Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
1812666a
"git@developer.sourcefind.cn:modelzoo/resnet50_tensorflow.git" did not exist on "1c99681ed94495f446399e247a8d63e07f28391c"
Commit
1812666a
authored
Nov 14, 2018
by
Chao Liu
Browse files
improved blockwise_tensor_op
parent
ff31af22
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
418 additions
and
100 deletions
+418
-100
driver/conv.cu
driver/conv.cu
+3
-3
src/include/blockwise_tensor_op.cuh
src/include/blockwise_tensor_op.cuh
+340
-37
src/include/device_tensor_descriptor.cuh
src/include/device_tensor_descriptor.cuh
+0
-60
src/include/direct_convolution.cuh
src/include/direct_convolution.cuh
+4
-0
src/include/threadwise_tensor_op.cuh
src/include/threadwise_tensor_op.cuh
+71
-0
No files found.
driver/conv.cu
View file @
1812666a
...
@@ -26,7 +26,7 @@ struct GeneratorTensor
...
@@ -26,7 +26,7 @@ struct GeneratorTensor
T
operator
()(
Is
...
is
)
T
operator
()(
Is
...
is
)
{
{
#if 1
#if 1
return
std
::
rand
()
/
RAND_MAX
;
return
T
(
std
::
rand
()
)
/
T
(
RAND_MAX
)
;
#elif 0
#elif 0
std
::
initializer_list
<
std
::
size_t
>
ls
=
{
static_cast
<
std
::
size_t
>
(
is
)...};
std
::
initializer_list
<
std
::
size_t
>
ls
=
{
static_cast
<
std
::
size_t
>
(
is
)...};
...
@@ -142,8 +142,8 @@ void device_convolution(
...
@@ -142,8 +142,8 @@ void device_convolution(
constexpr
unsigned
NBlockCopyLen0
=
1
;
constexpr
unsigned
NBlockCopyLen0
=
1
;
constexpr
unsigned
NBlockCopyLen1
=
1
;
constexpr
unsigned
NBlockCopyLen1
=
1
;
constexpr
unsigned
NBlockCopyLen2
=
2
;
constexpr
unsigned
NBlockCopyLen2
=
4
;
constexpr
unsigned
NBlockCopyLen3
=
16
;
constexpr
unsigned
NBlockCopyLen3
=
32
;
constexpr
unsigned
BlockSize
=
128
;
constexpr
unsigned
BlockSize
=
128
;
...
...
src/include/blockwise_tensor_op.cuh
View file @
1812666a
#pragma once
#pragma once
#include "constant_tensor_descriptor.cuh"
#include "constant_tensor_descriptor.cuh"
#define BLOCKWISE_TENSOR_OP_METHOD 12
#if BLOCKWISE_TENSOR_OP_METHOD == 11
template
<
class
TFloat
,
class
SrcDesc
,
class
DstDesc
,
unsigned
NWorkLen0
,
unsigned
NWorkLen1
,
unsigned
NWorkLen2
,
unsigned
NWorkLen3
,
class
F
,
unsigned
BlockSize
>
__device__
void
blockwise_4d_tensor_op
(
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
DstDesc
,
TFloat
*
__restrict__
p_dst
,
F
f
)
{
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
static_assert
(
is_same
<
decltype
(
src_desc
.
GetLengths
()),
decltype
(
dst_desc
.
GetLengths
())
>::
value
);
constexpr
auto
desc
=
make_ConstantTensorDescriptor
(
src_desc
.
GetLengths
());
#if 0
#if 0
if(threadIdx.x == 0)
{
print_ConstantTensorDescriptor(src_desc, "blockwise_4d_tensor_op: src_desc: ");
print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op: dst_desc: ");
}
#endif
for
(
unsigned
i
=
threadIdx
.
x
;
i
<
desc
.
GetElementSize
();
i
+=
BlockSize
)
{
unsigned
is
=
i
;
const
unsigned
did0
=
is
/
desc
.
GetStride
(
I0
);
is
-=
did0
*
desc
.
GetStride
(
I0
);
const
unsigned
did1
=
is
/
desc
.
GetStride
(
I1
);
is
-=
did1
*
desc
.
GetStride
(
I1
);
const
unsigned
did2
=
is
/
desc
.
GetStride
(
I2
);
is
-=
did2
*
desc
.
GetStride
(
I2
);
const
unsigned
did3
=
is
/
desc
.
GetStride
(
I3
);
const
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
f
(
p_src
[
sindex
],
p_dst
[
dindex
]);
}
}
#endif
#if BLOCKWISE_TENSOR_OP_METHOD == 12
template
<
class
TFloat
,
template
<
class
TFloat
,
class
SrcDesc
,
class
SrcDesc
,
class
DstDesc
,
class
DstDesc
,
...
@@ -24,6 +86,8 @@ __device__ void blockwise_4d_tensor_op(
...
@@ -24,6 +86,8 @@ __device__ void blockwise_4d_tensor_op(
static_assert
(
is_same
<
decltype
(
src_desc
.
GetLengths
()),
decltype
(
dst_desc
.
GetLengths
())
>::
value
);
static_assert
(
is_same
<
decltype
(
src_desc
.
GetLengths
()),
decltype
(
dst_desc
.
GetLengths
())
>::
value
);
constexpr
auto
desc
=
make_ConstantTensorDescriptor
(
src_desc
.
GetLengths
());
#if 0
#if 0
if(threadIdx.x == 0)
if(threadIdx.x == 0)
{
{
...
@@ -32,13 +96,94 @@ __device__ void blockwise_4d_tensor_op(
...
@@ -32,13 +96,94 @@ __device__ void blockwise_4d_tensor_op(
}
}
#endif
#endif
constexpr
unsigned
NLoop
=
desc
.
GetElementSize
()
/
BlockSize
;
for
(
unsigned
iloop
=
0
;
iloop
+
1
<
NLoop
;
++
iloop
)
{
unsigned
is
=
threadIdx
.
x
+
iloop
*
BlockSize
;
const
unsigned
did0
=
is
/
desc
.
GetStride
(
I0
);
is
-=
did0
*
desc
.
GetStride
(
I0
);
const
unsigned
did1
=
is
/
desc
.
GetStride
(
I1
);
is
-=
did1
*
desc
.
GetStride
(
I1
);
const
unsigned
did2
=
is
/
desc
.
GetStride
(
I2
);
is
-=
did2
*
desc
.
GetStride
(
I2
);
const
unsigned
did3
=
is
/
desc
.
GetStride
(
I3
);
const
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
f
(
p_src
[
sindex
],
p_dst
[
dindex
]);
}
constexpr
bool
has_tail
=
(
desc
.
GetElementSize
()
>
NLoop
*
BlockSize
);
if
(
has_tail
)
{
unsigned
is
=
threadIdx
.
x
+
NLoop
*
BlockSize
;
if
(
is
<
desc
.
GetElementSize
())
{
const
unsigned
did0
=
is
/
desc
.
GetStride
(
I0
);
is
-=
did0
*
desc
.
GetStride
(
I0
);
const
unsigned
did1
=
is
/
desc
.
GetStride
(
I1
);
is
-=
did1
*
desc
.
GetStride
(
I1
);
const
unsigned
did2
=
is
/
desc
.
GetStride
(
I2
);
is
-=
did2
*
desc
.
GetStride
(
I2
);
const
unsigned
did3
=
is
/
desc
.
GetStride
(
I3
);
const
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
f
(
p_src
[
sindex
],
p_dst
[
dindex
]);
}
}
}
#endif
#if BLOCKWISE_TENSOR_OP_METHOD == 21
template
<
class
TFloat
,
class
SrcDesc
,
class
DstDesc
,
unsigned
NWorkLen0
,
unsigned
NWorkLen1
,
unsigned
NWorkLen2
,
unsigned
NWorkLen3
,
class
F
,
unsigned
BlockSize
>
__device__
void
blockwise_4d_tensor_op
(
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
DstDesc
,
TFloat
*
__restrict__
p_dst
,
F
f
)
{
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
static_assert
(
is_same
<
decltype
(
src_desc
.
GetLengths
()),
decltype
(
dst_desc
.
GetLengths
())
>::
value
);
constexpr
unsigned
NWorkStride3
=
1
;
constexpr
unsigned
NWorkStride3
=
1
;
constexpr
unsigned
NWorkStride2
=
NWorkLen3
*
NWorkStride3
;
constexpr
unsigned
NWorkStride2
=
NWorkLen3
*
NWorkStride3
;
constexpr
unsigned
NWorkStride1
=
NWorkLen2
*
NWorkStride2
;
constexpr
unsigned
NWorkStride1
=
NWorkLen2
*
NWorkStride2
;
constexpr
unsigned
NWorkStride0
=
NWorkLen1
*
NWorkStride1
;
constexpr
unsigned
NWorkStride0
=
NWorkLen1
*
NWorkStride1
;
unsigned itmp =
unsigned
itmp
=
threadIdx
.
x
;
threadIdx.x;
const
unsigned
did0_begin
=
itmp
/
NWorkStride0
;
const
unsigned
did0_begin
=
itmp
/
NWorkStride0
;
...
@@ -66,29 +211,99 @@ __device__ void blockwise_4d_tensor_op(
...
@@ -66,29 +211,99 @@ __device__ void blockwise_4d_tensor_op(
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
f(p_src[dindex], p_dst[sindex]);
f
(
p_src
[
sindex
],
p_dst
[
dindex
]);
}
#if 0
// if(threadIdx.x == 0)
{
printf("blockwise_4d_tensor_op: 1: thread id %u, \t"
"sindex %u, p_src[sindex] %f, \t"
"dindex %u, p_dst[dindex] %f\n",
threadIdx.x,
sindex,
p_src[sindex],
dindex,
p_dst[dindex]);
}
}
}
}
}
#endif
#endif
#if BLOCKWISE_TENSOR_OP_METHOD == 22
template
<
class
TFloat
,
class
SrcDesc
,
class
DstDesc
,
unsigned
NWorkLen0
,
unsigned
NWorkLen1
,
unsigned
NWorkLen2
,
unsigned
NWorkLen3
,
class
F
,
unsigned
BlockSize
>
__device__
void
blockwise_4d_tensor_op
(
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
DstDesc
,
TFloat
*
__restrict__
p_dst
,
F
f
)
{
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
static_assert
(
is_same
<
decltype
(
src_desc
.
GetLengths
()),
decltype
(
dst_desc
.
GetLengths
())
>::
value
);
constexpr
unsigned
NWorkStride3
=
1
;
constexpr
unsigned
NWorkStride2
=
NWorkLen3
*
NWorkStride3
;
constexpr
unsigned
NWorkStride1
=
NWorkLen2
*
NWorkStride2
;
constexpr
unsigned
NWorkStride0
=
NWorkLen1
*
NWorkStride1
;
unsigned
itmp
=
threadIdx
.
x
;
const
unsigned
did0_begin
=
itmp
/
NWorkStride0
;
itmp
-=
did0_begin
*
NWorkStride0
;
const
unsigned
did1_begin
=
itmp
/
NWorkStride1
;
itmp
-=
did1_begin
*
NWorkStride1
;
const
unsigned
did2_begin
=
itmp
/
NWorkStride2
;
itmp
-=
did2_begin
*
NWorkStride2
;
const
unsigned
did3_begin
=
itmp
/
NWorkStride3
;
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0_begin
,
did1_begin
,
did2_begin
,
did3_begin
);
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0_begin
,
did1_begin
,
did2_begin
,
did3_begin
);
for
(
unsigned
did0
=
did0_begin
;
did0
<
src_desc
.
GetLength
(
I0
);
did0
+=
NWorkLen0
)
{
const
unsigned
sindex_save0
=
sindex
;
const
unsigned
dindex_save0
=
dindex
;
for
(
unsigned
did1
=
did1_begin
;
did1
<
src_desc
.
GetLength
(
I1
);
did1
+=
NWorkLen1
)
{
const
unsigned
sindex_save1
=
sindex
;
const
unsigned
dindex_save1
=
dindex
;
for
(
unsigned
did2
=
did2_begin
;
did2
<
src_desc
.
GetLength
(
I2
);
did2
+=
NWorkLen2
)
{
const
unsigned
sindex_save2
=
sindex
;
const
unsigned
dindex_save2
=
dindex
;
for
(
unsigned
did3
=
did3_begin
;
did3
<
src_desc
.
GetLength
(
I3
);
did3
+=
NWorkLen3
)
{
f
(
p_src
[
sindex
],
p_dst
[
dindex
]);
sindex
+=
NWorkLen3
*
src_desc
.
GetStride
(
I3
);
dindex
+=
NWorkLen3
*
dst_desc
.
GetStride
(
I3
);
}
}
sindex
=
sindex_save2
+
NWorkLen2
*
src_desc
.
GetStride
(
I2
);
dindex
=
dindex_save2
+
NWorkLen2
*
dst_desc
.
GetStride
(
I2
);
}
}
sindex
=
sindex_save1
+
NWorkLen1
*
src_desc
.
GetStride
(
I1
);
dindex
=
dindex_save1
+
NWorkLen1
*
dst_desc
.
GetStride
(
I1
);
}
}
sindex
=
sindex_save0
+
NWorkLen0
*
src_desc
.
GetStride
(
I0
);
dindex
=
dindex_save0
+
NWorkLen0
*
dst_desc
.
GetStride
(
I0
);
}
}
}
}
#endif
#elif
1
#if BLOCKWISE_TENSOR_OP_METHOD == 23
template
<
class
TFloat
,
template
<
class
TFloat
,
class
SrcDesc
,
class
SrcDesc
,
class
DstDesc
,
class
DstDesc
,
...
@@ -111,41 +326,129 @@ __device__ void blockwise_4d_tensor_op(
...
@@ -111,41 +326,129 @@ __device__ void blockwise_4d_tensor_op(
static_assert
(
is_same
<
decltype
(
src_desc
.
GetLengths
()),
decltype
(
dst_desc
.
GetLengths
())
>::
value
);
static_assert
(
is_same
<
decltype
(
src_desc
.
GetLengths
()),
decltype
(
dst_desc
.
GetLengths
())
>::
value
);
constexpr
auto
desc
=
make_ConstantTensorDescriptor
(
src_desc
.
GetLengths
());
constexpr
unsigned
NWorkStride3
=
1
;
constexpr
unsigned
NWorkStride2
=
NWorkLen3
*
NWorkStride3
;
constexpr
unsigned
NWorkStride1
=
NWorkLen2
*
NWorkStride2
;
constexpr
unsigned
NWorkStride0
=
NWorkLen1
*
NWorkStride1
;
#if 0
unsigned
itmp
=
threadIdx
.
x
;
if(threadIdx.x == 0)
const
unsigned
did0_begin
=
itmp
/
NWorkStride0
;
itmp
-=
did0_begin
*
NWorkStride0
;
const
unsigned
did1_begin
=
itmp
/
NWorkStride1
;
itmp
-=
did1_begin
*
NWorkStride1
;
const
unsigned
did2_begin
=
itmp
/
NWorkStride2
;
itmp
-=
did2_begin
*
NWorkStride2
;
const
unsigned
did3_begin
=
itmp
/
NWorkStride3
;
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0_begin
,
did1_begin
,
did2_begin
,
did3_begin
);
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0_begin
,
did1_begin
,
did2_begin
,
did3_begin
);
for
(
unsigned
did0
=
did0_begin
;
did0
<
src_desc
.
GetLength
(
I0
);
did0
+=
NWorkLen0
)
{
{
print_ConstantTensorDescriptor(src_desc, "blockwise_4d_tensor_op: src_desc: ");
unsigned
i1
=
0
;
print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op: dst_desc: ");
for
(
unsigned
did1
=
did1_begin
;
did1
<
src_desc
.
GetLength
(
I1
);
did1
+=
NWorkLen1
)
{
unsigned
i2
=
0
;
for
(
unsigned
did2
=
did2_begin
;
did2
<
src_desc
.
GetLength
(
I2
);
did2
+=
NWorkLen2
)
{
unsigned
i3
=
0
;
for
(
unsigned
did3
=
did3_begin
;
did3
<
src_desc
.
GetLength
(
I3
);
did3
+=
NWorkLen3
)
{
f
(
p_src
[
sindex
],
p_dst
[
dindex
]);
sindex
+=
NWorkLen3
*
src_desc
.
GetStride
(
I3
);
dindex
+=
NWorkLen3
*
dst_desc
.
GetStride
(
I3
);
++
i3
;
}
sindex
+=
NWorkLen2
*
src_desc
.
GetStride
(
I2
)
-
i3
*
NWorkLen3
*
src_desc
.
GetStride
(
I3
);
dindex
+=
NWorkLen2
*
dst_desc
.
GetStride
(
I2
)
-
i3
*
NWorkLen3
*
dst_desc
.
GetStride
(
I3
);
++
i2
;
}
sindex
+=
NWorkLen1
*
src_desc
.
GetStride
(
I1
)
-
i2
*
NWorkLen2
*
src_desc
.
GetStride
(
I2
);
dindex
+=
NWorkLen1
*
dst_desc
.
GetStride
(
I1
)
-
i2
*
NWorkLen2
*
dst_desc
.
GetStride
(
I2
);
++
i1
;
}
}
sindex
+=
NWorkLen0
*
src_desc
.
GetStride
(
I0
)
-
i1
*
NWorkLen1
*
src_desc
.
GetStride
(
I1
);
dindex
+=
NWorkLen0
*
dst_desc
.
GetStride
(
I0
)
-
i1
*
NWorkLen1
*
dst_desc
.
GetStride
(
I1
);
}
}
#endif
#endif
unsigned
lid
=
threadIdx
.
x
;
#if BLOCKWISE_TENSOR_OP_METHOD == 31
template
<
class
TFloat
,
class
SrcDesc
,
class
DstDesc
,
unsigned
NWorkLen0
,
unsigned
NWorkLen1
,
unsigned
NWorkLen2
,
unsigned
NWorkLen3
,
class
F
,
unsigned
BlockSize
>
__device__
void
blockwise_4d_tensor_op
(
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
DstDesc
,
TFloat
*
__restrict__
p_dst
,
F
f
)
{
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
for
(
unsigned
i
=
lid
;
i
<
desc
.
GetElementSize
();
i
+=
BlockSize
)
static_assert
(
is_same
<
decltype
(
src_desc
.
GetLengths
()),
decltype
(
dst_desc
.
GetLengths
())
>::
value
);
{
unsigned
is
=
i
;
const
unsigned
did0
=
is
/
desc
.
GetStride
(
I0
);
constexpr
unsigned
NWorkStride3
=
1
;
constexpr
unsigned
NWorkStride2
=
NWorkLen3
*
NWorkStride3
;
constexpr
unsigned
NWorkStride1
=
NWorkLen2
*
NWorkStride2
;
constexpr
unsigned
NWorkStride0
=
NWorkLen1
*
NWorkStride1
;
is
-=
did0
*
desc
.
GetStride
(
I0
)
;
unsigned
itmp
=
threadIdx
.
x
;
const
unsigned
did
1
=
is
/
desc
.
Get
Stride
(
I1
)
;
const
unsigned
did
0_begin
=
itmp
/
NWork
Stride
0
;
i
s
-=
did
1
*
desc
.
Get
Stride
(
I1
)
;
i
tmp
-=
did
0_begin
*
NWork
Stride
0
;
const
unsigned
did
2
=
is
/
desc
.
Get
Stride
(
I2
)
;
const
unsigned
did
1_begin
=
itmp
/
NWork
Stride
1
;
i
s
-=
did
2
*
desc
.
Get
Stride
(
I2
)
;
i
tmp
-=
did
1_begin
*
NWork
Stride
1
;
const
unsigned
did3
=
is
/
src_desc
.
GetStride
(
I3
);
const
unsigned
did2_begin
=
itmp
/
NWorkStride2
;
itmp
-=
did2_begin
*
NWorkStride2
;
const
unsigned
did3_begin
=
itmp
/
NWorkStride3
;
for
(
unsigned
did0
=
did0_begin
;
did0
<
src_desc
.
GetLength
(
I0
);
did0
+=
NWorkLen0
)
{
for
(
unsigned
did1
=
did1_begin
;
did1
<
src_desc
.
GetLength
(
I1
);
did1
+=
NWorkLen1
)
{
for
(
unsigned
did2
=
did2_begin
;
did2
<
src_desc
.
GetLength
(
I2
);
did2
+=
NWorkLen2
)
{
for
(
unsigned
did3
=
did3_begin
;
did3
<
src_desc
.
GetLength
(
I3
);
did3
+=
NWorkLen3
)
{
const
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
const
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
f
(
p_src
[
sindex
],
p_dst
[
dindex
]);
f
(
p_src
[
sindex
],
p_dst
[
dindex
]);
}
}
}
}
}
}
}
#endif
#endif
src/include/device_tensor_descriptor.cuh
deleted
100644 → 0
View file @
ff31af22
#pragma once
#include <algorithm>
#include "constant_tensor_descriptor.cuh"
#include "helper_cuda.h"
#include "tensor.hpp"
template
<
unsigned
NDim
>
struct
DeviceTensorDescriptor
{
__host__
__device__
DeviceTensorDescriptor
()
=
default
;
__host__
DeviceTensorDescriptor
(
const
TensorDescriptor
&
host_desc
)
{
assert
(
NDim
==
host_desc
.
GetDimension
());
std
::
copy
(
host_desc
.
GetLengths
().
begin
(),
host_desc
.
GetLengths
().
end
(),
mpLengths
);
std
::
copy
(
host_desc
.
GetStrides
().
begin
(),
host_desc
.
GetStrides
().
end
(),
mpStrides
);
}
__host__
__device__
unsigned
GetLength
(
unsigned
i
)
const
{
return
mpLengths
[
i
];
}
__host__
__device__
unsigned
GetStride
(
unsigned
i
)
const
{
return
mpStrides
[
i
];
}
// this is ugly, only for 4d
__host__
__device__
unsigned
Get1dIndex
(
unsigned
n
,
unsigned
c
,
unsigned
h
,
unsigned
w
)
const
{
return
n
*
mpStrides
[
0
]
+
c
*
mpStrides
[
1
]
+
h
*
mpStrides
[
2
]
+
w
*
mpStrides
[
3
];
}
unsigned
mpLengths
[
NDim
];
unsigned
mpStrides
[
NDim
];
};
// this is ugly, only for 4d
template
<
class
TConstTensorDesc
>
__host__
__device__
auto
make_DeviceTensorDescriptor
(
TConstTensorDesc
)
{
static_assert
(
TConstTensorDesc
::
nDim
==
4
,
"nDim is not 4"
);
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
const_desc
=
TConstTensorDesc
{};
constexpr
auto
ndim
=
const_desc
.
GetDimension
();
auto
desc
=
DeviceTensorDescriptor
<
ndim
>
{};
desc
.
mpLengths
[
0
]
=
const_desc
.
GetLength
(
I0
);
desc
.
mpLengths
[
1
]
=
const_desc
.
GetLength
(
I1
);
desc
.
mpLengths
[
2
]
=
const_desc
.
GetLength
(
I2
);
desc
.
mpLengths
[
3
]
=
const_desc
.
GetLength
(
I3
);
desc
.
mpStrides
[
0
]
=
const_desc
.
GetStride
(
I0
);
desc
.
mpStrides
[
1
]
=
const_desc
.
GetStride
(
I1
);
desc
.
mpStrides
[
2
]
=
const_desc
.
GetStride
(
I2
);
desc
.
mpStrides
[
3
]
=
const_desc
.
GetStride
(
I3
);
return
desc
;
}
src/include/direct_convolution.cuh
View file @
1812666a
...
@@ -283,6 +283,7 @@ __global__ void gridwise_convolution(InDesc,
...
@@ -283,6 +283,7 @@ __global__ void gridwise_convolution(InDesc,
{
{
auto
f_copy
=
[](
const
TFloat
&
src
,
TFloat
&
dst
)
{
dst
=
src
;
};
auto
f_copy
=
[](
const
TFloat
&
src
,
TFloat
&
dst
)
{
dst
=
src
;
};
#if 1
// copy input tensor to LDS
// copy input tensor to LDS
blockwise_4d_tensor_op
<
TFloat
,
blockwise_4d_tensor_op
<
TFloat
,
decltype
(
in_block_glb_desc
),
decltype
(
in_block_glb_desc
),
...
@@ -300,7 +301,9 @@ __global__ void gridwise_convolution(InDesc,
...
@@ -300,7 +301,9 @@ __global__ void gridwise_convolution(InDesc,
in_block_lds_desc
,
in_block_lds_desc
,
p_in_block
,
p_in_block
,
f_copy
);
f_copy
);
#endif
#if 1
// copy weight tensor to LDS
// copy weight tensor to LDS
blockwise_4d_tensor_op
<
TFloat
,
blockwise_4d_tensor_op
<
TFloat
,
decltype
(
wei_block_glb_desc
),
decltype
(
wei_block_glb_desc
),
...
@@ -316,6 +319,7 @@ __global__ void gridwise_convolution(InDesc,
...
@@ -316,6 +319,7 @@ __global__ void gridwise_convolution(InDesc,
wei_block_lds_desc
,
wei_block_lds_desc
,
p_wei_block
,
p_wei_block
,
f_copy
);
f_copy
);
#endif
// copy output tensor to LDS
// copy output tensor to LDS
blockwise_4d_tensor_op
<
TFloat
,
blockwise_4d_tensor_op
<
TFloat
,
...
...
src/include/threadwise_tensor_op.cuh
View file @
1812666a
#pragma once
#pragma once
#include "constant_tensor_descriptor.cuh"
#include "constant_tensor_descriptor.cuh"
#define THREADWISE_TENSOR_OP_METHOD 1
#if THREADWISE_TENSOR_OP_METHOD == 0
template
<
class
TFloat
,
class
SrcDesc
,
class
DstDesc
,
class
F
>
template
<
class
TFloat
,
class
SrcDesc
,
class
DstDesc
,
class
F
>
__device__
void
threadwise_4d_tensor_op
(
__device__
void
threadwise_4d_tensor_op
(
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
DstDesc
,
TFloat
*
__restrict__
p_dst
,
F
f
)
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
DstDesc
,
TFloat
*
__restrict__
p_dst
,
F
f
)
...
@@ -59,3 +62,71 @@ __device__ void threadwise_4d_tensor_op(
...
@@ -59,3 +62,71 @@ __device__ void threadwise_4d_tensor_op(
}
}
}
}
}
}
#endif
#if THREADWISE_TENSOR_OP_METHOD == 1
template
<
class
TFloat
,
class
SrcDesc
,
class
DstDesc
,
class
F
>
__device__
void
threadwise_4d_tensor_op
(
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
DstDesc
,
TFloat
*
__restrict__
p_dst
,
F
f
)
{
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
static_assert
(
is_same
<
decltype
(
src_desc
.
GetLengths
()),
decltype
(
dst_desc
.
GetLengths
())
>::
value
);
#if 0
if(threadIdx.x == 0)
{
print_ConstantTensorDescriptor(src_desc);
print_ConstantTensorDescriptor(dst_desc);
}
#endif
unsigned
sindex
=
0
;
unsigned
dindex
=
0
;
for
(
unsigned
did0
=
0
;
did0
<
src_desc
.
GetLength
(
I0
);
++
did0
)
{
for
(
unsigned
did1
=
0
;
did1
<
src_desc
.
GetLength
(
I1
);
++
did1
)
{
for
(
unsigned
did2
=
0
;
did2
<
src_desc
.
GetLength
(
I2
);
++
did2
)
{
for
(
unsigned
did3
=
0
;
did3
<
src_desc
.
GetLength
(
I3
);
++
did3
)
{
f
(
p_src
[
sindex
],
p_dst
[
dindex
]);
#if 0
if(threadIdx.x == 0)
{
printf("threadwise_4d_tensor_op: 1: thread id %u, \t"
"sindex %u, p_src[sindex] %f, \t"
"dindex %u, p_dst[dindex] %f\n",
threadIdx.x,
sindex,
p_src[sindex],
dindex,
p_dst[dindex]);
}
#endif
sindex
+=
src_desc
.
GetStride
(
I3
);
dindex
+=
dst_desc
.
GetStride
(
I3
);
}
sindex
+=
src_desc
.
GetStride
(
I2
)
-
src_desc
.
GetLength
(
I3
)
*
src_desc
.
GetStride
(
I3
);
dindex
+=
dst_desc
.
GetStride
(
I2
)
-
dst_desc
.
GetLength
(
I3
)
*
dst_desc
.
GetStride
(
I3
);
}
sindex
+=
src_desc
.
GetStride
(
I1
)
-
src_desc
.
GetLength
(
I2
)
*
src_desc
.
GetStride
(
I2
);
dindex
+=
dst_desc
.
GetStride
(
I1
)
-
dst_desc
.
GetLength
(
I2
)
*
dst_desc
.
GetStride
(
I2
);
}
sindex
+=
src_desc
.
GetStride
(
I0
)
-
src_desc
.
GetLength
(
I1
)
*
src_desc
.
GetStride
(
I1
);
dindex
+=
dst_desc
.
GetStride
(
I0
)
-
dst_desc
.
GetLength
(
I1
)
*
dst_desc
.
GetStride
(
I1
);
}
}
#endif
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment