Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
3dbd4725
"git@developer.sourcefind.cn:gaoqiong/composable_kernel.git" did not exist on "a65ef9030880d51dd159e4d23f1dc6093b17651c"
Commit
3dbd4725
authored
Jan 04, 2019
by
Chao Liu
Browse files
added threadwise tensor reorder operation
parent
21c91816
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
55 additions
and
27 deletions
+55
-27
src/include/blockwise_tensor_op.cuh
src/include/blockwise_tensor_op.cuh
+2
-0
src/include/gridwise_direct_convolution_2.cuh
src/include/gridwise_direct_convolution_2.cuh
+1
-1
src/include/threadwise_tensor_op.cuh
src/include/threadwise_tensor_op.cuh
+52
-26
No files found.
src/include/blockwise_tensor_op.cuh
View file @
3dbd4725
...
@@ -76,6 +76,8 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, TFloat* __restrict__ p_ds
...
@@ -76,6 +76,8 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, TFloat* __restrict__ p_ds
}
}
}
}
// TODO: in order to optimize mem access for different mem type,
// need to write specialized version
template
<
class
TFloat
,
template
<
class
TFloat
,
class
SrcDesc
,
class
SrcDesc
,
class
DstDesc
,
class
DstDesc
,
...
...
src/include/gridwise_direct_convolution_2.cuh
View file @
3dbd4725
...
@@ -182,7 +182,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
...
@@ -182,7 +182,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
for
(
unsigned
c_thread_data
=
0
;
c_thread_data
<
CPerBlock
;
c_thread_data
+=
CPerThread
)
for
(
unsigned
c_thread_data
=
0
;
c_thread_data
<
CPerBlock
;
c_thread_data
+=
CPerThread
)
{
{
// threadwise convolution
// threadwise convolution
#if
1
#if
0
threadwise_direct_convolution_2(
threadwise_direct_convolution_2(
in_thread_block_desc,
in_thread_block_desc,
p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
...
...
src/include/threadwise_tensor_op.cuh
View file @
3dbd4725
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
#include "constant_tensor_descriptor.cuh"
#include "constant_tensor_descriptor.cuh"
template
<
class
TFloat
,
class
Desc
,
class
F
>
template
<
class
TFloat
,
class
Desc
,
class
F
>
__device__
void
threadwise_4d_tensor_pointwise_op_unary
(
Desc
,
TFloat
*
__restrict__
p
,
F
f
)
__device__
void
threadwise_4d_tensor_pointwise_op
eration
_unary
(
Desc
,
TFloat
*
__restrict__
p
,
F
f
)
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -35,41 +35,48 @@ __device__ void threadwise_4d_tensor_pointwise_op_unary(Desc, TFloat* __restrict
...
@@ -35,41 +35,48 @@ __device__ void threadwise_4d_tensor_pointwise_op_unary(Desc, TFloat* __restrict
}
}
}
}
template
<
class
TFloat
,
class
DescA
,
class
DescB
,
class
DescRef
,
class
F
>
// TODO: in order to optimize mem access for different mem type,
__device__
void
threadwise_4d_tensor_pointwise_op_binary
(
// need to write specialized version
DescA
,
TFloat
*
const
__restrict__
p_a
,
DescB
,
TFloat
*
__restrict__
p_b
,
DescRef
,
F
f
)
template
<
class
TFloat
,
class
SrcDesc
,
class
DstDesc
,
class
RefDesc
,
class
Reorder
,
class
F
>
__device__
void
threadwise_4d_tensor_pointwise_operation_binary_reorder
(
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
DstDesc
,
TFloat
*
__restrict__
p_dst
,
RefDesc
,
Reorder
,
F
f
)
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
desc_a
=
DescA
{};
constexpr
unsigned
IT0
=
Reorder
{}.
Get
(
I0
);
constexpr
auto
desc_b
=
DescB
{};
constexpr
unsigned
IT1
=
Reorder
{}.
Get
(
I1
);
constexpr
auto
desc_ref
=
DescRef
{};
constexpr
unsigned
IT2
=
Reorder
{}.
Get
(
I2
);
constexpr
unsigned
IT3
=
Reorder
{}.
Get
(
I3
);
#if 0
constexpr
auto
src_desc
=
SrcDesc
{};
if(threadIdx.x == 0)
constexpr
auto
dst_desc
=
DstDesc
{};
{
constexpr
auto
ref_desc
=
RefDesc
{};
print_ConstantTensorDescriptor(desc_a, "threadwise_4d_tensor_op_binary: desc_a: ");
print_ConstantTensorDescriptor(desc_b, "threadwise_4d_tensor_op_binary: desc_b: ");
print_ConstantTensorDescriptor(desc_ref, "threadwise_4d_tensor_op_binary: desc_ref: ");
}
#endif
for
(
unsigned
did0
=
0
;
did0
<
desc
_ref
.
GetLength
(
I0
);
++
did0
)
for
(
unsigned
did0
=
0
;
did0
<
ref_
desc
.
GetLength
(
I0
);
++
did0
)
{
{
for
(
unsigned
did1
=
0
;
did1
<
desc
_ref
.
GetLength
(
I1
);
++
did1
)
for
(
unsigned
did1
=
0
;
did1
<
ref_
desc
.
GetLength
(
I1
);
++
did1
)
{
{
for
(
unsigned
did2
=
0
;
did2
<
desc
_ref
.
GetLength
(
I2
);
++
did2
)
for
(
unsigned
did2
=
0
;
did2
<
ref_
desc
.
GetLength
(
I2
);
++
did2
)
{
{
for
(
unsigned
did3
=
0
;
did3
<
desc
_ref
.
GetLength
(
I3
);
++
did3
)
for
(
unsigned
did3
=
0
;
did3
<
ref_
desc
.
GetLength
(
I3
);
++
did3
)
{
{
const
unsigned
aindex
=
desc_a
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
const
unsigned
aindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
const
unsigned
did
[
4
]
=
{
did0
,
did1
,
did2
,
did3
};
const
unsigned
bindex
=
desc_b
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
const
unsigned
bindex
=
dst_desc
.
Get1dIndex
(
did
[
IT0
],
did
[
IT1
],
did
[
IT2
],
did
[
IT3
]);
f
(
p_
a
[
aindex
],
p_
b
[
bindex
]);
f
(
p_
src
[
aindex
],
p_
dst
[
bindex
]);
}
}
}
}
}
}
...
@@ -81,18 +88,37 @@ __device__ void threadwise_4d_tensor_set_zero(Desc, TFloat* __restrict__ p)
...
@@ -81,18 +88,37 @@ __device__ void threadwise_4d_tensor_set_zero(Desc, TFloat* __restrict__ p)
{
{
auto
f_set_zero
=
[](
TFloat
&
v
)
{
v
=
TFloat
(
0
);
};
auto
f_set_zero
=
[](
TFloat
&
v
)
{
v
=
TFloat
(
0
);
};
threadwise_4d_tensor_pointwise_op_unary
<
TFloat
,
Desc
,
decltype
(
f_set_zero
)
>
(
threadwise_4d_tensor_pointwise_op
eration
_unary
<
TFloat
,
Desc
,
decltype
(
f_set_zero
)
>
(
Desc
{},
p
,
f_set_zero
);
Desc
{},
p
,
f_set_zero
);
}
}
template
<
class
TFloat
,
class
SrcDesc
,
class
DstDesc
,
class
RefDesc
,
class
Reorder
>
__device__
void
threadwise_4d_tensor_copy_reorder
(
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
DstDesc
,
TFloat
*
__restrict__
p_dst
,
RefDesc
,
Reorder
)
{
auto
f_copy
=
[](
const
TFloat
&
src
,
TFloat
&
dst
)
{
dst
=
src
;
};
threadwise_4d_tensor_pointwise_operation_binary_reorder
<
TFloat
,
SrcDesc
,
DstDesc
,
RefDesc
,
Reorder
,
decltype
(
f_copy
)
>
(
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
RefDesc
{},
Reorder
{},
f_copy
);
}
template
<
class
TFloat
,
class
SrcDesc
,
class
DstDesc
,
class
RefDesc
>
template
<
class
TFloat
,
class
SrcDesc
,
class
DstDesc
,
class
RefDesc
>
__device__
void
threadwise_4d_tensor_copy
(
__device__
void
threadwise_4d_tensor_copy
(
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
DstDesc
,
TFloat
*
__restrict__
p_dst
,
RefDesc
)
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
DstDesc
,
TFloat
*
__restrict__
p_dst
,
RefDesc
)
{
{
auto
f_copy
=
[](
const
TFloat
&
src
,
TFloat
&
dst
)
{
dst
=
src
;
};
auto
reorder
=
Sequence
<
0
,
1
,
2
,
3
>
{
};
threadwise_4d_tensor_
pointwise_op_binary
<
TFloat
,
SrcDesc
,
DstDesc
,
RefDesc
,
decltype
(
f_copy
)
>
(
threadwise_4d_tensor_
copy_reorder
<
TFloat
,
SrcDesc
,
DstDesc
,
RefDesc
,
decltype
(
reorder
)
>
(
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
RefDesc
{},
f_copy
);
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
RefDesc
{},
reorder
);
}
}
template
<
class
TFloat
,
class
Desc
,
class
IDim
,
class
NShift
>
template
<
class
TFloat
,
class
Desc
,
class
IDim
,
class
NShift
>
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment