Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
569ad66e
"include/ck/utility/print.hpp" did not exist on "6fe3627a9eb35f1237266f1b6cc8fd3456aed67d"
Commit
569ad66e
authored
Apr 23, 2019
by
Chao Liu
Browse files
added implicit gemm v1r3 lds_double_buffer NCHW * CYXK = KNHW, reworked static functionals
parent
87d8740b
Changes
22
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
32 additions
and
245 deletions
+32
-245
src/include/threadwise_4d_tensor_op.hip.hpp
src/include/threadwise_4d_tensor_op.hip.hpp
+2
-0
src/include/threadwise_nd_tensor_op.hip.hpp
src/include/threadwise_nd_tensor_op.hip.hpp
+30
-245
No files found.
src/include/threadwise_4d_tensor_op.hip.hpp
View file @
569ad66e
...
...
@@ -139,6 +139,7 @@ __device__ void threadwise_4d_tensor_copy_reorder_given_dst2src(SrcDesc,
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
SrcOpLengths
{},
MapDst2Src
{},
f_copy
);
}
#if 0 // replaced threadwise_nd_tensor_copy
template <class SrcData, class DstData, class SrcDesc, class DstDesc, class SrcOpLengths>
__device__ void threadwise_4d_tensor_copy(
SrcDesc, const SrcData* __restrict__ p_src, DstDesc, DstData* __restrict__ p_dst, SrcOpLengths)
...
...
@@ -210,6 +211,7 @@ __device__ void threadwise_4d_tensor_copy_v2(SrcDesc,
}
}
}
#endif
template
<
class
SrcData
,
class
DstData
,
...
...
src/include/threadwise_nd_tensor_op.hip.hpp
View file @
569ad66e
...
...
@@ -3,7 +3,7 @@
// need to assume src and dst is aligned
template
<
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
,
index_t
DataPerRead
>
__device__
void
threadwise_
6
d_tensor_copy
(
SrcDesc
,
__device__
void
threadwise_
n
d_tensor_copy
(
SrcDesc
,
const
Float
*
__restrict__
p_src
,
DstDesc
,
Float
*
__restrict__
p_dst
,
...
...
@@ -12,268 +12,53 @@ __device__ void threadwise_6d_tensor_copy(SrcDesc,
{
using
vector_t
=
typename
vector_type
<
Float
,
DataPerRead
>::
MemoryType
;
static_assert
(
SrcDesc
{}.
GetDimension
()
==
6
&&
DstDesc
{}.
GetDimension
()
==
6
&&
SrcOpLengths
::
nDim
==
6
,
"wrong! should be 6 dimension"
);
constexpr
index_t
nDim
=
SrcOpLengths
::
GetSize
();
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
static_assert
(
SrcDesc
{}.
GetDimension
()
==
nDim
&&
DstDesc
{}.
GetDimension
()
==
nDim
,
"wrong! dimension not consistent"
);
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
ref_desc
=
make_ConstantTensorDescriptor
(
SrcOpLengths
{});
static_assert
(
SrcDesc
{}.
GetStride
(
I5
)
==
1
&&
DstDesc
{}.
GetStride
(
I5
)
==
1
,
"wrong! only support stride5 == 1!
\n
"
);
static_assert
(
DataPerRead
==
1
||
DataPerRead
==
2
||
DataPerRead
==
4
,
"wrong! only support DataPerRead == 1, 2 or 4!
\n
"
);
static_assert
(
SrcDesc
{}.
GetStride
(
I4
)
%
DataPerRead
==
0
&&
DstDesc
{}.
GetStride
(
I4
)
%
DataPerRead
==
0
,
"wrong! src and dst stride should be multiple of DataPerRead to keep alignment"
);
constexpr
index_t
L5
=
SrcOpLengths
{}.
Get
(
I5
);
static_assert
(
L5
%
DataPerRead
==
0
,
"wrong! L5 should be evenly divided by DataPerRead"
);
constexpr
index_t
nloop_d5
=
L5
/
DataPerRead
;
for
(
index_t
did0
=
0
;
did0
<
ref_desc
.
GetLength
(
I0
);
++
did0
)
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
for
(
index_t
did1
=
0
;
did1
<
ref_desc
.
GetLength
(
I1
);
++
did1
)
{
for
(
index_t
did2
=
0
;
did2
<
ref_desc
.
GetLength
(
I2
);
++
did2
)
{
for
(
index_t
did3
=
0
;
did3
<
ref_desc
.
GetLength
(
I3
);
++
did3
)
{
for
(
index_t
did4
=
0
;
did4
<
ref_desc
.
GetLength
(
I4
);
++
did4
)
{
for
(
index_t
iloop_d5
=
0
;
iloop_d5
<
nloop_d5
;
++
iloop_d5
)
{
const
index_t
src_index
=
src_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
,
did4
,
iloop_d5
*
DataPerRead
);
const
index_t
dst_index
=
dst_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
,
did4
,
iloop_d5
*
DataPerRead
);
*
(
reinterpret_cast
<
vector_t
*>
(
p_dst
+
dst_index
))
=
*
(
reinterpret_cast
<
const
vector_t
*>
(
p_src
+
src_index
));
}
}
}
}
}
print_ConstantTensorDescriptor(src_desc, "src_desc");
print_ConstantTensorDescriptor(dst_desc, "dst_desc");
print_ConstantTensorDescriptor(ref_desc, "ref_desc");
}
}
// need to assume src and dst is aligned
template
<
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
,
index_t
DataPerRead
>
__device__
void
threadwise_8d_tensor_copy
(
SrcDesc
,
const
Float
*
__restrict__
p_src
,
DstDesc
,
Float
*
__restrict__
p_dst
,
SrcOpLengths
,
Number
<
DataPerRead
>
)
{
using
vector_t
=
typename
vector_type
<
Float
,
DataPerRead
>::
MemoryType
;
static_assert
(
SrcDesc
{}.
GetDimension
()
==
8
&&
DstDesc
{}.
GetDimension
()
==
8
&&
SrcOpLengths
::
nDim
==
8
,
"wrong! should be 8 dimension"
);
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
constexpr
auto
I7
=
Number
<
7
>
{};
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
ref_desc
=
make_ConstantTensorDescriptor
(
SrcOpLengths
{});
#endif
static_assert
(
SrcDesc
{}.
GetStride
(
I7
)
==
1
&&
DstDesc
{}.
GetStride
(
I7
)
==
1
,
"wrong! only support stride7 == 1!
\n
"
);
static_assert
(
DataPerRead
==
1
||
(
SrcDesc
{}.
GetStride
(
Number
<
nDim
-
1
>
{})
==
1
&&
DstDesc
{}.
GetStride
(
Number
<
nDim
-
1
>
{})
==
1
),
"wrong! only support stride[nDim-1] == 1!
\n
"
);
static_assert
(
DataPerRead
==
1
||
DataPerRead
==
2
||
DataPerRead
==
4
,
"wrong! only support DataPerRead == 1, 2 or 4!
\n
"
);
static_assert
(
SrcDesc
{}.
GetStride
(
I6
)
%
DataPerRead
==
0
&&
DstDesc
{}.
GetStride
(
I6
)
%
DataPerRead
==
0
,
"wrong! src and dst stride should be multiple of DataPerRead to keep alignment"
);
static_assert
(
SrcDesc
{}.
GetStride
(
Number
<
nDim
-
2
>
{})
%
DataPerRead
==
0
&&
DstDesc
{}.
GetStride
(
Number
<
nDim
-
2
>
{})
%
DataPerRead
==
0
,
"wrong! src and dst stride[nDim-2] should be multiple of DataPerRead to keep alignment"
);
constexpr
index_t
L
7
=
SrcOpLengths
{}.
Get
(
I7
);
constexpr
index_t
L
_Back
=
SrcOpLengths
{}.
Back
(
);
static_assert
(
L7
%
DataPerRead
==
0
,
"wrong! L7 should be evenly divided by DataPerRead"
);
static_assert
(
L_Back
%
DataPerRead
==
0
,
"wrong! lengths[nDim-1] should be evenly divided by DataPerRead"
);
constexpr
index_t
n
loop_d7
=
L7
/
DataPerRead
;
constexpr
index_t
n
Read
=
L_Back
/
DataPerRead
;
for
(
index_t
did0
=
0
;
did0
<
ref_desc
.
GetLength
(
I0
);
++
did0
)
{
for
(
index_t
did1
=
0
;
did1
<
ref_desc
.
GetLength
(
I1
);
++
did1
)
{
for
(
index_t
did2
=
0
;
did2
<
ref_desc
.
GetLength
(
I2
);
++
did2
)
{
for
(
index_t
did3
=
0
;
did3
<
ref_desc
.
GetLength
(
I3
);
++
did3
)
{
for
(
index_t
did4
=
0
;
did4
<
ref_desc
.
GetLength
(
I4
);
++
did4
)
{
for
(
index_t
did5
=
0
;
did5
<
ref_desc
.
GetLength
(
I5
);
++
did5
)
{
for
(
index_t
did6
=
0
;
did6
<
ref_desc
.
GetLength
(
I6
);
++
did6
)
{
for
(
index_t
iloop_d7
=
0
;
iloop_d7
<
nloop_d7
;
++
iloop_d7
)
{
const
index_t
src_index
=
src_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
,
did4
,
did5
,
did6
,
iloop_d7
*
DataPerRead
);
const
index_t
dst_index
=
dst_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
,
did4
,
did5
,
did6
,
iloop_d7
*
DataPerRead
);
*
(
reinterpret_cast
<
vector_t
*>
(
p_dst
+
dst_index
))
=
*
(
reinterpret_cast
<
const
vector_t
*>
(
p_src
+
src_index
));
}
}
}
}
}
}
}
}
}
// need to assume src and dst is aligned
template
<
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
,
index_t
DataPerRead
>
__device__
void
threadwise_10d_tensor_copy
(
SrcDesc
,
const
Float
*
__restrict__
p_src
,
DstDesc
,
Float
*
__restrict__
p_dst
,
SrcOpLengths
,
Number
<
DataPerRead
>
)
{
using
vector_t
=
typename
vector_type
<
Float
,
DataPerRead
>::
MemoryType
;
static_assert
(
SrcDesc
{}.
GetDimension
()
==
10
&&
DstDesc
{}.
GetDimension
()
==
10
&&
SrcOpLengths
::
GetSize
()
==
10
,
"wrong! should be 10 dimension"
);
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
constexpr
auto
I7
=
Number
<
7
>
{};
constexpr
auto
I8
=
Number
<
8
>
{};
constexpr
auto
I9
=
Number
<
9
>
{};
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
ref_desc
=
make_ConstantTensorDescriptor
(
SrcOpLengths
{});
static_assert
(
SrcDesc
{}.
GetStride
(
I9
)
==
1
&&
DstDesc
{}.
GetStride
(
I9
)
==
1
,
"wrong! only support stride7 == 1!
\n
"
);
static_assert
(
DataPerRead
==
1
||
DataPerRead
==
2
||
DataPerRead
==
4
,
"wrong! only support DataPerRead == 1, 2 or 4!
\n
"
);
static_ford
<
decltype
(
ref_desc
.
GetLengths
().
PopBack
())
>
{}([
=
](
auto
Ids
)
{
static_for
<
0
,
nRead
,
1
>
{}([
=
](
auto
IRead
)
{
constexpr
auto
multi_id
=
decltype
(
Ids
){}.
PushBack
(
Number
<
IRead
.
Get
()
*
DataPerRead
>
{});
static_assert
(
SrcDesc
{}.
GetStride
(
I8
)
%
DataPerRead
==
0
&&
DstDesc
{}.
GetStride
(
I8
)
%
DataPerRead
==
0
,
"wrong! src and dst stride should be multiple of DataPerRead to keep alignment"
);
const
index_t
src_index
=
src_desc
.
Get1dIndex
(
multi_id
);
const
expr
index_t
L9
=
SrcOpLengths
{}.
Get
(
I9
);
const
index_t
dst_index
=
dst_desc
.
Get1dIndex
(
multi_id
);
static_assert
(
L9
%
DataPerRead
==
0
,
"wrong! L9 should be evenly divided by DataPerRead"
);
constexpr
index_t
nloop_d9
=
L9
/
DataPerRead
;
#pragma unroll
for
(
index_t
did0
=
0
;
did0
<
ref_desc
.
GetLength
(
I0
);
++
did0
)
{
#pragma unroll
for
(
index_t
did1
=
0
;
did1
<
ref_desc
.
GetLength
(
I1
);
++
did1
)
{
#pragma unroll
for
(
index_t
did2
=
0
;
did2
<
ref_desc
.
GetLength
(
I2
);
++
did2
)
{
#pragma unroll
for
(
index_t
did3
=
0
;
did3
<
ref_desc
.
GetLength
(
I3
);
++
did3
)
{
#pragma unroll
for
(
index_t
did4
=
0
;
did4
<
ref_desc
.
GetLength
(
I4
);
++
did4
)
{
#pragma unroll
for
(
index_t
did5
=
0
;
did5
<
ref_desc
.
GetLength
(
I5
);
++
did5
)
{
#pragma unroll
for
(
index_t
did6
=
0
;
did6
<
ref_desc
.
GetLength
(
I6
);
++
did6
)
{
#pragma unroll
for
(
index_t
did7
=
0
;
did7
<
ref_desc
.
GetLength
(
I7
);
++
did7
)
{
#pragma unroll
for
(
index_t
did8
=
0
;
did8
<
ref_desc
.
GetLength
(
I8
);
++
did8
)
{
#pragma unroll
for
(
index_t
iloop_d9
=
0
;
iloop_d9
<
nloop_d9
;
++
iloop_d9
)
{
const
index_t
src_index
=
src_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
,
did4
,
did5
,
did6
,
did7
,
did8
,
iloop_d9
*
DataPerRead
);
const
index_t
dst_index
=
dst_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
,
did4
,
did5
,
did6
,
did7
,
did8
,
iloop_d9
*
DataPerRead
);
*
(
reinterpret_cast
<
vector_t
*>
(
p_dst
+
dst_index
))
=
*
(
reinterpret_cast
<
const
vector_t
*>
(
p_src
+
src_index
));
}
}
}
}
}
}
}
}
}
}
*
(
reinterpret_cast
<
vector_t
*>
(
&
p_dst
[
dst_index
]))
=
*
(
reinterpret_cast
<
const
vector_t
*>
(
&
p_src
[
src_index
]));
});
});
}
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment