Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
04c5527d
Commit
04c5527d
authored
Mar 04, 2019
by
Chao Liu
Browse files
refactor
parent
5fd40ad7
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
94 additions
and
100 deletions
+94
-100
driver/driver.hip.cpp
driver/driver.hip.cpp
+2
-0
src/include/blockwise_2d_tensor_op.hip.hpp
src/include/blockwise_2d_tensor_op.hip.hpp
+10
-74
src/include/blockwise_4d_tensor_op.hip.hpp
src/include/blockwise_4d_tensor_op.hip.hpp
+5
-26
src/include/common.hip.hpp
src/include/common.hip.hpp
+75
-0
src/include/config.h.in
src/include/config.h.in
+2
-0
No files found.
driver/driver.hip.cpp
View file @
04c5527d
...
@@ -633,6 +633,7 @@ int main(int argc, char* argv[])
...
@@ -633,6 +633,7 @@ int main(int argc, char* argv[])
if
(
do_verification
)
if
(
do_verification
)
{
{
#if 1
if
(
Y
==
3
&&
X
==
3
)
if
(
Y
==
3
&&
X
==
3
)
{
{
host_winograd_3x3_convolution
(
in_nchw
,
wei_kcsr
,
out_nkhw_host
,
lower_pads
,
upper_pads
);
host_winograd_3x3_convolution
(
in_nchw
,
wei_kcsr
,
out_nkhw_host
,
lower_pads
,
upper_pads
);
...
@@ -642,6 +643,7 @@ int main(int argc, char* argv[])
...
@@ -642,6 +643,7 @@ int main(int argc, char* argv[])
host_direct_convolution
(
in_nchw
,
wei_kcsr
,
out_nkhw_host
,
lower_pads
,
upper_pads
);
host_direct_convolution
(
in_nchw
,
wei_kcsr
,
out_nkhw_host
,
lower_pads
,
upper_pads
);
}
}
check_error
(
out_nkhw_host
,
out_nkhw_device
);
check_error
(
out_nkhw_host
,
out_nkhw_device
);
#endif
#if 0
#if 0
LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
...
...
src/include/blockwise_2d_tensor_op.hip.hpp
View file @
04c5527d
...
@@ -373,6 +373,8 @@ template <unsigned BlockSize,
...
@@ -373,6 +373,8 @@ template <unsigned BlockSize,
unsigned
DataPerRead
>
unsigned
DataPerRead
>
struct
Blockwise2dTensorCopy3
struct
Blockwise2dTensorCopy3
{
{
using
vector_t
=
typename
vector_type
<
Float
,
DataPerRead
>::
type
;
unsigned
mSrcMyThreadOffset
;
unsigned
mSrcMyThreadOffset
;
unsigned
mDstMyThreadOffset
;
unsigned
mDstMyThreadOffset
;
...
@@ -424,11 +426,6 @@ struct Blockwise2dTensorCopy3
...
@@ -424,11 +426,6 @@ struct Blockwise2dTensorCopy3
__device__
void
Run
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
__device__
void
Run
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
{
{
static_assert
(
is_same
<
Float
,
float
>::
value
,
"wrong! only support float!
\n
"
);
using
Float2
=
float2
;
using
Float4
=
float4
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -454,27 +451,9 @@ struct Blockwise2dTensorCopy3
...
@@ -454,27 +451,9 @@ struct Blockwise2dTensorCopy3
constexpr
unsigned
dst_loop_stride
=
DstDesc
{}.
GetStride
(
I0
)
*
thread_per_d0
;
constexpr
unsigned
dst_loop_stride
=
DstDesc
{}.
GetStride
(
I0
)
*
thread_per_d0
;
auto
f_copy
=
[
&
](
unsigned
iloop
)
{
auto
f_copy
=
[
&
](
unsigned
iloop
)
{
if
(
DataPerRead
==
1
)
*
(
reinterpret_cast
<
vector_t
*>
(
p_dst
+
mDstMyThreadOffset
+
iloop
*
dst_loop_stride
))
=
{
*
(
reinterpret_cast
<
const
vector_t
*>
(
p_src
+
mSrcMyThreadOffset
+
p_dst
[
mDstMyThreadOffset
+
iloop
*
dst_loop_stride
]
=
iloop
*
src_loop_stride
));
p_src
[
mSrcMyThreadOffset
+
iloop
*
src_loop_stride
];
}
else
if
(
DataPerRead
==
2
)
{
*
(
reinterpret_cast
<
Float2
*>
(
p_dst
+
mDstMyThreadOffset
+
iloop
*
dst_loop_stride
))
=
*
(
reinterpret_cast
<
const
Float2
*>
(
p_src
+
mSrcMyThreadOffset
+
iloop
*
src_loop_stride
));
}
else
if
(
DataPerRead
==
4
)
{
*
(
reinterpret_cast
<
Float4
*>
(
p_dst
+
mDstMyThreadOffset
+
iloop
*
dst_loop_stride
))
=
*
(
reinterpret_cast
<
const
Float4
*>
(
p_src
+
mSrcMyThreadOffset
+
iloop
*
src_loop_stride
));
}
else
{
assert
(
false
);
}
};
};
for
(
unsigned
iloop
=
0
;
iloop
<
nloop_d0
;
++
iloop
)
for
(
unsigned
iloop
=
0
;
iloop
<
nloop_d0
;
++
iloop
)
...
@@ -514,11 +493,6 @@ struct Blockwise2dTensorCopy3
...
@@ -514,11 +493,6 @@ struct Blockwise2dTensorCopy3
__device__
void
RunLoadRegisterClipboard
(
const
Float
*
__restrict__
p_src
,
__device__
void
RunLoadRegisterClipboard
(
const
Float
*
__restrict__
p_src
,
Float
*
p_clipboard
)
const
Float
*
p_clipboard
)
const
{
{
static_assert
(
is_same
<
Float
,
float
>::
value
,
"wrong! only support float!
\n
"
);
using
Float2
=
float2
;
using
Float4
=
float4
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -544,26 +518,9 @@ struct Blockwise2dTensorCopy3
...
@@ -544,26 +518,9 @@ struct Blockwise2dTensorCopy3
constexpr
unsigned
dst_loop_stride
=
DstDesc
{}.
GetStride
(
I0
)
*
thread_per_d0
;
constexpr
unsigned
dst_loop_stride
=
DstDesc
{}.
GetStride
(
I0
)
*
thread_per_d0
;
auto
f_copy
=
[
&
](
unsigned
iloop
)
{
auto
f_copy
=
[
&
](
unsigned
iloop
)
{
if
(
DataPerRead
==
1
)
*
(
reinterpret_cast
<
vector_t
*>
(
p_clipboard
+
iloop
*
4
))
=
{
*
(
reinterpret_cast
<
const
vector_t
*>
(
p_src
+
mSrcMyThreadOffset
+
p_clipboard
[
iloop
]
=
p_src
[
mSrcMyThreadOffset
+
iloop
*
src_loop_stride
];
iloop
*
src_loop_stride
));
}
else
if
(
DataPerRead
==
2
)
{
*
(
reinterpret_cast
<
Float2
*>
(
p_clipboard
+
iloop
*
2
))
=
*
(
reinterpret_cast
<
const
Float2
*>
(
p_src
+
mSrcMyThreadOffset
+
iloop
*
src_loop_stride
));
}
else
if
(
DataPerRead
==
4
)
{
*
(
reinterpret_cast
<
Float4
*>
(
p_clipboard
+
iloop
*
4
))
=
*
(
reinterpret_cast
<
const
Float4
*>
(
p_src
+
mSrcMyThreadOffset
+
iloop
*
src_loop_stride
));
}
else
{
assert
(
false
);
}
};
};
for
(
unsigned
iloop
=
0
;
iloop
<
nloop_d0
;
++
iloop
)
for
(
unsigned
iloop
=
0
;
iloop
<
nloop_d0
;
++
iloop
)
...
@@ -587,11 +544,6 @@ struct Blockwise2dTensorCopy3
...
@@ -587,11 +544,6 @@ struct Blockwise2dTensorCopy3
__device__
void
RunStoreRegisterClipboard
(
const
Float
*
__restrict__
p_clipboard
,
__device__
void
RunStoreRegisterClipboard
(
const
Float
*
__restrict__
p_clipboard
,
Float
*
__restrict__
p_dst
)
const
Float
*
__restrict__
p_dst
)
const
{
{
static_assert
(
is_same
<
Float
,
float
>::
value
,
"wrong! only support float!
\n
"
);
using
Float2
=
float2
;
using
Float4
=
float4
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -617,24 +569,8 @@ struct Blockwise2dTensorCopy3
...
@@ -617,24 +569,8 @@ struct Blockwise2dTensorCopy3
constexpr
unsigned
dst_loop_stride
=
DstDesc
{}.
GetStride
(
I0
)
*
thread_per_d0
;
constexpr
unsigned
dst_loop_stride
=
DstDesc
{}.
GetStride
(
I0
)
*
thread_per_d0
;
auto
f_copy
=
[
&
](
unsigned
iloop
)
{
auto
f_copy
=
[
&
](
unsigned
iloop
)
{
if
(
DataPerRead
==
1
)
*
(
reinterpret_cast
<
vector_t
*>
(
p_dst
+
mDstMyThreadOffset
+
iloop
*
dst_loop_stride
))
=
{
*
(
reinterpret_cast
<
const
vector_t
*>
(
p_clipboard
+
iloop
*
4
));
p_dst
[
mDstMyThreadOffset
+
iloop
*
dst_loop_stride
]
=
p_clipboard
[
iloop
];
}
else
if
(
DataPerRead
==
2
)
{
*
(
reinterpret_cast
<
Float2
*>
(
p_dst
+
mDstMyThreadOffset
+
iloop
*
dst_loop_stride
))
=
*
(
reinterpret_cast
<
const
Float2
*>
(
p_clipboard
+
iloop
*
2
));
}
else
if
(
DataPerRead
==
4
)
{
*
(
reinterpret_cast
<
Float4
*>
(
p_dst
+
mDstMyThreadOffset
+
iloop
*
dst_loop_stride
))
=
*
(
reinterpret_cast
<
const
Float4
*>
(
p_clipboard
+
iloop
*
4
));
}
else
{
assert
(
false
);
}
};
};
for
(
unsigned
iloop
=
0
;
iloop
<
nloop_d0
;
++
iloop
)
for
(
unsigned
iloop
=
0
;
iloop
<
nloop_d0
;
++
iloop
)
...
...
src/include/blockwise_4d_tensor_op.hip.hpp
View file @
04c5527d
...
@@ -349,6 +349,8 @@ template <unsigned BlockSize,
...
@@ -349,6 +349,8 @@ template <unsigned BlockSize,
unsigned
DataPerRead
>
unsigned
DataPerRead
>
struct
Blockwise4dTensorCopy3
struct
Blockwise4dTensorCopy3
{
{
using
vector_t
=
typename
vector_type
<
Float
,
DataPerRead
>::
type
;
unsigned
mSrcMyThreadOffset
;
unsigned
mSrcMyThreadOffset
;
unsigned
mDstMyThreadOffset
;
unsigned
mDstMyThreadOffset
;
...
@@ -422,11 +424,6 @@ struct Blockwise4dTensorCopy3
...
@@ -422,11 +424,6 @@ struct Blockwise4dTensorCopy3
__device__
void
Run
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
__device__
void
Run
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
{
{
static_assert
(
is_same
<
Float
,
float
>::
value
,
"wrong! only support float!
\n
"
);
using
Float2
=
float2
;
using
Float4
=
float4
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
...
@@ -482,27 +479,9 @@ struct Blockwise4dTensorCopy3
...
@@ -482,27 +479,9 @@ struct Blockwise4dTensorCopy3
iloop_d2
*
thread_per_d2
,
iloop_d2
*
thread_per_d2
,
iloop_d3
*
thread_per_d3
*
DataPerRead
);
iloop_d3
*
thread_per_d3
*
DataPerRead
);
if
(
DataPerRead
==
1
)
*
(
reinterpret_cast
<
vector_t
*>
(
p_dst
+
dst_offset
+
mDstMyThreadOffset
))
=
{
*
(
reinterpret_cast
<
const
vector_t
*>
(
p_src
+
src_offset
+
p_dst
[
dst_offset
+
mDstMyThreadOffset
]
=
mSrcMyThreadOffset
));
p_src
[
src_offset
+
mSrcMyThreadOffset
];
}
else
if
(
DataPerRead
==
2
)
{
*
(
reinterpret_cast
<
Float2
*>
(
p_dst
+
dst_offset
+
mDstMyThreadOffset
))
=
*
(
reinterpret_cast
<
const
Float2
*>
(
p_src
+
src_offset
+
mSrcMyThreadOffset
));
}
else
if
(
DataPerRead
==
4
)
{
*
(
reinterpret_cast
<
Float4
*>
(
p_dst
+
dst_offset
+
mDstMyThreadOffset
))
=
*
(
reinterpret_cast
<
const
Float4
*>
(
p_src
+
src_offset
+
mSrcMyThreadOffset
));
}
else
{
assert
(
false
);
}
}
}
}
}
}
}
...
...
src/include/common.hip.hpp
View file @
04c5527d
...
@@ -16,6 +16,81 @@ struct is_same<T, T>
...
@@ -16,6 +16,81 @@ struct is_same<T, T>
static
const
bool
value
=
true
;
static
const
bool
value
=
true
;
};
};
template
<
class
T
,
unsigned
N
>
struct
vector_type
{
};
template
<
>
struct
vector_type
<
float
,
1
>
{
using
type
=
float
;
};
template
<
>
struct
vector_type
<
float
,
2
>
{
using
type
=
float2
;
};
template
<
>
struct
vector_type
<
float
,
4
>
{
using
type
=
float4
;
};
#if 0
template <>
struct vector_type<half_float::half, 1>
{
using type = half_float::half;
};
template <>
struct vector_type<half_float::half, 2>
{
using type = float;
};
template <>
struct vector_type<half_float::half, 4>
{
using type = float2;
};
template <>
struct vector_type<half_float::half, 8>
{
using type = float4;
};
#endif
#if 1
template
<
>
struct
vector_type
<
half
,
1
>
{
using
type
=
half
;
};
template
<
>
struct
vector_type
<
half
,
2
>
{
using
type
=
half2
;
};
template
<
>
struct
vector_type
<
half
,
4
>
{
using
type
=
float2
;
};
template
<
>
struct
vector_type
<
half
,
8
>
{
using
type
=
float4
;
};
#endif
template
<
class
T
,
T
N
>
template
<
class
T
,
T
N
>
struct
integral_constant
struct
integral_constant
{
{
...
...
src/include/config.h.in
View file @
04c5527d
...
@@ -4,8 +4,10 @@
...
@@ -4,8 +4,10 @@
#if DEVICE_BACKEND_HIP
#if DEVICE_BACKEND_HIP
#include "hip/hip_runtime.h"
#include "hip/hip_runtime.h"
#include "half.hpp"
#elif DEVICE_BACKEND_CUDA
#elif DEVICE_BACKEND_CUDA
#include "cuda_runtime.h"
#include "cuda_runtime.h"
#include "nvToolsExt.h"
#include "nvToolsExt.h"
#include "helper_cuda.h"
#include "helper_cuda.h"
#include "cuda_fp16.h"
#endif
#endif
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment