Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
8c923db4
Commit
8c923db4
authored
Mar 22, 2019
by
Chao Liu
Browse files
hip build
parent
e72eece8
Changes
19
Hide whitespace changes
Inline
Side-by-side
Showing
19 changed files
with
196 additions
and
155 deletions
+196
-155
driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
...device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
+15
-15
driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp
driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp
+35
-2
driver/driver.hip.cpp
driver/driver.hip.cpp
+22
-10
src/include/Array.hip.hpp
src/include/Array.hip.hpp
+1
-1
src/include/blockwise_4d_tensor_op.hip.hpp
src/include/blockwise_4d_tensor_op.hip.hpp
+5
-4
src/include/blockwise_direct_convolution.hip.hpp
src/include/blockwise_direct_convolution.hip.hpp
+15
-12
src/include/blockwise_gemm.hip.hpp
src/include/blockwise_gemm.hip.hpp
+16
-13
src/include/common.hip.hpp
src/include/common.hip.hpp
+2
-0
src/include/config.h.in
src/include/config.h.in
+2
-0
src/include/data_type.hip.hpp
src/include/data_type.hip.hpp
+3
-30
src/include/gridwise_direct_convolution_1.hip.hpp
src/include/gridwise_direct_convolution_1.hip.hpp
+10
-9
src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
...lude/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
+24
-19
src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
...se_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
+19
-15
src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp
...idwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp
+15
-13
src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp
...implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp
+5
-4
src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
...idwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
+2
-2
src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
...mm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
+2
-2
src/include/tensor.hpp
src/include/tensor.hpp
+1
-2
src/include/threadwise_nd_tensor_op.hip.hpp
src/include/threadwise_nd_tensor_op.hip.hpp
+2
-2
No files found.
driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
View file @
8c923db4
...
...
@@ -51,7 +51,7 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
in_nchw_vec
(
n
,
c
,
h
,
w
)
=
vector_t
::
Pack
(
in_nchw
(
n
,
2
*
c
,
h
,
w
),
in_nchw
(
n
,
2
*
c
+
1
,
h
,
w
));
#elif 1
in_nchw_vec
(
n
,
c
,
h
,
w
)
=
vector_t
::
Pack
(
in_nchw
(
n
,
4
*
c
,
h
,
w
),
in_nchw_vec
(
n
,
c
,
h
,
w
)
=
vector_t
::
Pack
(
in_nchw
(
n
,
4
*
c
,
h
,
w
),
in_nchw
(
n
,
4
*
c
+
1
,
h
,
w
),
in_nchw
(
n
,
4
*
c
+
2
,
h
,
w
),
in_nchw
(
n
,
4
*
c
+
3
,
h
,
w
));
...
...
@@ -113,37 +113,37 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
constexpr unsigned BlockSize = 128;
#elif
0
// 3x3, 34x34, 128 thread, fp32, vector = 2
constexpr
unsigned
NPerBlock
=
2
;
constexpr
unsigned
KPerBlock
=
32
;
constexpr
unsigned
CPerBlock
=
2
;
constexpr
unsigned
NPerBlock
=
2
;
constexpr
unsigned
KPerBlock
=
32
;
constexpr
unsigned
CPerBlock
=
2
;
constexpr
unsigned
HoPerBlock
=
2
;
constexpr
unsigned
WoPerBlock
=
32
;
constexpr
unsigned
NPerThread
=
2
;
constexpr
unsigned
KPerThread
=
4
;
constexpr
unsigned
CPerThread
=
1
;
constexpr
unsigned
NPerThread
=
2
;
constexpr
unsigned
KPerThread
=
4
;
constexpr
unsigned
CPerThread
=
1
;
constexpr
unsigned
HoPerThread
=
2
;
constexpr
unsigned
WoPerThread
=
2
;
constexpr
unsigned
InBlockCopyDataPerRead
=
2
;
constexpr
unsigned
InBlockCopyDataPerRead
=
2
;
constexpr
unsigned
WeiBlockCopyDataPerRead
=
2
;
constexpr
unsigned
BlockSize
=
128
;
#elif 0
// 3x3, 34x34, 128 thread, int8, vector = 4
constexpr
unsigned
NPerBlock
=
2
;
constexpr
unsigned
KPerBlock
=
32
;
constexpr
unsigned
CPerBlock
=
8
;
constexpr
unsigned
NPerBlock
=
2
;
constexpr
unsigned
KPerBlock
=
32
;
constexpr
unsigned
CPerBlock
=
8
;
constexpr
unsigned
HoPerBlock
=
4
;
constexpr
unsigned
WoPerBlock
=
32
;
constexpr
unsigned
NPerThread
=
1
;
constexpr
unsigned
KPerThread
=
8
;
constexpr
unsigned
CPerThread
=
2
;
constexpr
unsigned
NPerThread
=
1
;
constexpr
unsigned
KPerThread
=
8
;
constexpr
unsigned
CPerThread
=
2
;
constexpr
unsigned
HoPerThread
=
4
;
constexpr
unsigned
WoPerThread
=
2
;
constexpr
unsigned
InBlockCopyDataPerRead
=
2
;
constexpr
unsigned
InBlockCopyDataPerRead
=
2
;
constexpr
unsigned
WeiBlockCopyDataPerRead
=
2
;
constexpr
unsigned
BlockSize
=
128
;
...
...
driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp
View file @
8c923db4
...
...
@@ -74,7 +74,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc,
wei_cyxk_device_buf
.
ToDevice
(
wei_cyxk
.
mData
.
data
());
out_khwn_device_buf
.
ToDevice
(
out_khwn
.
mData
.
data
());
#if
1
#if
0
// for 3x3, 34x34
constexpr unsigned NPerBlock = 16;
constexpr unsigned KPerBlock = 64;
...
...
@@ -213,7 +213,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc,
constexpr
unsigned
WoPerThread
=
1
;
constexpr
unsigned
BlockSize
=
128
;
#elif
1
#elif
0
// for 1x1, 28x28
constexpr
unsigned
NPerBlock
=
16
;
constexpr
unsigned
KPerBlock
=
128
;
...
...
@@ -245,6 +245,39 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc,
constexpr
unsigned
OutThreadCopyDataPerWrite
=
2
;
constexpr
unsigned
BlockSize
=
128
;
#elif 1
// for 1x1, 14x14
constexpr
unsigned
NPerBlock
=
16
;
constexpr
unsigned
KPerBlock
=
128
;
constexpr
unsigned
CPerBlock
=
8
;
constexpr
unsigned
HoPerBlock
=
2
;
constexpr
unsigned
WoPerBlock
=
2
;
constexpr
unsigned
NPerThread
=
4
;
constexpr
unsigned
KPerThread
=
16
;
constexpr
unsigned
CPerThread
=
1
;
constexpr
unsigned
HoPerThread
=
1
;
constexpr
unsigned
WoPerThread
=
1
;
constexpr
unsigned
InBlockCopy_ThreadPerDimC
=
8
;
constexpr
unsigned
InBlockCopy_ThreadPerDimH
=
2
;
constexpr
unsigned
InBlockCopy_ThreadPerDimW
=
2
;
constexpr
unsigned
InBlockCopy_ThreadPerDimN
=
4
;
constexpr
unsigned
InBlockCopyDataPerRead
=
4
;
constexpr
unsigned
WeiBlockCopyDataPerRead
=
4
;
constexpr
unsigned
GemmMPerThreadSubC
=
4
;
constexpr
unsigned
GemmNPerThreadSubC
=
4
;
constexpr
unsigned
GemmMLevel0Cluster
=
4
;
constexpr
unsigned
GemmNLevel0Cluster
=
2
;
constexpr
unsigned
GemmMLevel1Cluster
=
2
;
constexpr
unsigned
GemmNLevel1Cluster
=
4
;
constexpr
unsigned
GemmKPerThreadLoop
=
1
;
constexpr
unsigned
OutThreadCopyDataPerWrite
=
2
;
constexpr
unsigned
BlockSize
=
128
;
#endif
...
...
driver/driver.hip.cpp
View file @
8c923db4
...
...
@@ -8,11 +8,11 @@
#include "ConstantTensorDescriptor.hip.hpp"
#include "conv_common.hip.hpp"
//#include "device_direct_convolution_1.hpp"
#include "device_direct_convolution_2_nchw_kcyx_nkhw.hpp"
#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
//
#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp"
//
#include "device_direct_convolution_2_nchw_kcyx_nkhw.hpp"
//
#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp"
//#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"
//
#include "device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp"
#include "device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp"
struct
GeneratorTensor_1
{
...
...
@@ -353,7 +353,7 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
std
::
size_t
ho
=
HoPerTile
*
htile
+
j
;
for
(
int
i
=
0
;
i
<
WoPerTile
;
++
i
)
{
std
::
size_t
wo
=
WoPerTile
*
wtile
+
i
;
std
::
size_t
wo
=
WoPerTile
*
wtile
+
i
;
out_nkhw
(
n
,
k
,
ho
,
wo
)
=
out_hold
(
n
,
k
,
htile
,
wtile
,
j
,
i
);
}
}
...
...
@@ -568,7 +568,7 @@ int main(int argc, char* argv[])
constexpr
unsigned
HPad
=
2
;
constexpr
unsigned
WPad
=
2
;
#elif
1
#elif
0
// 1x1 filter, 32x32 image
constexpr
unsigned
N
=
64
;
constexpr
unsigned
C
=
256
;
...
...
@@ -578,6 +578,18 @@ int main(int argc, char* argv[])
constexpr
unsigned
Y
=
1
;
constexpr
unsigned
X
=
1
;
constexpr
unsigned
HPad
=
0
;
constexpr
unsigned
WPad
=
0
;
#elif 1
// 1x1 filter, 14x14 image
constexpr
unsigned
N
=
128
;
constexpr
unsigned
C
=
2048
;
constexpr
unsigned
HI
=
14
;
constexpr
unsigned
WI
=
14
;
constexpr
unsigned
K
=
512
;
constexpr
unsigned
Y
=
1
;
constexpr
unsigned
X
=
1
;
constexpr
unsigned
HPad
=
0
;
constexpr
unsigned
WPad
=
0
;
#endif
...
...
@@ -594,8 +606,8 @@ int main(int argc, char* argv[])
ostream_ConstantTensorDescriptor
(
wei_kcyx_desc
,
std
::
cout
<<
"wei_kcyx_desc: "
);
ostream_ConstantTensorDescriptor
(
out_nkhw_desc
,
std
::
cout
<<
"out_nkhw_desc: "
);
using
in_data_t
=
char
;
using
out_data_t
=
int32_
t
;
using
in_data_t
=
float
;
using
out_data_t
=
floa
t
;
Tensor
<
in_data_t
>
in_nchw
(
make_TensorDescriptor
(
in_nchw_desc
));
Tensor
<
in_data_t
>
wei_kcyx
(
make_TensorDescriptor
(
wei_kcyx_desc
));
Tensor
<
out_data_t
>
out_nkhw_host
(
make_TensorDescriptor
(
out_nkhw_desc
));
...
...
@@ -635,9 +647,9 @@ int main(int argc, char* argv[])
device_direct_convolution_1
#elif
0
device_direct_convolution_2_nchw_kcyx_nkhw
#elif 1
device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
#elif 0
device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
#elif 1
device_implicit_gemm_convolution_1_chwn_cyxk_khwn
#elif 0
device_implicit_gemm_convolution_2_chwn_cyxk_khwn
...
...
src/include/Array.hip.hpp
View file @
8c923db4
...
...
@@ -10,7 +10,7 @@ struct Array
unsigned
mData
[
nSize
];
template
<
class
...
Xs
>
__host__
__device__
Array
(
Xs
...
xs
)
:
mData
(
{
static_cast
<
TData
>
(
xs
)...}
)
__host__
__device__
Array
(
Xs
...
xs
)
:
mData
{
static_cast
<
TData
>
(
xs
)...}
{
}
...
...
src/include/blockwise_4d_tensor_op.hip.hpp
View file @
8c923db4
...
...
@@ -340,10 +340,11 @@ struct BlockwiseChwnTensorCopyPadded
constexpr
unsigned
NLoop
=
ref_desc
.
GetElementSize
()
/
BlockSize
;
const
Float
*
p_src_tmp
=
p_src
+
src_desc
.
Get1dIndex
(
c_block_data_begin
,
(
ho_block_data_begin
+
h_block_pad_low
)
-
h_global_pad_low
,
(
wo_block_data_begin
+
w_block_pad_low
)
-
w_global_pad_low
,
n_block_data_begin
);
p_src
+
src_desc
.
Get1dIndex
(
c_block_data_begin
,
(
ho_block_data_begin
+
h_block_pad_low
)
-
h_global_pad_low
,
(
wo_block_data_begin
+
w_block_pad_low
)
-
w_global_pad_low
,
n_block_data_begin
);
#if 0
if(get_thread_local_1d_id() == 0)
...
...
src/include/blockwise_direct_convolution.hip.hpp
View file @
8c923db4
...
...
@@ -93,10 +93,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
Float
p_out_thread
[
out_thread_desc
.
GetElementSpace
()];
threadwise_4d_tensor_copy
(
out_block_desc
,
p_out_block
+
out_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
k_thread_data_begin
,
ho_thread_data_begin
,
wo_thread_data_begin
),
p_out_block
+
out_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
k_thread_data_begin
,
ho_thread_data_begin
,
wo_thread_data_begin
),
out_thread_desc
,
p_out_thread
,
out_thread_desc
.
GetLengths
());
...
...
@@ -107,10 +108,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
// threadwise convolution
threadwise_direct_convolution_2
(
in_thread_block_desc
,
p_in_block
+
in_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
c_thread_data_begin
,
hi_thread_data_begin
,
wi_thread_data_begin
),
p_in_block
+
in_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
c_thread_data_begin
,
hi_thread_data_begin
,
wi_thread_data_begin
),
wei_thread_block_desc
,
p_wei_block
+
wei_block_desc
.
Get1dIndex
(
k_thread_data_begin
,
c_thread_data_begin
,
0
,
0
),
...
...
@@ -122,10 +124,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
threadwise_4d_tensor_copy
(
out_thread_desc
,
p_out_thread
,
out_block_desc
,
p_out_block
+
out_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
k_thread_data_begin
,
ho_thread_data_begin
,
wo_thread_data_begin
),
p_out_block
+
out_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
k_thread_data_begin
,
ho_thread_data_begin
,
wo_thread_data_begin
),
out_thread_desc
.
GetLengths
());
}
}
src/include/blockwise_gemm.hip.hpp
View file @
8c923db4
...
...
@@ -431,12 +431,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
constexpr
unsigned
MRepeat
=
MPerThread
/
MPerThreadSubC
;
constexpr
unsigned
NRepeat
=
NPerThread
/
NPerThreadSubC
;
// loop over k
// loop over k
#pragma unroll
for
(
unsigned
k_begin
=
0
;
k_begin
<
KPerBlock
;
k_begin
+=
KPerThreadLoop
)
{
// read first batch of A, B
// copy A-sub to form A
// read first batch of A, B
// copy A-sub to form A
#pragma unroll
for
(
unsigned
m_repeat
=
0
;
m_repeat
<
MRepeat
;
++
m_repeat
)
{
...
...
@@ -449,7 +449,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
a_thread_sub_mtx
.
GetLengths
());
}
// copy B-sub to form B
// copy B-sub to form B
#pragma unroll
for
(
unsigned
n_repeat
=
0
;
n_repeat
<
NRepeat
;
++
n_repeat
)
{
...
...
@@ -462,7 +462,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
b_thread_sub_mtx
.
GetLengths
());
}
// loop over batch
// loop over batch
#pragma unroll
for
(
unsigned
ib
=
0
;
ib
+
1
<
BatchPerThread
;
++
ib
)
{
...
...
@@ -551,14 +551,15 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
c_thread_mtx_begin
.
batch
*
BlockMatrixStrideC
+
c_block_mtx
.
Get1dIndex
(
c_thread_mtx_begin
.
row
,
c_thread_mtx_begin
.
col
);
for
(
unsigned
m_repeat
=
0
;
m_repeat
,
MRepeat
;
++
m_repeat
)
for
(
unsigned
m_repeat
=
0
;
m_repeat
<
MRepeat
;
++
m_repeat
)
{
for
(
unsigned
n_repeat
=
0
;
n_repeat
,
NRepeat
;
++
n_repeat
)
for
(
unsigned
n_repeat
=
0
;
n_repeat
<
NRepeat
;
++
n_repeat
)
{
threadwise_matrix_copy
(
c_thread_sub_mtx
,
p_c_thread
+
c_thread_sub_mtx
.
Get1dIndex
(
m_repeat
*
MPerLevel1Cluster
,
n_repeat
*
NPerLevel1Cluster
),
p_c_thread
+
c_thread_sub_mtx
.
Get1dIndex
(
m_repeat
*
MPerLevel1Cluster
,
n_repeat
*
NPerLevel1Cluster
),
c_block_mtx
,
p_c_block
+
c_block_mtx
.
Get1dIndex
(
m_repeat
*
MPerLevel1Cluster
,
...
...
@@ -656,8 +657,9 @@ struct BlockwiseGemmBlockABlockBThreadC
constexpr
unsigned
NClusterWork
=
(
NPerBlock
+
NPerThread
*
NThreadPerCluster
-
1
)
/
(
NPerThread
*
NThreadPerCluster
);
static_assert
(
BlockSize
==
(
MClusterWork
*
MThreadPerCluster
)
*
(
NClusterWork
*
NThreadPerCluster
),
static_assert
(
BlockSize
==
(
MClusterWork
*
MThreadPerCluster
)
*
(
NClusterWork
*
NThreadPerCluster
),
"wrong! wrong BlockSize"
);
if
(
DistributeThreadAlongColumnFirst
)
...
...
@@ -1256,8 +1258,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
p_b_thread
+
b_thread_mtx
.
Get1dIndex
(
0
,
n_repeat
*
NPerThreadSubC
),
c_thread_sub_mtx
,
False
,
p_c_thread
+
c_thread_mtx
.
Get1dIndex
(
m_repeat
*
MPerThreadSubC
,
n_repeat
*
NPerThreadSubC
),
p_c_thread
+
c_thread_mtx
.
Get1dIndex
(
m_repeat
*
MPerThreadSubC
,
n_repeat
*
NPerThreadSubC
),
f_accum
);
}
}
...
...
src/include/common.hip.hpp
View file @
8c923db4
...
...
@@ -21,6 +21,7 @@ struct is_same<T, T>
static
const
bool
value
=
true
;
};
#if 0
template <typename T>
__host__ __device__ constexpr T max(T a, T b)
{
...
...
@@ -32,6 +33,7 @@ __host__ __device__ constexpr T min(T a, T b)
{
return a < b ? a : b;
}
#endif
__host__
__device__
constexpr
unsigned
integer_divide_ceil
(
unsigned
a
,
unsigned
b
)
{
...
...
src/include/config.h.in
View file @
8c923db4
...
...
@@ -4,8 +4,10 @@
#if DEVICE_BACKEND_HIP
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
#elif DEVICE_BACKEND_CUDA
#include "cuda_runtime.h"
#include "cuda_fp16.h"
#include "nvToolsExt.h"
#include "helper_cuda.h"
#endif
src/include/data_type.hip.hpp
View file @
8c923db4
#pragma once
#include "config.h"
#if DEVICE_BACKEND_CUDA
namespace
CUDA
{
#include "cuda_fp16.h"
}
#endif
using
half
=
CUDA
::
half
;
using
half2
=
CUDA
::
half2
;
template
<
class
T
,
unsigned
N
>
struct
vector_type
{
...
...
@@ -52,6 +43,7 @@ struct vector_type<float2, 2>
using
MemoryType
=
float4
;
};
#if 0
template <>
struct vector_type<half, 1>
{
...
...
@@ -91,24 +83,6 @@ struct vector_type<half, 8>
using MemoryType = float4;
};
template
<
>
struct
vector_type
<
half2
,
1
>
{
using
MemoryType
=
half2
;
};
template
<
>
struct
vector_type
<
half2
,
2
>
{
using
MemoryType
=
float2
;
};
template
<
>
struct
vector_type
<
half2
,
4
>
{
using
MemoryType
=
float4
;
};
template <>
struct vector_type<char, 1>
{
...
...
@@ -169,7 +143,6 @@ struct vector_type<int32_t, 2>
using MemoryType = int64_t;
};
#if 0
template <>
struct vector_type<char2, 2>
{
...
...
@@ -214,6 +187,7 @@ __device__ void fused_multiply_accumulate(float& d, const float4& s0, const floa
d
+=
s0
.
w
*
s1
.
w
;
}
#if 0
__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { d += s0 * s1; }
__device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1)
...
...
@@ -222,12 +196,10 @@ __device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2&
d += s0.y * s1.y;
}
#if 0
__device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1)
{
d += s0.x * s1.x + s0.y * s1.y;
}
#endif
__device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s1) { d += s0 * s1; }
...
...
@@ -239,3 +211,4 @@ __device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const i
d = __dp4a(s0, s1, d);
#endif
}
#endif
src/include/gridwise_direct_convolution_1.hip.hpp
View file @
8c923db4
...
...
@@ -113,10 +113,11 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
c_block_work_begin
+=
CPerBlock
)
{
// copy input tensor to LDS
blockwise_in_copy
.
Run
(
p_in_global
+
in_global_desc
.
Get1dIndex
(
n_block_work_begin
,
c_block_work_begin
,
hi_block_work_begin
,
wi_block_work_begin
),
blockwise_in_copy
.
Run
(
p_in_global
+
in_global_desc
.
Get1dIndex
(
n_block_work_begin
,
c_block_work_begin
,
hi_block_work_begin
,
wi_block_work_begin
),
p_in_block
);
// copy weight tensor to LDS
...
...
@@ -143,9 +144,9 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
}
// copy output tensor from LDS to device mem
blockwise_out_copy
.
Run
(
p_out_block
,
p_out_global
+
out_global_desc
.
Get1dIndex
(
n_block_work_begin
,
k_block_work_begin
,
ho_block_work_begin
,
wo_block_work_begin
));
blockwise_out_copy
.
Run
(
p_out_block
,
p_out_global
+
out_global_desc
.
Get1dIndex
(
n_block_work_begin
,
k_block_work_begin
,
ho_block_work_begin
,
wo_block_work_begin
));
}
src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
View file @
8c923db4
...
...
@@ -176,16 +176,18 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
c_block_data_begin
+=
CPerBlock
,
__syncthreads
())
{
// copy input tensor to LDS
blockwise_in_copy
.
Run
(
p_in_global
+
in_nchw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
c_block_data_begin
,
hi_block_data_begin
,
wi_block_data_begin
),
blockwise_in_copy
.
Run
(
p_in_global
+
in_nchw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
c_block_data_begin
,
hi_block_data_begin
,
wi_block_data_begin
),
p_in_block
);
// copy weight tensor to LDS
blockwise_wei_copy
.
Run
(
p_wei_global
+
wei_kcyx_global_desc
.
Get1dIndex
(
k_block_data_begin
,
c_block_data_begin
,
0
,
0
),
p_wei_block
);
blockwise_wei_copy
.
Run
(
p_wei_global
+
wei_kcyx_global_desc
.
Get1dIndex
(
k_block_data_begin
,
c_block_data_begin
,
0
,
0
),
p_wei_block
);
__syncthreads
();
...
...
@@ -195,10 +197,11 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
#if 1
threadwise_direct_convolution_2
(
in_nchw_thread_block_desc
,
p_in_block
+
in_nchw_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
c_thread_data
,
hi_thread_data_begin
,
wi_thread_data_begin
),
p_in_block
+
in_nchw_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
c_thread_data
,
hi_thread_data_begin
,
wi_thread_data_begin
),
wei_kcyx_thread_block_desc
,
p_wei_block
+
wei_kcyx_block_desc
.
Get1dIndex
(
k_thread_data_begin
,
c_thread_data
,
0
,
0
),
...
...
@@ -207,10 +210,11 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
#elif 0
threadwise_direct_convolution_3
(
in_nchw_thread_block_desc
,
p_in_block
+
in_nchw_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
c_thread_data
,
hi_thread_data_begin
,
wi_thread_data_begin
),
p_in_block
+
in_nchw_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
c_thread_data
,
hi_thread_data_begin
,
wi_thread_data_begin
),
wei_kcyx_thread_block_desc
,
p_wei_block
+
wei_kcyx_block_desc
.
Get1dIndex
(
k_thread_data_begin
,
c_thread_data
,
0
,
0
),
...
...
@@ -225,9 +229,10 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
out_nkhw_thread_desc
,
p_out_thread
,
out_nkhw_global_desc
,
p_out_global
+
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
+
n_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
p_out_global
+
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
+
n_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
out_nkhw_thread_desc
.
GetLengths
());
}
src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
View file @
8c923db4
...
...
@@ -200,9 +200,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
p_in_vec_block
);
// copy weight tensor to LDS
blockwise_wei_copy
.
Run
(
p_wei_vec_global
+
wei_kcyx_vec_global_desc
.
Get1dIndex
(
k_block_data_begin
,
c_block_data_begin
,
0
,
0
),
p_wei_vec_block
);
blockwise_wei_copy
.
Run
(
p_wei_vec_global
+
wei_kcyx_vec_global_desc
.
Get1dIndex
(
k_block_data_begin
,
c_block_data_begin
,
0
,
0
),
p_wei_vec_block
);
__syncthreads
();
...
...
@@ -212,10 +213,11 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
#if 1
threadwise_direct_convolution_2
(
in_nchw_vec_thread_block_desc
,
p_in_vec_block
+
in_nchw_vec_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
c_thread_data
,
hi_thread_data_begin
,
wi_thread_data_begin
),
p_in_vec_block
+
in_nchw_vec_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
c_thread_data
,
hi_thread_data_begin
,
wi_thread_data_begin
),
wei_kcyx_vec_thread_block_desc
,
p_wei_vec_block
+
wei_kcyx_vec_block_desc
.
Get1dIndex
(
k_thread_data_begin
,
c_thread_data
,
0
,
0
),
...
...
@@ -224,10 +226,11 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
#elif 0
threadwise_direct_convolution_3
(
in_nchw_vec_thread_block_desc
,
p_in_vec_block
+
in_nchw_vec_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
c_thread_data
,
hi_thread_data_begin
,
wi_thread_data_begin
),
p_in_vec_block
+
in_nchw_vec_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
c_thread_data
,
hi_thread_data_begin
,
wi_thread_data_begin
),
wei_kcyx_vec_thread_block_desc
,
p_wei_vec_block
+
wei_kcyx_vec_block_desc
.
Get1dIndex
(
k_thread_data_begin
,
c_thread_data
,
0
,
0
),
...
...
@@ -242,9 +245,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
out_nkhw_thread_desc
,
p_out_thread
,
out_nkhw_global_desc
,
p_out_global
+
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
+
n_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
p_out_global
+
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
+
n_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
out_nkhw_thread_desc
.
GetLengths
());
}
src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp
View file @
8c923db4
...
...
@@ -184,8 +184,9 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
threadwise_4d_tensor_set_zero
(
out_khwn_thread_desc
,
p_out_thread
);
const
Float
*
p_in_global_block_begin
=
p_in_global
+
in_chwn_global_desc
.
Get1dIndex
(
0
,
hi_block_data_begin
,
wi_block_data_begin
,
n_block_data_begin
);
p_in_global
+
in_chwn_global_desc
.
Get1dIndex
(
0
,
hi_block_data_begin
,
wi_block_data_begin
,
n_block_data_begin
);
const
Float
*
p_wei_global_block_begin
=
p_wei_global
+
wei_cyxk_global_desc
.
Get1dIndex
(
0
,
0
,
0
,
k_block_data_begin
);
...
...
@@ -216,7 +217,7 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
}
}
// output: register to global mem,
// output: register to global mem,
#if 0
const auto c_thread_mtx_begin =
blockwise_batch_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
...
...
@@ -286,16 +287,17 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
}
#endif
threadwise_8d_tensor_copy
(
out_8d_thread_desc
,
p_out_thread
,
out_8d_global_desc
,
p_out_global
+
out_khwn_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
out_8d_thread_desc
.
GetLengths
(),
Number
<
OutThreadCopyDataPerWrite
>
{});
threadwise_8d_tensor_copy
(
out_8d_thread_desc
,
p_out_thread
,
out_8d_global_desc
,
p_out_global
+
out_khwn_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
out_8d_thread_desc
.
GetLengths
(),
Number
<
OutThreadCopyDataPerWrite
>
{});
}
else
if
(
NPerThread
==
NPerBlock
)
{
...
...
src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp
View file @
8c923db4
...
...
@@ -283,10 +283,11 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(
out_hkwn_thread_desc
,
p_out_thread
,
out_khwn_global_desc
,
p_out_global
+
out_khwn_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
p_out_global
+
out_khwn_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
out_hkwn_thread_desc
.
GetLengths
(),
reorder_khwn_from_hkwn
);
}
src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
View file @
8c923db4
...
...
@@ -121,7 +121,7 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric
decltype(in_cb_block_desc),
decltype(in_cb_block_desc.GetLengths())>{};
#elif
0
const
auto
blockwise_in_copy
=
Blockwise2dTensorCopy2
<
BlockSize
,
const
auto
blockwise_in_copy
=
Blockwise2dTensorCopy2
<
BlockSize
,
Float
,
decltype
(
in_cb_global_desc
),
decltype
(
in_cb_block_desc
),
...
...
@@ -129,7 +129,7 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric
InBlockCopyThreadPerDim0
,
InBlockCopyThreadPerDim1
>
{};
#elif 1
const
auto
blockwise_in_copy
=
Blockwise2dTensorCopy3
<
BlockSize
,
const
auto
blockwise_in_copy
=
Blockwise2dTensorCopy3
<
BlockSize
,
Float
,
decltype
(
in_cb_global_desc
),
decltype
(
in_cb_block_desc
),
...
...
src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
View file @
8c923db4
...
...
@@ -121,7 +121,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_b
decltype(in_cb_block_desc),
decltype(in_cb_block_desc.GetLengths())>{};
#elif
0
const
auto
blockwise_in_copy
=
Blockwise2dTensorCopy2
<
BlockSize
,
const
auto
blockwise_in_copy
=
Blockwise2dTensorCopy2
<
BlockSize
,
Float
,
decltype
(
in_cb_global_desc
),
decltype
(
in_cb_block_desc
),
...
...
@@ -129,7 +129,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_b
InBlockCopyThreadPerDim0
,
InBlockCopyThreadPerDim1
>
{};
#elif 1
const
auto
blockwise_in_copy
=
Blockwise2dTensorCopy3
<
BlockSize
,
const
auto
blockwise_in_copy
=
Blockwise2dTensorCopy3
<
BlockSize
,
Float
,
decltype
(
in_cb_global_desc
),
decltype
(
in_cb_block_desc
),
...
...
src/include/tensor.hpp
View file @
8c923db4
...
...
@@ -22,8 +22,7 @@ std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
return
os
;
}
typedef
enum
{
typedef
enum
{
Half
=
0
,
Float
=
1
,
}
DataType_t
;
...
...
src/include/threadwise_nd_tensor_op.hip.hpp
View file @
8c923db4
...
...
@@ -10,7 +10,7 @@ __device__ void threadwise_6d_tensor_copy(SrcDesc,
SrcOpLengths
,
Number
<
DataPerRead
>
)
{
using
vector_t
=
typename
vector_type
<
Float
,
DataPerRead
>::
t
ype
;
using
vector_t
=
typename
vector_type
<
Float
,
DataPerRead
>::
MemoryT
ype
;
static_assert
(
SrcDesc
{}.
GetDimension
()
==
6
&&
DstDesc
{}.
GetDimension
()
==
6
&&
SrcOpLengths
::
nDim
==
6
,
...
...
@@ -80,7 +80,7 @@ __device__ void threadwise_8d_tensor_copy(SrcDesc,
SrcOpLengths
,
Number
<
DataPerRead
>
)
{
using
vector_t
=
typename
vector_type
<
Float
,
DataPerRead
>::
t
ype
;
using
vector_t
=
typename
vector_type
<
Float
,
DataPerRead
>::
MemoryT
ype
;
static_assert
(
SrcDesc
{}.
GetDimension
()
==
8
&&
DstDesc
{}.
GetDimension
()
==
8
&&
SrcOpLengths
::
nDim
==
8
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment