Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
b7d05245
Commit
b7d05245
authored
May 15, 2019
by
Chao Liu
Browse files
adding implicit gemm v3
parent
4957d5a3
Changes
29
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
572 additions
and
231 deletions
+572
-231
driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
...er/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
+148
-0
driver/driver.hip.cpp
driver/driver.hip.cpp
+6
-3
src/include/ConstantMergedTensorDescriptor.hip.hpp
src/include/ConstantMergedTensorDescriptor.hip.hpp
+95
-0
src/include/ConstantTensorDescriptor.hip.hpp
src/include/ConstantTensorDescriptor.hip.hpp
+52
-12
src/include/Sequence.hip.hpp
src/include/Sequence.hip.hpp
+15
-3
src/include/blockwise_4d_tensor_op.hip.hpp
src/include/blockwise_4d_tensor_op.hip.hpp
+1
-1
src/include/blockwise_gemm.hip.hpp
src/include/blockwise_gemm.hip.hpp
+20
-38
src/include/blockwise_merged_tensor_slice_op.hip.hpp
src/include/blockwise_merged_tensor_slice_op.hip.hpp
+55
-0
src/include/blockwise_tensor_slice_op.hip.hpp
src/include/blockwise_tensor_slice_op.hip.hpp
+33
-30
src/include/conv_common.hip.hpp
src/include/conv_common.hip.hpp
+4
-4
src/include/functional.hip.hpp
src/include/functional.hip.hpp
+67
-64
src/include/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hip.hpp
...ude/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hip.hpp
+2
-2
src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp
...ise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp
+3
-3
src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp
...ise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp
+3
-3
src/include/gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp
...ise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp
+14
-14
src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp
...ise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp
+3
-3
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp
...plicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp
+3
-3
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
...plicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
+16
-16
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
...plicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
+16
-16
src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp
...ise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp
+16
-16
No files found.
driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
0 → 100644
View file @
b7d05245
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_convolution_wrapper.hip.hpp"
#include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp"
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
index_t
nrepeat
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_nchw_desc
=
InDesc
{};
constexpr
auto
wei_kcyx_desc
=
WeiDesc
{};
constexpr
auto
out_nkhw_desc
=
OutDesc
{};
constexpr
index_t
Hi
=
in_nchw_desc
.
GetLength
(
I2
);
constexpr
index_t
Wi
=
in_nchw_desc
.
GetLength
(
I3
);
constexpr
index_t
N
=
out_nkhw_desc
.
GetLength
(
I0
);
constexpr
index_t
Ho
=
out_nkhw_desc
.
GetLength
(
I2
);
constexpr
index_t
Wo
=
out_nkhw_desc
.
GetLength
(
I3
);
constexpr
index_t
K
=
wei_kcyx_desc
.
GetLength
(
I0
);
constexpr
index_t
C
=
wei_kcyx_desc
.
GetLength
(
I1
);
constexpr
index_t
Y
=
wei_kcyx_desc
.
GetLength
(
I2
);
constexpr
index_t
X
=
wei_kcyx_desc
.
GetLength
(
I3
);
// reorder weight
auto
wei_cyxk_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
ostream_ConstantTensorDescriptor
(
wei_cyxk_desc
,
std
::
cout
<<
"wei_cyxk_desc: "
);
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
auto
f_reorder_kcyx2cyxk
=
[
&
](
auto
k
,
auto
c
,
auto
y
,
auto
x
)
{
wei_cyxk
(
c
,
y
,
x
,
k
)
=
wei_kcyx
(
k
,
c
,
y
,
x
);
};
make_ParallelTensorFunctor
(
f_reorder_kcyx2cyxk
,
K
,
C
,
Y
,
X
)(
std
::
thread
::
hardware_concurrency
());
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_nchw_device_buf
(
data_sz
*
in_nchw
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_cyxk_device_buf
(
data_sz
*
wei_cyxk
.
mDesc
.
GetElementSpace
());
DeviceMem
out_nkhw_device_buf
(
data_sz
*
out_nkhw
.
mDesc
.
GetElementSpace
());
in_nchw_device_buf
.
ToDevice
(
in_nchw
.
mData
.
data
());
wei_cyxk_device_buf
.
ToDevice
(
wei_cyxk
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
#if 1
// for 3x3, 28x28, v3, Pascal
constexpr
index_t
BlockSize
=
128
;
constexpr
index_t
BPerBlock
=
16
;
constexpr
index_t
KPerBlock
=
128
;
constexpr
index_t
CPerBlock
=
8
;
constexpr
index_t
BPerThread
=
1
;
constexpr
index_t
KPerThread
=
8
;
constexpr
index_t
GemmMPerThreadSubC
=
4
;
constexpr
index_t
GemmNPerThreadSubC
=
4
;
constexpr
index_t
GemmMLevel0Cluster
=
4
;
constexpr
index_t
GemmNLevel0Cluster
=
2
;
constexpr
index_t
GemmMLevel1Cluster
=
4
;
constexpr
index_t
GemmNLevel1Cluster
=
2
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
GemmDataPerReadA
=
4
;
constexpr
index_t
GemmDataPerReadB
=
4
;
using
InBlockReorderSrcSubLengths_NCHW
=
Sequence
<
4
,
1
,
1
,
1
>
;
using
InBlockReorderSrcClusterLengths_NCHW
=
Sequence
<
4
,
8
,
2
,
2
>
;
using
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW
=
Sequence
<
1
,
2
,
0
,
3
>
;
constexpr
index_t
WeiBlockCopyDataPerRead_K
=
4
;
#endif
constexpr
index_t
GridSize
=
((
N
+
NPerBlock
-
1
)
/
NPerBlock
)
*
((
K
+
KPerBlock
-
1
)
/
KPerBlock
)
*
((
Ho
+
HoPerBlock
-
1
)
/
HoPerBlock
)
*
((
Wo
+
WoPerBlock
-
1
)
/
WoPerBlock
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
{
constexpr
auto
gridwise_conv
=
#if 1
GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
#endif
<
GridSize
,
BlockSize
,
T
,
decltype
(
in_nchw_desc
),
decltype
(
wei_cyxk_desc
),
decltype
(
out_nkhw_desc
),
NPerBlock
,
KPerBlock
,
CPerBlock
,
HoPerBlock
,
WoPerBlock
,
NPerThread
,
KPerThread
,
HoPerThread
,
WoPerThread
,
GemmMPerThreadSubC
,
GemmNPerThreadSubC
,
GemmMLevel0Cluster
,
GemmNLevel0Cluster
,
GemmMLevel1Cluster
,
GemmNLevel1Cluster
,
GemmKPerThreadLoop
,
GemmDataPerReadA
,
GemmDataPerReadB
,
InBlockReorderSrcSubLengths_NCHW
,
InBlockReorderSrcClusterLengths_NCHW
,
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW
,
InBlockReorderDataPerRead_W
,
InBlockReorderDataPerWrite_N
,
WeiBlockCopyClusterLengths
,
WeiBlockCopyDataPerRead_K
,
OutThreadCopyDataPerWrite_W
>
{};
float
time
=
launch_kernel
(
run_gridwise_convolution
<
decltype
(
gridwise_conv
),
T
>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
static_cast
<
T
*>
(
in_nchw_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
wei_cyxk_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
out_nkhw_device_buf
.
GetDeviceBuffer
()));
printf
(
"Elapsed time : %f ms, %f TFlop/s
\n
"
,
time
,
(
float
)
calculate_convolution_flops
(
InDesc
{},
WeiDesc
{},
OutDesc
{})
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
time
);
usleep
(
std
::
min
(
time
*
1000
,
float
(
10000
)));
}
out_nkhw_device_buf
.
FromDevice
(
out_nkhw
.
mData
.
data
());
}
driver/driver.hip.cpp
View file @
b7d05245
...
...
@@ -13,6 +13,7 @@
#include "device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp"
#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
struct
GeneratorTensor_1
{
...
...
@@ -410,7 +411,7 @@ void check_error(const Tensor<T>& ref, const Tensor<T>& result)
int
main
(
int
argc
,
char
*
argv
[])
{
#if
1
#if
0
// 3x3, 34x34
constexpr index_t N = 64;
constexpr index_t C = 256;
...
...
@@ -434,7 +435,7 @@ int main(int argc, char* argv[])
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif
0
#elif
1
// 3x3 filter, 28x28 image
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
256
;
...
...
@@ -603,7 +604,7 @@ int main(int argc, char* argv[])
#if 1
#if 0
device_direct_convolution_1
#elif
1
#elif
0
device_convolution_direct_v2_nchw_kcyx_nkhw
#elif 0
device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
...
...
@@ -615,6 +616,8 @@ int main(int argc, char* argv[])
device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
#elif 0
device_convolution_implicit_gemm_v2_chwn_cyxk_khwn
#elif 1
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
#endif
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
...
...
src/include/ConstantMergedTensorDescriptor.hip.hpp
0 → 100644
View file @
b7d05245
#pragma once
#include "common.hip.hpp"
#include "ConstantTensorDescriptor.hip.hpp"
// TensorDesc: ConstantTensorDescriptor<...>
// MergedDimRanges: Sequence<FirstMergedDim, LastMergedDim>
template
<
class
TensorDesc
,
class
...
MergedDimRanges
>
struct
ConstantMergedTensorDescriptor
{
static
constexpr
index_t
nOriginalDim
=
GetNumOfOriginalDimension
();
static
constexpr
index_t
nDim
=
GetNumOfDimension
();
template
<
class
...
Is
>
__host__
__device__
constexpr
ConstantMergedTensorDescriptor
()
{
constexpr
auto
merged_dim_ranges
=
std
::
make_tuple
(
MergedDimRanges
{}...);
static_for
<
0
,
sizeof
...(
MergedDimRanges
),
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
i
=
I
.
Get
();
constexpr
auto
merged_dim_range
=
std
::
get
<
i
>
(
merged_dim_ranges
);
static_assert
(
merged_dim_range
.
GetSize
()
==
2
,
"wrong! should specify first and last dimension to be merged"
);
static_assert
(
merged_dim_range
.
Get
(
Number
<
0
>
{})
<
GetNumOfUnmergedDimension
(),
"wrong!"
);
static_assert
(
merged_dim_range
.
Get
(
Number
<
1
>
{})
<
GetNumOfUnmergedDimension
(),
"wrong!"
);
static_assert
(
merged_dim_range
.
Get
(
Number
<
0
>
{})
<=
merged_dim_range
.
Get
(
Number
<
1
>
{}),
"wrong!"
);
});
}
__host__
__device__
static
constexpr
index_t
GetNumOfOriginalDimension
()
{
return
TensorDesc
::
GetNumOfDimension
();
}
__host__
__device__
static
constexpr
index_t
GetNumOfDimension
()
{
constexpr
auto
merged_dim_ranges
=
std
::
make_tuple
(
MergedDimRanges
...);
struct
f_calculate_num_of_lost_dim
{
__host__
__device__
constexpr
index_t
operator
()(
auto
I
)
const
{
constexpr
index_t
i
=
I
.
Get
();
constexpr
auto
merged_dim_range
=
std
::
get
<
i
>
(
merged_dim_ranges
);
return
merged_dim_range
.
Get
(
Number
<
1
>
{})
-
merged_dim_range
.
Get
(
Number
<
0
>
{});
}
};
constexpr
index_t
num_lost_dim
=
static_const_reduce_n
<
sizeof
...(
MergedDimRanges
)
>
{}(
f_calculate_num_of_lost_dim
,
mod_conv
::
plus
<
index_t
>
{});
return
TensorDesc
::
GetNumOfDimension
()
-
num_lost_dim
;
}
template
<
index_t
IDim
>
__host__
__device__
static
constexpr
bool
IsMergedDimension
(
Number
<
IDim
>
)
{
// not implemented
}
template
<
index_t
IDim
>
__host__
__device__
static
constexpr
bool
GetLength
(
Number
<
IDim
>
)
{
// not implemented
}
template
<
index_t
IDim
>
__host__
__device__
static
constexpr
bool
GetStride
(
Number
<
IDim
>
)
{
static_assert
(
!
IsMergedDimension
(
Number
<
IDim
>
{},
"wrong! A merged dimension does not have uniform stride"
)
// not implemented
}
template
<
class
...
Is
>
__host__
__device__
auto
MultiIndex2OriginalMultiIndex
(
Is
...
is
)
const
{
// not implemented
}
template
<
class
...
Is
>
__host__
__device__
auto
OriginalMultiIndex2MultiIndex
(
Is
...
is
)
const
{
// not implemented
}
};
template
<
class
TensorDesc
,
class
...
MergedDimRanges
>
constexpr
auto
make_ConstantMergedTensorDescriptor
(
TensorDesc
,
MergedDimRanges
...)
{
return
ConstantMergedTensorDescriptor
<
TensorDesc
,
MergedDimRanges
...
>
{};
}
src/include/ConstantTensorDescriptor.hip.hpp
View file @
b7d05245
...
...
@@ -65,7 +65,7 @@ struct ConstantTensorDescriptor
static_assert
(
Lengths
::
GetSize
()
==
Strides
::
GetSize
(),
"nDim not consistent"
);
}
__host__
__device__
static
constexpr
index_t
GetDimension
()
{
return
nDim
;
}
__host__
__device__
static
constexpr
index_t
Get
NumOf
Dimension
()
{
return
nDim
;
}
__host__
__device__
static
constexpr
Lengths
GetLengths
()
{
return
Lengths
{};
}
...
...
@@ -160,11 +160,51 @@ struct ConstantTensorDescriptor
return
multi_id
;
}
__host__
__device__
static
constexpr
auto
Condense
()
__host__
__device__
static
constexpr
auto
Pack
()
{
constexpr
auto
default_strides
=
calculate_default_strides
(
Lengths
{});
return
ConstantTensorDescriptor
<
Lengths
,
decltype
(
default_strides
)
>
{};
}
template
<
index_t
IDims
...>
__host__
__device__
static
constexpr
auto
Extract
(
Number
<
IDims
>
...
/*extracted_dims...*/
)
{
static_assert
(
sizeof
...(
IDims
)
<=
GetNumOfDimension
(),
"wrong!"
);
constexpr
auto
extracted_lengths
=
Sequence
<
Lengths
{}.
Get
(
Number
<
IDims
>
{})...
>
{};
constexpr
auto
extracted_strides
=
Sequence
<
Strides
{}.
Get
(
Number
<
IDims
>
{})...
>
{};
return
make_ConstantTensorDescriptor
(
extracted_lenghts
,
extracted_strides
);
}
template
<
index_t
IDim
,
index_t
SliceLen
>
__host__
__device__
static
constexpr
auto
Slice
(
Number
<
IDim
>
,
Number
<
SliceLen
>
)
{
// not implemented
}
template
<
index_t
IDim
,
index_t
...
FoldLengths
>
__host__
device__
static
constexpr
auto
Fold
(
Number
<
IDim
>
,
Sequence
<
FoldLengths
...
>
)
{
// not implemented
// need to check the Length dimension to be folded is dividable by FoldLengths
}
template
<
index_t
FirstUnfoldDim
,
index_t
LastUnfoldDim
>
__host__
__device__
static
constexpr
auto
Unfold
(
Number
<
FirstUnfoldDim
>
,
Number
<
LastUnfoldDim
>
)
{
// not implemented
// need to check the dimensions to be unfold are packed, otherwise, Unfold is not permitted
}
template
<
index_t
...
IRs
>
__host__
__device__
static
constexpr
auto
ReorderGivenNew2Old
(
Sequence
<
IRs
...
>
/*new2old*/
)
{
static_assert
(
sizeof
...(
IRs
)
==
GetNumberOfDimension
(),
"wrong! dimension is wrong"
);
constexpr
auto
map_new2old
=
Sequence
<
IRs
...
>
{};
return
make_ConstantTensorDescriptor
(
Lengths
{}.
ReorderGivenNew2Old
(
map_new2old
),
Strides
{}.
ReorderGivenNew2Old
(
map_new2old
));
}
};
template
<
class
Lengths
>
...
...
@@ -191,7 +231,7 @@ template <class TDesc>
__host__
__device__
void
print_ConstantTensorDescriptor
(
TDesc
,
const
char
*
s
)
{
constexpr
auto
desc
=
TDesc
{};
constexpr
index_t
ndim
=
desc
.
GetDimension
();
constexpr
index_t
ndim
=
desc
.
Get
NumOf
Dimension
();
static_assert
(
ndim
>=
2
&&
ndim
<=
10
,
"wrong!"
);
...
...
@@ -202,7 +242,7 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s)
printf
(
"%s dim %u, lengths {%u %u}, strides {%u %u}
\n
"
,
s
,
desc
.
GetDimension
(),
desc
.
Get
NumOf
Dimension
(),
desc
.
GetLength
(
I0
),
desc
.
GetLength
(
I1
),
desc
.
GetStride
(
I0
),
...
...
@@ -216,7 +256,7 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s)
printf
(
"%s dim %u, lengths {%u %u %u}, strides {%u %u %u}
\n
"
,
s
,
desc
.
GetDimension
(),
desc
.
Get
NumOf
Dimension
(),
desc
.
GetLength
(
I0
),
desc
.
GetLength
(
I1
),
desc
.
GetLength
(
I2
),
...
...
@@ -233,7 +273,7 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s)
printf
(
"%s dim %u, lengths {%u %u %u %u}, strides {%u %u %u %u}
\n
"
,
s
,
desc
.
GetDimension
(),
desc
.
Get
NumOf
Dimension
(),
desc
.
GetLength
(
I0
),
desc
.
GetLength
(
I1
),
desc
.
GetLength
(
I2
),
...
...
@@ -253,7 +293,7 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s)
printf
(
"%s dim %u, lengths {%u %u %u %u %u}, strides {%u %u %u %u %u}
\n
"
,
s
,
desc
.
GetDimension
(),
desc
.
Get
NumOf
Dimension
(),
desc
.
GetLength
(
I0
),
desc
.
GetLength
(
I1
),
desc
.
GetLength
(
I2
),
...
...
@@ -276,7 +316,7 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s)
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u}, strides {%u %u %u %u %u %u}
\n
"
,
s
,
desc
.
GetDimension
(),
desc
.
Get
NumOf
Dimension
(),
desc
.
GetLength
(
I0
),
desc
.
GetLength
(
I1
),
desc
.
GetLength
(
I2
),
...
...
@@ -302,7 +342,7 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s)
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u}
\n
"
,
s
,
desc
.
GetDimension
(),
desc
.
Get
NumOf
Dimension
(),
desc
.
GetLength
(
I0
),
desc
.
GetLength
(
I1
),
desc
.
GetLength
(
I2
),
...
...
@@ -331,7 +371,7 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s)
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u}
\n
"
,
s
,
desc
.
GetDimension
(),
desc
.
Get
NumOf
Dimension
(),
desc
.
GetLength
(
I0
),
desc
.
GetLength
(
I1
),
desc
.
GetLength
(
I2
),
...
...
@@ -364,7 +404,7 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s)
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u "
"%u}
\n
"
,
s
,
desc
.
GetDimension
(),
desc
.
Get
NumOf
Dimension
(),
desc
.
GetLength
(
I0
),
desc
.
GetLength
(
I1
),
desc
.
GetLength
(
I2
),
...
...
@@ -400,7 +440,7 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s)
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u "
"%u %u %u}
\n
"
,
s
,
desc
.
GetDimension
(),
desc
.
Get
NumOf
Dimension
(),
desc
.
GetLength
(
I0
),
desc
.
GetLength
(
I1
),
desc
.
GetLength
(
I2
),
...
...
src/include/Sequence.hip.hpp
View file @
b7d05245
...
...
@@ -59,10 +59,22 @@ struct Sequence
__host__
__device__
constexpr
auto
PopBack
()
const
;
template
<
class
F
>
__host__
__device__
constexpr
auto
Transform
(
F
f
)
const
template
<
index_t
I
,
index_t
X
>
__host__
__device__
constexpr
auto
Insert
(
Number
<
I
>
,
Number
<
X
>
)
const
{
return
Sequence
<
f
(
Is
)...
>
{};
index_t
data
[
mSize
+
1
];
static_for
<
0
,
I
,
1
>
{}([
&
](
auto
Iter
)
{
constexpr
index_t
iter
=
Iter
.
Get
();
data
[
iter
]
=
mData
[
iter
];
});
data
[
I
]
=
X
;
static_for
<
I
,
nSize
,
1
>
{}([
&
](
auto
Iter
)
{
constexpr
index_t
iter
=
Iter
.
Get
();
data
[
iter
+
1
]
=
mData
[
iter
];
});
}
};
...
...
src/include/blockwise_4d_tensor_op.hip.hpp
View file @
b7d05245
#pragma once
#include "ConstantTensorDescriptor.hip.hpp"
#include "threadwise_
nd_
tensor_op.hip.hpp"
#include "threadwise_tensor_
slice_
op.hip.hpp"
template
<
index_t
BlockSize
,
class
Float
,
class
DstDesc
,
class
F
>
__device__
void
...
...
src/include/blockwise_gemm.hip.hpp
View file @
b7d05245
...
...
@@ -7,7 +7,6 @@
template
<
index_t
BlockSize
,
class
BlockMatrixA
,
class
BlockMatrixB
,
class
ThreadMatrixC
,
index_t
MPerThreadSubC
,
index_t
NPerThreadSubC
,
index_t
MLevel0Cluster
,
...
...
@@ -35,51 +34,35 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
static_assert
(
BlockSize
==
ThreadPerLevel1Cluster
,
"wrong! wrong blocksize
\n
"
);
constexpr
auto
a_block_mtx
=
BlockMatrixA
{};
constexpr
auto
b_block_mtx
=
BlockMatrixB
{};
constexpr
auto
c_thread_mtx
=
ThreadMatrixC
{};
static_assert
(
a_block_mtx
.
NRow
()
==
b_block_mtx
.
NRow
(),
static_assert
(
BlockMatrixA
::
NRow
()
==
BlockMatrixB
::
NRow
(),
"wrong! K dimension not consistent
\n
"
);
constexpr
index_t
M
=
a_block_mtx
.
NCol
();
// A is transposed
constexpr
index_t
N
=
b_block_mtx
.
NCol
();
constexpr
index_t
K
=
a_block_mtx
.
NRow
();
constexpr
index_t
MPerThread
=
c_thread_mtx
.
NRow
();
constexpr
index_t
NPerThread
=
c_thread_mtx
.
NCol
();
static_assert
((
MPerThread
%
MPerThreadSubC
==
0
)
&&
(
NPerThread
%
NPerThreadSubC
==
0
),
"wrong! Cannot evenly divide thread work among repeat
\n
"
);
constexpr
index_t
MRepeat
=
MPerThread
/
MPerThreadSubC
;
constexpr
index_t
NRepeat
=
NPerThread
/
NPerThreadSubC
;
static_assert
((
M
%
MRepeat
==
0
)
&&
(
N
%
NRepeat
==
0
),
"wrong! Cannot evenly divide work among repeat
\n
"
);
constexpr
index_t
M
=
BlockMatrixA
::
NCol
();
// A is transposed
constexpr
index_t
N
=
BlockMatrixB
::
NCol
();
constexpr
index_t
K
=
BlockMatrixA
::
NRow
();
constexpr
index_t
MPerLevel1Cluster
=
M
/
MRepeat
;
constexpr
index_t
NPerLevel1Cluster
=
N
/
NRepeat
;
static_assert
(
M
%
(
MPerThreadSubC
*
MLevel0Cluster
*
MLevel1Cluster
)
==
0
&&
N
%
(
NPerThreadSubC
*
NLevel0Cluster
*
NLevel1Cluster
)
==
0
,
"wrong! Cannot evenly divide work among
\n
"
);
static_assert
((
MPerLevel1Cluster
%
MLevel1Cluster
==
0
)
&&
(
NPerLevel1Cluster
%
NLevel1Cluster
==
0
),
"wrong! Cannot evenly divide work among Level1Cluster
\n
"
);
static_assert
(
ThreadMatrixC
::
GetLengths
()
==
GetThreadMatrixCLengths
,
"wrong! ThreadMatrixC lengths is wrong"
);
constexpr
index_t
MPerLevel0Cluster
=
MPerLevel1Cluster
/
MLevel1Cluster
;
constexpr
index_t
NPerLevel0Cluster
=
NPerLevel1Cluster
/
NLevel1Cluster
;
auto
c_thread_mtx_index
=
GetBeginOfThreadMatrixC
(
get_thread_local_1d_id
());
static_assert
((
MPerLevel0Cluster
%
MLevel0Cluster
==
0
)
&&
(
NPerLevel0Cluster
%
NLevel0Cluster
==
0
),
"wrong! Cannot evenly divide work among Level0Cluster
\n
"
);
mMyThreadOffsetA
=
BlockMatrixA
::
Get1dIndex
(
0
,
c_thread_mtx_index
.
row
);
mMyThreadOffsetB
=
BlockMatrixB
::
Get1dIndex
(
0
,
c_thread_mtx_index
.
col
);
}
static_assert
((
MPerThreadSubC
==
MPerLevel0Cluster
/
MLevel0Cluster
)
&&
(
NPerThreadSubC
==
NPerLevel0Cluster
/
NLevel0Cluster
),
"wrong! thread work size is wrong
\n
"
);
__device__
static
auto
GetThreadMatrixCLengths
()
{
constexpr
index_t
M
=
BlockMatrixA
::
NCol
();
// A is transposed
constexpr
index_t
N
=
BlockMatrixB
::
NCol
();
auto
c_thread_mtx_index
=
GetBeginOfThreadMatrixC
(
get_thread_local_1d_id
());
constexpr
index_t
MRepeat
=
M
/
(
MPerThreadSubC
*
MLevel0Cluster
*
MLevel1Cluster
);
constexpr
index_t
NRepeat
=
N
/
(
NPerThreadSubC
*
NLevel0Cluster
*
NLevel1Cluster
);
mMyThreadOffsetA
=
a_block_mtx
.
Get1dIndex
(
0
,
c_thread_mtx_index
.
row
);
mMyThreadOffsetB
=
b_block_mtx
.
Get1dIndex
(
0
,
c_thread_mtx_index
.
col
);
return
Sequence
<
MRepeat
*
MPerThreadSubC
,
NRepeat
*
NPerThreadSubC
>
{};
}
__device__
static
MatrixIndex
GetBeginOfThreadMatrixC
(
index_t
thread_id
)
...
...
@@ -101,7 +84,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
level1_n_id
*
NPerLevel0Cluster
+
level0_n_id
*
NPerThreadSubC
};
}
// this should be optimized away if input is known
__device__
static
MatrixIndex
GetDistanceFromBeginOfThreadMatrixC
(
index_t
m_in_c
,
index_t
n_in_c
)
{
...
...
src/include/blockwise_merged_tensor_slice_op.hip.hpp
0 → 100644
View file @
b7d05245
#pragma once
#include "threadwise_tensor_slice_op.hip.hpp"
// slice a merged tensor, reorder and copy it into a normal tensor
// src: a merged tensor,
// dst: a normal tensor
template
<
index_t
BlockSize
,
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SliceLengths
,
class
SubLengths
,
class
ClusterLengths
,
class
ThreadArrangeOrder
,
class
SrcAccessOrder
,
class
DstAccessOrder
>
struct
BlockwiseTensorSliceCopy_generic_v1
{
static
constexpr
index_t
nDim
=
SrcDesc
::
GetNumOfDimension
();
index_t
mSrcMyThreadOffset
;
index_t
mDstMyThreadOffset
;
__device__
BlockwiseTensorSliceCopy_generic_v1
(
Array
<
index_t
,
nDim
>
src_block_multi_id_offset
,
Array
<
index_t
,
nDim
>
dst_block_multi_id_offset
)
{
// only support SrcSubLengths.GetLength() == 1 on merged dimension, for now
// check SrcDataPerRead should be 1, if last dimension is a merged dimension
// check NDim consistent
// calculate mSrcMyThreadOffset
// calculate mDstMyThreadOffset
}
__device__
static
constexpr
index_t
GetRegisterClipboardSize
()
{}
__device__
void
RunLoadRegisterClipboard
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_clipboard
)
const
{
}
__device__
void
RunStoreRegisterClipboard
(
const
Float
*
__restrict__
p_clipboard
,
Float
*
__restrict__
p_dst
)
const
{
}
__device__
void
Run
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
{
Float
p_clipboard
[
GetRegisterClipboardSize
()];
RunLoadRegisterClipboard
(
p_src
,
p_clipboard
);
RunStoreRegisterClipboard
(
p_clipboard
,
p_dst
);
}
};
src/include/blockwise_
nd_
tensor_op.hip.hpp
→
src/include/blockwise_tensor_
slice_
op.hip.hpp
View file @
b7d05245
#pragma once
#include "threadwise_
nd_
tensor_op.hip.hpp"
#include "threadwise_tensor_
slice_
op.hip.hpp"
template
<
index_t
BlockSize
,
class
Float
,
...
...
@@ -12,14 +12,16 @@ template <index_t BlockSize,
class
MapThreadCluster2SrcCluster
,
index_t
SrcDataPerRead
,
index_t
DstDataPerWrite
>
struct
Blockwise
Nd
Tensor
Copy
Reorder_v3
struct
BlockwiseTensor
Slice
Reorder
Copy
_v3
{
static
constexpr
index_t
nDim
=
SrcLengths
::
GetSize
();
index_t
mSrcMyThreadOffset
;
index_t
mDstMyThreadOffset
;
__device__
BlockwiseNdTensorCopyReorder_v3
()
__device__
BlockwiseTensorSliceReorderCopy_v3
(
Array
<
index_t
,
nDim
>
src_block_data_multi_id_begin
,
Array
<
index_t
,
nDim
>
dst_block_data_multi_id_begin
)
{
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
...
...
@@ -43,8 +45,9 @@ struct BlockwiseNdTensorCopyReorder_v3
static_assert
(
is_same
<
Float
,
float
>::
value
,
"wrong! only support float for now!
\n
"
);
// sanity check: nDim
static_assert
(
SrcDesc
::
GetDimension
()
==
nDim
&&
DstDesc
::
GetDimension
()
==
nDim
&&
SrcLengths
::
GetSize
()
==
nDim
&&
SrcSubLengths
::
GetSize
()
==
nDim
&&
static_assert
(
SrcDesc
::
GetNumOfDimension
()
==
nDim
&&
DstDesc
::
GetNumOfDimension
()
==
nDim
&&
SrcLengths
::
GetSize
()
==
nDim
&&
SrcSubLengths
::
GetSize
()
==
nDim
&&
SrcClusterLengths
::
GetSize
()
==
nDim
&&
MapDst2Src
::
GetSize
()
==
nDim
&&
MapThreadCluster2SrcCluster
::
GetSize
()
==
nDim
,
"wrong! nDim is not consistent
\n
"
);
...
...
@@ -112,17 +115,17 @@ struct BlockwiseNdTensorCopyReorder_v3
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim
)
{
constexpr
auto
I
=
decltype
(
IDim
){};
constexpr
index_t
i
=
I
.
Get
();
// compiler: will it really compute index here, or be
associat
ed with Get1dIndex and
// compiler: will it really compute index here, or be
merg
ed with Get1dIndex and
// optimized away???
src_data_multi_id
[
i
]
*=
src_sub_lengths
.
Get
(
I
);
});
// compiler: will it really compute index here, or be
associat
ed with Get1dIndex and
// compiler: will it really compute index here, or be
merg
ed with Get1dIndex and
// optimized away???
const
auto
dst_data_multi_id
=
reorder_array_given_new2old
(
src_data_multi_id
,
map_dst2src
);
mSrcMyThreadOffset
=
src_desc
.
Get1dIndex
(
src_data_multi_id
);
mDstMyThreadOffset
=
dst_desc
.
Get1dIndex
(
dst_data_multi_id
);
mSrcMyThreadOffset
=
src_desc
.
Get1dIndex
(
src_data_multi_id
+
src_block_data_multi_id_begin
);
mDstMyThreadOffset
=
dst_desc
.
Get1dIndex
(
dst_data_multi_id
+
dst_block_data_multi_id_begin
);
}
__device__
static
constexpr
index_t
GetRegisterClipboardSize
()
...
...
@@ -176,12 +179,12 @@ struct BlockwiseNdTensorCopyReorder_v3
constexpr
index_t
clipboard_offset
=
thread_tensor_desc
.
Get1dIndex
(
clipboard_data_multi_id
);
threadwise_
nd_
tensor_copy
(
SrcDesc
{},
p_src
+
src_offset
+
mSrcMyThreadOffset
,
thread_tensor_desc
,
p_clipboard
+
clipboard_offset
,
thread_sub_tensor_lengths
,
Number
<
SrcDataPerRead
>
{});
threadwise_tensor_
slice_
copy
(
SrcDesc
{},
p_src
+
src_offset
+
mSrcMyThreadOffset
,
thread_tensor_desc
,
p_clipboard
+
clipboard_offset
,
thread_sub_tensor_lengths
,
Number
<
SrcDataPerRead
>
{});
});
}
...
...
@@ -222,22 +225,22 @@ struct BlockwiseNdTensorCopyReorder_v3
// write in the order of dst
#if 1
threadwise_
nd_
tensor_copy_reorder_given_dst2src_v2
(
thread_tensor_desc
,
p_clipboard
+
clipboard_offset
,
DstDesc
{},
p_dst
+
dst_offset
+
mDstMyThreadOffset
,
thread_sub_tensor_lengths
,
MapDst2Src
{});
threadwise_tensor_
slice_
copy_reorder_given_dst2src_v2
(
thread_tensor_desc
,
p_clipboard
+
clipboard_offset
,
DstDesc
{},
p_dst
+
dst_offset
+
mDstMyThreadOffset
,
thread_sub_tensor_lengths
,
MapDst2Src
{});
#else
threadwise_
nd_
tensor_copy_reorder_given_dst2src_v3
(
thread_tensor_desc
,
p_clipboard
+
clipboard_offset
,
DstDesc
{},
p_dst
+
dst_offset
+
mDstMyThreadOffset
,
thread_sub_tensor_lengths
,
MapDst2Src
{},
Number
<
DstDataPerWrite
>
{});
threadwise_tensor_
slice_
copy_reorder_given_dst2src_v3
(
thread_tensor_desc
,
p_clipboard
+
clipboard_offset
,
DstDesc
{},
p_dst
+
dst_offset
+
mDstMyThreadOffset
,
thread_sub_tensor_lengths
,
MapDst2Src
{},
Number
<
DstDataPerWrite
>
{});
#endif
});
}
...
...
src/include/conv_common.hip.hpp
View file @
b7d05245
...
...
@@ -14,8 +14,8 @@ __host__ __device__ constexpr auto get_convolution_output_default_4d_tensor_desc
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
static_assert
(
in_desc
.
GetDimension
()
==
4
,
"input nDim is not 4"
);
static_assert
(
wei_desc
.
GetDimension
()
==
4
,
"weight nDim is not 4"
);
static_assert
(
in_desc
.
Get
NumOf
Dimension
()
==
4
,
"input nDim is not 4"
);
static_assert
(
wei_desc
.
Get
NumOf
Dimension
()
==
4
,
"weight nDim is not 4"
);
static_assert
(
in_desc
.
GetLength
(
I1
)
==
wei_desc
.
GetLength
(
I1
),
"input & weight dimension not consistent"
);
...
...
@@ -45,8 +45,8 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
static_assert
(
in_desc
.
GetDimension
()
==
4
,
"input nDim is not 4"
);
static_assert
(
wei_desc
.
GetDimension
()
==
4
,
"weight nDim is not 4"
);
static_assert
(
in_desc
.
Get
NumOf
Dimension
()
==
4
,
"input nDim is not 4"
);
static_assert
(
wei_desc
.
Get
NumOf
Dimension
()
==
4
,
"weight nDim is not 4"
);
static_assert
(
in_desc
.
GetLength
(
I1
)
==
wei_desc
.
GetLength
(
I1
),
"input & weight dimension not consistent"
);
...
...
src/include/functional.hip.hpp
View file @
b7d05245
#pragma once
#include "constant_integral.hip.hpp"
struct
forwarder
{
template
<
typename
T
>
__host__
__device__
constexpr
T
operator
()(
T
&&
x
)
const
{
return
std
::
forward
<
T
>
(
x
);
}
};
// Emulate compile time if statement for C++14
// Get the idea from
// "https://baptiste-wicht.com/posts/2015/07/simulate-static_if-with-c11c14.html"
// TODO: use if constexpr, when C++17 is supported
template
<
bool
Predicate
>
struct
static_if
{
};
template
<
>
struct
static_if
<
true
>
{
using
Type
=
static_if
<
true
>
;
template
<
class
F
>
__host__
__device__
constexpr
auto
operator
()(
F
f
)
const
{
// This is a trick for compiler:
// Pass forwarder to lambda "f" as "auto" argument, and maks sure "f" will use it,
// this will make "f" a generic lambda, so that "f" won't be compiled until here
f
(
forwarder
{});
return
Type
{};
}
template
<
class
F
>
__host__
__device__
static
constexpr
auto
else_
(
F
)
{
return
Type
{};
}
};
template
<
>
struct
static_if
<
false
>
{
using
Type
=
static_if
<
false
>
;
template
<
class
F
>
__host__
__device__
constexpr
auto
operator
()(
F
)
const
{
return
Type
{};
}
template
<
class
F
>
__host__
__device__
static
constexpr
auto
else_
(
F
f
)
{
// This is a trick for compiler:
// Pass forwarder to lambda "f" as "auto" argument, and maks sure "f" will use it,
// this will make "f" a generic lambda, so that "f" won't be compiled until here
f
(
forwarder
{});
return
Type
{};
}
};
template
<
index_t
Iter
,
index_t
Remaining
,
index_t
Increment
>
struct
static_for_impl
{
...
...
@@ -26,22 +87,26 @@ struct static_for_impl<Iter, 0, Increment>
}
};
// F signature: F(Number<I>)
template
<
index_t
NBegin
,
index_t
NEnd
,
index_t
Increment
>
struct
static_for
{
template
<
class
F
>
__host__
__device__
void
operator
()(
F
f
)
const
{
static_assert
(
NBegin
<
NEnd
,
"Wrong! we should have NBegin < NEnd"
);
static_assert
((
NEnd
-
NBegin
)
%
Increment
==
0
,
"Wrong! should satisfy (NEnd - NBegin) % Increment == 0"
);
static_for_impl
<
NBegin
,
NEnd
-
NBegin
,
Increment
>
{}(
f
);
static_if
<
NBegin
<
End
>
{}([
&
](
auto
forwarder
)
{
static_for_impl
<
NBegin
,
NEnd
-
NBegin
,
forwarder
(
Increment
)
>
{}(
f
);
});
}
};
template
<
index_t
NLoop
>
struct
static_const_reduce_n
{
// signature of F: F(Number<I>)
template
<
class
F
,
class
Reduce
>
__host__
__device__
constexpr
auto
operator
()(
F
f
,
Reduce
r
)
const
{
...
...
@@ -70,65 +135,3 @@ __host__ __device__ constexpr auto unpacker(F f)
return [=](auto xs_array){ f(xs...); };
}
#endif
struct
forwarder
{
template
<
typename
T
>
__host__
__device__
constexpr
T
operator
()(
T
&&
x
)
const
{
return
std
::
forward
<
T
>
(
x
);
}
};
// Emulate compile time if statement for C++14
// Get the idea from
// "https://baptiste-wicht.com/posts/2015/07/simulate-static_if-with-c11c14.html"
// TODO: use if constexpr, when C++17 is supported
template
<
bool
Predicate
>
struct
static_if
{
};
template
<
>
struct
static_if
<
true
>
{
using
Type
=
static_if
<
true
>
;
template
<
class
F
>
__host__
__device__
constexpr
auto
operator
()(
F
f
)
const
{
// This is a trick for compiler:
// Pass forwarder to lambda "f" as "auto" argument, and maks sure "f" will use it,
// this will make "f" a generic lambda, so that "f" won't be compiled until here
f
(
forwarder
{});
return
Type
{};
}
template
<
class
F
>
__host__
__device__
static
constexpr
auto
else_
(
F
)
{
return
Type
{};
}
};
template
<
>
struct
static_if
<
false
>
{
using
Type
=
static_if
<
false
>
;
template
<
class
F
>
__host__
__device__
constexpr
auto
operator
()(
F
)
const
{
return
Type
{};
}
template
<
class
F
>
__host__
__device__
static
constexpr
auto
else_
(
F
f
)
{
// This is a trick for compiler:
// Pass forwarder to lambda "f" as "auto" argument, and maks sure "f" will use it,
// this will make "f" a generic lambda, so that "f" won't be compiled until here
f
(
forwarder
{});
return
Type
{};
}
};
src/include/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hip.hpp
View file @
b7d05245
...
...
@@ -3,7 +3,7 @@
#include "ConstantTensorDescriptor.hip.hpp"
#include "blockwise_2d_tensor_op.hip.hpp"
#include "blockwise_4d_tensor_op.hip.hpp"
#include "threadwise_
nd_
tensor_op.hip.hpp"
#include "threadwise_tensor_
slice_
op.hip.hpp"
#include "threadwise_direct_convolution.hip.hpp"
template
<
index_t
GridSize
,
...
...
@@ -229,7 +229,7 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw
}
// copy output tensor from register to global mem
threadwise_
nd_
tensor_copy
(
threadwise_tensor_
slice_
copy
(
out_nkhw_thread_desc
,
p_out_thread
,
out_nkhw_global_desc
,
...
...
src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp
View file @
b7d05245
...
...
@@ -4,7 +4,7 @@
#include "ConstantMatrixDescriptor.hip.hpp"
#include "blockwise_4d_tensor_op.hip.hpp"
#include "blockwise_2d_tensor_op.hip.hpp"
#include "threadwise_
nd_
tensor_op.hip.hpp"
#include "threadwise_tensor_
slice_
op.hip.hpp"
#include "threadwise_4d_tensor_op.hip.hpp"
#include "blockwise_batched_gemm.hip.hpp"
...
...
@@ -325,7 +325,7 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
}
#endif
threadwise_
nd_
tensor_copy
(
threadwise_tensor_
slice_
copy
(
out_10d_thread_desc
,
p_out_thread
,
out_10d_global_desc
,
...
...
@@ -375,7 +375,7 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
}
#endif
threadwise_
nd_
tensor_copy
(
threadwise_tensor_
slice_
copy
(
out_10d_thread_desc
,
p_out_thread
,
out_10d_global_desc
,
...
...
src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp
View file @
b7d05245
...
...
@@ -5,7 +5,7 @@
#include "blockwise_2d_tensor_op.hip.hpp"
#include "blockwise_3d_tensor_op.hip.hpp"
#include "blockwise_4d_tensor_op.hip.hpp"
#include "threadwise_
nd_
tensor_op.hip.hpp"
#include "threadwise_tensor_
slice_
op.hip.hpp"
#include "threadwise_4d_tensor_op.hip.hpp"
#include "blockwise_batched_gemm.hip.hpp"
...
...
@@ -358,7 +358,7 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
}
#endif
threadwise_
nd_
tensor_copy
(
threadwise_tensor_
slice_
copy
(
out_10d_thread_desc
,
p_out_thread
,
out_10d_global_desc
,
...
...
@@ -408,7 +408,7 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
}
#endif
threadwise_
nd_
tensor_copy
(
threadwise_tensor_
slice_
copy
(
out_10d_thread_desc
,
p_out_thread
,
out_10d_global_desc
,
...
...
src/include/gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp
View file @
b7d05245
...
...
@@ -4,8 +4,8 @@
#include "ConstantMatrixDescriptor.hip.hpp"
#include "blockwise_2d_tensor_op.hip.hpp"
#include "blockwise_3d_tensor_op.hip.hpp"
#include "blockwise_
nd_
tensor_op.hip.hpp"
#include "threadwise_
nd_
tensor_op.hip.hpp"
#include "blockwise_tensor_
slice_
op.hip.hpp"
#include "threadwise_tensor_
slice_
op.hip.hpp"
#include "threadwise_4d_tensor_op.hip.hpp"
#include "blockwise_batched_gemm.hip.hpp"
...
...
@@ -127,18 +127,18 @@ struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn
// input: format is [N, C, Hi, Wi] to [C, Hi, Wi, N]
constexpr
auto
map_chwn2nchw
=
Sequence
<
1
,
2
,
3
,
0
>
{};
const
auto
blockwise_in_copy_reorder
=
BlockwiseNdTensorCopyReorder_v3
<
BlockSize
,
Float
,
decltype
(
in_n_c_h_w_global_desc
),
decltype
(
in_c_h_w_n_block_desc
),
Sequence
<
NPerBlock
,
CPerBlock
,
HoPerBlock
,
WiPerBlock
>
,
InBlockReorderSrcSubLengths_NCHW
,
InBlockReorderSrcClusterLengths_NCHW
,
decltype
(
map_chwn2nchw
),
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW
,
InBlockReorderDataPerRead_W
,
InBlockReorderDataPerWrite_N
>
{};
const
auto
blockwise_in_copy_reorder
=
BlockwiseTensorSliceReorderCopy_v3
<
BlockSize
,
Float
,
decltype
(
in_n_c_h_w_global_desc
),
decltype
(
in_c_h_w_n_block_desc
),
Sequence
<
NPerBlock
,
CPerBlock
,
HoPerBlock
,
WiPerBlock
>
,
InBlockReorderSrcSubLengths_NCHW
,
InBlockReorderSrcClusterLengths_NCHW
,
decltype
(
map_chwn2nchw
),
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW
,
InBlockReorderDataPerRead_W
,
InBlockReorderDataPerWrite_N
>
{};
// blockwise wei copy
// format is [CPerBlock, X * KPerBlock]
...
...
src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp
View file @
b7d05245
...
...
@@ -4,7 +4,7 @@
#include "ConstantMatrixDescriptor.hip.hpp"
#include "blockwise_2d_tensor_op.hip.hpp"
#include "blockwise_4d_tensor_op.hip.hpp"
#include "threadwise_
nd_
tensor_op.hip.hpp"
#include "threadwise_tensor_
slice_
op.hip.hpp"
#include "threadwise_4d_tensor_op.hip.hpp"
#include "blockwise_batched_gemm.hip.hpp"
...
...
@@ -347,7 +347,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
}
#endif
threadwise_
nd_
tensor_copy
(
threadwise_tensor_
slice_
copy
(
out_10d_thread_desc
,
p_out_thread
,
out_10d_global_desc
,
...
...
@@ -397,7 +397,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
}
#endif
threadwise_
nd_
tensor_copy
(
threadwise_tensor_
slice_
copy
(
out_10d_thread_desc
,
p_out_thread
,
out_10d_global_desc
,
...
...
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp
View file @
b7d05245
...
...
@@ -4,7 +4,7 @@
#include "ConstantMatrixDescriptor.hip.hpp"
#include "blockwise_2d_tensor_op.hip.hpp"
#include "blockwise_4d_tensor_op.hip.hpp"
#include "threadwise_
nd_
tensor_op.hip.hpp"
#include "threadwise_tensor_
slice_
op.hip.hpp"
#include "threadwise_4d_tensor_op.hip.hpp"
#include "blockwise_batched_gemm.hip.hpp"
...
...
@@ -408,7 +408,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
}
#endif
threadwise_
nd_
tensor_copy
(
threadwise_tensor_
slice_
copy
(
out_10d_thread_desc
,
p_out_thread
,
out_10d_global_desc
,
...
...
@@ -458,7 +458,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
}
#endif
threadwise_
nd_
tensor_copy
(
threadwise_tensor_
slice_
copy
(
out_10d_thread_desc
,
p_out_thread
,
out_10d_global_desc
,
...
...
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
View file @
b7d05245
...
...
@@ -3,8 +3,8 @@
#include "ConstantTensorDescriptor.hip.hpp"
#include "ConstantMatrixDescriptor.hip.hpp"
#include "blockwise_2d_tensor_op.hip.hpp"
#include "blockwise_
nd_
tensor_op.hip.hpp"
#include "threadwise_
nd_
tensor_op.hip.hpp"
#include "blockwise_tensor_
slice_
op.hip.hpp"
#include "threadwise_tensor_
slice_
op.hip.hpp"
#include "threadwise_4d_tensor_op.hip.hpp"
#include "blockwise_batched_gemm.hip.hpp"
...
...
@@ -131,18 +131,18 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
// input: format is [N, C, Hi, Wi] to [C, Hi, Wi, N]
constexpr
auto
map_chwn2nchw
=
Sequence
<
1
,
2
,
3
,
0
>
{};
const
auto
blockwise_in_copy_reorder
=
BlockwiseNdTensorCopyReorder_v3
<
BlockSize
,
Float
,
decltype
(
in_n_c_h_w_global_desc
),
decltype
(
in_c_h_w_n_block_desc
),
Sequence
<
NPerBlock
,
CPerBlock
,
HoPerBlock
,
WoPerBlock
>
,
InBlockReorderSrcSubLengths_NCHW
,
InBlockReorderSrcClusterLengths_NCHW
,
decltype
(
map_chwn2nchw
),
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW
,
InBlockReorderDataPerRead_W
,
InBlockReorderDataPerWrite_N
>
{};
const
auto
blockwise_in_copy_reorder
=
BlockwiseTensorSliceReorderCopy_v3
<
BlockSize
,
Float
,
decltype
(
in_n_c_h_w_global_desc
),
decltype
(
in_c_h_w_n_block_desc
),
Sequence
<
NPerBlock
,
CPerBlock
,
HoPerBlock
,
WoPerBlock
>
,
InBlockReorderSrcSubLengths_NCHW
,
InBlockReorderSrcClusterLengths_NCHW
,
decltype
(
map_chwn2nchw
),
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW
,
InBlockReorderDataPerRead_W
,
InBlockReorderDataPerWrite_N
>
{};
// blockwise wei copy
// format is [CPerBlock, KPerBlock]
...
...
@@ -407,7 +407,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
}
#endif
threadwise_
nd_
tensor_copy
(
threadwise_tensor_
slice_
copy
(
out_10d_thread_desc
,
p_out_thread
,
out_10d_global_desc
,
...
...
@@ -457,7 +457,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
}
#endif
threadwise_
nd_
tensor_copy
(
threadwise_tensor_
slice_
copy
(
out_10d_thread_desc
,
p_out_thread
,
out_10d_global_desc
,
...
...
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
View file @
b7d05245
...
...
@@ -3,8 +3,8 @@
#include "ConstantTensorDescriptor.hip.hpp"
#include "ConstantMatrixDescriptor.hip.hpp"
#include "blockwise_2d_tensor_op.hip.hpp"
#include "blockwise_
nd_
tensor_op.hip.hpp"
#include "threadwise_
nd_
tensor_op.hip.hpp"
#include "blockwise_tensor_
slice_
op.hip.hpp"
#include "threadwise_tensor_
slice_
op.hip.hpp"
#include "threadwise_4d_tensor_op.hip.hpp"
#include "blockwise_batched_gemm.hip.hpp"
...
...
@@ -131,18 +131,18 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
// input: format is [N, C, Hi, Wi] to [C, Hi, Wi, N]
constexpr
auto
map_chwn2nchw
=
Sequence
<
1
,
2
,
3
,
0
>
{};
const
auto
blockwise_in_copy_reorder
=
BlockwiseNdTensorCopyReorder_v3
<
BlockSize
,
Float
,
decltype
(
in_n_c_h_w_global_desc
),
decltype
(
in_c_h_w_n_block_desc
),
Sequence
<
NPerBlock
,
CPerBlock
,
HoPerBlock
,
WoPerBlock
>
,
InBlockReorderSrcSubLengths_NCHW
,
InBlockReorderSrcClusterLengths_NCHW
,
decltype
(
map_chwn2nchw
),
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW
,
InBlockReorderDataPerRead_W
,
InBlockReorderDataPerWrite_N
>
{};
const
auto
blockwise_in_copy_reorder
=
BlockwiseTensorSliceReorderCopy_v3
<
BlockSize
,
Float
,
decltype
(
in_n_c_h_w_global_desc
),
decltype
(
in_c_h_w_n_block_desc
),
Sequence
<
NPerBlock
,
CPerBlock
,
HoPerBlock
,
WoPerBlock
>
,
InBlockReorderSrcSubLengths_NCHW
,
InBlockReorderSrcClusterLengths_NCHW
,
decltype
(
map_chwn2nchw
),
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW
,
InBlockReorderDataPerRead_W
,
InBlockReorderDataPerWrite_N
>
{};
// blockwise wei copy
// format is [CPerBlock, KPerBlock]
...
...
@@ -409,7 +409,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
constexpr
auto
map_out_global2thread
=
Sequence
<
7
,
8
,
9
,
0
,
1
,
2
,
3
,
4
,
5
,
6
>
{};
threadwise_
nd_
tensor_copy_reorder_given_dst2src_v2
(
threadwise_tensor_
slice_
copy_reorder_given_dst2src_v2
(
out_10d_thread_desc
,
p_out_thread
,
out_10d_global_desc
,
...
...
@@ -458,7 +458,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
constexpr
auto
map_out_global2thread
=
Sequence
<
8
,
9
,
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
>
{};
threadwise_
nd_
tensor_copy_reorder_given_dst2src_v2
(
threadwise_tensor_
slice_
copy_reorder_given_dst2src_v2
(
out_10d_thread_desc
,
p_out_thread
,
out_10d_global_desc
,
...
...
src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp
View file @
b7d05245
...
...
@@ -3,8 +3,8 @@
#include "ConstantTensorDescriptor.hip.hpp"
#include "ConstantMatrixDescriptor.hip.hpp"
#include "blockwise_2d_tensor_op.hip.hpp"
#include "blockwise_
nd_
tensor_op.hip.hpp"
#include "threadwise_
nd_
tensor_op.hip.hpp"
#include "blockwise_tensor_
slice_
op.hip.hpp"
#include "threadwise_tensor_
slice_
op.hip.hpp"
#include "threadwise_4d_tensor_op.hip.hpp"
#include "blockwise_batched_gemm.hip.hpp"
...
...
@@ -130,18 +130,18 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
// input: format is [N, C, Hi, Wi] to [C, Hi, Wi, N]
constexpr
auto
map_chwn2nchw
=
Sequence
<
1
,
2
,
3
,
0
>
{};
const
auto
blockwise_in_copy_reorder
=
BlockwiseNdTensorCopyReorder_v3
<
BlockSize
,
Float
,
decltype
(
in_n_c_h_w_global_desc
),
decltype
(
in_c_h_w_n_block_desc
),
Sequence
<
NPerBlock
,
CPerBlock
,
HoPerBlock
,
WoPerBlock
>
,
InBlockReorderSrcSubLengths_NCHW
,
InBlockReorderSrcClusterLengths_NCHW
,
decltype
(
map_chwn2nchw
),
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW
,
InBlockReorderDataPerRead_W
,
InBlockReorderDataPerWrite_N
>
{};
const
auto
blockwise_in_copy_reorder
=
BlockwiseTensorSliceReorderCopy_v3
<
BlockSize
,
Float
,
decltype
(
in_n_c_h_w_global_desc
),
decltype
(
in_c_h_w_n_block_desc
),
Sequence
<
NPerBlock
,
CPerBlock
,
HoPerBlock
,
WoPerBlock
>
,
InBlockReorderSrcSubLengths_NCHW
,
InBlockReorderSrcClusterLengths_NCHW
,
decltype
(
map_chwn2nchw
),
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW
,
InBlockReorderDataPerRead_W
,
InBlockReorderDataPerWrite_N
>
{};
// blockwise wei copy
// format is [CPerBlock, KPerBlock]
...
...
@@ -390,7 +390,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
}
#endif
threadwise_
nd_
tensor_copy
(
threadwise_tensor_
slice_
copy
(
out_10d_thread_desc
,
p_out_thread
,
out_10d_global_desc
,
...
...
@@ -440,7 +440,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
}
#endif
threadwise_
nd_
tensor_copy
(
threadwise_tensor_
slice_
copy
(
out_10d_thread_desc
,
p_out_thread
,
out_10d_global_desc
,
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment