Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
31b40352
Unverified
Commit
31b40352
authored
Aug 18, 2021
by
Chao Liu
Committed by
GitHub
Aug 18, 2021
Browse files
Merge pull request #16 from ROCmSoftwarePlatform/develop
Merge develop into master
parents
5781adf5
b62bf8c3
Changes
145
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
157 additions
and
169 deletions
+157
-169
host/online_compile/kernels_batch.cpp.in
host/online_compile/kernels_batch.cpp.in
+0
-1
host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
...lver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
+143
-127
host/solver/include/convolution_problem_descriptor.hpp
host/solver/include/convolution_problem_descriptor.hpp
+4
-2
host/solver/include/solver_common.hpp
host/solver/include/solver_common.hpp
+6
-11
script/cmake-rocm.sh
script/cmake-rocm.sh
+4
-28
No files found.
host/online_compile/kernels_batch.cpp.in
deleted
100644 → 0
View file @
5781adf5
#include "${KERNEL_SRC_HPP_FILENAME}"
host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
View file @
31b40352
...
...
@@ -2,136 +2,150 @@
#define CONV_IGEMM_FWD_V6R1_DLOPS_NCHW_KCYX_NKHW_HPP
#include <numeric>
#include <sstream>
namespace
ck_driver
{
namespace
ck
{
namespace
driver
{
struct
CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw
{
ck
::
DataTypeEnum_t
ABDataTypeEnum
;
ck
::
DataTypeEnum_t
AccDataTypeEnum
;
ck
::
DataTypeEnum_t
CDataTypeEnum
;
auto
GetCompileParameterString
()
const
{
auto
param
=
std
::
stringstream
()
;
int
BlockSize
;
// clang-format off
param
<<
" -DCK_PARAM_ABDataTypeEnum="
<<
ABDataTypeEnum
<<
" -DCK_PARAM_AccDataTypeEnum="
<<
AccDataTypeEnum
<<
" -DCK_PARAM_CDataTypeEnum="
<<
CDataTypeEnum
<<
" -DCK_PARAM_BlockSize="
<<
BlockSize
<<
" -DCK_PARAM_GN0="
<<
GN0
<<
" -DCK_PARAM_GK1="
<<
GK1
<<
" -DCK_PARAM_GM1PerBlockGM11="
<<
GM1PerBlockGM11
<<
" -DCK_PARAM_GN1PerBlockGN11="
<<
GN1PerBlockGN11
<<
" -DCK_PARAM_GK0PerBlock="
<<
GK0PerBlock
<<
" -DCK_PARAM_BM1PerThreadBM11="
<<
BM1PerThreadBM11
<<
" -DCK_PARAM_BN1PerThreadBN11="
<<
BN1PerThreadBN11
<<
" -DCK_PARAM_BK0PerThread="
<<
BK0PerThread
<<
" -DCK_PARAM_BM10BN10ThreadClusterBM10Xs="
<<
BM10BN10ThreadClusterBM10Xs
[
0
]
<<
","
<<
BM10BN10ThreadClusterBM10Xs
[
1
]
<<
" -DCK_PARAM_BM10BN10ThreadClusterBN10Xs="
<<
BM10BN10ThreadClusterBN10Xs
[
0
]
<<
","
<<
BM10BN10ThreadClusterBN10Xs
[
1
]
<<
" -DCK_PARAM_ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1="
<<
ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1
[
0
]
<<
","
<<
ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1
[
1
]
<<
","
<<
ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1
[
2
]
<<
","
<<
ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1
[
3
]
<<
","
<<
ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1
[
4
]
<<
" -DCK_PARAM_ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1="
<<
ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1
[
0
]
<<
","
<<
ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1
[
1
]
<<
","
<<
ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1
[
2
]
<<
","
<<
ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1
[
3
]
<<
","
<<
ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1
[
4
]
<<
" -DCK_PARAM_ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1="
<<
ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1
[
0
]
<<
","
<<
ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1
[
1
]
<<
","
<<
ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1
[
2
]
<<
","
<<
ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1
[
3
]
<<
","
<<
ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1
[
4
]
<<
" -DCK_PARAM_ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1="
<<
ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1
[
0
]
<<
","
<<
ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1
[
1
]
<<
","
<<
ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1
[
2
]
<<
","
<<
ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1
[
3
]
<<
","
<<
ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1
[
4
]
<<
" -DCK_PARAM_BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1="
<<
BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1
[
0
]
<<
","
<<
BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1
[
1
]
<<
","
<<
BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1
[
2
]
<<
","
<<
BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1
[
3
]
<<
","
<<
BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1
[
4
]
<<
" -DCK_PARAM_BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1="
<<
BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1
[
0
]
<<
","
<<
BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1
[
1
]
<<
","
<<
BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1
[
2
]
<<
","
<<
BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1
[
3
]
<<
","
<<
BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1
[
4
]
<<
" -DCK_PARAM_BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1="
<<
BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1
[
0
]
<<
","
<<
BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1
[
1
]
<<
","
<<
BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1
[
2
]
<<
","
<<
BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1
[
3
]
<<
","
<<
BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1
[
4
]
<<
" -DCK_PARAM_BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1="
<<
BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1
[
0
]
<<
","
<<
BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1
[
1
]
<<
","
<<
BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1
[
2
]
<<
","
<<
BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1
[
3
]
<<
","
<<
BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1
[
4
]
<<
" -DCK_PARAM_CThreadTransferDstScalarPerVector="
<<
CThreadTransferDstScalarPerVector
<<
" -DCK_PARAM_HasMainKBlockLoop="
<<
static_cast
<
int
>
(
HasMainKBlockLoop
)
<<
" -DCK_PARAM_HasDoubleTailKBlockLoop="
<<
static_cast
<
int
>
(
HasDoubleTailKBlockLoop
);
// clang-format on
int
GN0
;
int
GK1
;
return
param
.
str
()
;
}
int
GM1PerBlockGM11
;
int
GN1PerBlockGN11
;
int
GK0PerBlock
;
ck
::
DataTypeEnum_t
ABDataTypeEnum
=
ck
::
DataTypeEnum_t
::
Unknown
;
ck
::
DataTypeEnum_t
AccDataTypeEnum
=
ck
::
DataTypeEnum_t
::
Unknown
;
ck
::
DataTypeEnum_t
CDataTypeEnum
=
ck
::
DataTypeEnum_t
::
Unknown
;
int
BM1PerThreadBM11
;
int
BN1PerThreadBN11
;
int
BK0PerThread
;
int
BlockSize
=
-
1
;
std
::
array
<
int
,
2
>
BM10BN10ThreadClusterBM10Xs
;
std
::
array
<
int
,
2
>
BM10BN10ThreadClusterBN10Xs
;
int
GN0
=
-
1
;
int
GK1
=
-
1
;
std
::
array
<
int
,
5
>
ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1
;
std
::
array
<
int
,
5
>
ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1
;
std
::
array
<
int
,
5
>
ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1
;
std
::
array
<
int
,
5
>
ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1
;
int
GM1PerBlockGM11
=
-
1
;
int
GN1PerBlockGN11
=
-
1
;
int
GK0PerBlock
=
-
1
;
std
::
array
<
int
,
5
>
BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1
;
std
::
array
<
int
,
5
>
BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1
;
std
::
array
<
int
,
5
>
BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1
;
std
::
array
<
int
,
5
>
BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1
;
int
BM1PerThreadBM11
=
-
1
;
int
BN1PerThreadBN11
=
-
1
;
int
BK0PerThread
=
-
1
;
int
CThreadTransferDstScalarPerVector
;
std
::
array
<
int
,
2
>
BM10BN10ThreadClusterBM10Xs
=
{
-
1
,
-
1
};
std
::
array
<
int
,
2
>
BM10BN10ThreadClusterBN10Xs
=
{
-
1
,
-
1
};
bool
HasMainKBlockLoop
;
bool
HasDoubleTailKBlockLoop
;
std
::
array
<
int
,
5
>
ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1
=
{
-
1
,
-
1
,
-
1
,
-
1
,
-
1
};
std
::
array
<
int
,
5
>
ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1
=
{
-
1
,
-
1
,
-
1
,
-
1
,
-
1
};
std
::
array
<
int
,
5
>
ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1
=
{
-
1
,
-
1
,
-
1
,
-
1
,
-
1
};
std
::
array
<
int
,
5
>
ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1
=
{
-
1
,
-
1
,
-
1
,
-
1
,
-
1
};
auto
GetCompileParameterString
()
const
{
// clang-format off
return
" -DCK_PARAM_ABDataTypeEnum="
+
std
::
to_string
(
ABDataTypeEnum
)
+
" -DCK_PARAM_AccDataTypeEnum="
+
std
::
to_string
(
AccDataTypeEnum
)
+
" -DCK_PARAM_CDataTypeEnum="
+
std
::
to_string
(
CDataTypeEnum
)
+
" -DCK_PARAM_BlockSize="
+
std
::
to_string
(
BlockSize
)
+
" -DCK_PARAM_GN0="
+
std
::
to_string
(
GN0
)
+
" -DCK_PARAM_GK1="
+
std
::
to_string
(
GK1
)
+
" -DCK_PARAM_GM1PerBlockGM11="
+
std
::
to_string
(
GM1PerBlockGM11
)
+
" -DCK_PARAM_GN1PerBlockGN11="
+
std
::
to_string
(
GN1PerBlockGN11
)
+
" -DCK_PARAM_GK0PerBlock="
+
std
::
to_string
(
GK0PerBlock
)
+
" -DCK_PARAM_BM1PerThreadBM11="
+
std
::
to_string
(
BM1PerThreadBM11
)
+
" -DCK_PARAM_BN1PerThreadBN11="
+
std
::
to_string
(
BN1PerThreadBN11
)
+
" -DCK_PARAM_BK0PerThread="
+
std
::
to_string
(
BK0PerThread
)
+
" -DCK_PARAM_BM10BN10ThreadClusterBM10Xs="
+
std
::
to_string
(
BM10BN10ThreadClusterBM10Xs
[
0
])
+
","
+
std
::
to_string
(
BM10BN10ThreadClusterBM10Xs
[
1
])
+
" -DCK_PARAM_BM10BN10ThreadClusterBN10Xs="
+
std
::
to_string
(
BM10BN10ThreadClusterBN10Xs
[
0
])
+
","
+
std
::
to_string
(
BM10BN10ThreadClusterBN10Xs
[
1
])
+
" -DCK_PARAM_ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1="
+
std
::
to_string
(
ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1
[
0
])
+
","
+
std
::
to_string
(
ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1
[
1
])
+
","
+
std
::
to_string
(
ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1
[
2
])
+
","
+
std
::
to_string
(
ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1
[
3
])
+
","
+
std
::
to_string
(
ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1
[
4
])
+
" -DCK_PARAM_ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1="
+
std
::
to_string
(
ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1
[
0
])
+
","
+
std
::
to_string
(
ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1
[
1
])
+
","
+
std
::
to_string
(
ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1
[
2
])
+
","
+
std
::
to_string
(
ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1
[
3
])
+
","
+
std
::
to_string
(
ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1
[
4
])
+
" -DCK_PARAM_ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1="
+
std
::
to_string
(
ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1
[
0
])
+
","
+
std
::
to_string
(
ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1
[
1
])
+
","
+
std
::
to_string
(
ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1
[
2
])
+
","
+
std
::
to_string
(
ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1
[
3
])
+
","
+
std
::
to_string
(
ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1
[
4
])
+
" -DCK_PARAM_ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1="
+
std
::
to_string
(
ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1
[
0
])
+
","
+
std
::
to_string
(
ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1
[
1
])
+
","
+
std
::
to_string
(
ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1
[
2
])
+
","
+
std
::
to_string
(
ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1
[
3
])
+
","
+
std
::
to_string
(
ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1
[
4
])
+
" -DCK_PARAM_BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1="
+
std
::
to_string
(
BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1
[
0
])
+
","
+
std
::
to_string
(
BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1
[
1
])
+
","
+
std
::
to_string
(
BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1
[
2
])
+
","
+
std
::
to_string
(
BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1
[
3
])
+
","
+
std
::
to_string
(
BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1
[
4
])
+
" -DCK_PARAM_BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1="
+
std
::
to_string
(
BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1
[
0
])
+
","
+
std
::
to_string
(
BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1
[
1
])
+
","
+
std
::
to_string
(
BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1
[
2
])
+
","
+
std
::
to_string
(
BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1
[
3
])
+
","
+
std
::
to_string
(
BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1
[
4
])
+
" -DCK_PARAM_BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1="
+
std
::
to_string
(
BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1
[
0
])
+
","
+
std
::
to_string
(
BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1
[
1
])
+
","
+
std
::
to_string
(
BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1
[
2
])
+
","
+
std
::
to_string
(
BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1
[
3
])
+
","
+
std
::
to_string
(
BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1
[
4
])
+
" -DCK_PARAM_BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1="
+
std
::
to_string
(
BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1
[
0
])
+
","
+
std
::
to_string
(
BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1
[
1
])
+
","
+
std
::
to_string
(
BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1
[
2
])
+
","
+
std
::
to_string
(
BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1
[
3
])
+
","
+
std
::
to_string
(
BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1
[
4
])
+
" -DCK_PARAM_CThreadTransferDstScalarPerVector="
+
std
::
to_string
(
CThreadTransferDstScalarPerVector
)
+
" -DCK_PARAM_HasMainKBlockLoop="
+
std
::
to_string
(
HasMainKBlockLoop
)
+
" -DCK_PARAM_HasDoubleTailKBlockLoop="
+
std
::
to_string
(
HasDoubleTailKBlockLoop
);
// clang-format on
}
std
::
array
<
int
,
5
>
BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1
=
{
-
1
,
-
1
,
-
1
,
-
1
,
-
1
};
std
::
array
<
int
,
5
>
BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1
=
{
-
1
,
-
1
,
-
1
,
-
1
,
-
1
};
std
::
array
<
int
,
5
>
BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1
=
{
-
1
,
-
1
,
-
1
,
-
1
,
-
1
};
std
::
array
<
int
,
5
>
BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1
=
{
-
1
,
-
1
,
-
1
,
-
1
,
-
1
};
int
CThreadTransferDstScalarPerVector
=
-
1
;
bool
HasMainKBlockLoop
=
false
;
bool
HasDoubleTailKBlockLoop
=
false
;
};
struct
TunableConvIgemmFwdV6r1DlopsNchwKcyxNkhw
...
...
@@ -229,8 +243,6 @@ struct ConvIgemmFwdV6r1DlopsNchwKcyxNkhw
CalculateCompileParameterBasedOnTunable
(
const
ConvolutionProblemDescriptor
&
conv_problem_desc
,
const
TunableConvIgemmFwdV6r1DlopsNchwKcyxNkhw
&
tunable
)
{
using
namespace
ck
;
const
int
C
=
conv_problem_desc
.
C
;
const
int
Y
=
conv_problem_desc
.
Y
;
const
int
X
=
conv_problem_desc
.
X
;
...
...
@@ -247,12 +259,17 @@ struct ConvIgemmFwdV6r1DlopsNchwKcyxNkhw
DataTypeEnum_t
AccDataTypeEnum
;
switch
(
ABDataTypeEnum
)
if
(
ABDataTypeEnum
==
DataTypeEnum_t
::
Float
||
ABDataTypeEnum
==
DataTypeEnum_t
::
Half
)
{
AccDataTypeEnum
=
DataTypeEnum_t
::
Float
;
}
else
if
(
ABDataTypeEnum
==
DataTypeEnum_t
::
Int8
)
{
case
DataTypeEnum_t
::
Float
:
case
DataTypeEnum_t
::
Half
:
AccDataTypeEnum
=
DataTypeEnum_t
::
Float
;
break
;
case
DataTypeEnum_t
::
Int8
:
AccDataTypeEnum
=
DataTypeEnum_t
::
Int32
;
break
;
default:
return
std
::
make_tuple
(
CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw
{},
false
);
AccDataTypeEnum
=
DataTypeEnum_t
::
Int32
;
}
else
{
return
std
::
make_tuple
(
CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw
{},
false
);
}
const
int
BlockSize
=
tunable
.
BlockSize
;
...
...
@@ -342,7 +359,7 @@ struct ConvIgemmFwdV6r1DlopsNchwKcyxNkhw
{
for
(
const
auto
&
tunable
:
generate_tunable_list_conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw
())
{
CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw
compile_param
;
CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw
compile_param
{}
;
bool
found
=
false
;
std
::
tie
(
compile_param
,
found
)
=
...
...
@@ -368,8 +385,6 @@ struct ConvIgemmFwdV6r1DlopsNchwKcyxNkhw
IsValidCompileParameter
(
const
ConvolutionProblemDescriptor
&
conv_problem_desc
,
const
CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw
&
compile_param
)
{
using
namespace
ck
;
const
int
N
=
conv_problem_desc
.
N
;
const
int
K
=
conv_problem_desc
.
K
;
const
int
C
=
conv_problem_desc
.
C
;
...
...
@@ -669,5 +684,6 @@ struct ConvIgemmFwdV6r1DlopsNchwKcyxNkhw
}
};
}
// namespace ck_driver
}
// namespace driver
}
// namespace ck
#endif
host/solver/include/convolution_problem_descriptor.hpp
View file @
31b40352
#ifndef CONVOLUTION_PROBLEM_DESCRIPTOR
#define CONVOLUTION_PROBLEM_DESCRIPTOR
namespace
ck_driver
{
namespace
ck
{
namespace
driver
{
struct
ConvolutionProblemDescriptor
{
...
...
@@ -75,5 +76,6 @@ struct ConvolutionProblemDescriptor
std
::
size_t
CalculateFlop
()
const
{
return
2L
*
N
*
K
*
C
*
Y
*
X
*
Ho
*
Wo
;
}
};
}
// namespace ck_driver
}
// namespace driver
}
// namespace ck
#endif
host/
driver_online/include/online_dri
ver_common.hpp
→
host/
solver/include/sol
ver_common.hpp
View file @
31b40352
#ifndef
ONLINE_DRI
VER_COMMON_HPP
#define
ONLINE_DRI
VER_COMMON_HPP
#ifndef
CK_SOL
VER_COMMON_HPP
#define
CK_SOL
VER_COMMON_HPP
namespace
ck_driver
{
inline
auto
get_ck_hip_online_compile_common_flag
()
{
std
::
string
param
=
" -std=c++17"
;
return
param
;
}
namespace
ck
{
namespace
driver
{
// greatest common divisor, aka highest common factor
inline
int
gcd
(
int
x
,
int
y
)
...
...
@@ -47,5 +41,6 @@ auto gcd(X x, Ys... ys)
return
gcd
(
x
,
gcd
(
ys
...));
}
}
// namespace ck_driver
}
// namespace driver
}
// namespace ck
#endif
script/cmake-rocm.sh
View file @
31b40352
...
...
@@ -3,40 +3,16 @@ rm -f CMakeCache.txt
rm
-f
*
.cmake
rm
-rf
CMakeFiles
MY_PROJECT_SOURCE
=
../../..
/
MY_PROJECT_SOURCE
=
../../..
MY_PROJECT_INSTALL
=
../install.dir
cmake
\
-D
CMAKE_INSTALL_PREFIX
=
${
MY_PROJECT_INSTALL
}
\
-D
HALF_INCLUDE_DIR
=
"/root/workspace/external/half/include"
\
-D
BUILD_DEV
=
ON
\
-D
CMAKE_BUILD_TYPE
=
Release
\
-D
CMAKE_CXX_FLAGS
=
"-DCK_AMD_GPU_GFX906 -O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=
$PWD
"
\
-D
HIP_ONLINE_COMPILER_FLAGS
=
"-DCK_AMD_GPU_GFX906"
\
-D
CMAKE_CXX_FLAGS
=
"-DCK_AMD_GPU_GFX908 -O3 --amdgpu-target=gfx908 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=
$PWD
"
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_PREFIX_PATH
=
/opt/rocm
\
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
${
MY_PROJECT_SOURCE
}
#CXX_FLAG_TMP=-Weverything
# -Wno-c++98-compat \
# -Wno-c++98-compat-pedantic \
# -Wno-conversion \
# -Wno-double-promotion \
# -Wno-exit-time-destructors \
# -Wno-extra-semi \
# -Wno-float-conversion \
# -Wno-gnu-anonymous-struct \
# -Wno-gnu-zero-variadic-macro-arguments \
# -Wno-missing-noreturn \
# -Wno-missing-prototypes \
# -Wno-nested-anon-types \
# -Wno-padded \
# -Wno-return-std-move-in-c++11 \
# -Wno-shorten-64-to-32 \
# -Wno-sign-conversion \
# -Wno-unknown-warning-option \
# -Wno-unused-command-line-argument \
# -Wno-weak-vtables \
# -Wno-covered-switch-default \
# -Wno-disabled-macro-expansion \
# -Wno-undefined-reinterpret-cast
Prev
1
…
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment