Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
8753e615
Commit
8753e615
authored
Mar 25, 2021
by
Jing Zhang
Browse files
navi test
parent
418ca5ee
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
11 additions
and
9 deletions
+11
-9
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
...nel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
+4
-2
composable_kernel/include/utility/config.amd.hpp.in
composable_kernel/include/utility/config.amd.hpp.in
+2
-2
driver/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
+4
-4
script/cmake-rocm3.7.sh
script/cmake-rocm3.7.sh
+1
-1
No files found.
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
View file @
8753e615
...
@@ -51,7 +51,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
...
@@ -51,7 +51,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
{
{
__host__
__device__
static
constexpr
index_t
GetSharedMemoryNumberOfByte
()
__host__
__device__
static
constexpr
index_t
GetSharedMemoryNumberOfByte
()
{
{
const
auto
E
=
4
*
3
*
3
;
const
expr
auto
E
=
EPerBlock
*
3
*
3
;
constexpr
auto
max_lds_align
=
constexpr
auto
max_lds_align
=
math
::
lcm
(
Number
<
ABlockTransferDstScalarPerVector_K
>
{},
Number
<
KPerBlock
>
{});
math
::
lcm
(
Number
<
ABlockTransferDstScalarPerVector_K
>
{},
Number
<
KPerBlock
>
{});
...
@@ -84,7 +84,9 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
...
@@ -84,7 +84,9 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
const
auto
E
=
a_e_k_global_desc
.
GetLength
(
I0
);
constexpr
auto
E
=
EPerBlock
*
3
*
3
;
// const auto E = a_e_k_global_desc.GetLength(I0);
const
auto
K
=
a_e_k_global_desc
.
GetLength
(
I1
);
const
auto
K
=
a_e_k_global_desc
.
GetLength
(
I1
);
const
auto
N
=
b_e_n_ho_wo_global_desc
.
GetLength
(
I1
);
const
auto
N
=
b_e_n_ho_wo_global_desc
.
GetLength
(
I1
);
...
...
composable_kernel/include/utility/config.amd.hpp.in
View file @
8753e615
...
@@ -11,9 +11,9 @@
...
@@ -11,9 +11,9 @@
#define CK_DEVICE_BACKEND_AMD 1
#define CK_DEVICE_BACKEND_AMD 1
// GPU ID
// GPU ID
#define CK_AMD_GPU_GFX906
1
#define CK_AMD_GPU_GFX906
0
#define CK_AMD_GPU_GFX908 0
#define CK_AMD_GPU_GFX908 0
#define CK_AMD_GPU_GFX1030
0
#define CK_AMD_GPU_GFX1030
1
// HIP version
// HIP version
#ifndef CK_HIP_VERSION_FLAT
#ifndef CK_HIP_VERSION_FLAT
...
...
driver/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
View file @
8753e615
...
@@ -113,22 +113,22 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
...
@@ -113,22 +113,22 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
constexpr
index_t
KPerBlock
=
16
;
constexpr
index_t
KPerBlock
=
16
;
constexpr
index_t
HoPerBlock
=
8
;
constexpr
index_t
HoPerBlock
=
8
;
constexpr
index_t
WoPerBlock
=
32
;
constexpr
index_t
WoPerBlock
=
32
;
constexpr
index_t
EPerBlock
=
1
;
constexpr
index_t
EPerBlock
=
4
;
constexpr
index_t
KPerThread
=
16
;
constexpr
index_t
KPerThread
=
16
;
constexpr
index_t
HoPerThread
=
2
;
constexpr
index_t
HoPerThread
=
2
;
constexpr
index_t
WoPerThread
=
2
;
constexpr
index_t
WoPerThread
=
2
;
constexpr
index_t
EPerThread
=
1
;
constexpr
index_t
EPerThread
=
EPerBlock
;
using
ABlockTransferThreadSliceLengths_E_K
=
Sequence
<
9
,
1
>
;
using
ABlockTransferThreadSliceLengths_E_K
=
Sequence
<
9
,
1
>
;
using
ABlockTransferThreadClusterLengths_E_K
=
Sequence
<
4
,
16
>
;
using
ABlockTransferThreadClusterLengths_E_K
=
Sequence
<
EPerBlock
,
16
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_E
=
1
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_E
=
1
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K
=
1
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K
=
1
;
constexpr
index_t
BThreadTransferSrcScalarPerVector_W
=
1
;
constexpr
index_t
BThreadTransferSrcScalarPerVector_W
=
1
;
constexpr
index_t
CThreadTransferDstScalarPerVector_W
=
2
;
constexpr
index_t
CThreadTransferDstScalarPerVector_W
=
1
;
constexpr
auto
conv_driver
=
constexpr
auto
conv_driver
=
DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
<
DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
<
...
...
script/cmake-rocm3.7.sh
View file @
8753e615
...
@@ -10,7 +10,7 @@ cmake
...
@@ -10,7 +10,7 @@ cmake
-D
CMAKE_INSTALL_PREFIX
=
${
MY_PROJECT_INSTALL
}
\
-D
CMAKE_INSTALL_PREFIX
=
${
MY_PROJECT_INSTALL
}
\
-D
CMAKE_BUILD_TYPE
=
Release
\
-D
CMAKE_BUILD_TYPE
=
Release
\
-D
DEVICE_BACKEND
=
"AMD"
\
-D
DEVICE_BACKEND
=
"AMD"
\
-D
CMAKE_CXX_FLAGS
=
"-O3 --amdgpu-target=gfx
906
-gline-tables-only -save-temps=
$CWD
-ftemplate-backtrace-limit=0"
\
-D
CMAKE_CXX_FLAGS
=
"-O3 --amdgpu-target=gfx
1030
-gline-tables-only -save-temps=
$CWD
-ftemplate-backtrace-limit=0"
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_PREFIX_PATH
=
"/opt/rocm"
\
-D
CMAKE_PREFIX_PATH
=
"/opt/rocm"
\
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment