Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
4ec493ec
Commit
4ec493ec
authored
Sep 10, 2021
by
ltqin
Browse files
change some variable name
parent
1343569e
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
26 additions
and
26 deletions
+26
-26
CMakeLists.txt
CMakeLists.txt
+1
-1
host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwgc_gkyxc_nhwgk.hpp
...forward_implicit_gemm_v4r4r4_xdlops_nhwgc_gkyxc_nhwgk.hpp
+24
-24
script/cmake-rocm.sh
script/cmake-rocm.sh
+1
-1
No files found.
CMakeLists.txt
View file @
4ec493ec
...
...
@@ -43,7 +43,7 @@ message(STATUS "Build with HIP ${hip_VERSION}")
message
(
"HALF_INCLUDE_DIR:
${
HALF_INCLUDE_DIR
}
"
)
# CMAKE_CXX_FLAGS
SET
(
BUILD_DEV O
N
CACHE BOOL
"BUILD_DEV"
)
SET
(
BUILD_DEV O
FF
CACHE BOOL
"BUILD_DEV"
)
if
(
BUILD_DEV
)
string
(
APPEND CMAKE_CXX_FLAGS
" -Werror -Weverything"
)
endif
()
...
...
host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwgc_gkyxc_nhwgk.hpp
View file @
4ec493ec
...
...
@@ -15,16 +15,16 @@ template <typename TInWei,
typename
InLeftPads
,
typename
InRightPads
>
void
device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwgc_gkyxc_nhwgk
(
const
InLengths
&
in_n_hi_wi_c_lengths
,
const
WeiLengths
&
wei_k_y_x_c_lengths
,
const
OutLengths
&
out_n_ho_wo_k_lengths
,
const
InLengths
&
in_n_hi_wi_
g_
c_lengths
,
const
WeiLengths
&
wei_
g_
k_y_x_c_lengths
,
const
OutLengths
&
out_n_ho_wo_
g_
k_lengths
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
in_right_pads
,
const
Tensor
<
TInWei
>&
in_n_hi_wi_c
,
const
Tensor
<
TInWei
>&
wei_k_y_x_c
,
Tensor
<
TOut
>&
out_n_ho_wo_k
,
const
Tensor
<
TInWei
>&
in_n_hi_wi_
g_
c
,
const
Tensor
<
TInWei
>&
wei_
g_
k_y_x_c
,
Tensor
<
TOut
>&
out_n_ho_wo_
g_
k
,
ck
::
index_t
nrepeat
)
{
using
namespace
ck
;
...
...
@@ -35,19 +35,19 @@ void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwgc_gkyxc_nhwgk(
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
//
constexpr auto I4 = Number<4>{};
constexpr
auto
I4
=
Number
<
4
>
{};
DeviceMem
in_n_hi_wi_c_device_buf
(
sizeof
(
TInWei
)
*
in_n_hi_wi_c
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_k_y_x_c_device_buf
(
sizeof
(
TInWei
)
*
wei_k_y_x_c
.
mDesc
.
GetElementSpace
());
DeviceMem
out_n_ho_wo_k_device_buf
(
sizeof
(
TOut
)
*
out_n_ho_wo_k
.
mDesc
.
GetElementSpace
());
DeviceMem
in_n_hi_wi_
g_
c_device_buf
(
sizeof
(
TInWei
)
*
in_n_hi_wi_
g_
c
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_
g_
k_y_x_c_device_buf
(
sizeof
(
TInWei
)
*
wei_
g_
k_y_x_c
.
mDesc
.
GetElementSpace
());
DeviceMem
out_n_ho_wo_
g_
k_device_buf
(
sizeof
(
TOut
)
*
out_n_ho_wo_
g_
k
.
mDesc
.
GetElementSpace
());
in_n_hi_wi_c_device_buf
.
ToDevice
(
in_n_hi_wi_c
.
mData
.
data
());
wei_k_y_x_c_device_buf
.
ToDevice
(
wei_k_y_x_c
.
mData
.
data
());
out_n_ho_wo_k_device_buf
.
ToDevice
(
out_n_ho_wo_k
.
mData
.
data
());
in_n_hi_wi_
g_
c_device_buf
.
ToDevice
(
in_n_hi_wi_
g_
c
.
mData
.
data
());
wei_
g_
k_y_x_c_device_buf
.
ToDevice
(
wei_
g_
k_y_x_c
.
mData
.
data
());
out_n_ho_wo_
g_
k_device_buf
.
ToDevice
(
out_n_ho_wo_
g_
k
.
mData
.
data
());
const
auto
in_n_hi_wi_c_desc
=
make_naive_tensor_descriptor_packed
(
in_n_hi_wi_c_lengths
);
const
auto
wei_k_y_x_c_desc
=
make_naive_tensor_descriptor_packed
(
wei_k_y_x_c_lengths
);
const
auto
out_n_ho_wo_k_desc
=
make_naive_tensor_descriptor_packed
(
out_n_ho_wo_k_lengths
);
const
auto
in_n_hi_wi_
g_
c_desc
=
make_naive_tensor_descriptor_packed
(
in_n_hi_wi_
g_
c_lengths
);
const
auto
wei_
g_
k_y_x_c_desc
=
make_naive_tensor_descriptor_packed
(
wei_
g_
k_y_x_c_lengths
);
const
auto
out_n_ho_wo_
g_
k_desc
=
make_naive_tensor_descriptor_packed
(
out_n_ho_wo_
g_
k_lengths
);
#if 0
// [M, N, K0, K1] = [256, 128, 4, 4] for fp32
...
...
@@ -176,14 +176,14 @@ void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwgc_gkyxc_nhwgk(
constexpr
index_t
MRepeat
=
2
;
constexpr
index_t
NRepeat
=
4
;
using
GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1
=
Sequence
<
1
,
2
,
8
>
;
using
GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1
=
Sequence
<
4
,
64
,
1
>
;
using
GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1
=
Sequence
<
1
,
1
,
2
,
8
>
;
using
GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1
=
Sequence
<
1
,
4
,
64
,
1
>
;
constexpr
index_t
GemmABlockTransferSrcScalarPerVector_GemmK1
=
8
;
constexpr
index_t
GemmABlockTransferDstScalarPerVector_GemmK1
=
8
;
using
GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1
=
Sequence
<
1
,
4
,
8
>
;
using
GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1
=
Sequence
<
4
,
64
,
1
>
;
using
GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1
=
Sequence
<
1
,
1
,
4
,
8
>
;
using
GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1
=
Sequence
<
1
,
4
,
64
,
1
>
;
constexpr
index_t
GemmBBlockTransferSrcScalarPerVector_GemmK1
=
8
;
constexpr
index_t
GemmBBlockTransferDstScalarPerVector_GemmK1
=
8
;
...
...
@@ -220,9 +220,9 @@ void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwgc_gkyxc_nhwgk(
#endif
const
auto
descs
=
transform_forward_convolution_into_gemm_v4r4r4_nhwgc_gkyxc_nhwgk_pad
(
in_n_hi_wi_c_desc
,
wei_k_y_x_c_desc
,
out_n_ho_wo_k_desc
,
transform_forward_convolution_into_gemm_v4r4r4_nhwgc_gkyxc_nhwgk_pad
(
in_n_hi_wi_
g_
c_desc
,
wei_
g_
k_y_x_c_desc
,
out_n_ho_wo_
g_
k_desc
,
conv_strides
,
conv_dilations
,
in_left_pads
,
...
...
@@ -351,5 +351,5 @@ void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwgc_gkyxc_nhwgk(
}
*/
// copy result back to host
out_n_ho_wo_k_device_buf
.
FromDevice
(
out_n_ho_wo_k
.
mData
.
data
());
out_n_ho_wo_
g_
k_device_buf
.
FromDevice
(
out_n_ho_wo_
g_
k
.
mData
.
data
());
}
script/cmake-rocm.sh
View file @
4ec493ec
...
...
@@ -9,7 +9,7 @@ MY_PROJECT_INSTALL=../install.dir
cmake
\
-D
CMAKE_INSTALL_PREFIX
=
${
MY_PROJECT_INSTALL
}
\
-D
HALF_INCLUDE_DIR
=
"/root/workspace/external/half/include"
\
-D
BUILD_DEV
=
O
N
\
-D
BUILD_DEV
=
O
FF
\
-D
CMAKE_BUILD_TYPE
=
Release
\
-D
CMAKE_CXX_FLAGS
=
"-DCK_AMD_GPU_GFX908 -O3 --amdgpu-target=gfx908 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=
$PWD
"
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment