Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
f20e48f1
Commit
f20e48f1
authored
Nov 05, 2024
by
aska-0096
Browse files
Merge branch 'develop' of
https://github.com/ROCm/composable_kernel
into update_cka8w8
parents
b97c6876
0c9012fb
Changes
361
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
468 additions
and
44 deletions
+468
-44
CMakeLists.txt
CMakeLists.txt
+17
-15
README.md
README.md
+9
-5
example/01_gemm/common.hpp
example/01_gemm/common.hpp
+8
-7
example/01_gemm/run_gemm_example.inc
example/01_gemm/run_gemm_example.inc
+15
-12
example/01_gemm/run_gemm_example_streamk_v2.inc
example/01_gemm/run_gemm_example_streamk_v2.inc
+1
-1
example/01_gemm/run_gemm_example_v2.inc
example/01_gemm/run_gemm_example_v2.inc
+1
-1
example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
..._gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
+3
-3
example/62_convnd_activ/CMakeLists.txt
example/62_convnd_activ/CMakeLists.txt
+1
-0
example/62_convnd_activ/dynamic_unary/CMakeLists.txt
example/62_convnd_activ/dynamic_unary/CMakeLists.txt
+45
-0
example/62_convnd_activ/dynamic_unary/convnd_fwd_activ_dynamic_unary_common.hpp
...v/dynamic_unary/convnd_fwd_activ_dynamic_unary_common.hpp
+238
-0
example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_abs_fp16.cpp
...d_activ/dynamic_unary/convnd_fwd_xdl_dynamic_abs_fp16.cpp
+13
-0
example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_clippedrelu_fp16.cpp
...dynamic_unary/convnd_fwd_xdl_dynamic_clippedrelu_fp16.cpp
+13
-0
example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_elu_fp16.cpp
...d_activ/dynamic_unary/convnd_fwd_xdl_dynamic_elu_fp16.cpp
+13
-0
example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_leakyrelu_fp16.cpp
...v/dynamic_unary/convnd_fwd_xdl_dynamic_leakyrelu_fp16.cpp
+13
-0
example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_logistic_fp16.cpp
...iv/dynamic_unary/convnd_fwd_xdl_dynamic_logistic_fp16.cpp
+13
-0
example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_passthrough_fp16.cpp
...dynamic_unary/convnd_fwd_xdl_dynamic_passthrough_fp16.cpp
+13
-0
example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_pow_fp16.cpp
...d_activ/dynamic_unary/convnd_fwd_xdl_dynamic_pow_fp16.cpp
+13
-0
example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_relu_fp16.cpp
..._activ/dynamic_unary/convnd_fwd_xdl_dynamic_relu_fp16.cpp
+13
-0
example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_sigmoid_fp16.cpp
...tiv/dynamic_unary/convnd_fwd_xdl_dynamic_sigmoid_fp16.cpp
+13
-0
example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_softrelu_fp16.cpp
...iv/dynamic_unary/convnd_fwd_xdl_dynamic_softrelu_fp16.cpp
+13
-0
No files found.
CMakeLists.txt
View file @
f20e48f1
...
@@ -137,7 +137,7 @@ if(GPU_TARGETS)
...
@@ -137,7 +137,7 @@ if(GPU_TARGETS)
else
()
else
()
set
(
USER_GPU_TARGETS 0
)
set
(
USER_GPU_TARGETS 0
)
endif
()
endif
()
find_package
(
hip
)
find_package
(
hip
REQUIRED
)
# No assumption that HIP kernels are launched with uniform block size for backward compatibility
# No assumption that HIP kernels are launched with uniform block size for backward compatibility
# SWDEV-413293 and https://reviews.llvm.org/D155213
# SWDEV-413293 and https://reviews.llvm.org/D155213
math
(
EXPR hip_VERSION_FLAT
"(
${
hip_VERSION_MAJOR
}
* 1000 +
${
hip_VERSION_MINOR
}
) * 100000 +
${
hip_VERSION_PATCH
}
"
)
math
(
EXPR hip_VERSION_FLAT
"(
${
hip_VERSION_MAJOR
}
* 1000 +
${
hip_VERSION_MINOR
}
) * 100000 +
${
hip_VERSION_PATCH
}
"
)
...
@@ -170,27 +170,30 @@ else()
...
@@ -170,27 +170,30 @@ else()
set
(
CK_GPU_TARGETS
${
GPU_TARGETS
}
)
set
(
CK_GPU_TARGETS
${
GPU_TARGETS
}
)
endif
()
endif
()
endif
()
endif
()
#if the user did not set GPU_TARGETS, delete whatever was set by HIP package
if
(
NOT USER_GPU_TARGETS
)
set
(
GPU_TARGETS
""
)
endif
()
#make sure all the targets on the list are actually supported by the current compiler
#make sure all the targets on the list are actually supported by the current compiler
rocm_check_target_ids
(
SUPPORTED_GPU_TARGETS
rocm_check_target_ids
(
SUPPORTED_GPU_TARGETS
TARGETS
${
CK_GPU_TARGETS
}
)
TARGETS
${
CK_GPU_TARGETS
}
)
message
(
"Building CK for the following targets:
${
SUPPORTED_GPU_TARGETS
}
"
)
message
(
"Building CK for the following targets:
${
SUPPORTED_GPU_TARGETS
}
"
)
if
(
GPU_TARGETS
)
if
(
SUPPORTED_GPU_TARGETS MATCHES
"gfx9"
)
if
(
GPU_TARGETS MATCHES
"gfx9"
)
message
(
"Enabling XDL instances"
)
add_definitions
(
-DCK_USE_XDL
)
add_definitions
(
-DCK_USE_XDL
)
set
(
CK_USE_XDL
"ON"
)
endif
()
if
(
GPU_TARGETS MATCHES
"gfx11"
OR GPU_TARGETS MATCHES
"gfx12"
)
add_definitions
(
-DCK_USE_WMMA
)
set
(
CK_USE_WMMA
"ON"
)
endif
()
else
()
add_definitions
(
-DCK_USE_WMMA -DCK_USE_XDL
)
set
(
CK_USE_XDL
"ON"
)
set
(
CK_USE_XDL
"ON"
)
endif
()
if
(
SUPPORTED_GPU_TARGETS MATCHES
"gfx11"
OR SUPPORTED_GPU_TARGETS MATCHES
"gfx12"
)
message
(
"Enabling WMMA instances"
)
add_definitions
(
-DCK_USE_WMMA
)
set
(
CK_USE_WMMA
"ON"
)
set
(
CK_USE_WMMA
"ON"
)
endif
()
endif
()
option
(
CK_USE_FP8_ON_UNSUPPORTED_ARCH
"Enable FP8 GEMM instances on older architectures"
OFF
)
if
(
CK_USE_FP8_ON_UNSUPPORTED_ARCH
AND
(
SUPPORTED_GPU_TARGETS MATCHES
"gfx90a"
OR SUPPORTED_GPU_TARGETS MATCHES
"gfx908"
))
add_definitions
(
-DCK_USE_FP8_ON_UNSUPPORTED_ARCH
)
endif
()
# CK config file to record supported datatypes, etc.
# CK config file to record supported datatypes, etc.
configure_file
(
include/ck/config.h.in
${
CMAKE_CURRENT_BINARY_DIR
}
/include/ck/config.h
)
configure_file
(
include/ck/config.h.in
${
CMAKE_CURRENT_BINARY_DIR
}
/include/ck/config.h
)
...
@@ -318,7 +321,6 @@ link_libraries(${OpenMP_gomp_LIBRARY})
...
@@ -318,7 +321,6 @@ link_libraries(${OpenMP_gomp_LIBRARY})
link_libraries
(
${
OpenMP_pthread_LIBRARY
}
)
link_libraries
(
${
OpenMP_pthread_LIBRARY
}
)
## HIP
## HIP
find_package
(
HIP REQUIRED
)
# Override HIP version in config.h, if necessary.
# Override HIP version in config.h, if necessary.
# The variables set by find_package() can't be overwritten,
# The variables set by find_package() can't be overwritten,
# therefore let's use intermediate variables.
# therefore let's use intermediate variables.
...
@@ -578,7 +580,7 @@ rocm_package_setup_component(profiler
...
@@ -578,7 +580,7 @@ rocm_package_setup_component(profiler
)
)
add_subdirectory
(
profiler
)
add_subdirectory
(
profiler
)
if
(
CK_USE_CODEGEN
AND
(
GPU_TARGETS MATCHES
"gfx9"
OR GPU_ARCHS
))
if
(
CK_USE_CODEGEN
AND
(
SUPPORTED_
GPU_TARGETS MATCHES
"gfx9"
OR GPU_ARCHS
))
add_subdirectory
(
codegen
)
add_subdirectory
(
codegen
)
endif
()
endif
()
...
...
README.md
View file @
f20e48f1
...
@@ -137,12 +137,11 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa
...
@@ -137,12 +137,11 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa
You can find instructions for running ckProfiler in [profiler](/profiler).
You can find instructions for running ckProfiler in [profiler](/profiler).
Note the
`-j`
option for building with multiple threads in parallel. This speeds up the build significantly.
Note the
`-j`
option for building with multiple threads in parallel, which speeds up the build significantly.
However,
`-j`
launches unlimited number of threads, which can cause the build to run out of memory and
crash. On average, you should expect each thread to use ~2Gb of RAM.
Depending on the number of CPU cores and the amount of RAM on your system, you may want to
Depending on the number of CPU cores and the amount of RAM on your system, you may want to
limit the number of threads. For example, if you have a 128-core CPU and 64 Gb of RAM.
limit the number of threads. For example, if you have a 128-core CPU and 128 Gb of RAM it's advisable to use
`-j32`
.
By default,
`-j`
launches one thread per CPU core, which can cause the build to run out of memory and
crash. In such cases, you can reduce the number of threads to 32 by using
`-j32`
.
Additional cmake flags can be used to significantly speed-up the build:
Additional cmake flags can be used to significantly speed-up the build:
...
@@ -154,6 +153,11 @@ Additional cmake flags can be used to significantly speed-up the build:
...
@@ -154,6 +153,11 @@ Additional cmake flags can be used to significantly speed-up the build:
`batched_gemm_multi_d_dl`
. These instances are useful on architectures like the NAVI2x, as most
`batched_gemm_multi_d_dl`
. These instances are useful on architectures like the NAVI2x, as most
other platforms have faster instances, such as
`xdl`
or
`wmma`
, available.
other platforms have faster instances, such as
`xdl`
or
`wmma`
, available.
*
`CK_USE_FP8_ON_UNSUPPORTED_ARCH`
(default is OFF) must be set to ON in order to build instances,
such as
`gemm_universal`
and
`gemm_multiply_multiply`
for fp8 data type for GPU targets which do not
have native support for fp8 data type, such as gfx908 or gfx90a. These instances are useful on
architectures like the MI100/MI200 for the functional support only.
## Using sccache for building
## Using sccache for building
The default CK Docker images come with a pre-installed version of sccache, which supports clang
The default CK Docker images come with a pre-installed version of sccache, which supports clang
...
...
example/01_gemm/common.hpp
View file @
f20e48f1
...
@@ -75,9 +75,10 @@ struct ProblemSizeSplitK final
...
@@ -75,9 +75,10 @@ struct ProblemSizeSplitK final
struct
ExecutionConfig
final
struct
ExecutionConfig
final
{
{
bool
do_verification
=
true
;
// 0 - no verification, 1 - CPU, 2 - GPU, 3 - CPU + GPU
int
init_method
=
2
;
int
do_verification
=
3
;
bool
time_kernel
=
false
;
int
init_method
=
2
;
bool
time_kernel
=
false
;
};
};
template
<
ck
::
index_t
...
Is
>
template
<
ck
::
index_t
...
Is
>
...
@@ -126,7 +127,7 @@ bool parse_cmd_args<ProblemSize>(int argc,
...
@@ -126,7 +127,7 @@ bool parse_cmd_args<ProblemSize>(int argc,
}
}
else
else
{
{
std
::
cerr
<<
"arg1: verification (0=no, 1=CPU and GPU)"
<<
std
::
endl
std
::
cerr
<<
"arg1: verification (0=no, 1=CPU
, 2=GPU, 3=CPU
and GPU)"
<<
std
::
endl
<<
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
<<
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
<<
std
::
endl
<<
std
::
endl
<<
"arg3: time kernel (0=no, 1=yes)"
<<
std
::
endl
<<
"arg3: time kernel (0=no, 1=yes)"
<<
std
::
endl
...
@@ -176,7 +177,7 @@ bool parse_cmd_args<ProblemSizeStreamK_universal>(int argc,
...
@@ -176,7 +177,7 @@ bool parse_cmd_args<ProblemSizeStreamK_universal>(int argc,
else
else
{
{
std
::
cerr
std
::
cerr
<<
"arg1: verification (0=no, 1=CPU and GPU)"
<<
std
::
endl
<<
"arg1: verification (0=no, 1=CPU
, 2=GPU, 3=CPU
and GPU)"
<<
std
::
endl
<<
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
<<
std
::
endl
<<
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
<<
std
::
endl
<<
"arg3: time kernel (0=no, 1=yes)"
<<
std
::
endl
<<
"arg3: time kernel (0=no, 1=yes)"
<<
std
::
endl
<<
"arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC"
<<
std
::
endl
<<
"arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC"
<<
std
::
endl
...
@@ -225,7 +226,7 @@ bool parse_cmd_args<ProblemSizeStreamK>(int argc,
...
@@ -225,7 +226,7 @@ bool parse_cmd_args<ProblemSizeStreamK>(int argc,
}
}
else
else
{
{
std
::
cerr
<<
"arg1: verification (0=no, 1=CPU and GPU)"
<<
std
::
endl
std
::
cerr
<<
"arg1: verification (0=no, 1=CPU
, 2=GPU, 3=CPU
and GPU)"
<<
std
::
endl
<<
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
<<
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
<<
std
::
endl
<<
std
::
endl
<<
"arg3: time kernel (0=no, 1=yes)"
<<
std
::
endl
<<
"arg3: time kernel (0=no, 1=yes)"
<<
std
::
endl
...
@@ -275,7 +276,7 @@ bool parse_cmd_args<ProblemSizeSplitK>(int argc,
...
@@ -275,7 +276,7 @@ bool parse_cmd_args<ProblemSizeSplitK>(int argc,
}
}
else
else
{
{
std
::
cerr
<<
"arg1: verification (0=no, 1=CPU and GPU)"
<<
std
::
endl
std
::
cerr
<<
"arg1: verification (0=no, 1=CPU
, 2=GPU, 3=CPU
and GPU)"
<<
std
::
endl
<<
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
<<
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
<<
std
::
endl
<<
std
::
endl
<<
"arg3: time kernel (0=no, 1=yes)"
<<
std
::
endl
<<
"arg3: time kernel (0=no, 1=yes)"
<<
std
::
endl
...
...
example/01_gemm/run_gemm_example.inc
View file @
f20e48f1
...
@@ -330,7 +330,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
...
@@ -330,7 +330,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
bool
pass
=
true
;
bool
pass
=
true
;
if
(
config
.
do_verification
)
if
(
(
config
.
do_verification
==
1
)
||
(
config
.
do_verification
==
3
)
)
{
{
// CPU verification
// CPU verification
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_gemm
=
ReferenceGemmInstance
{};
...
@@ -353,13 +353,16 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
...
@@ -353,13 +353,16 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
#else
#else
c_m_n_device_buf
.
FromDevice
(
c_m_n_device_result
.
mData
.
data
());
c_m_n_device_buf
.
FromDevice
(
c_m_n_device_result
.
mData
.
data
());
pass
&=
!
ck
::
utils
::
check_err
(
c_m_n_device_result
,
pass
&=
ck
::
utils
::
check_err
(
c_m_n_device_result
,
c_m_n_host_result
,
c_m_n_host_result
,
"Error: Incorrect results!"
,
"Error: Incorrect results!"
,
get_rtol
<
CDataType
>
(),
get_rtol
<
CDataType
>
(),
get_atol
<
CDataType
>
());
get_atol
<
CDataType
>
());
#endif
#endif
}
if
((
config
.
do_verification
==
2
)
||
(
config
.
do_verification
==
3
))
{
// GPU verification
// GPU verification
auto
ref_gemm_gpu
=
ReferenceGemmInstanceGPU
{};
auto
ref_gemm_gpu
=
ReferenceGemmInstanceGPU
{};
auto
ref_invoker_gpu
=
ref_gemm_gpu
.
MakeInvoker
();
auto
ref_invoker_gpu
=
ref_gemm_gpu
.
MakeInvoker
();
...
@@ -381,14 +384,14 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
...
@@ -381,14 +384,14 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
c_m_n_device_ref_buf
.
FromDevice
(
c_m_n_device_ref_result
.
mData
.
data
());
c_m_n_device_ref_buf
.
FromDevice
(
c_m_n_device_ref_result
.
mData
.
data
());
c_m_n_device_buf
.
FromDevice
(
c_m_n_device_result
.
mData
.
data
());
c_m_n_device_buf
.
FromDevice
(
c_m_n_device_result
.
mData
.
data
());
pass
&=
!
ck
::
utils
::
check_err
(
c_m_n_device_result
,
pass
&=
ck
::
utils
::
check_err
(
c_m_n_device_result
,
c_m_n_device_ref_result
,
c_m_n_device_ref_result
,
"Error: Incorrect results!"
,
"Error: Incorrect results!"
,
get_rtol
<
CDataType
>
(),
get_rtol
<
CDataType
>
(),
get_atol
<
CDataType
>
());
get_atol
<
CDataType
>
());
}
}
return
!
pass
;
return
pass
==
true
;
}
}
bool
run_gemm_example
(
int
argc
,
char
*
argv
[])
bool
run_gemm_example
(
int
argc
,
char
*
argv
[])
...
...
example/01_gemm/run_gemm_example_streamk_v2.inc
View file @
f20e48f1
...
@@ -241,7 +241,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
...
@@ -241,7 +241,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
}
}
bool
pass
=
true
;
bool
pass
=
true
;
if
(
config
.
do_verification
)
if
(
(
config
.
do_verification
==
1
)
||
(
config
.
do_verification
==
3
)
)
{
{
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
...
...
example/01_gemm/run_gemm_example_v2.inc
View file @
f20e48f1
...
@@ -228,7 +228,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
...
@@ -228,7 +228,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
}
}
bool
pass
=
true
;
bool
pass
=
true
;
if
(
config
.
do_verification
)
if
(
(
config
.
do_verification
==
1
)
||
(
config
.
do_verification
==
3
)
)
{
{
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
...
...
example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
View file @
f20e48f1
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
/*
/*
Computes C_m_o = Relu(A0[m, k] * B0[n, k] + D00[m, n] + D01[mn]) * B1[n, o] + D1[m, o]
Computes C_m_o = Relu(A0[m, k] * B0[n, k] + D00[m, n] + D01[mn]) * B1[n, o] + D1[m, o]
...
@@ -60,14 +60,14 @@ struct AddAddRelu
...
@@ -60,14 +60,14 @@ struct AddAddRelu
{
{
const
ck
::
half_t
x
=
c
+
d0
+
d1
;
const
ck
::
half_t
x
=
c
+
d0
+
d1
;
ck
::
tensor_operation
::
element_wise
::
Relu
{}.
template
operator
()
<
ck
::
half_t
>
(
e
,
x
);
ck
::
tensor_operation
::
element_wise
::
Relu
{}.
operator
()(
e
,
x
);
}
}
__host__
__device__
void
__host__
__device__
void
operator
()(
float
&
e
,
const
float
&
c
,
const
ck
::
half_t
&
d0
,
const
ck
::
half_t
&
d1
)
const
operator
()(
float
&
e
,
const
float
&
c
,
const
ck
::
half_t
&
d0
,
const
ck
::
half_t
&
d1
)
const
{
{
const
float
x
=
c
+
(
d0
+
d1
);
const
float
x
=
c
+
(
d0
+
d1
);
ck
::
tensor_operation
::
element_wise
::
Relu
{}.
template
operator
()
<
float
>
(
e
,
x
);
ck
::
tensor_operation
::
element_wise
::
Relu
{}.
operator
()(
e
,
x
);
}
}
};
};
...
...
example/62_convnd_activ/CMakeLists.txt
View file @
f20e48f1
...
@@ -6,6 +6,7 @@ add_subdirectory(convscale_add)
...
@@ -6,6 +6,7 @@ add_subdirectory(convscale_add)
add_subdirectory
(
convscale_reduce
)
add_subdirectory
(
convscale_reduce
)
add_subdirectory
(
multi_AB
)
add_subdirectory
(
multi_AB
)
add_subdirectory
(
unary
)
add_subdirectory
(
unary
)
add_subdirectory
(
dynamic_unary
)
add_custom_target
(
example_convnd_activ_xdl
)
add_custom_target
(
example_convnd_activ_xdl
)
# ScaleAdd ScaleAdd Relu
# ScaleAdd ScaleAdd Relu
...
...
example/62_convnd_activ/dynamic_unary/CMakeLists.txt
0 → 100644
View file @
f20e48f1
list
(
APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942
)
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
if
(
gpu IN_LIST gpu_list AND target EQUAL 0
)
add_custom_target
(
example_convnd_activ_dynamic_unary_xdl
)
# Sigmoid
add_example_executable
(
example_convnd_fwd_xdl_dynamic_sigmoid_fp16 convnd_fwd_xdl_dynamic_sigmoid_fp16.cpp
)
add_example_dependencies
(
example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_sigmoid_fp16
)
# Tanh
add_example_executable
(
example_convnd_fwd_xdl_dynamic_tanh_fp16 convnd_fwd_xdl_dynamic_tanh_fp16.cpp
)
add_example_dependencies
(
example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_tanh_fp16
)
# Relu
add_example_executable
(
example_convnd_fwd_xdl_dynamic_relu_fp16 convnd_fwd_xdl_dynamic_relu_fp16.cpp
)
add_example_dependencies
(
example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_relu_fp16
)
# SoftRelu
add_example_executable
(
example_convnd_fwd_xdl_dynamic_softrelu_fp16 convnd_fwd_xdl_dynamic_softrelu_fp16.cpp
)
add_example_dependencies
(
example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_softrelu_fp16
)
# Abs
add_example_executable
(
example_convnd_fwd_xdl_dynamic_abs_fp16 convnd_fwd_xdl_dynamic_abs_fp16.cpp
)
add_example_dependencies
(
example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_abs_fp16
)
# Pow
add_example_executable
(
example_convnd_fwd_xdl_dynamic_pow_fp16 convnd_fwd_xdl_dynamic_pow_fp16.cpp
)
add_example_dependencies
(
example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_pow_fp16
)
# Clipped Relu
add_example_executable
(
example_convnd_fwd_xdl_dynamic_clippedrelu_fp16 convnd_fwd_xdl_dynamic_clippedrelu_fp16.cpp
)
add_example_dependencies
(
example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_clippedrelu_fp16
)
# Leaky Relu
add_example_executable
(
example_convnd_fwd_xdl_dynamic_leakyrelu_fp16 convnd_fwd_xdl_dynamic_leakyrelu_fp16.cpp
)
add_example_dependencies
(
example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_leakyrelu_fp16
)
# Elu
add_example_executable
(
example_convnd_fwd_xdl_dynamic_elu_fp16 convnd_fwd_xdl_dynamic_elu_fp16.cpp
)
add_example_dependencies
(
example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_elu_fp16
)
# Swish
add_example_executable
(
example_convnd_fwd_xdl_dynamic_swish_fp16 convnd_fwd_xdl_dynamic_swish_fp16.cpp
)
add_example_dependencies
(
example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_swish_fp16
)
# PassThrough
add_example_executable
(
example_convnd_fwd_xdl_dynamic_passthrough_fp16 convnd_fwd_xdl_dynamic_passthrough_fp16.cpp
)
add_example_dependencies
(
example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_passthrough_fp16
)
# Logistic
add_example_executable
(
example_convnd_fwd_xdl_dynamic_logistic_fp16 convnd_fwd_xdl_dynamic_logistic_fp16.cpp
)
add_example_dependencies
(
example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_logistic_fp16
)
set
(
target 1
)
endif
()
endforeach
()
example/62_convnd_activ/dynamic_unary/convnd_fwd_activ_dynamic_unary_common.hpp
0 → 100644
View file @
f20e48f1
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cstdlib>
#include <iostream>
#include <numeric>
#include <type_traits>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
constexpr
ck
::
index_t
NDimSpatial
=
3
;
using
InDataType
=
ck
::
half_t
;
using
WeiDataType
=
ck
::
half_t
;
using
AccDataType
=
float
;
using
CShuffleDataType
=
ck
::
half_t
;
using
OutDataType
=
ck
::
half_t
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
GNDHWC
;
using
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
GKZYXC
;
using
OutLayout
=
ck
::
tensor_layout
::
convolution
::
GNDHWK
;
using
InElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
WeiElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DynamicElementOp
=
ck
::
tensor_operation
::
element_wise
::
DynamicUnaryOp
;
static
constexpr
auto
ConvSpec
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Default
;
static
constexpr
auto
GemmSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
using
DeviceGroupedConvNDActivInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
<
NDimSpatial
,
InLayout
,
WeiLayout
,
ck
::
Tuple
<>
,
OutLayout
,
InDataType
,
WeiDataType
,
AccDataType
,
CShuffleDataType
,
ck
::
Tuple
<>
,
OutDataType
,
InElementOp
,
WeiElementOp
,
DynamicElementOp
,
ConvSpec
,
// ConvForwardSpecialization
GemmSpec
,
// GemmSpecialization
1
,
//
256
,
// BlockSize
128
,
// MPerBlock
256
,
// NPerBlock
32
,
// KPerBlock
8
,
// AK1
8
,
// BK1
32
,
// MPerXdl
32
,
// NPerXdl
2
,
// MXdlPerWave
4
,
// NXdlPerWave
S
<
4
,
64
,
1
>
,
// ABlockTransferThreadClusterLengths_AK0_M_AK1
S
<
1
,
0
,
2
>
,
// ABlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// ABlockTransferSrcAccessOrder
2
,
// ABlockTransferSrcVectorDim
8
,
// ABlockTransferSrcScalarPerVector
8
,
// ABlockTransferDstScalarPerVector_AK1
1
,
// ABlockLdsExtraM
S
<
4
,
64
,
1
>
,
// BBlockTransferThreadClusterLengths_BK0_N_BK1
S
<
1
,
0
,
2
>
,
// BBlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// BBlockTransferSrcAccessOrder
2
,
// BBlockTransferSrcVectorDim
8
,
// BBlockTransferSrcScalarPerVector
8
,
// BBlockTransferDstScalarPerVector_BK1
1
,
// BBlockLdsExtraN
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
;
template
<
ck
::
index_t
NDimSpatial
,
typename
InDataType
,
typename
WeiDataType
,
typename
OutDataType
,
typename
InElementOp
,
typename
WeiElementOp
,
typename
OutElementOp
,
typename
DeviceConvNDFwdInstance
>
bool
run_grouped_conv
(
bool
do_verification
,
int
init_method
,
bool
time_kernel
,
const
ck
::
utils
::
conv
::
ConvParam
&
conv_param
,
const
HostTensorDescriptor
&
in_g_n_c_wis_desc
,
const
HostTensorDescriptor
&
wei_g_k_c_xs_desc
,
const
HostTensorDescriptor
&
out_g_n_k_wos_desc
,
const
InElementOp
&
in_element_op
,
const
WeiElementOp
&
wei_element_op
,
const
OutElementOp
&
out_element_op
)
{
Tensor
<
InDataType
>
in
(
in_g_n_c_wis_desc
);
Tensor
<
WeiDataType
>
wei
(
wei_g_k_c_xs_desc
);
Tensor
<
OutDataType
>
out_host
(
out_g_n_k_wos_desc
);
Tensor
<
OutDataType
>
out_device
(
out_g_n_k_wos_desc
);
std
::
cout
<<
"in: "
<<
in
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"wei: "
<<
wei
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"out: "
<<
out_host
.
mDesc
<<
std
::
endl
;
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
-
2
,
2
});
wei
.
GenerateTensorValue
(
GeneratorTensor_2
<
WeiDataType
>
{
-
2
,
2
});
break
;
default:
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
-
1.0
,
1.0
});
wei
.
GenerateTensorValue
(
GeneratorTensor_3
<
WeiDataType
>
{
-
0.05
,
0.05
});
}
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
wei
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
out_device
.
mDesc
.
GetElementSpaceSize
());
in_device_buf
.
ToDevice
(
in
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei
.
mData
.
data
());
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_dilations
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_left_pads
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_right_pads
{};
auto
copy
=
[](
const
auto
&
x
,
auto
&
y
)
{
ck
::
ranges
::
copy
(
x
,
y
.
begin
());
};
copy
(
in_g_n_c_wis_desc
.
GetLengths
(),
a_g_n_c_wis_lengths
);
copy
(
in_g_n_c_wis_desc
.
GetStrides
(),
a_g_n_c_wis_strides
);
copy
(
wei_g_k_c_xs_desc
.
GetLengths
(),
b_g_k_c_xs_lengths
);
copy
(
wei_g_k_c_xs_desc
.
GetStrides
(),
b_g_k_c_xs_strides
);
copy
(
out_g_n_k_wos_desc
.
GetLengths
(),
e_g_n_k_wos_lengths
);
copy
(
out_g_n_k_wos_desc
.
GetStrides
(),
e_g_n_k_wos_strides
);
copy
(
conv_param
.
conv_filter_strides_
,
conv_filter_strides
);
copy
(
conv_param
.
conv_filter_dilations_
,
conv_filter_dilations
);
copy
(
conv_param
.
input_left_pads_
,
input_left_pads
);
copy
(
conv_param
.
input_right_pads_
,
input_right_pads
);
// do Conv
auto
conv
=
DeviceConvNDFwdInstance
{};
auto
invoker
=
conv
.
MakeInvoker
();
auto
argument
=
conv
.
MakeArgument
(
in_device_buf
.
GetDeviceBuffer
(),
wei_device_buf
.
GetDeviceBuffer
(),
std
::
array
<
const
void
*
,
0
>
{},
out_device_buf
.
GetDeviceBuffer
(),
a_g_n_c_wis_lengths
,
a_g_n_c_wis_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
std
::
array
<
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
,
0
>
{{}},
std
::
array
<
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
,
0
>
{{}},
e_g_n_k_wos_lengths
,
e_g_n_k_wos_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
in_element_op
,
wei_element_op
,
out_element_op
);
if
(
!
conv
.
IsSupportedArgument
(
argument
))
{
throw
std
::
runtime_error
(
"The device op with the specified compilation parameters does "
"not support this convolution problem."
);
}
float
avg_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
conv_param
.
GetFlops
();
std
::
size_t
num_btype
=
conv_param
.
GetByte
<
InDataType
,
WeiDataType
,
OutDataType
>
();
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
avg_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
conv
.
GetTypeString
()
<<
std
::
endl
;
if
(
do_verification
)
{
auto
ref_conv
=
ck
::
tensor_operation
::
host
::
ReferenceConvFwd
<
NDimSpatial
,
InDataType
,
WeiDataType
,
OutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
>
();
auto
ref_invoker
=
ref_conv
.
MakeInvoker
();
auto
ref_argument
=
ref_conv
.
MakeArgument
(
in
,
wei
,
out_host
,
conv_param
.
conv_filter_strides_
,
conv_param
.
conv_filter_dilations_
,
conv_param
.
input_left_pads_
,
conv_param
.
input_right_pads_
,
in_element_op
,
wei_element_op
,
out_element_op
);
ref_invoker
.
Run
(
ref_argument
);
out_device_buf
.
FromDevice
(
out_device
.
mData
.
data
());
return
ck
::
utils
::
check_err
(
out_device
,
out_host
,
"Error: incorrect results!"
);
}
return
true
;
}
example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_abs_fp16.cpp
0 → 100644
View file @
f20e48f1
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "convnd_fwd_activ_dynamic_unary_common.hpp"
#include "../run_convnd_activ_dynamic_example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
ck
::
tensor_operation
::
element_wise
::
UnaryAbs
out_element_op
;
return
!
run_convnd_example
(
argc
,
argv
,
out_element_op
);
}
example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_clippedrelu_fp16.cpp
0 → 100644
View file @
f20e48f1
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "convnd_fwd_activ_dynamic_unary_common.hpp"
#include "../run_convnd_activ_dynamic_example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
ck
::
tensor_operation
::
element_wise
::
ClippedRelu
out_element_op
(
0.
f
,
1.
f
);
return
!
run_convnd_example
(
argc
,
argv
,
out_element_op
);
}
example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_elu_fp16.cpp
0 → 100644
View file @
f20e48f1
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "convnd_fwd_activ_dynamic_unary_common.hpp"
#include "../run_convnd_activ_dynamic_example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
ck
::
tensor_operation
::
element_wise
::
Elu
out_element_op
(
2.
f
);
return
!
run_convnd_example
(
argc
,
argv
,
out_element_op
);
}
example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_leakyrelu_fp16.cpp
0 → 100644
View file @
f20e48f1
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "convnd_fwd_activ_dynamic_unary_common.hpp"
#include "../run_convnd_activ_dynamic_example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
ck
::
tensor_operation
::
element_wise
::
LeakyRelu
out_element_op
(
0.
f
);
return
!
run_convnd_example
(
argc
,
argv
,
out_element_op
);
}
example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_logistic_fp16.cpp
0 → 100644
View file @
f20e48f1
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "convnd_fwd_activ_dynamic_unary_common.hpp"
#include "../run_convnd_activ_dynamic_example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
ck
::
tensor_operation
::
element_wise
::
Logistic
out_element_op
(
1.0
f
);
return
!
run_convnd_example
(
argc
,
argv
,
out_element_op
);
}
example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_passthrough_fp16.cpp
0 → 100644
View file @
f20e48f1
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "convnd_fwd_activ_dynamic_unary_common.hpp"
#include "../run_convnd_activ_dynamic_example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
ck
::
tensor_operation
::
element_wise
::
PassThrough
out_element_op
;
return
!
run_convnd_example
(
argc
,
argv
,
out_element_op
);
}
example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_pow_fp16.cpp
0 → 100644
View file @
f20e48f1
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "convnd_fwd_activ_dynamic_unary_common.hpp"
#include "../run_convnd_activ_dynamic_example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
ck
::
tensor_operation
::
element_wise
::
Power
out_element_op
(
4.
f
,
1.
f
,
2.
f
);
return
!
run_convnd_example
(
argc
,
argv
,
out_element_op
);
}
example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_relu_fp16.cpp
0 → 100644
View file @
f20e48f1
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "convnd_fwd_activ_dynamic_unary_common.hpp"
#include "../run_convnd_activ_dynamic_example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
ck
::
tensor_operation
::
element_wise
::
Relu
out_element_op
;
return
!
run_convnd_example
(
argc
,
argv
,
out_element_op
);
}
example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_sigmoid_fp16.cpp
0 → 100644
View file @
f20e48f1
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "convnd_fwd_activ_dynamic_unary_common.hpp"
#include "../run_convnd_activ_dynamic_example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
ck
::
tensor_operation
::
element_wise
::
Sigmoid
out_element_op
;
return
!
run_convnd_example
(
argc
,
argv
,
out_element_op
);
}
example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_softrelu_fp16.cpp
0 → 100644
View file @
f20e48f1
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "convnd_fwd_activ_dynamic_unary_common.hpp"
#include "../run_convnd_activ_dynamic_example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
ck
::
tensor_operation
::
element_wise
::
SoftRelu
out_element_op
;
return
!
run_convnd_example
(
argc
,
argv
,
out_element_op
);
}
Prev
1
2
3
4
5
…
19
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment