Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
316d4acc
Commit
316d4acc
authored
Sep 07, 2023
by
Adam Osewski
Browse files
Merge remote-tracking branch 'origin/develop' into aosewski/gemm_tile_loop
parents
9836e0ae
37a8c1f7
Changes
259
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1134 additions
and
65 deletions
+1134
-65
.pre-commit-config.yaml
.pre-commit-config.yaml
+1
-1
CMakeLists.txt
CMakeLists.txt
+70
-43
Dockerfile
Dockerfile
+1
-1
Jenkinsfile
Jenkinsfile
+1
-1
client_example/05_layernorm/layernorm2d.cpp
client_example/05_layernorm/layernorm2d.cpp
+8
-0
client_example/13_batchnorm/batchnorm_bwd_nhwc.cpp
client_example/13_batchnorm/batchnorm_bwd_nhwc.cpp
+6
-0
client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp
client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp
+6
-0
client_example/18_groupnorm/groupnorm_swish.cpp
client_example/18_groupnorm/groupnorm_swish.cpp
+8
-0
client_example/19_pool/CMakeLists.txt
client_example/19_pool/CMakeLists.txt
+11
-0
client_example/19_pool/avg_pool3d_bwd.cpp
client_example/19_pool/avg_pool3d_bwd.cpp
+191
-0
client_example/19_pool/avg_pool3d_fwd.cpp
client_example/19_pool/avg_pool3d_fwd.cpp
+0
-0
client_example/19_pool/max_pool2d_bwd.cpp
client_example/19_pool/max_pool2d_bwd.cpp
+280
-0
client_example/19_pool/max_pool2d_fwd.cpp
client_example/19_pool/max_pool2d_fwd.cpp
+0
-0
client_example/20_image_to_column/CMakeLists.txt
client_example/20_image_to_column/CMakeLists.txt
+2
-0
client_example/20_image_to_column/image_to_column.cpp
client_example/20_image_to_column/image_to_column.cpp
+167
-0
client_example/21_grouped_gemm_bias/CMakeLists.txt
client_example/21_grouped_gemm_bias/CMakeLists.txt
+2
-0
client_example/21_grouped_gemm_bias/grouped_gemm_fixed_nk_bias_fp16.cpp
.../21_grouped_gemm_bias/grouped_gemm_fixed_nk_bias_fp16.cpp
+244
-0
client_example/CMakeLists.txt
client_example/CMakeLists.txt
+37
-16
cmake/DoxygenDoc.cmake
cmake/DoxygenDoc.cmake
+2
-0
docs/Contributors_Guide.rst
docs/Contributors_Guide.rst
+97
-3
No files found.
.pre-commit-config.yaml
View file @
316d4acc
...
@@ -3,7 +3,7 @@ repos:
...
@@ -3,7 +3,7 @@ repos:
hooks
:
hooks
:
-
id
:
clang-format
-
id
:
clang-format
name
:
clang-format
name
:
clang-format
entry
:
clang-format-1
0
-i --style=file
entry
:
clang-format-1
2
-i --style=file
language
:
system
language
:
system
types_or
:
[
c++
,
inc
]
types_or
:
[
c++
,
inc
]
-
id
:
copyright-year-checker
-
id
:
copyright-year-checker
...
...
CMakeLists.txt
View file @
316d4acc
cmake_minimum_required
(
VERSION 3.14
)
cmake_minimum_required
(
VERSION 3.14
)
set
(
version 1.1.0
)
# Check support for CUDA/HIP in Cmake
# Check support for CUDA/HIP in Cmake
project
(
composable_kernel
)
project
(
composable_kernel
VERSION
${
version
}
)
list
(
APPEND CMAKE_MODULE_PATH
"
${
PROJECT_SOURCE_DIR
}
/cmake"
)
list
(
APPEND CMAKE_MODULE_PATH
"
${
PROJECT_SOURCE_DIR
}
/cmake"
)
if
(
DTYPES
)
if
(
DTYPES
)
add_definitions
(
-DDTYPES
)
add_definitions
(
-DDTYPES
)
if
(
DTYPES MATCHES
"int8"
)
if
(
DTYPES MATCHES
"int8"
)
add_definitions
(
-D__int8__
)
add_definitions
(
-DCK_ENABLE_INT8
)
endif
()
set
(
CK_ENABLE_INT8
"ON"
)
if
(
DTYPES MATCHES
"fp8"
)
endif
()
add_definitions
(
-D__fp8__
)
if
(
DTYPES MATCHES
"fp8"
)
endif
()
add_definitions
(
-DCK_ENABLE_FP8
)
if
(
DTYPES MATCHES
"fp16"
)
set
(
CK_ENABLE_FP8
"ON"
)
add_definitions
(
-D__fp16__
)
endif
()
endif
()
if
(
DTYPES MATCHES
"fp16"
)
if
(
DTYPES MATCHES
"fp32"
)
add_definitions
(
-DCK_ENABLE_FP16
)
add_definitions
(
-D__fp32__
)
set
(
CK_ENABLE_FP16
"ON"
)
endif
()
endif
()
if
(
DTYPES MATCHES
"fp64"
)
if
(
DTYPES MATCHES
"fp32"
)
add_definitions
(
-D__fp64__
)
add_definitions
(
-DCK_ENABLE_FP32
)
endif
()
set
(
CK_ENABLE_FP32
"ON"
)
if
(
DTYPES MATCHES
"bf16"
)
endif
()
add_definitions
(
-D__bf16__
)
if
(
DTYPES MATCHES
"fp64"
)
endif
()
add_definitions
(
-DCK_ENABLE_FP64
)
message
(
"DTYPES macro set to
${
DTYPES
}
"
)
set
(
CK_ENABLE_FP64
"ON"
)
endif
()
if
(
DTYPES MATCHES
"bf16"
)
add_definitions
(
-DCK_ENABLE_BF16
)
set
(
CK_ENABLE_BF16
"ON"
)
endif
()
message
(
"DTYPES macro set to
${
DTYPES
}
"
)
else
()
else
()
add_definitions
(
-D__int8__ -D__fp8__ -D__fp16__ -D__fp32__ -D__fp64__ -D__bf16__
)
add_definitions
(
-DCK_ENABLE_INT8 -DCK_ENABLE_FP8 -DCK_ENABLE_FP16 -DCK_ENABLE_FP32 -DCK_ENABLE_FP64 -DCK_ENABLE_BF16
)
set
(
CK_ENABLE_ALL_DTYPES
"ON"
)
endif
()
endif
()
if
(
DL_KERNELS
)
if
(
DL_KERNELS
)
add_definitions
(
-DDL_KERNELS
)
add_definitions
(
-DDL_KERNELS
)
set
(
CK_ENABLE_DL_KERNELS
"ON"
)
endif
()
endif
()
if
(
INSTANCES_ONLY
)
if
(
INSTANCES_ONLY
)
add_definitions
(
-DINSTANCES_ONLY
)
add_definitions
(
-DINSTANCES_ONLY
)
set
(
CK_ENABLE_INSTANCES_ONLY
"ON"
)
endif
()
endif
()
# CK config file to record supported datatypes, etc.
configure_file
(
"
${
PROJECT_SOURCE_DIR
}
/include/ck/config.h.in"
"
${
PROJECT_BINARY_DIR
}
/include/ck/config.h"
)
# CK version file to record release version as well as git commit hash
find_package
(
Git REQUIRED
)
execute_process
(
COMMAND
"
${
GIT_EXECUTABLE
}
"
rev-parse HEAD OUTPUT_VARIABLE COMMIT_ID OUTPUT_STRIP_TRAILING_WHITESPACE
)
configure_file
(
"
${
PROJECT_SOURCE_DIR
}
/include/ck/version.h.in"
"
${
PROJECT_BINARY_DIR
}
/include/ck/version.h"
)
enable_testing
()
enable_testing
()
set
(
ROCM_SYMLINK_LIBS OFF
)
set
(
ROCM_SYMLINK_LIBS OFF
)
...
@@ -50,8 +68,10 @@ include(ROCMInstallSymlinks)
...
@@ -50,8 +68,10 @@ include(ROCMInstallSymlinks)
include
(
ROCMCreatePackage
)
include
(
ROCMCreatePackage
)
include
(
CheckCXXCompilerFlag
)
include
(
CheckCXXCompilerFlag
)
include
(
ROCMCheckTargetIds
)
include
(
ROCMCheckTargetIds
)
rocm_setup_version
(
VERSION 0.2.0
)
include
(
TargetFlags
)
include
(
TargetFlags
)
rocm_setup_version
(
VERSION
${
version
}
)
list
(
APPEND CMAKE_PREFIX_PATH
${
CMAKE_INSTALL_PREFIX
}
${
CMAKE_INSTALL_PREFIX
}
/llvm
${
CMAKE_INSTALL_PREFIX
}
/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip
)
list
(
APPEND CMAKE_PREFIX_PATH
${
CMAKE_INSTALL_PREFIX
}
${
CMAKE_INSTALL_PREFIX
}
/llvm
${
CMAKE_INSTALL_PREFIX
}
/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip
)
message
(
"GPU_TARGETS=
${
GPU_TARGETS
}
"
)
message
(
"GPU_TARGETS=
${
GPU_TARGETS
}
"
)
...
@@ -315,13 +335,14 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
...
@@ -315,13 +335,14 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
set
(
CMAKE_ARCHIVE_OUTPUT_DIRECTORY
${
CMAKE_CURRENT_BINARY_DIR
}
/lib
)
set
(
CMAKE_ARCHIVE_OUTPUT_DIRECTORY
${
CMAKE_CURRENT_BINARY_DIR
}
/lib
)
set
(
CMAKE_RUNTIME_OUTPUT_DIRECTORY
${
CMAKE_CURRENT_BINARY_DIR
}
/bin
)
set
(
CMAKE_RUNTIME_OUTPUT_DIRECTORY
${
CMAKE_CURRENT_BINARY_DIR
}
/bin
)
# set CK project include directories
include_directories
(
BEFORE
include_directories
(
BEFORE
${
PROJECT_BINARY_DIR
}
/include
${
PROJECT_SOURCE_DIR
}
/include
${
PROJECT_SOURCE_DIR
}
/include
${
PROJECT_SOURCE_DIR
}
/library/include
${
PROJECT_SOURCE_DIR
}
/library/include
${
HIP_INCLUDE_DIRS
}
${
HIP_INCLUDE_DIRS
}
)
)
SET
(
BUILD_DEV ON CACHE BOOL
"BUILD_DEV"
)
SET
(
BUILD_DEV ON CACHE BOOL
"BUILD_DEV"
)
if
(
BUILD_DEV
)
if
(
BUILD_DEV
)
add_compile_options
(
-Werror
)
add_compile_options
(
-Werror
)
...
@@ -341,35 +362,35 @@ IF(IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu
...
@@ -341,35 +362,35 @@ IF(IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu
file
(
READ
"
${
PROJECT_SOURCE_DIR
}
/library/src/tensor_operation_instance/gpu/
${
subdir_path
}
/CMakeLists.txt"
cmake_instance
)
file
(
READ
"
${
PROJECT_SOURCE_DIR
}
/library/src/tensor_operation_instance/gpu/
${
subdir_path
}
/CMakeLists.txt"
cmake_instance
)
set
(
add_inst 0
)
set
(
add_inst 0
)
if
(
"
${
cmake_instance
}
"
MATCHES
"DTYPES MATCHES
\"
fp8
\"
"
AND DTYPES MATCHES
"fp8"
)
if
(
"
${
cmake_instance
}
"
MATCHES
"DTYPES MATCHES
\"
fp8
\"
"
AND DTYPES MATCHES
"fp8"
)
#message("fp8 instance found!")
#message("fp8 instance found!")
set
(
add_inst 1
)
set
(
add_inst 1
)
endif
()
endif
()
if
(
"
${
cmake_instance
}
"
MATCHES
"DTYPES MATCHES
\"
fp16
\"
"
AND DTYPES MATCHES
"fp16"
)
if
(
"
${
cmake_instance
}
"
MATCHES
"DTYPES MATCHES
\"
fp16
\"
"
AND DTYPES MATCHES
"fp16"
)
#message("fp16 instance found!")
#message("fp16 instance found!")
set
(
add_inst 1
)
set
(
add_inst 1
)
endif
()
endif
()
if
(
"
${
cmake_instance
}
"
MATCHES
"DTYPES MATCHES
\"
fp32
\"
"
AND DTYPES MATCHES
"fp32"
)
if
(
"
${
cmake_instance
}
"
MATCHES
"DTYPES MATCHES
\"
fp32
\"
"
AND DTYPES MATCHES
"fp32"
)
#message("fp32 instance found!")
#message("fp32 instance found!")
set
(
add_inst 1
)
set
(
add_inst 1
)
endif
()
endif
()
if
(
"
${
cmake_instance
}
"
MATCHES
"DTYPES MATCHES
\"
fp64
\"
"
AND DTYPES MATCHES
"fp64"
)
if
(
"
${
cmake_instance
}
"
MATCHES
"DTYPES MATCHES
\"
fp64
\"
"
AND DTYPES MATCHES
"fp64"
)
#message("fp64 instance found!")
#message("fp64 instance found!")
set
(
add_inst 1
)
set
(
add_inst 1
)
endif
()
endif
()
if
(
"
${
cmake_instance
}
"
MATCHES
"DTYPES MATCHES
\"
bf16
\"
"
AND DTYPES MATCHES
"bf16"
)
if
(
"
${
cmake_instance
}
"
MATCHES
"DTYPES MATCHES
\"
bf16
\"
"
AND DTYPES MATCHES
"bf16"
)
#message("bf16 instance found!")
#message("bf16 instance found!")
set
(
add_inst 1
)
set
(
add_inst 1
)
endif
()
endif
()
if
(
"
${
cmake_instance
}
"
MATCHES
"DTYPES MATCHES
\"
int8
\"
"
AND DTYPES MATCHES
"int8"
)
if
(
"
${
cmake_instance
}
"
MATCHES
"DTYPES MATCHES
\"
int8
\"
"
AND DTYPES MATCHES
"int8"
)
#message("int8 instance found!")
#message("int8 instance found!")
set
(
add_inst 1
)
set
(
add_inst 1
)
endif
()
endif
()
if
(
NOT
"
${
cmake_instance
}
"
MATCHES
"DTYPES"
)
if
(
NOT
"
${
cmake_instance
}
"
MATCHES
"DTYPES"
)
#message("instance should be built for all types!")
#message("instance should be built for all types!")
set
(
add_inst 1
)
set
(
add_inst 1
)
endif
()
endif
()
if
(
add_inst EQUAL 1 OR NOT DEFINED DTYPES
)
if
(
add_inst EQUAL 1 OR NOT DEFINED DTYPES
)
list
(
APPEND CK_DEVICE_INSTANCES device_
${
subdir_path
}
_instance
)
list
(
APPEND CK_DEVICE_INSTANCES device_
${
subdir_path
}
_instance
)
endif
()
endif
()
ENDIF
()
ENDIF
()
ENDFOREACH
()
ENDFOREACH
()
...
@@ -409,7 +430,6 @@ endif()
...
@@ -409,7 +430,6 @@ endif()
#Create an interface target for the include only files and call it "composablekernels"
#Create an interface target for the include only files and call it "composablekernels"
include
(
CMakePackageConfigHelpers
)
include
(
CMakePackageConfigHelpers
)
set
(
version 1.0.0
)
write_basic_package_version_file
(
write_basic_package_version_file
(
"
${
CMAKE_CURRENT_BINARY_DIR
}
/composable_kernelConfigVersion.cmake"
"
${
CMAKE_CURRENT_BINARY_DIR
}
/composable_kernelConfigVersion.cmake"
VERSION
"
${
version
}
"
VERSION
"
${
version
}
"
...
@@ -417,9 +437,9 @@ write_basic_package_version_file(
...
@@ -417,9 +437,9 @@ write_basic_package_version_file(
)
)
configure_package_config_file
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/Config.cmake.in
configure_package_config_file
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/Config.cmake.in
"
${
CMAKE_CURRENT_BINARY_DIR
}
/composable_kernelConfig.cmake"
"
${
CMAKE_CURRENT_BINARY_DIR
}
/composable_kernelConfig.cmake"
INSTALL_DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
/cmake/composable_kernel
INSTALL_DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
/cmake/composable_kernel
NO_CHECK_REQUIRED_COMPONENTS_MACRO
NO_CHECK_REQUIRED_COMPONENTS_MACRO
)
)
rocm_install
(
FILES
rocm_install
(
FILES
...
@@ -428,6 +448,13 @@ rocm_install(FILES
...
@@ -428,6 +448,13 @@ rocm_install(FILES
DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
/cmake/composable_kernel
DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
/cmake/composable_kernel
)
)
# Install CK version and configuration files
rocm_install
(
FILES
${
PROJECT_BINARY_DIR
}
/include/ck/version.h
${
PROJECT_BINARY_DIR
}
/include/ck/config.h
DESTINATION
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/
)
set
(
CPACK_RESOURCE_FILE_LICENSE
"
${
CMAKE_CURRENT_SOURCE_DIR
}
/LICENSE"
)
set
(
CPACK_RESOURCE_FILE_LICENSE
"
${
CMAKE_CURRENT_SOURCE_DIR
}
/LICENSE"
)
set
(
CPACK_RPM_PACKAGE_LICENSE
"MIT"
)
set
(
CPACK_RPM_PACKAGE_LICENSE
"MIT"
)
...
...
Dockerfile
View file @
316d4acc
...
@@ -63,7 +63,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
...
@@ -63,7 +63,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
nano
\
nano
\
zlib1g-dev
\
zlib1g-dev
\
openssh-server
\
openssh-server
\
clang-format-1
0
\
clang-format-1
2
\
kmod
&&
\
kmod
&&
\
apt-get clean
&&
\
apt-get clean
&&
\
rm
-rf
/var/lib/apt/lists/
*
rm
-rf
/var/lib/apt/lists/
*
...
...
Jenkinsfile
View file @
316d4acc
...
@@ -689,7 +689,7 @@ pipeline {
...
@@ -689,7 +689,7 @@ pipeline {
-o -iname \'*.cpp.in\' \
-o -iname \'*.cpp.in\' \
-o -iname \'*.cl\' \
-o -iname \'*.cl\' \
| grep -v 'build/' \
| grep -v 'build/' \
| xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-1
0
-style=file {} | diff - {}\'"
| xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-1
2
-style=file {} | diff - {}\'"
}
}
steps
{
steps
{
buildHipClangJobAndReboot
(
setup_cmd:
""
,
build_cmd:
""
,
execute_cmd:
execute_cmd
,
no_reboot:
true
)
buildHipClangJobAndReboot
(
setup_cmd:
""
,
build_cmd:
""
,
execute_cmd:
execute_cmd
,
no_reboot:
true
)
...
...
client_example/05_layernorm/layernorm2d.cpp
View file @
316d4acc
...
@@ -100,6 +100,10 @@ int main(int argc, char* argv[])
...
@@ -100,6 +100,10 @@ int main(int argc, char* argv[])
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
{
size_t
workspace_sz
=
op_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
());
SimpleDeviceMem
workspace
(
workspace_sz
);
op_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
workspace
.
GetDeviceBuffer
());
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
std
::
size_t
num_byte
=
sizeof
(
XDataType
)
*
M
*
N
+
sizeof
(
GammaDataType
)
*
N
+
std
::
size_t
num_byte
=
sizeof
(
XDataType
)
*
M
*
N
+
sizeof
(
GammaDataType
)
*
N
+
...
@@ -153,6 +157,10 @@ int main(int argc, char* argv[])
...
@@ -153,6 +157,10 @@ int main(int argc, char* argv[])
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
{
size_t
workspace_sz
=
op_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
());
SimpleDeviceMem
workspace
(
workspace_sz
);
op_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
workspace
.
GetDeviceBuffer
());
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
}
...
...
client_example/13_batchnorm/batchnorm_bwd_nhwc.cpp
View file @
316d4acc
...
@@ -191,6 +191,12 @@ int main(int argc, char* argv[])
...
@@ -191,6 +191,12 @@ int main(int argc, char* argv[])
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
{
size_t
workspace_sz
=
op_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
());
SimpleDeviceMem
workspace
(
workspace_sz
);
op_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
workspace
.
GetDeviceBuffer
());
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
}
...
...
client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp
View file @
316d4acc
...
@@ -187,6 +187,12 @@ int main(int argc, char* argv[])
...
@@ -187,6 +187,12 @@ int main(int argc, char* argv[])
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
{
size_t
workspace_sz
=
op_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
());
SimpleDeviceMem
workspace
(
workspace_sz
);
op_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
workspace
.
GetDeviceBuffer
());
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
}
...
...
client_example/18_groupnorm/groupnorm_swish.cpp
View file @
316d4acc
...
@@ -129,6 +129,10 @@ int main(int argc, char* argv[])
...
@@ -129,6 +129,10 @@ int main(int argc, char* argv[])
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
{
size_t
workspace_sz
=
op_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
());
SimpleDeviceMem
workspace
(
workspace_sz
);
op_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
workspace
.
GetDeviceBuffer
());
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
std
::
size_t
num_byte
=
std
::
size_t
num_byte
=
...
@@ -184,6 +188,10 @@ int main(int argc, char* argv[])
...
@@ -184,6 +188,10 @@ int main(int argc, char* argv[])
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
{
size_t
workspace_sz
=
op_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
());
SimpleDeviceMem
workspace
(
workspace_sz
);
op_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
workspace
.
GetDeviceBuffer
());
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
}
...
...
client_example/19_pool
_fwd
/CMakeLists.txt
→
client_example/19_pool/CMakeLists.txt
View file @
316d4acc
add_executable
(
client_max_pool2d_fwd max_pool2d_fwd.cpp
)
add_executable
(
client_max_pool2d_fwd max_pool2d_fwd.cpp
)
target_link_libraries
(
client_max_pool2d_fwd PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
client_max_pool2d_fwd PRIVATE composable_kernel::device_operations
)
add_executable
(
client_max_pool2d_bwd max_pool2d_bwd.cpp
)
target_link_libraries
(
client_max_pool2d_bwd PRIVATE composable_kernel::device_operations
)
add_executable
(
client_avg_pool3d_fwd avg_pool3d_fwd.cpp
)
add_executable
(
client_avg_pool3d_fwd avg_pool3d_fwd.cpp
)
target_link_libraries
(
client_avg_pool3d_fwd PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
client_avg_pool3d_fwd PRIVATE composable_kernel::device_operations
)
\ No newline at end of file
add_executable
(
client_avg_pool3d_bwd avg_pool3d_bwd.cpp
)
target_link_libraries
(
client_avg_pool3d_bwd PRIVATE composable_kernel::device_operations
)
client_example/19_pool/avg_pool3d_bwd.cpp
0 → 100644
View file @
316d4acc
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/avg_pool3d_bwd.hpp"
using
DOutDataType
=
ck
::
half_t
;
using
DInDataType
=
ck
::
half_t
;
using
DOutLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
using
DInLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{},
mMemSize_
(
mem_size
)
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
void
SetZero
()
const
{
(
void
)
hipMemset
(
p_mem_
,
0
,
mMemSize_
);
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
std
::
size_t
mMemSize_
;
};
int
main
(
int
argc
,
char
*
argv
[])
{
ck
::
index_t
N
=
2
;
ck
::
index_t
C
=
32
;
ck
::
index_t
Z
=
2
;
ck
::
index_t
Y
=
2
;
ck
::
index_t
X
=
2
;
ck
::
index_t
Di
=
30
;
ck
::
index_t
Hi
=
30
;
ck
::
index_t
Wi
=
30
;
ck
::
index_t
window_stride_d
=
2
;
ck
::
index_t
window_stride_h
=
2
;
ck
::
index_t
window_stride_w
=
2
;
ck
::
index_t
window_dilation_d
=
1
;
ck
::
index_t
window_dilation_h
=
1
;
ck
::
index_t
window_dilation_w
=
1
;
ck
::
index_t
in_left_pad_d
=
1
;
ck
::
index_t
in_left_pad_h
=
1
;
ck
::
index_t
in_left_pad_w
=
1
;
ck
::
index_t
in_right_pad_d
=
1
;
ck
::
index_t
in_right_pad_h
=
1
;
ck
::
index_t
in_right_pad_w
=
1
;
const
ck
::
index_t
Zs
=
(
Z
-
1
)
*
window_dilation_d
+
1
;
const
ck
::
index_t
Ys
=
(
Y
-
1
)
*
window_dilation_h
+
1
;
const
ck
::
index_t
Xs
=
(
X
-
1
)
*
window_dilation_w
+
1
;
ck
::
index_t
Do
=
(
Di
+
in_left_pad_d
+
in_right_pad_d
-
Zs
)
/
window_stride_d
+
1
;
ck
::
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
Ys
)
/
window_stride_h
+
1
;
ck
::
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
Xs
)
/
window_stride_w
+
1
;
// Pool API only support the order of NCDHW
std
::
vector
<
ck
::
index_t
>
in_length
=
{
N
,
C
,
Di
,
Hi
,
Wi
};
std
::
vector
<
ck
::
index_t
>
out_length
=
{
N
,
C
,
Do
,
Ho
,
Wo
};
std
::
vector
<
ck
::
index_t
>
window_spatial_lengths
=
{
Z
,
Y
,
X
};
std
::
vector
<
ck
::
index_t
>
window_strides
=
{
window_stride_d
,
window_stride_h
,
window_stride_w
};
std
::
vector
<
ck
::
index_t
>
window_dilations
{
window_dilation_d
,
window_dilation_h
,
window_dilation_w
};
std
::
vector
<
ck
::
index_t
>
input_left_pads
=
{
in_left_pad_d
,
in_left_pad_h
,
in_left_pad_w
};
std
::
vector
<
ck
::
index_t
>
input_right_pads
=
{
in_right_pad_d
,
in_right_pad_h
,
in_right_pad_w
};
std
::
size_t
in_tensor_size
=
N
*
C
*
Di
*
Hi
*
Wi
;
std
::
size_t
out_tensor_size
=
N
*
C
*
Do
*
Ho
*
Wo
;
// tensor layout = NDHWC
std
::
vector
<
ck
::
index_t
>
in_tensor_stride
=
{
Di
*
C
*
Hi
*
Wi
,
1
,
C
*
Hi
*
Wi
,
Wi
*
C
,
C
};
std
::
vector
<
ck
::
index_t
>
out_tensor_stride
=
{
Do
*
C
*
Ho
*
Wo
,
1
,
C
*
Ho
*
Wo
,
Wo
*
C
,
C
};
SimpleDeviceMem
dout_device_buf
(
sizeof
(
DOutDataType
)
*
out_tensor_size
);
SimpleDeviceMem
din_device_buf
(
sizeof
(
DInDataType
)
*
in_tensor_size
);
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceAvgPoolBwd
<
3
,
DOutDataType
,
DInDataType
,
DOutLayout
,
DInLayout
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_op_name
;
bool
found
=
false
;
int
best_op_id
=
-
1
;
float
best_ave_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
0
;
// profile device operation instances
std
::
cout
<<
"Run all instances and do timing"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
static_cast
<
DOutDataType
*>
(
dout_device_buf
.
GetDeviceBuffer
()),
static_cast
<
DInDataType
*>
(
din_device_buf
.
GetDeviceBuffer
()),
out_length
,
in_length
,
out_tensor_stride
,
in_tensor_stride
,
window_spatial_lengths
,
window_strides
,
window_dilations
,
input_left_pads
,
input_right_pads
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
din_device_buf
.
SetZero
();
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
std
::
size_t
num_bytes
=
in_tensor_size
*
sizeof
(
DInDataType
)
+
out_tensor_size
*
sizeof
(
DOutDataType
);
float
gb_per_sec
=
num_bytes
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
ave_time
<
best_ave_time
)
{
found
=
true
;
best_op_id
=
i
;
best_op_name
=
op_name
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
}
}
else
{
std
::
cout
<<
op_name
<<
" does not support this problem"
<<
std
::
endl
;
}
}
// run the best intance
if
(
found
)
{
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
static_cast
<
DOutDataType
*>
(
dout_device_buf
.
GetDeviceBuffer
()),
static_cast
<
DInDataType
*>
(
din_device_buf
.
GetDeviceBuffer
()),
out_length
,
in_length
,
out_tensor_stride
,
in_tensor_stride
,
window_spatial_lengths
,
window_strides
,
window_dilations
,
input_left_pads
,
input_right_pads
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
din_device_buf
.
SetZero
();
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
std
::
cout
<<
"Done"
<<
std
::
endl
;
}
return
0
;
}
client_example/19_pool
_fwd
/avg_pool3d_fwd.cpp
→
client_example/19_pool/avg_pool3d_fwd.cpp
View file @
316d4acc
File moved
client_example/19_pool/max_pool2d_bwd.cpp
0 → 100644
View file @
316d4acc
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_pool_fwd.hpp"
#include "ck/tensor_operation/gpu/device/device_max_pool_bwd.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp"
#include "ck/library/tensor_operation_instance/gpu/max_pool_bwd.hpp"
using
InDataType
=
ck
::
half_t
;
using
OutDataType
=
ck
::
half_t
;
using
DOutDataType
=
ck
::
half_t
;
using
DInDataType
=
ck
::
half_t
;
using
IndexDataType
=
int32_t
;
// We use pool3d to implement pool2d in this example
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
using
OutLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
constexpr
ck
::
index_t
InOutRank
=
5
;
constexpr
ck
::
index_t
WindowRank
=
3
;
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
void
TransformPool2dparamToPool3d
(
std
::
vector
<
ck
::
index_t
>&
input_lengths
,
std
::
vector
<
ck
::
index_t
>&
window_lengths
,
std
::
vector
<
ck
::
index_t
>&
output_lengths
,
std
::
vector
<
ck
::
index_t
>&
input_stride
,
std
::
vector
<
ck
::
index_t
>&
output_stride
,
std
::
vector
<
ck
::
index_t
>&
indices_stride
,
std
::
vector
<
ck
::
index_t
>&
window_strides
,
std
::
vector
<
ck
::
index_t
>&
window_dilations
,
std
::
vector
<
ck
::
index_t
>&
input_left_pads
,
std
::
vector
<
ck
::
index_t
>&
input_right_pads
,
std
::
vector
<
ck
::
index_t
>&
pooling_dims
)
{
// NCHW to NCDHW
input_lengths
.
insert
(
input_lengths
.
begin
()
+
2
,
1
);
output_lengths
.
insert
(
output_lengths
.
begin
()
+
2
,
1
);
input_stride
.
insert
(
input_stride
.
begin
()
+
2
,
0
);
output_stride
.
insert
(
output_stride
.
begin
()
+
2
,
0
);
indices_stride
.
insert
(
indices_stride
.
begin
()
+
2
,
0
);
// YX to ZYX
window_lengths
.
insert
(
window_lengths
.
begin
(),
1
);
window_strides
.
insert
(
window_strides
.
begin
(),
0
);
window_dilations
.
insert
(
window_dilations
.
begin
(),
0
);
input_left_pads
.
insert
(
input_left_pads
.
begin
(),
0
);
input_right_pads
.
insert
(
input_right_pads
.
begin
(),
0
);
pooling_dims
=
{
2
,
3
,
4
};
}
int
main
(
int
argc
,
char
*
argv
[])
{
ck
::
index_t
N
=
2
;
ck
::
index_t
C
=
32
;
ck
::
index_t
Y
=
2
;
ck
::
index_t
X
=
2
;
ck
::
index_t
Hi
=
30
;
ck
::
index_t
Wi
=
30
;
ck
::
index_t
window_stride_h
=
2
;
ck
::
index_t
window_stride_w
=
2
;
ck
::
index_t
window_dilation_h
=
1
;
ck
::
index_t
window_dilation_w
=
1
;
ck
::
index_t
in_left_pad_h
=
1
;
ck
::
index_t
in_left_pad_w
=
1
;
ck
::
index_t
in_right_pad_h
=
1
;
ck
::
index_t
in_right_pad_w
=
1
;
const
ck
::
index_t
Ys
=
(
Y
-
1
)
*
window_dilation_h
+
1
;
const
ck
::
index_t
Xs
=
(
X
-
1
)
*
window_dilation_w
+
1
;
ck
::
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
Ys
)
/
window_stride_h
+
1
;
ck
::
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
Xs
)
/
window_stride_w
+
1
;
// Pool API only support the order of NCHW
std
::
vector
<
ck
::
index_t
>
in_length
=
{
N
,
C
,
Hi
,
Wi
};
std
::
vector
<
ck
::
index_t
>
out_length
=
{
N
,
C
,
Ho
,
Wo
};
std
::
vector
<
ck
::
index_t
>
window_spatial_lengths
=
{
Y
,
X
};
std
::
vector
<
ck
::
index_t
>
window_strides
=
{
window_stride_h
,
window_stride_w
};
std
::
vector
<
ck
::
index_t
>
window_dilations
=
{
window_dilation_h
,
window_dilation_w
};
std
::
vector
<
ck
::
index_t
>
input_left_pads
=
{
in_left_pad_h
,
in_left_pad_w
};
std
::
vector
<
ck
::
index_t
>
input_right_pads
=
{
in_right_pad_h
,
in_right_pad_w
};
std
::
vector
<
ck
::
index_t
>
pooling_dims
=
{
2
,
3
};
std
::
size_t
in_tensor_size
=
N
*
C
*
Hi
*
Wi
;
std
::
size_t
out_tensor_size
=
N
*
C
*
Ho
*
Wo
;
// tensor layout = NHWC
std
::
vector
<
ck
::
index_t
>
in_tensor_stride
=
{
C
*
Hi
*
Wi
,
1
,
Wi
*
C
,
C
};
std
::
vector
<
ck
::
index_t
>
out_tensor_stride
=
{
C
*
Ho
*
Wo
,
1
,
Wo
*
C
,
C
};
TransformPool2dparamToPool3d
(
in_length
,
window_spatial_lengths
,
out_length
,
in_tensor_stride
,
out_tensor_stride
,
out_tensor_stride
,
window_strides
,
window_dilations
,
input_left_pads
,
input_right_pads
,
pooling_dims
);
SimpleDeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in_tensor_size
);
SimpleDeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
out_tensor_size
);
SimpleDeviceMem
indices_device_buf
(
sizeof
(
IndexDataType
)
*
out_tensor_size
);
SimpleDeviceMem
dout_device_buf
(
sizeof
(
DOutDataType
)
*
out_tensor_size
);
SimpleDeviceMem
din_device_buf
(
sizeof
(
DInDataType
)
*
in_tensor_size
);
// Generate index data from max pool forward
{
using
MaxPoolFwdDeviceOp
=
ck
::
tensor_operation
::
device
::
DevicePoolFwd
<
InOutRank
,
WindowRank
,
InDataType
,
OutDataType
,
IndexDataType
,
InLayout
,
OutLayout
,
ck
::
ReduceTensorOp
::
MAX
,
true
>
;
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
MaxPoolFwdDeviceOp
>::
GetInstances
();
auto
&
op_ptr
=
op_ptrs
[
0
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
static_cast
<
IndexDataType
*>
(
indices_device_buf
.
GetDeviceBuffer
()),
in_length
,
window_spatial_lengths
,
out_length
,
in_tensor_stride
,
out_tensor_stride
,
out_tensor_stride
,
window_strides
,
window_dilations
,
input_left_pads
,
input_right_pads
,
pooling_dims
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
}
// Run MaxPool bwd
using
MaxPoolBwdDeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceMaxPoolBwd
<
DOutDataType
,
IndexDataType
,
DInDataType
>
;
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
MaxPoolBwdDeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_op_name
;
bool
found
=
false
;
int
best_op_id
=
-
1
;
float
best_ave_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
0
;
// profile device operation instances
std
::
cout
<<
"Run all instances and do timing"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
dout_device_buf
.
GetDeviceBuffer
()),
static_cast
<
IndexDataType
*>
(
indices_device_buf
.
GetDeviceBuffer
()),
static_cast
<
DInDataType
*>
(
din_device_buf
.
GetDeviceBuffer
()),
out_tensor_size
,
in_tensor_size
,
window_spatial_lengths
,
window_strides
,
window_dilations
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
size_t
workspace_sz
=
op_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
());
SimpleDeviceMem
workspace
(
workspace_sz
);
op_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
workspace
.
GetDeviceBuffer
());
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
std
::
size_t
num_bytes
=
in_tensor_size
*
sizeof
(
DInDataType
)
+
out_tensor_size
*
sizeof
(
IndexDataType
)
+
out_tensor_size
*
sizeof
(
DOutDataType
);
float
gb_per_sec
=
num_bytes
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
gb_per_sec
<<
"GB / s,"
<<
op_name
<<
std
::
endl
;
if
(
ave_time
<
best_ave_time
)
{
found
=
true
;
best_op_id
=
i
;
best_op_name
=
op_name
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
}
}
else
{
std
::
cout
<<
op_name
<<
" does not support this problem"
<<
std
::
endl
;
}
}
// run the best intance
if
(
found
)
{
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
dout_device_buf
.
GetDeviceBuffer
()),
static_cast
<
IndexDataType
*>
(
indices_device_buf
.
GetDeviceBuffer
()),
static_cast
<
DInDataType
*>
(
din_device_buf
.
GetDeviceBuffer
()),
out_tensor_size
,
in_tensor_size
,
window_spatial_lengths
,
window_strides
,
window_dilations
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
size_t
workspace_sz
=
op_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
());
SimpleDeviceMem
workspace
(
workspace_sz
);
op_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
workspace
.
GetDeviceBuffer
());
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
std
::
cout
<<
"Done"
<<
std
::
endl
;
}
return
0
;
}
client_example/19_pool
_fwd
/max_pool2d_fwd.cpp
→
client_example/19_pool/max_pool2d_fwd.cpp
View file @
316d4acc
File moved
client_example/20_image_to_column/CMakeLists.txt
0 → 100644
View file @
316d4acc
add_executable
(
client_image_to_column image_to_column.cpp
)
target_link_libraries
(
client_image_to_column PRIVATE composable_kernel::device_operations
)
client_example/20_image_to_column/image_to_column.cpp
0 → 100644
View file @
316d4acc
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <iterator>
#include <numeric>
#include <vector>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/image_to_column.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
using
InDataType
=
ck
::
half_t
;
using
OutDataType
=
ck
::
half_t
;
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
GNHWC
;
static
constexpr
ck
::
index_t
NumDimSpatial
=
2
;
static
constexpr
ck
::
index_t
G
=
1
;
static
constexpr
ck
::
index_t
N
=
32
;
// batch size
static
constexpr
ck
::
index_t
C
=
32
;
// input channel (per group)
static
constexpr
ck
::
index_t
Y
=
3
;
// filter H
static
constexpr
ck
::
index_t
X
=
3
;
// filter W
static
constexpr
ck
::
index_t
Hi
=
28
;
// input H
static
constexpr
ck
::
index_t
Wi
=
28
;
// input W
static
constexpr
ck
::
index_t
Ho
=
28
;
// output H
static
constexpr
ck
::
index_t
Wo
=
28
;
// output W
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
int
main
()
{
std
::
array
<
ck
::
index_t
,
2
>
in_spatial_lengths
{
Hi
,
Wi
};
std
::
array
<
ck
::
index_t
,
2
>
wei_spatial_lengths
{
Y
,
X
};
std
::
array
<
ck
::
index_t
,
2
>
out_spatial_lengths
{
Ho
,
Wo
};
// We have NHWGC in memory space (G is dummy)
// However, CK's API only accept length and stride with order of GNCHW
// Hence, we need to adjust the order of stride
std
::
array
<
ck
::
index_t
,
5
>
in_strides
{
C
,
Hi
*
Wi
*
G
*
C
,
1
,
Wi
*
G
*
C
,
G
*
C
};
std
::
array
<
ck
::
index_t
,
2
>
out_strides
{
Y
*
X
*
C
,
1
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
filter_strides
{
1
,
1
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
filter_dilations
{
1
,
1
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
input_left_pads
{
1
,
1
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
input_right_pads
{
1
,
1
};
SimpleDeviceMem
in
(
sizeof
(
InDataType
)
*
N
*
Hi
*
Wi
*
G
*
C
);
SimpleDeviceMem
out
(
sizeof
(
OutDataType
)
*
N
*
Ho
*
Wo
*
Y
*
X
*
C
);
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceImageToColumn
<
NumDimSpatial
,
InLayout
,
InDataType
,
OutDataType
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_op_name
;
int
best_op_id
=
-
1
;
float
best_avg_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
0
;
// profile device operation instances
std
::
cout
<<
"Run all instances and do timing"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
in
.
GetDeviceBuffer
(),
out
.
GetDeviceBuffer
(),
N
,
C
,
in_spatial_lengths
,
out_spatial_lengths
,
wei_spatial_lengths
,
in_strides
,
out_strides
,
filter_strides
,
filter_dilations
,
input_left_pads
,
input_right_pads
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
std
::
size_t
num_bytes
=
sizeof
(
InDataType
)
*
N
*
Hi
*
Wi
*
G
*
C
+
sizeof
(
OutDataType
)
*
N
*
Ho
*
Wo
*
Y
*
X
*
C
;
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
avg_time
<
best_avg_time
)
{
best_op_id
=
i
;
best_op_name
=
op_name
;
best_avg_time
=
avg_time
;
best_gb_per_sec
=
gb_per_sec
;
}
}
else
{
std
::
cerr
<<
op_name
<<
" does not support this problem"
<<
std
::
endl
;
}
}
if
(
best_op_id
<
0
)
{
std
::
cerr
<<
"no suitable instance"
<<
std
::
endl
;
return
EXIT_FAILURE
;
}
std
::
cout
<<
"Best Perf: "
<<
std
::
setw
(
10
)
<<
best_avg_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
// run the best intance
{
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
in
.
GetDeviceBuffer
(),
out
.
GetDeviceBuffer
(),
N
,
C
,
in_spatial_lengths
,
out_spatial_lengths
,
wei_spatial_lengths
,
in_strides
,
out_strides
,
filter_strides
,
filter_dilations
,
input_left_pads
,
input_right_pads
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
std
::
cout
<<
"Done"
<<
std
::
endl
;
}
}
client_example/21_grouped_gemm_bias/CMakeLists.txt
0 → 100644
View file @
316d4acc
add_executable
(
client_grouped_gemm_fixed_nk_bias_fp16 grouped_gemm_fixed_nk_bias_fp16.cpp
)
target_link_libraries
(
client_grouped_gemm_fixed_nk_bias_fp16 PRIVATE composable_kernel::device_operations
)
client_example/21_grouped_gemm_bias/grouped_gemm_fixed_nk_bias_fp16.cpp
0 → 100644
View file @
316d4acc
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <iostream>
#include <vector>
#include <random>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_gemm_bias.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Add
=
ck
::
tensor_operation
::
element_wise
::
Add
;
using
ADataType
=
F16
;
using
BDataType
=
F16
;
using
D0DataType
=
F32
;
using
DsDataType
=
ck
::
Tuple
<
D0DataType
>
;
using
EDataType
=
F32
;
using
ALayout
=
Row
;
using
BLayout
=
Row
;
using
D0Layout
=
Row
;
using
DsLayout
=
ck
::
Tuple
<
D0Layout
>
;
using
ELayout
=
Row
;
using
AElementOp
=
PassThrough
;
using
BElementOp
=
PassThrough
;
using
CDEElementOp
=
Add
;
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
int
main
()
{
std
::
vector
<
int
>
Ms
,
Ns
,
Ks
,
StrideAs
,
StrideBs
,
StrideEs
;
int
sum_of_m
=
0
;
Ms
=
{
167
,
183
,
177
,
181
,
153
,
139
,
156
,
173
,
163
,
150
,
204
,
184
,
168
,
156
,
168
,
148
};
int
group_count
=
Ms
.
size
();
for
(
int
i
=
0
;
i
<
group_count
;
++
i
)
{
Ns
.
push_back
(
768
);
Ks
.
push_back
(
4608
);
StrideAs
.
push_back
(
std
::
is_same
<
Row
,
ALayout
>::
value
?
Ks
[
i
]
:
Ms
[
i
]);
StrideBs
.
push_back
(
std
::
is_same
<
Row
,
BLayout
>::
value
?
Ns
[
i
]
:
Ks
[
i
]);
StrideEs
.
push_back
(
std
::
is_same
<
Row
,
ELayout
>::
value
?
Ns
[
i
]
:
Ms
[
i
]);
sum_of_m
+=
Ms
[
i
];
}
auto
f_matrix_space_size
=
[](
std
::
size_t
nRow
,
std
::
size_t
nCol
,
std
::
size_t
stride
,
auto
layout
)
{
using
Layout
=
decltype
(
layout
);
if
constexpr
(
std
::
is_same
<
Layout
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
(
nRow
-
1
)
*
stride
+
nCol
;
}
else
{
return
(
nCol
-
1
)
*
stride
+
nRow
;
}
};
std
::
vector
<
SimpleDeviceMem
>
a_dev_bufs
,
b_dev_bufs
,
d0_dev_bufs
,
e_dev_bufs
;
a_dev_bufs
.
reserve
(
group_count
);
b_dev_bufs
.
reserve
(
group_count
);
d0_dev_bufs
.
reserve
(
group_count
);
e_dev_bufs
.
reserve
(
group_count
);
std
::
vector
<
void
*>
p_e
;
p_e
.
reserve
(
group_count
);
std
::
vector
<
ck
::
tensor_operation
::
device
::
GemmDesc
>
gemm_descs
;
gemm_descs
.
reserve
(
group_count
);
std
::
vector
<
ck
::
tensor_operation
::
device
::
GroupedGemmKernelArgument
<
1
>>
grouped_gemm_kernel_args_
;
grouped_gemm_kernel_args_
.
reserve
(
group_count
);
for
(
int
i
=
0
;
i
<
group_count
;
++
i
)
{
a_dev_bufs
.
emplace_back
(
sizeof
(
ADataType
)
*
f_matrix_space_size
(
Ms
[
i
],
Ks
[
i
],
StrideAs
[
i
],
ALayout
{}));
b_dev_bufs
.
emplace_back
(
sizeof
(
BDataType
)
*
f_matrix_space_size
(
Ks
[
i
],
Ns
[
i
],
StrideBs
[
i
],
BLayout
{}));
d0_dev_bufs
.
emplace_back
(
sizeof
(
D0DataType
)
*
f_matrix_space_size
(
Ms
[
i
],
Ns
[
i
],
0
,
D0Layout
{}));
e_dev_bufs
.
emplace_back
(
sizeof
(
EDataType
)
*
f_matrix_space_size
(
Ms
[
i
],
Ns
[
i
],
StrideEs
[
i
],
ELayout
{}));
gemm_descs
.
push_back
({
sum_of_m
,
Ns
[
i
],
Ks
[
i
],
1
,
StrideBs
[
i
],
1
,
{
0
}});
p_e
.
push_back
(
e_dev_bufs
[
i
].
GetDeviceBuffer
());
grouped_gemm_kernel_args_
.
push_back
(
{
a_dev_bufs
[
i
].
GetDeviceBuffer
(),
b_dev_bufs
[
i
].
GetDeviceBuffer
(),
std
::
array
<
const
void
*
,
1
>
{
d0_dev_bufs
[
i
].
GetDeviceBuffer
()},
e_dev_bufs
[
i
].
GetDeviceBuffer
(),
Ms
[
i
],
Ns
[
i
],
Ks
[
i
],
StrideAs
[
i
],
StrideBs
[
i
],
std
::
array
<
ck
::
index_t
,
1
>
{
0
},
StrideEs
[
i
]});
}
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceGroupedGemmFixedNK
<
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
ADataType
,
BDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
const
auto
a_element_op
=
AElementOp
{};
const
auto
b_element_op
=
BElementOp
{};
const
auto
cde_element_op
=
CDEElementOp
{};
std
::
string
best_op_name
;
bool
found
=
false
;
int
best_op_id
=
-
1
;
float
best_ave_time
=
0
;
float
best_tflops
=
0
;
float
best_gb_per_sec
=
0
;
// profile device operation instances
std
::
cout
<<
"Run all instances and do timing"
<<
std
::
endl
;
std
::
vector
<
const
void
*>
p_a
=
{},
p_b
=
{};
std
::
vector
<
std
::
array
<
const
void
*
,
1
>>
p_ds
=
{};
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
p_a
,
p_b
,
p_ds
,
p_e
,
gemm_descs
,
a_element_op
,
b_element_op
,
cde_element_op
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
SimpleDeviceMem
grouped_gemm_kernel_args_dev
(
op_ptr
->
GetDeviceKernelArgSize
(
argument_ptr
.
get
()));
SimpleDeviceMem
grouped_gemm_workspace_dev
(
op_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
()));
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
hipGetErrorString
(
hipMemcpy
(
grouped_gemm_kernel_args_dev
.
GetDeviceBuffer
(),
grouped_gemm_kernel_args_
.
data
(),
op_ptr
->
GetDeviceKernelArgSize
(
argument_ptr
.
get
()),
hipMemcpyHostToDevice
));
op_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
grouped_gemm_workspace_dev
.
GetDeviceBuffer
());
op_ptr
->
SetDeviceKernelArgs
(
argument_ptr
.
get
(),
grouped_gemm_kernel_args_dev
.
GetDeviceBuffer
());
op_ptr
->
SetKBatch
(
argument_ptr
.
get
(),
2
);
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
std
::
size_t
flop
=
0
,
num_btype
=
0
;
for
(
std
::
size_t
j
=
0
;
j
<
gemm_descs
.
size
();
++
j
)
{
flop
+=
std
::
size_t
(
2
)
*
Ms
[
j
]
*
Ns
[
j
]
*
Ks
[
j
];
num_btype
+=
sizeof
(
ADataType
)
*
Ms
[
j
]
*
Ks
[
j
]
+
sizeof
(
BDataType
)
*
Ks
[
j
]
*
Ns
[
j
]
+
sizeof
(
EDataType
)
*
Ms
[
j
]
*
Ns
[
j
];
}
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
tflops
>
best_tflops
)
{
found
=
true
;
best_op_id
=
i
;
best_op_name
=
op_name
;
best_tflops
=
tflops
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
}
}
else
{
std
::
cout
<<
op_name
<<
" does not support this problem"
<<
std
::
endl
;
}
}
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
return
0
;
}
client_example/CMakeLists.txt
View file @
316d4acc
...
@@ -3,31 +3,52 @@ project(ck_app)
...
@@ -3,31 +3,52 @@ project(ck_app)
add_compile_options
(
-std=c++17
)
add_compile_options
(
-std=c++17
)
if
(
DTYPES
)
if
(
DTYPES
)
add_definitions
(
-DDTYPES
)
add_definitions
(
-DDTYPES
)
if
(
DTYPES MATCHES
"int8"
)
if
(
DTYPES MATCHES
"int8"
)
add_definitions
(
-D__int8__
)
add_definitions
(
-DCK_ENABLE_INT8
)
if
(
NOT DEFINED
${
CK_ENABLE_INT8
}
)
set
(
CK_ENABLE_INT8
"ON"
)
endif
()
endif
()
if
(
DTYPES MATCHES
"fp8"
)
endif
()
add_definitions
(
-D__fp8__
)
if
(
DTYPES MATCHES
"fp8"
)
add_definitions
(
-DCK_ENABLE_FP8
)
if
(
NOT DEFINED
${
CK_ENABLE_FP8
}
)
set
(
CK_ENABLE_FP8
"ON"
)
endif
()
endif
()
if
(
DTYPES MATCHES
"fp16"
)
endif
()
add_definitions
(
-D__fp16__
)
if
(
DTYPES MATCHES
"fp16"
)
add_definitions
(
-DCK_ENABLE_FP16
)
if
(
NOT DEFINED
${
CK_ENABLE_FP16
}
)
set
(
CK_ENABLE_FP16
"ON"
)
endif
()
endif
()
if
(
DTYPES MATCHES
"fp32"
)
endif
()
add_definitions
(
-D__fp32__
)
if
(
DTYPES MATCHES
"fp32"
)
add_definitions
(
-DCK_ENABLE_FP32
)
if
(
NOT DEFINED
${
CK_ENABLE_FP32
}
)
set
(
CK_ENABLE_FP32
"ON"
)
endif
()
endif
()
if
(
DTYPES MATCHES
"fp64"
)
endif
()
add_definitions
(
-D__fp64__
)
if
(
DTYPES MATCHES
"fp64"
)
add_definitions
(
-DCK_ENABLE_FP64
)
if
(
NOT DEFINED
${
CK_ENABLE_FP64
}
)
set
(
CK_ENABLE_FP64
"ON"
)
endif
()
endif
()
if
(
DTYPES MATCHES
"bf16"
)
endif
()
add_definitions
(
-D__bf16__
)
if
(
DTYPES MATCHES
"bf16"
)
add_definitions
(
-DCK_ENABLE_BF16
)
if
(
NOT DEFINED
${
CK_ENABLE_BF16
}
)
set
(
CK_ENABLE_BF16
"ON"
)
endif
()
endif
()
message
(
"DTYPES macro set to
${
DTYPES
}
"
)
endif
()
message
(
"DTYPES macro set to
${
DTYPES
}
"
)
else
()
else
()
add_definitions
(
-D__int8__ -D__fp8__ -D__fp16__ -D__fp32__ -D__fp64__ -D__bf16__
)
add_definitions
(
-DCK_ENABLE_INT8 -DCK_ENABLE_FP8 -DCK_ENABLE_FP16 -DCK_ENABLE_FP32 -DCK_ENABLE_FP64 -DCK_ENABLE_BF16
)
if
(
NOT DEFINED
${
CK_ENABLE_ALL_DTYPES
}
)
set
(
CK_ENABLE_ALL_DTYPES
"ON"
)
endif
()
endif
()
endif
()
find_package
(
composable_kernel
1.0.0
COMPONENTS device_operations
)
find_package
(
composable_kernel COMPONENTS device_operations
)
find_package
(
hip REQUIRED PATHS /opt/rocm
)
find_package
(
hip REQUIRED PATHS /opt/rocm
)
message
(
STATUS
"Build with HIP
${
hip_VERSION
}
"
)
message
(
STATUS
"Build with HIP
${
hip_VERSION
}
"
)
...
...
cmake/DoxygenDoc.cmake
View file @
316d4acc
...
@@ -309,6 +309,8 @@ XML_OUTPUT
...
@@ -309,6 +309,8 @@ XML_OUTPUT
XML_PROGRAMLISTING
XML_PROGRAMLISTING
)
)
set
(
WARN_AS_ERROR YES
)
set
(
DOXYGEN_CONFIG_FILE
"
${
CMAKE_CURRENT_BINARY_DIR
}
/doxygen/doxygen.conf"
CACHE PATH
"Path to generated doxygen configuration file"
)
set
(
DOXYGEN_CONFIG_FILE
"
${
CMAKE_CURRENT_BINARY_DIR
}
/doxygen/doxygen.conf"
CACHE PATH
"Path to generated doxygen configuration file"
)
function
(
add_doxygen_doc
)
function
(
add_doxygen_doc
)
...
...
docs/Contributors_Guide.rst
View file @
316d4acc
...
@@ -2,7 +2,101 @@
...
@@ -2,7 +2,101 @@
Contributor's Guide
Contributor's Guide
===================
===================
Pull-request guidelines
This chapter explains how to get started contributing to the Composable Kernel project and what are
=======================
the contributing rules.
[TODO]
Getting started
===============
#. **Documentation:** Before contributing to the library, familiarize yourself with the
`Composable Kernel User Guide <https://rocm.docs.amd.com/projects/composable_kernel/en/latest/>`_.
It provides insight into the core concepts, environment configuration, and steps to obtain or
build the library. You can also find some of this information in the
`README file <https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/develop/README.md>`_
on the project's GitHub page.
#. **Additional reading:** We also recommend reading a `blog post
<https://community.amd.com/t5/instinct-accelerators/amd-composable-kernel-library-efficient-fused-kernels-for-ai/ba-p/553224>`_
from the AMD Community portal. It offers a deeper understanding of the library's objectives and
showcases its performance capabilities.
#. **General information:** For broader information about AMD products, consider exploring the
`AMD Developer Central portal <https://www.amd.com/en/developer.html>`_.
How do I contribute
===================
We deeply value contributions from our users. You can make an impact by reporting issues or
proposing code enhancements through pull requests.
Reporting issues
----------------
We use `Github issues <https://github.com/ROCmSoftwarePlatform/composable_kernel/issues>`_
to track public bugs and enhancement requests.
If you encounter an issue with the library, please check if the problem has already been
reported by searching existing issues on GitHub. If your issue seems unique, please submit a new
issue. All reported issues must include:
* A comprehensive description of the problem, including:
* What did you observe?
* Why do you think it is a bug (if it seems like one)?
* What did you expect to happen? What would indicate the resolution of the problem?
* Are there any known workarounds?
* Your configuration details, including:
* Which GPU are you using?
* Which OS version are you on?
* Which ROCm version are you using?
* Are you using a Docker image? If so, which one?
* Steps to reproduce the issue, including:
* What actions trigger the issue? What are the reproduction steps?
* If you build the library from scratch, what CMake command did you use?
* How frequently does this issue happen? Does it reproduce every time? Or is it a sporadic issue?
Before sumbitting any issue, ensure you have addressed all relevant questions from the checklist.
Creating Pull Requests
----------------------
You can submit `Pull Requests (PR) on GitHub
<https://github.com/ROCmSoftwarePlatform/composable_kernel/pulls>`_.
All contributors are required to develop their changes on a separate branch and then create a
pull requrest to merge their changes into the `develop` branch, which is the default
development branch in the Composable Kernel project. All external contributors must use their own
forks of the project to develop their changes.
When submitting a Pull Request you should:
* Describe the change providing information about the motivation for the change and a general
description of all code modifications.
* Verify and test the change:
* Run any relevant existing tests.
* Write new tests if added functionality is not covered by current tests.
* Ensure your changes align with the coding style defined in the ``.clang-format`` file located in
the project's root directory. We leverage `pre-commit` to run `clang-format` automatically. We
highly recommend contributors utilize this method to maintain consistent code formatting.
Instructions on setting up `pre-commit` can be found in the project's
`README file <https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/develop/README.md>`_
* Link your PR to any related issues:
* If there is an issue that is resolved by your change, please provide a link to the issue in
the description of your pull request.
* For larger contributions, structure your change into a sequence of smaller, focused commits, each
addressing a particular aspect or fix.
Following the above guidelines ensures a seamless review process and faster assistance from our
end.
Thank you for your commitment to enhancing the Composable Kernel project! We look forward to collaborating with you.
Prev
1
2
3
4
5
…
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment