Unverified Commit 8c4897d1 authored by Rostyslav Geyyer's avatar Rostyslav Geyyer Committed by GitHub
Browse files

Merge branch 'develop' into lwpck-756

parents 9ba9ebec 9e86ebd6
repos:
- repo: local
hooks:
- id: clang-format
name: clang-format
entry: clang-format-12 -i --style=file
language: system
types_or: [c++, inc]
- id: copyright-year-checker
name: copyright-year-checker
entry: script/check_copyright_year.sh
verbose: false
language: script
types: [c++]
...@@ -12,6 +12,11 @@ Full documentation for Composable Kernel is not yet available. ...@@ -12,6 +12,11 @@ Full documentation for Composable Kernel is not yet available.
- Improve proformance of normalization kernel - Improve proformance of normalization kernel
### Added ### Added
- Added new cmake flag "DL_KERNELS" must be set to "ON" in order to build the gemm_dl and batched_gemm_multi_d_dl instances.
- Added new cmake flag "DTYPES" which could be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build instance of select data types.
- Added new cmake flag "INSTANCES_ONLY" which will only build CK library and instances without the tests, examples, or profiler.
- Added new feature: if GPU_TARGETS is not set on cmake command line, CK will be built for all targets supported by compiler.
- Added support on MI300A/MI300X.
- Added support on NAVI3x. - Added support on NAVI3x.
- Added user tutorial (#563). - Added user tutorial (#563).
- Added more instances for irregular GEMM sizes (#560). - Added more instances for irregular GEMM sizes (#560).
...@@ -20,8 +25,8 @@ Full documentation for Composable Kernel is not yet available. ...@@ -20,8 +25,8 @@ Full documentation for Composable Kernel is not yet available.
- Added multi-embeddings support (#542). - Added multi-embeddings support (#542).
- Added Navi3x blockwise GEMM and real GEMM support (#541). - Added Navi3x blockwise GEMM and real GEMM support (#541).
- Added Navi grouped ConvBwdWeight support (#505). - Added Navi grouped ConvBwdWeight support (#505).
- Added pool3d forward (#697). - Added MaxPool, AvgPool forward (#815).
- Added maxpool backward (#750). - Added MaxPool backward (#750).
### Changed ### Changed
- Changed ... - Changed ...
cmake_minimum_required(VERSION 3.14) cmake_minimum_required(VERSION 3.14)
set(version 1.1.0)
# Check support for CUDA/HIP in Cmake # Check support for CUDA/HIP in Cmake
project(composable_kernel) project(composable_kernel VERSION ${version})
list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
if (DTYPES)
add_definitions(-DDTYPES)
if (DTYPES MATCHES "int8")
add_definitions(-DCK_ENABLE_INT8)
set(CK_ENABLE_INT8 "ON")
endif()
if (DTYPES MATCHES "fp8")
add_definitions(-DCK_ENABLE_FP8)
set(CK_ENABLE_FP8 "ON")
endif()
if (DTYPES MATCHES "fp16")
add_definitions(-DCK_ENABLE_FP16)
set(CK_ENABLE_FP16 "ON")
endif()
if (DTYPES MATCHES "fp32")
add_definitions(-DCK_ENABLE_FP32)
set(CK_ENABLE_FP32 "ON")
endif()
if (DTYPES MATCHES "fp64")
add_definitions(-DCK_ENABLE_FP64)
set(CK_ENABLE_FP64 "ON")
endif()
if (DTYPES MATCHES "bf16")
add_definitions(-DCK_ENABLE_BF16)
set(CK_ENABLE_BF16 "ON")
endif()
message("DTYPES macro set to ${DTYPES}")
else()
add_definitions(-DCK_ENABLE_INT8 -DCK_ENABLE_FP8 -DCK_ENABLE_FP16 -DCK_ENABLE_FP32 -DCK_ENABLE_FP64 -DCK_ENABLE_BF16)
set(CK_ENABLE_ALL_DTYPES "ON")
endif()
if(DL_KERNELS)
add_definitions(-DDL_KERNELS)
set(CK_ENABLE_DL_KERNELS "ON")
endif()
if(INSTANCES_ONLY)
add_definitions(-DINSTANCES_ONLY)
set(CK_ENABLE_INSTANCES_ONLY "ON")
endif()
# CK config file to record supported datatypes, etc.
configure_file("${PROJECT_SOURCE_DIR}/include/ck/config.h.in" "${PROJECT_BINARY_DIR}/include/ck/config.h")
# CK version file to record release version as well as git commit hash
find_package(Git REQUIRED)
execute_process(COMMAND "${GIT_EXECUTABLE}" rev-parse HEAD OUTPUT_VARIABLE COMMIT_ID OUTPUT_STRIP_TRAILING_WHITESPACE)
configure_file("${PROJECT_SOURCE_DIR}/include/ck/version.h.in" "${PROJECT_BINARY_DIR}/include/ck/version.h")
enable_testing() enable_testing()
set(ROCM_SYMLINK_LIBS OFF) set(ROCM_SYMLINK_LIBS OFF)
...@@ -16,11 +67,57 @@ include(ROCMSetupVersion) ...@@ -16,11 +67,57 @@ include(ROCMSetupVersion)
include(ROCMInstallSymlinks) include(ROCMInstallSymlinks)
include(ROCMCreatePackage) include(ROCMCreatePackage)
include(CheckCXXCompilerFlag) include(CheckCXXCompilerFlag)
include(ROCMCheckTargetIds)
rocm_setup_version(VERSION 0.2.0)
include(TargetFlags) include(TargetFlags)
rocm_setup_version(VERSION ${version})
list(APPEND CMAKE_PREFIX_PATH ${CMAKE_INSTALL_PREFIX} ${CMAKE_INSTALL_PREFIX}/llvm ${CMAKE_INSTALL_PREFIX}/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip) list(APPEND CMAKE_PREFIX_PATH ${CMAKE_INSTALL_PREFIX} ${CMAKE_INSTALL_PREFIX}/llvm ${CMAKE_INSTALL_PREFIX}/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip)
message("GPU_TARGETS= ${GPU_TARGETS}")
message("checking which targets are supported")
#This is the list of targets to be used in case GPU_TARGETS is not set on command line
#These targets will be filtered and only supported ones will be used
#Setting GPU_TARGETS on command line will override this list
if(NOT PROFILER_ONLY)
rocm_check_target_ids(DEFAULT_GPU_TARGETS
TARGETS "gfx900;gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102")
else()
add_definitions(-DPROFILER_ONLY)
if(GPU_TARGETS)
message(FATAL_ERROR "For PROFILE_ONLY build, please do not set GPU_TARGETS, use GPU_ARCH = gfx9, gfx10, or gfx11")
endif()
if(GPU_ARCH MATCHES "gfx9")
rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx900;gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942")
elseif(GPU_ARCH MATCHES "gfx10")
rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx1030")
elseif(GPU_ARCH MATCHES "gfx11")
rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx1100;gfx1101;gfx1102")
else()
message(FATAL_ERROR "For PROFILE_ONLY build, please specify GPU_ARCH as gfx9, gfx10, or gfx11")
endif()
endif()
message("Supported GPU_TARGETS= ${DEFAULT_GPU_TARGETS}")
set(AMDGPU_TARGETS "${DEFAULT_GPU_TARGETS}" CACHE STRING " ")
if(GPU_TARGETS)
message("Building CK for the following targets: ${GPU_TARGETS}")
else()
message("Building CK for the following targets: ${AMDGPU_TARGETS}")
endif()
find_package(hip)
# No assumption that HIP kernels are launched with uniform block size for backward compatibility
# SWDEV-413293 and https://reviews.llvm.org/D155213
math(EXPR hip_VERSION_FLAT "(${hip_VERSION_MAJOR} * 1000 + ${hip_VERSION_MINOR}) * 100000 + ${hip_VERSION_PATCH}")
message("hip_version_flat=${hip_VERSION_FLAT}")
if(${hip_VERSION_FLAT} GREATER 500723302)
message("Adding the fno-offload-uniform-block compiler flag")
add_compile_options(-fno-offload-uniform-block)
endif()
option(USE_BITINT_EXTENSION_INT4, "Whether to enable clang's BitInt extension to provide int4 data type." OFF) option(USE_BITINT_EXTENSION_INT4, "Whether to enable clang's BitInt extension to provide int4 data type." OFF)
option(USE_OPT_NAVI3X, "Whether to enable LDS cumode and Wavefront32 mode for NAVI3X silicons." OFF) option(USE_OPT_NAVI3X, "Whether to enable LDS cumode and Wavefront32 mode for NAVI3X silicons." OFF)
...@@ -238,13 +335,14 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib) ...@@ -238,13 +335,14 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib) set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
# set CK project include directories
include_directories(BEFORE include_directories(BEFORE
${PROJECT_BINARY_DIR}/include
${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/include
${PROJECT_SOURCE_DIR}/library/include ${PROJECT_SOURCE_DIR}/library/include
${HIP_INCLUDE_DIRS} ${HIP_INCLUDE_DIRS}
) )
SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV") SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
if(BUILD_DEV) if(BUILD_DEV)
add_compile_options(-Werror) add_compile_options(-Werror)
...@@ -258,36 +356,80 @@ file(GLOB_RECURSE INSTANCE_FILES "${PROJECT_SOURCE_DIR}/*/device_*_instance.cpp" ...@@ -258,36 +356,80 @@ file(GLOB_RECURSE INSTANCE_FILES "${PROJECT_SOURCE_DIR}/*/device_*_instance.cpp"
file(GLOB dir_list RELATIVE ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/*) file(GLOB dir_list RELATIVE ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/*)
set(CK_DEVICE_INSTANCES) set(CK_DEVICE_INSTANCES)
FOREACH(subdir_path ${dir_list}) FOREACH(subdir_path ${dir_list})
IF(IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}") set(target_dir)
list(APPEND CK_DEVICE_INSTANCES device_${subdir_path}_instance) IF(IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}")
ENDIF() set(cmake_instance)
file(READ "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}/CMakeLists.txt" cmake_instance)
set(add_inst 0)
if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp8\" " AND DTYPES MATCHES "fp8")
#message("fp8 instance found!")
set(add_inst 1)
endif()
if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp16\"" AND DTYPES MATCHES "fp16")
#message("fp16 instance found!")
set(add_inst 1)
endif()
if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp32\"" AND DTYPES MATCHES "fp32")
#message("fp32 instance found!")
set(add_inst 1)
endif()
if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp64\"" AND DTYPES MATCHES "fp64")
#message("fp64 instance found!")
set(add_inst 1)
endif()
if("${cmake_instance}" MATCHES "DTYPES MATCHES \"bf16\"" AND DTYPES MATCHES "bf16")
#message("bf16 instance found!")
set(add_inst 1)
endif()
if("${cmake_instance}" MATCHES "DTYPES MATCHES \"int8\"" AND DTYPES MATCHES "int8")
#message("int8 instance found!")
set(add_inst 1)
endif()
if(NOT "${cmake_instance}" MATCHES "DTYPES")
#message("instance should be built for all types!")
set(add_inst 1)
endif()
if(add_inst EQUAL 1 OR NOT DEFINED DTYPES)
list(APPEND CK_DEVICE_INSTANCES device_${subdir_path}_instance)
endif()
ENDIF()
ENDFOREACH() ENDFOREACH()
add_custom_target(instances DEPENDS utility;${CK_DEVICE_INSTANCES} SOURCES ${INSTANCE_FILES}) add_custom_target(instances DEPENDS utility;${CK_DEVICE_INSTANCES} SOURCES ${INSTANCE_FILES})
add_subdirectory(library)
rocm_package_setup_component(tests if(NOT DEFINED INSTANCES_ONLY)
if(NOT DEFINED PROFILER_ONLY)
rocm_package_setup_component(tests
LIBRARY_NAME composablekernel LIBRARY_NAME composablekernel
PACKAGE_NAME tests # Prevent -static suffix on package name PACKAGE_NAME tests # Prevent -static suffix on package name
) )
rocm_package_setup_component(examples rocm_package_setup_component(examples
LIBRARY_NAME composablekernel LIBRARY_NAME composablekernel
PACKAGE_NAME examples PACKAGE_NAME examples
) )
add_subdirectory(example)
add_subdirectory(test)
rocm_package_setup_component(profiler rocm_package_setup_component(profiler
LIBRARY_NAME composablekernel LIBRARY_NAME composablekernel
PACKAGE_NAME ckProfiler PACKAGE_NAME ckProfiler
) )
add_subdirectory(profiler)
add_subdirectory(library) else()
add_subdirectory(example) #When building PROFILER_ONLY, label the package with GPU_ARCH
add_subdirectory(test) rocm_package_setup_component(profiler
add_subdirectory(profiler) LIBRARY_NAME composablekernel
PACKAGE_NAME ckProfiler_${GPU_ARCH}
)
add_subdirectory(profiler)
endif()
endif()
#Create an interface target for the include only files and call it "composablekernels" #Create an interface target for the include only files and call it "composablekernels"
include(CMakePackageConfigHelpers) include(CMakePackageConfigHelpers)
set(version 1.0.0)
write_basic_package_version_file( write_basic_package_version_file(
"${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake" "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake"
VERSION "${version}" VERSION "${version}"
...@@ -295,9 +437,9 @@ write_basic_package_version_file( ...@@ -295,9 +437,9 @@ write_basic_package_version_file(
) )
configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/Config.cmake.in configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/Config.cmake.in
"${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake" "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake"
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
NO_CHECK_REQUIRED_COMPONENTS_MACRO NO_CHECK_REQUIRED_COMPONENTS_MACRO
) )
rocm_install(FILES rocm_install(FILES
...@@ -306,6 +448,13 @@ rocm_install(FILES ...@@ -306,6 +448,13 @@ rocm_install(FILES
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
) )
# Install CK version and configuration files
install(FILES
${PROJECT_BINARY_DIR}/include/ck/version.h
${PROJECT_BINARY_DIR}/include/ck/config.h
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck/
)
set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE") set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
set(CPACK_RPM_PACKAGE_LICENSE "MIT") set(CPACK_RPM_PACKAGE_LICENSE "MIT")
......
...@@ -6,9 +6,11 @@ This is the list of developers and contributors to Composable Kernel library ...@@ -6,9 +6,11 @@ This is the list of developers and contributors to Composable Kernel library
## Developers ## Developers
[Chao Liu](https://github.com/asroy), [Jing Zhang](https://github.com/zjing14), 2018-2023 [Chao Liu](https://github.com/asroy), [Jing Zhang](https://github.com/zjing14), 2018-2023
[Letao Qin](https://github.com/ltqin), [Qianfeng Zhang](https://github.com/qianfengz), [Liang Huang](https://github.com/carlushuang), [Shaojie Wang](https://github.com/shaojiewang), 2019-2022 [Letao Qin](https://github.com/ltqin), [Qianfeng Zhang](https://github.com/qianfengz), [Liang Huang](https://github.com/carlushuang), [Shaojie Wang](https://github.com/shaojiewang), 2019-2023
[Anthony Chang](https://github.com/rosenrodt), [Chunyu Lai](https://github.com/rocking5566), [Illia Silin](https://github.com/illsilin), [Adam Osewski](https://github.com/aosewski), [Poyen Chen](https://github.com/poyenc), [Rosty Geyyer](https://github.com/geyyer), 2022 [Anthony Chang](https://github.com/rosenrodt), [Chunyu Lai](https://github.com/rocking5566), [Illia Silin](https://github.com/illsilin), [Adam Osewski](https://github.com/aosewski), [Poyen Chen](https://github.com/poyenc), [Rosty Geyyer](https://github.com/geyyer), [Astha Rai](https://github.com/arai713), [Shi YanXing](https://github.com/Yanxing-Shi), 2022-2023
[Hari Sadasivan](https://github.com/hsadasiv), [Bartlomiej Kocot](https://github.com/bartekxk), [Bartlomiej Wroblewski](https://github.com/bwroblew), 2023
Hanwen Chang, 2019-2021, Hanwen Chang, 2019-2021,
......
...@@ -26,9 +26,14 @@ RUN if [ "$ROCMVERSION" != "5.7" ]; then \ ...@@ -26,9 +26,14 @@ RUN if [ "$ROCMVERSION" != "5.7" ]; then \
sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \ sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \
sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \ sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \
elif [ "$ROCMVERSION" = "5.7" ] && [ "$compiler_version" = "" ] || [ "$compiler_version" = "amd-stg-open" ]; then \ elif [ "$ROCMVERSION" = "5.7" ] && [ "$compiler_version" = "" ] || [ "$compiler_version" = "amd-stg-open" ]; then \
sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_5.7-20.04-1_all.deb" && \ sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_5.7-20.04-1_all.deb" && \
apt update && apt-get install -y ./amdgpu-install-internal_5.7-20.04-1_all.deb && \ apt update && apt-get install -y ./amdgpu-install-internal_5.7-20.04-1_all.deb && \
amdgpu-repo --amdgpu-build=1609671 --rocm-build=compute-rocm-npi-mi300/1354; \ amdgpu-repo --amdgpu-build=1609671 --rocm-build=compute-rocm-npi-mi300/1354; \
elif [ "$ROCMVERSION" = "5.7" ] && [ "$compiler_version" = "rc1" ]; then \
sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_5.7-20.04-1_all.deb" && \
apt update && apt-get install -y ./amdgpu-install-internal_5.7-20.04-1_all.deb && \
sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 5.7 rel-19 > /etc/apt/sources.list.d/rocm-build.list' && \
amdgpu-repo --amdgpu-build=1637781; \
fi fi
RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list" RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
...@@ -48,6 +53,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- ...@@ -48,6 +53,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
libpthread-stubs0-dev \ libpthread-stubs0-dev \
llvm-amdgpu \ llvm-amdgpu \
pkg-config \ pkg-config \
python \
python3 \ python3 \
python3-dev \ python3-dev \
python3-pip \ python3-pip \
...@@ -57,12 +63,16 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- ...@@ -57,12 +63,16 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
nano \ nano \
zlib1g-dev \ zlib1g-dev \
openssh-server \ openssh-server \
clang-format-10 \ clang-format-12 \
kmod && \ kmod && \
apt-get clean && \ apt-get clean && \
rm -rf /var/lib/apt/lists/* rm -rf /var/lib/apt/lists/*
#Install latest version of cmake #Install latest version of cmake
RUN wget -qO /usr/local/bin/ninja.gz https://github.com/ninja-build/ninja/releases/latest/download/ninja-linux.zip
RUN gunzip /usr/local/bin/ninja.gz
RUN chmod a+x /usr/local/bin/ninja
RUN git clone https://github.com/nico/ninjatracing.git
RUN apt purge --auto-remove -y cmake RUN apt purge --auto-remove -y cmake
RUN apt update RUN apt update
RUN apt install -y software-properties-common lsb-release RUN apt install -y software-properties-common lsb-release
......
...@@ -11,6 +11,20 @@ def show_node_info() { ...@@ -11,6 +11,20 @@ def show_node_info() {
""" """
} }
def nthreads() {
def nproc = sh(returnStdout: true, script: 'nproc')
echo "Number of cores: ${nproc}"
def n = nproc.toInteger()
if (n > 32){
n /= 2
}
if (n > 64){
n = 64
}
echo "Number of threads used for building: ${n}"
return n
}
def runShell(String command){ def runShell(String command){
def responseCode = sh returnStatus: true, script: "${command} > tmp.txt" def responseCode = sh returnStatus: true, script: "${command} > tmp.txt"
def output = readFile(file: "tmp.txt") def output = readFile(file: "tmp.txt")
...@@ -219,7 +233,8 @@ def cmake_build(Map conf=[:]){ ...@@ -219,7 +233,8 @@ def cmake_build(Map conf=[:]){
""" """
def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args} .. ") def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args} .. ")
// reduce parallelism when compiling, clang uses too much memory // reduce parallelism when compiling, clang uses too much memory
def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make -j\$(( \$(nproc) / 2 )) ${config_targets}") def nt = nthreads()
def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make -j${nt} ${config_targets}")
def execute_cmd = conf.get("execute_cmd", "") def execute_cmd = conf.get("execute_cmd", "")
def cmd = conf.get("cmd", """ def cmd = conf.get("cmd", """
...@@ -461,7 +476,7 @@ def Build_CK(Map conf=[:]){ ...@@ -461,7 +476,7 @@ def Build_CK(Map conf=[:]){
else{ else{
echo "GPU is OK" echo "GPU is OK"
} }
if ( runShell('grep -n "gfx1030" clinfo.log') ){ if ( runShell('grep -n "gfx1030" clinfo.log') || runShell('grep -n "gfx1101" clinfo.log') ){
navi_node = 1 navi_node = 1
} }
} }
...@@ -482,7 +497,7 @@ def Build_CK(Map conf=[:]){ ...@@ -482,7 +497,7 @@ def Build_CK(Map conf=[:]){
else{ else{
echo "GPU is OK" echo "GPU is OK"
} }
if ( runShell('grep -n "gfx1030" clinfo.log') ){ if ( runShell('grep -n "gfx1030" clinfo.log') || runShell('grep -n "gfx1101" clinfo.log') ){
navi_node = 1 navi_node = 1
} }
} }
...@@ -493,8 +508,8 @@ def Build_CK(Map conf=[:]){ ...@@ -493,8 +508,8 @@ def Build_CK(Map conf=[:]){
{ {
cmake_build(conf) cmake_build(conf)
dir("build"){ dir("build"){
//run tests and examples //run tests and examples
sh 'make -j\$(( \$(nproc) / 2 )) check' sh 'make -j check'
if (navi_node == 0 ){ if (navi_node == 0 ){
//we only need the ckProfiler to run the performance tests, so we pack and stash it //we only need the ckProfiler to run the performance tests, so we pack and stash it
//do not stash profiler on Navi nodes //do not stash profiler on Navi nodes
...@@ -597,7 +612,7 @@ def process_results(Map conf=[:]){ ...@@ -597,7 +612,7 @@ def process_results(Map conf=[:]){
} }
//launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=5.7;COMPILER_VERSION=rc1
0 21 * * * % ROCMVERSION=5.6;COMPILER_VERSION=;COMPILER_COMMIT= 0 21 * * * % ROCMVERSION=5.6;COMPILER_VERSION=;COMPILER_COMMIT=
0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=''' : "" 0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=''' : ""
...@@ -674,7 +689,7 @@ pipeline { ...@@ -674,7 +689,7 @@ pipeline {
-o -iname \'*.cpp.in\' \ -o -iname \'*.cpp.in\' \
-o -iname \'*.cl\' \ -o -iname \'*.cl\' \
| grep -v 'build/' \ | grep -v 'build/' \
| xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-10 -style=file {} | diff - {}\'" | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-12 -style=file {} | diff - {}\'"
} }
steps{ steps{
buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd, no_reboot:true) buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd, no_reboot:true)
...@@ -695,8 +710,8 @@ pipeline { ...@@ -695,8 +710,8 @@ pipeline {
} }
agent{ label rocmnode("gfx908 || gfx90a") } agent{ label rocmnode("gfx908 || gfx90a") }
environment{ environment{
setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a;gfx940" """ setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941" """
execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a;gfx940" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
} }
steps{ steps{
Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
...@@ -717,7 +732,7 @@ pipeline { ...@@ -717,7 +732,7 @@ pipeline {
Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
} }
} }
stage("Build CK and run Tests on Navi") stage("Build CK and run Tests on Navi21")
{ {
when { when {
beforeAgent true beforeAgent true
...@@ -725,7 +740,7 @@ pipeline { ...@@ -725,7 +740,7 @@ pipeline {
} }
agent{ label rocmnode("navi21") } agent{ label rocmnode("navi21") }
environment{ environment{
setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" """ setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" -DDL_KERNELS=ON """
execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1030" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1030" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
} }
...@@ -733,6 +748,22 @@ pipeline { ...@@ -733,6 +748,22 @@ pipeline {
Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
} }
} }
stage("Build CK and run Tests on Navi32")
{
when {
beforeAgent true
expression { !params.RUN_FULL_QA.toBoolean() }
}
agent{ label rocmnode("navi32") }
environment{
setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DDTYPES="fp16;fp32;bf16" -DGPU_TARGETS="gfx1101" """
execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1101" -DDTYPES="fp16;fp32;bf16" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
}
steps{
Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
}
}
} }
} }
......
...@@ -52,6 +52,8 @@ CK is released under the MIT license. [License File](/LICENSE) ...@@ -52,6 +52,8 @@ CK is released under the MIT license. [License File](/LICENSE)
```bash ```bash
DOCKER_BUILDKIT=1 docker build -t ck:latest -f Dockerfile . DOCKER_BUILDKIT=1 docker build -t ck:latest -f Dockerfile .
``` ```
Pre-built dockers are available from this public repo:
https://hub.docker.com/r/rocm/composable_kernel/tags
## Launch docker ## Launch docker
...@@ -76,12 +78,26 @@ mkdir build && cd build ...@@ -76,12 +78,26 @@ mkdir build && cd build
cmake \ cmake \
-D CMAKE_PREFIX_PATH=/opt/rocm \ -D CMAKE_PREFIX_PATH=/opt/rocm \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_CXX_FLAGS="-O3" \
-D CMAKE_BUILD_TYPE=Release \ -D CMAKE_BUILD_TYPE=Release \
-D GPU_TARGETS="gfx908;gfx90a" \ -D GPU_TARGETS="gfx908;gfx90a" \
.. ..
``` ```
If GPU_TARGETS is not set on the cmake command line, CK will be built for all targets supported by the
current compiler.
Additional cmake flags can be used to significantly speed-up the build:
INSTANCES_ONLY (by default is OFF) must be set to ON in order to build only the instances and library
while skipping all tests, examples, and profiler. This is useful for libraries that use CK as a dependency.
DTYPES (by default not set) can be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build instances
of select data types only. Currently, building of int8 instances is taking a lot of time (the compiler fix is in the works).
DL_KERNELS (by default is OFF) must be set to ON in order to build the gemm_dl and batched_gemm_multi_d_dl
instances. Those instances are only needed for the NAVI2x platforms.
### Build examples and tests ### Build examples and tests
```bash ```bash
...@@ -109,6 +125,24 @@ make install ...@@ -109,6 +125,24 @@ make install
Instructions for using CK as a pre-built kernel library are under [client_example](/client_example) Instructions for using CK as a pre-built kernel library are under [client_example](/client_example)
## Contributing
When you contribute to Composable Kernel, make sure to run `clang-format` on all the changed files. We highly recommend using git hooks that are managed by the `pre-commit` framework. To install hooks, run:
```bash
sudo script/install_precommit.sh
```
This way, `pre-commit` will add the appropriate hooks to your local repository and automatically run `clang-format` (and possibly additional checks) before any commit is created.
If you need to uninstall hooks from the repository, you can do so by running the following command:
```bash
script/uninstall_precommit.sh
```
If for any reason, you need to temporarily disable precommit hooks, you can add the `--no-verify` option to the `git commit` command.
## Caveat ## Caveat
### Kernel Timing and Verification ### Kernel Timing and Verification
......
if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
add_executable(client_conv2d_fwd_bias_tanh_perchannel_quantization conv2d_fwd_bias_tanh_perchannel_quantization.cpp) add_executable(client_conv2d_fwd_bias_tanh_perchannel_quantization conv2d_fwd_bias_tanh_perchannel_quantization.cpp)
target_link_libraries(client_conv2d_fwd_bias_tanh_perchannel_quantization PRIVATE composable_kernel::device_operations) target_link_libraries(client_conv2d_fwd_bias_tanh_perchannel_quantization PRIVATE composable_kernel::device_operations)
...@@ -18,3 +19,4 @@ target_link_libraries(client_conv2d_fwd_perlayer_quantization PRIVATE composable ...@@ -18,3 +19,4 @@ target_link_libraries(client_conv2d_fwd_perlayer_quantization PRIVATE composable
add_executable(client_gemm_quantization gemm_quantization.cpp) add_executable(client_gemm_quantization gemm_quantization.cpp)
target_link_libraries(client_gemm_quantization PRIVATE composable_kernel::device_operations) target_link_libraries(client_gemm_quantization PRIVATE composable_kernel::device_operations)
endif()
...@@ -32,63 +32,49 @@ struct SimpleDeviceMem ...@@ -32,63 +32,49 @@ struct SimpleDeviceMem
}; };
template <ck::index_t NumDimSpatial> template <ck::index_t NumDimSpatial>
std::size_t GetFlops(ck::index_t G, std::size_t GetFlops(const std::array<ck::index_t, NumDimSpatial>& output_lengths,
ck::index_t N, const std::array<ck::index_t, NumDimSpatial>& filter_lengths)
ck::index_t K,
ck::index_t C,
const std::array<ck::index_t, NumDimSpatial>& output_spatial_lengths,
const std::array<ck::index_t, NumDimSpatial>& filter_spatial_lengths)
{ {
constexpr ck::index_t spatial_offset = 3;
const auto C = filter_lengths[2];
// 2 * G * N * K * C * <output spatial lengths product> * <filter spatial lengths product> // 2 * G * N * K * C * <output spatial lengths product> * <filter spatial lengths product>
return static_cast<std::size_t>(2) * G * N * K * C * return static_cast<std::size_t>(2) * C *
std::accumulate(std::begin(output_spatial_lengths), std::accumulate(std::begin(output_lengths),
std::end(output_spatial_lengths), std::end(output_lengths),
static_cast<std::size_t>(1), static_cast<std::size_t>(1),
std::multiplies<>()) * std::multiplies<>()) *
std::accumulate(std::begin(filter_spatial_lengths), std::accumulate(std::begin(filter_lengths) + spatial_offset,
std::end(filter_spatial_lengths), std::end(filter_lengths),
static_cast<std::size_t>(1), static_cast<std::size_t>(1),
std::multiplies<>()); std::multiplies<>());
} }
template <typename InDataType, ck::index_t NumDimSpatial> template <typename InDataType, ck::index_t NumDimSpatial>
std::size_t GetInputByte(ck::index_t G, std::size_t GetInputByte(const std::array<ck::index_t, NumDimSpatial>& input_lengths)
ck::index_t N,
ck::index_t C,
const std::array<ck::index_t, NumDimSpatial>& input_spatial_lengths)
{ {
// sizeof(InDataType) * (G * N * C * <input spatial lengths product>) + // sizeof(InDataType) * (G * N * C * <input spatial lengths product>) +
return sizeof(InDataType) * (G * N * C * return sizeof(InDataType) * (std::accumulate(std::begin(input_lengths),
std::accumulate(std::begin(input_spatial_lengths), std::end(input_lengths),
std::end(input_spatial_lengths),
static_cast<std::size_t>(1), static_cast<std::size_t>(1),
std::multiplies<>())); std::multiplies<>()));
} }
template <typename WeiDataType, ck::index_t NumDimSpatial> template <typename WeiDataType, ck::index_t NumDimSpatial>
std::size_t GetWeightByte(ck::index_t G, std::size_t GetWeightByte(const std::array<ck::index_t, NumDimSpatial>& filter_lengths)
ck::index_t K,
ck::index_t C,
const std::array<ck::index_t, NumDimSpatial>& filter_spatial_lengths)
{ {
// sizeof(WeiDataType) * (G * K * C * <filter spatial lengths product>) + // sizeof(WeiDataType) * (G * K * C * <filter spatial lengths product>) +
return sizeof(WeiDataType) * (G * K * C * return sizeof(WeiDataType) * (std::accumulate(std::begin(filter_lengths),
std::accumulate(std::begin(filter_spatial_lengths), std::end(filter_lengths),
std::end(filter_spatial_lengths),
static_cast<std::size_t>(1), static_cast<std::size_t>(1),
std::multiplies<>())); std::multiplies<>()));
} }
template <typename OutDataType, ck::index_t NumDimSpatial> template <typename OutDataType, ck::index_t NumDimSpatial>
std::size_t GetOutputByte(ck::index_t G, std::size_t GetOutputByte(const std::array<ck::index_t, NumDimSpatial>& output_lengths)
ck::index_t N,
ck::index_t K,
const std::array<ck::index_t, NumDimSpatial>& output_spatial_lengths)
{ {
// sizeof(OutDataType) * (G * N * K * <output spatial lengths product>); // sizeof(OutDataType) * (G * N * K * <output spatial lengths product>);
return sizeof(OutDataType) * (G * N * K * return sizeof(OutDataType) * (std::accumulate(std::begin(output_lengths),
std::accumulate(std::begin(output_spatial_lengths), std::end(output_lengths),
std::end(output_spatial_lengths),
static_cast<std::size_t>(1), static_cast<std::size_t>(1),
std::multiplies<std::size_t>())); std::multiplies<std::size_t>()));
} }
...@@ -101,13 +87,12 @@ template <ck::index_t NumDimSpatial, ...@@ -101,13 +87,12 @@ template <ck::index_t NumDimSpatial,
typename WeiLayout, typename WeiLayout,
typename OutLayout> typename OutLayout>
bool run_grouped_conv_bwd_weight( bool run_grouped_conv_bwd_weight(
ck::index_t G, const std::array<ck::index_t, NumDimSpatial + 3>& input_lengths,
ck::index_t N, const std::array<ck::index_t, NumDimSpatial + 3>& input_strides,
ck::index_t K, const std::array<ck::index_t, NumDimSpatial + 3>& filter_lengths,
ck::index_t C, const std::array<ck::index_t, NumDimSpatial + 3>& weights_strides,
const std::array<ck::index_t, NumDimSpatial>& input_spatial_lengths, const std::array<ck::index_t, NumDimSpatial + 3>& output_lengths,
const std::array<ck::index_t, NumDimSpatial>& filter_spatial_lengths, const std::array<ck::index_t, NumDimSpatial + 3>& output_strides,
const std::array<ck::index_t, NumDimSpatial>& output_spatial_lengths,
const std::array<ck::index_t, NumDimSpatial>& conv_filter_strides, const std::array<ck::index_t, NumDimSpatial>& conv_filter_strides,
const std::array<ck::index_t, NumDimSpatial>& conv_filter_dilations, const std::array<ck::index_t, NumDimSpatial>& conv_filter_dilations,
const std::array<ck::index_t, NumDimSpatial>& input_left_pads, const std::array<ck::index_t, NumDimSpatial>& input_left_pads,
...@@ -115,9 +100,9 @@ bool run_grouped_conv_bwd_weight( ...@@ -115,9 +100,9 @@ bool run_grouped_conv_bwd_weight(
{ {
ck::index_t split_k = 2; ck::index_t split_k = 2;
SimpleDeviceMem in(GetInputByte<InDataType, NumDimSpatial>(G, N, C, input_spatial_lengths)); SimpleDeviceMem in(GetInputByte<InDataType, NumDimSpatial + 3>(input_lengths));
SimpleDeviceMem wei(GetWeightByte<WeiDataType, NumDimSpatial>(G, K, C, filter_spatial_lengths)); SimpleDeviceMem wei(GetWeightByte<WeiDataType, NumDimSpatial + 3>(filter_lengths));
SimpleDeviceMem out(GetOutputByte<OutDataType, NumDimSpatial>(G, N, K, output_spatial_lengths)); SimpleDeviceMem out(GetOutputByte<OutDataType, NumDimSpatial + 3>(output_lengths));
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdWeight<NumDimSpatial, using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdWeight<NumDimSpatial,
InLayout, InLayout,
...@@ -141,6 +126,10 @@ bool run_grouped_conv_bwd_weight( ...@@ -141,6 +126,10 @@ bool run_grouped_conv_bwd_weight(
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
float best_tflops = 0; float best_tflops = 0;
std::array<ck::index_t, NumDimSpatial + 3> a_g_n_c_wis_lengths{};
std::array<ck::index_t, NumDimSpatial + 3> a_g_n_c_wis_strides{};
std::array<ck::index_t, NumDimSpatial + 3> b_g_k_c_xs_lengths{};
// profile device operation instances // profile device operation instances
std::cout << "Run all instances and do timing" << std::endl; std::cout << "Run all instances and do timing" << std::endl;
...@@ -150,13 +139,12 @@ bool run_grouped_conv_bwd_weight( ...@@ -150,13 +139,12 @@ bool run_grouped_conv_bwd_weight(
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
wei.GetDeviceBuffer(), wei.GetDeviceBuffer(),
out.GetDeviceBuffer(), out.GetDeviceBuffer(),
G, input_lengths,
N, input_strides,
K, filter_lengths,
C, weights_strides,
input_spatial_lengths, output_lengths,
filter_spatial_lengths, output_strides,
output_spatial_lengths,
conv_filter_strides, conv_filter_strides,
conv_filter_dilations, conv_filter_dilations,
input_left_pads, input_left_pads,
...@@ -172,12 +160,10 @@ bool run_grouped_conv_bwd_weight( ...@@ -172,12 +160,10 @@ bool run_grouped_conv_bwd_weight(
{ {
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t flop = std::size_t flop = GetFlops<NumDimSpatial + 3>(output_lengths, filter_lengths);
GetFlops<NumDimSpatial>(G, N, K, C, output_spatial_lengths, filter_spatial_lengths); std::size_t num_bytes = GetInputByte<InDataType, NumDimSpatial + 3>(input_lengths) +
std::size_t num_bytes = GetWeightByte<WeiDataType, NumDimSpatial + 3>(filter_lengths) +
GetInputByte<InDataType, NumDimSpatial>(G, N, C, input_spatial_lengths) + GetOutputByte<OutDataType, NumDimSpatial + 3>(output_lengths);
GetWeightByte<WeiDataType, NumDimSpatial>(G, K, C, filter_spatial_lengths) +
GetOutputByte<OutDataType, NumDimSpatial>(G, N, K, output_spatial_lengths);
float tflops = static_cast<float>(flop) / 1.E9 / avg_time; float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_bytes / 1.E6 / avg_time; float gb_per_sec = num_bytes / 1.E6 / avg_time;
...@@ -217,13 +203,12 @@ bool run_grouped_conv_bwd_weight( ...@@ -217,13 +203,12 @@ bool run_grouped_conv_bwd_weight(
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
wei.GetDeviceBuffer(), wei.GetDeviceBuffer(),
out.GetDeviceBuffer(), out.GetDeviceBuffer(),
G, input_lengths,
N, input_strides,
K, filter_lengths,
C, weights_strides,
input_spatial_lengths, output_lengths,
filter_spatial_lengths, output_strides,
output_spatial_lengths,
conv_filter_strides, conv_filter_strides,
conv_filter_dilations, conv_filter_dilations,
input_left_pads, input_left_pads,
......
...@@ -22,6 +22,16 @@ static constexpr ck::index_t C = 192; ...@@ -22,6 +22,16 @@ static constexpr ck::index_t C = 192;
static constexpr ck::index_t X = 3; static constexpr ck::index_t X = 3;
static constexpr ck::index_t Wi = 28; static constexpr ck::index_t Wi = 28;
static constexpr ck::index_t Wo = 28; static constexpr ck::index_t Wo = 28;
static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_lengths{G, N, C, Wi};
static constexpr std::array<ck::index_t, NumDimSpatial + 3> filter_lengths{G, K, C, X};
static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_lengths{G, N, K, Wo};
static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{N * Wi * C, Wi* C, 1, C};
static constexpr std::array<ck::index_t, NumDimSpatial + 3> weights_strides{K * X * C, X* C, 1, C};
static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{N * Wo * K, Wo* K, 1, K};
static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1};
static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1};
static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1};
static constexpr std::array<ck::index_t, NumDimSpatial> input_right_pads{1};
int main() int main()
{ {
...@@ -31,7 +41,16 @@ int main() ...@@ -31,7 +41,16 @@ int main()
OutDataType, OutDataType,
InLayout, InLayout,
WeiLayout, WeiLayout,
OutLayout>(G, N, K, C, {Wi}, {X}, {Wo}, {1}, {1}, {1}, {1}) OutLayout>(input_lengths,
input_strides,
filter_lengths,
weights_strides,
output_lengths,
output_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads)
? EXIT_SUCCESS ? EXIT_SUCCESS
: EXIT_FAILURE; : EXIT_FAILURE;
} }
...@@ -25,6 +25,19 @@ static constexpr ck::index_t Hi = 28; ...@@ -25,6 +25,19 @@ static constexpr ck::index_t Hi = 28;
static constexpr ck::index_t Wi = 28; static constexpr ck::index_t Wi = 28;
static constexpr ck::index_t Ho = 28; static constexpr ck::index_t Ho = 28;
static constexpr ck::index_t Wo = 28; static constexpr ck::index_t Wo = 28;
static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_lengths{G, N, C, Hi, Wi};
static constexpr std::array<ck::index_t, NumDimSpatial + 3> filter_lengths{G, K, C, Y, X};
static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_lengths{G, N, K, Ho, Wo};
static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{
N * Hi * Wi * C, Hi* Wi* C, 1, Wi* C, C};
static constexpr std::array<ck::index_t, NumDimSpatial + 3> weights_strides{
K * Y * X * C, Y* X* C, 1, X* C, C};
static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{
N * Ho * Wo * K, Ho* Wo* K, 1, Wo* K, K};
static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1, 1};
static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1, 1};
static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
static constexpr std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
int main() int main()
{ {
...@@ -34,8 +47,16 @@ int main() ...@@ -34,8 +47,16 @@ int main()
OutDataType, OutDataType,
InLayout, InLayout,
WeiLayout, WeiLayout,
OutLayout>( OutLayout>(input_lengths,
G, N, K, C, {Hi, Wi}, {Y, X}, {Ho, Wo}, {1, 1}, {1, 1}, {1, 1}, {1, 1}) input_strides,
filter_lengths,
weights_strides,
output_lengths,
output_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads)
? EXIT_SUCCESS ? EXIT_SUCCESS
: EXIT_FAILURE; : EXIT_FAILURE;
} }
...@@ -28,6 +28,19 @@ static constexpr ck::index_t Wi = 3; ...@@ -28,6 +28,19 @@ static constexpr ck::index_t Wi = 3;
static constexpr ck::index_t Do = 28; static constexpr ck::index_t Do = 28;
static constexpr ck::index_t Ho = 28; static constexpr ck::index_t Ho = 28;
static constexpr ck::index_t Wo = 3; static constexpr ck::index_t Wo = 3;
static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_lengths{G, N, C, Di, Hi, Wi};
static constexpr std::array<ck::index_t, NumDimSpatial + 3> filter_lengths{G, K, C, Z, Y, X};
static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_lengths{G, N, K, Do, Ho, Wo};
static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{
N * Di * Hi * Wi * C, Di* Hi* Wi* C, 1, Hi* Wi* C, Wi* C, C};
static constexpr std::array<ck::index_t, NumDimSpatial + 3> weights_strides{
K * Z * Y * X * C, Z* Y* X* C, 1, Y* X* C, X* C, C};
static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{
N * Do * Ho * Wo * K, Do* Ho* Wo* K, 1, Ho* Wo* K, Wo* K, K};
static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1, 1, 1};
static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1, 1, 1};
static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
static constexpr std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};
int main() int main()
{ {
...@@ -37,17 +50,16 @@ int main() ...@@ -37,17 +50,16 @@ int main()
OutDataType, OutDataType,
InLayout, InLayout,
WeiLayout, WeiLayout,
OutLayout>(G, OutLayout>(input_lengths,
N, input_strides,
K, filter_lengths,
C, weights_strides,
{Di, Hi, Wi}, output_lengths,
{Z, Y, X}, output_strides,
{Do, Ho, Wo}, conv_filter_strides,
{1, 1, 1}, conv_filter_dilations,
{1, 1, 1}, input_left_pads,
{1, 1, 1}, input_right_pads)
{1, 1, 1})
? EXIT_SUCCESS ? EXIT_SUCCESS
: EXIT_FAILURE; : EXIT_FAILURE;
} }
...@@ -28,6 +28,19 @@ static constexpr ck::index_t Wi = 3; ...@@ -28,6 +28,19 @@ static constexpr ck::index_t Wi = 3;
static constexpr ck::index_t Do = 28; static constexpr ck::index_t Do = 28;
static constexpr ck::index_t Ho = 28; static constexpr ck::index_t Ho = 28;
static constexpr ck::index_t Wo = 3; static constexpr ck::index_t Wo = 3;
static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_lengths{G, N, C, Di, Hi, Wi};
static constexpr std::array<ck::index_t, NumDimSpatial + 3> filter_lengths{G, K, C, Z, Y, X};
static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_lengths{G, N, K, Do, Ho, Wo};
static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{
N * Di * Hi * Wi * C, Di* Hi* Wi* C, 1, Hi* Wi* C, Wi* C, C};
static constexpr std::array<ck::index_t, NumDimSpatial + 3> weights_strides{
K * Z * Y * X * C, Z* Y* X* C, 1, Y* X* C, X* C, C};
static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{
N * Do * Ho * Wo * K, Do* Ho* Wo* K, 1, Ho* Wo* K, Wo* K, K};
static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1, 1, 1};
static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1, 1, 1};
static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
static constexpr std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};
int main() int main()
{ {
...@@ -37,17 +50,16 @@ int main() ...@@ -37,17 +50,16 @@ int main()
OutDataType, OutDataType,
InLayout, InLayout,
WeiLayout, WeiLayout,
OutLayout>(G, OutLayout>(input_lengths,
N, input_strides,
K, filter_lengths,
C, weights_strides,
{Di, Hi, Wi}, output_lengths,
{Z, Y, X}, output_strides,
{Do, Ho, Wo}, conv_filter_strides,
{1, 1, 1}, conv_filter_dilations,
{1, 1, 1}, input_left_pads,
{1, 1, 1}, input_right_pads)
{1, 1, 1})
? EXIT_SUCCESS ? EXIT_SUCCESS
: EXIT_FAILURE; : EXIT_FAILURE;
} }
...@@ -191,6 +191,12 @@ int main(int argc, char* argv[]) ...@@ -191,6 +191,12 @@ int main(int argc, char* argv[])
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
SimpleDeviceMem workspace(workspace_sz);
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
} }
......
...@@ -187,6 +187,12 @@ int main(int argc, char* argv[]) ...@@ -187,6 +187,12 @@ int main(int argc, char* argv[])
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
SimpleDeviceMem workspace(workspace_sz);
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
} }
......
...@@ -16,6 +16,9 @@ using InDataType = ck::half_t; ...@@ -16,6 +16,9 @@ using InDataType = ck::half_t;
using OutDataType = ck::half_t; using OutDataType = ck::half_t;
using IndexDataType = int32_t; using IndexDataType = int32_t;
using InLayout = ck::tensor_layout::convolution::NDHWC;
using OutLayout = ck::tensor_layout::convolution::NDHWC;
constexpr ck::index_t InOutRank = 5; constexpr ck::index_t InOutRank = 5;
constexpr ck::index_t WindowRank = 3; constexpr ck::index_t WindowRank = 3;
#if 0 #if 0
...@@ -44,33 +47,41 @@ struct SimpleDeviceMem ...@@ -44,33 +47,41 @@ struct SimpleDeviceMem
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
ck::index_t N = 2; ck::index_t N = 2;
ck::index_t C = 32; ck::index_t C = 32;
ck::index_t Z = 2; ck::index_t Z = 2;
ck::index_t Y = 2; ck::index_t Y = 2;
ck::index_t X = 2; ck::index_t X = 2;
ck::index_t Di = 30; ck::index_t Di = 30;
ck::index_t Hi = 30; ck::index_t Hi = 30;
ck::index_t Wi = 30; ck::index_t Wi = 30;
ck::index_t window_stride_d = 2; ck::index_t window_stride_d = 2;
ck::index_t window_stride_h = 2; ck::index_t window_stride_h = 2;
ck::index_t window_stride_w = 2; ck::index_t window_stride_w = 2;
ck::index_t in_left_pad_d = 1; ck::index_t window_dilation_d = 1;
ck::index_t in_left_pad_h = 1; ck::index_t window_dilation_h = 1;
ck::index_t in_left_pad_w = 1; ck::index_t window_dilation_w = 1;
ck::index_t in_right_pad_d = 1; ck::index_t in_left_pad_d = 1;
ck::index_t in_right_pad_h = 1; ck::index_t in_left_pad_h = 1;
ck::index_t in_right_pad_w = 1; ck::index_t in_left_pad_w = 1;
ck::index_t in_right_pad_d = 1;
ck::index_t Do = (Di + in_left_pad_d + in_right_pad_d - Z) / window_stride_d + 1; ck::index_t in_right_pad_h = 1;
ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Y) / window_stride_h + 1; ck::index_t in_right_pad_w = 1;
ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - X) / window_stride_w + 1;
const ck::index_t Zs = (Z - 1) * window_dilation_d + 1;
const ck::index_t Ys = (Y - 1) * window_dilation_h + 1;
const ck::index_t Xs = (X - 1) * window_dilation_w + 1;
ck::index_t Do = (Di + in_left_pad_d + in_right_pad_d - Zs) / window_stride_d + 1;
ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Ys) / window_stride_h + 1;
ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - Xs) / window_stride_w + 1;
// Pool API only support the order of NCDHW // Pool API only support the order of NCDHW
std::vector<ck::index_t> in_length = {N, C, Di, Hi, Wi}; std::vector<ck::index_t> in_length = {N, C, Di, Hi, Wi};
std::vector<ck::index_t> out_length = {N, C, Do, Ho, Wo}; std::vector<ck::index_t> out_length = {N, C, Do, Ho, Wo};
std::vector<ck::index_t> window_spatial_lengths = {Z, Y, X}; std::vector<ck::index_t> window_spatial_lengths = {Z, Y, X};
std::vector<ck::index_t> window_strides = {window_stride_d, window_stride_h, window_stride_w}; std::vector<ck::index_t> window_strides = {window_stride_d, window_stride_h, window_stride_w};
std::vector<ck::index_t> window_dilations{
window_dilation_d, window_dilation_h, window_dilation_w};
std::vector<ck::index_t> input_left_pads = {in_left_pad_d, in_left_pad_h, in_left_pad_w}; std::vector<ck::index_t> input_left_pads = {in_left_pad_d, in_left_pad_h, in_left_pad_w};
std::vector<ck::index_t> input_right_pads = {in_right_pad_d, in_right_pad_h, in_right_pad_w}; std::vector<ck::index_t> input_right_pads = {in_right_pad_d, in_right_pad_h, in_right_pad_w};
...@@ -90,6 +101,8 @@ int main(int argc, char* argv[]) ...@@ -90,6 +101,8 @@ int main(int argc, char* argv[])
InDataType, InDataType,
OutDataType, OutDataType,
IndexDataType, IndexDataType,
InLayout,
OutLayout,
ReduceOpId, ReduceOpId,
OutputIndex>; OutputIndex>;
...@@ -122,6 +135,7 @@ int main(int argc, char* argv[]) ...@@ -122,6 +135,7 @@ int main(int argc, char* argv[])
out_tensor_stride, out_tensor_stride,
out_tensor_stride, out_tensor_stride,
window_strides, window_strides,
window_dilations,
input_left_pads, input_left_pads,
input_right_pads, input_right_pads,
{2, 3, 4}); {2, 3, 4});
...@@ -181,6 +195,7 @@ int main(int argc, char* argv[]) ...@@ -181,6 +195,7 @@ int main(int argc, char* argv[])
out_tensor_stride, out_tensor_stride,
out_tensor_stride, out_tensor_stride,
window_strides, window_strides,
window_dilations,
input_left_pads, input_left_pads,
input_right_pads, input_right_pads,
{2, 3, 4}); {2, 3, 4});
......
...@@ -10,14 +10,18 @@ ...@@ -10,14 +10,18 @@
#include "ck/tensor_operation/gpu/device/device_pool_fwd.hpp" #include "ck/tensor_operation/gpu/device/device_pool_fwd.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp" #include "ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp"
using InDataType = ck::half_t; using InDataType = ck::half_t;
using OutDataType = ck::half_t; using OutDataType = ck::half_t;
using IndexDataType = int32_t; using IndexDataType = int32_t;
constexpr ck::index_t InOutRank = 4; // We use pool3d to implement pool2d in this example
constexpr ck::index_t WindowRank = 2; using InLayout = ck::tensor_layout::convolution::NDHWC;
using OutLayout = ck::tensor_layout::convolution::NDHWC;
constexpr ck::index_t InOutRank = 5;
constexpr ck::index_t WindowRank = 3;
#if 1 #if 1
constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX; constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
constexpr bool OutputIndex = true; constexpr bool OutputIndex = true;
...@@ -42,31 +46,66 @@ struct SimpleDeviceMem ...@@ -42,31 +46,66 @@ struct SimpleDeviceMem
void* p_mem_; void* p_mem_;
}; };
void TransformPool2dparamToPool3d(std::vector<ck::index_t>& input_lengths,
std::vector<ck::index_t>& window_lengths,
std::vector<ck::index_t>& output_lengths,
std::vector<ck::index_t>& input_stride,
std::vector<ck::index_t>& output_stride,
std::vector<ck::index_t>& indices_stride,
std::vector<ck::index_t>& window_strides,
std::vector<ck::index_t>& window_dilations,
std::vector<ck::index_t>& input_left_pads,
std::vector<ck::index_t>& input_right_pads,
std::vector<ck::index_t>& pooling_dims)
{
// NCHW to NCDHW
input_lengths.insert(input_lengths.begin() + 2, 1);
output_lengths.insert(output_lengths.begin() + 2, 1);
input_stride.insert(input_stride.begin() + 2, 0);
output_stride.insert(output_stride.begin() + 2, 0);
indices_stride.insert(indices_stride.begin() + 2, 0);
// YX to ZYX
window_lengths.insert(window_lengths.begin(), 1);
window_strides.insert(window_strides.begin(), 0);
window_dilations.insert(window_dilations.begin(), 0);
input_left_pads.insert(input_left_pads.begin(), 0);
input_right_pads.insert(input_right_pads.begin(), 0);
pooling_dims = {2, 3, 4};
}
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
ck::index_t N = 2; ck::index_t N = 2;
ck::index_t C = 32; ck::index_t C = 32;
ck::index_t Y = 2; ck::index_t Y = 2;
ck::index_t X = 2; ck::index_t X = 2;
ck::index_t Hi = 30; ck::index_t Hi = 30;
ck::index_t Wi = 30; ck::index_t Wi = 30;
ck::index_t window_stride_h = 2; ck::index_t window_stride_h = 2;
ck::index_t window_stride_w = 2; ck::index_t window_stride_w = 2;
ck::index_t in_left_pad_h = 1; ck::index_t window_dilation_h = 1;
ck::index_t in_left_pad_w = 1; ck::index_t window_dilation_w = 1;
ck::index_t in_right_pad_h = 1; ck::index_t in_left_pad_h = 1;
ck::index_t in_right_pad_w = 1; ck::index_t in_left_pad_w = 1;
ck::index_t in_right_pad_h = 1;
ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Y) / window_stride_h + 1; ck::index_t in_right_pad_w = 1;
ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - X) / window_stride_w + 1;
const ck::index_t Ys = (Y - 1) * window_dilation_h + 1;
const ck::index_t Xs = (X - 1) * window_dilation_w + 1;
ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Ys) / window_stride_h + 1;
ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - Xs) / window_stride_w + 1;
// Pool API only support the order of NCHW // Pool API only support the order of NCHW
std::vector<ck::index_t> in_length = {N, C, Hi, Wi}; std::vector<ck::index_t> in_length = {N, C, Hi, Wi};
std::vector<ck::index_t> out_length = {N, C, Ho, Wo}; std::vector<ck::index_t> out_length = {N, C, Ho, Wo};
std::vector<ck::index_t> window_spatial_lengths = {Y, X}; std::vector<ck::index_t> window_spatial_lengths = {Y, X};
std::vector<ck::index_t> window_strides = {window_stride_h, window_stride_w}; std::vector<ck::index_t> window_strides = {window_stride_h, window_stride_w};
std::vector<ck::index_t> window_dilations = {window_dilation_h, window_dilation_w};
std::vector<ck::index_t> input_left_pads = {in_left_pad_h, in_left_pad_w}; std::vector<ck::index_t> input_left_pads = {in_left_pad_h, in_left_pad_w};
std::vector<ck::index_t> input_right_pads = {in_right_pad_h, in_right_pad_w}; std::vector<ck::index_t> input_right_pads = {in_right_pad_h, in_right_pad_w};
std::vector<ck::index_t> pooling_dims = {2, 3};
std::size_t in_tensor_size = N * C * Hi * Wi; std::size_t in_tensor_size = N * C * Hi * Wi;
std::size_t out_tensor_size = N * C * Ho * Wo; std::size_t out_tensor_size = N * C * Ho * Wo;
...@@ -75,6 +114,18 @@ int main(int argc, char* argv[]) ...@@ -75,6 +114,18 @@ int main(int argc, char* argv[])
std::vector<ck::index_t> in_tensor_stride = {C * Hi * Wi, 1, Wi * C, C}; std::vector<ck::index_t> in_tensor_stride = {C * Hi * Wi, 1, Wi * C, C};
std::vector<ck::index_t> out_tensor_stride = {C * Ho * Wo, 1, Wo * C, C}; std::vector<ck::index_t> out_tensor_stride = {C * Ho * Wo, 1, Wo * C, C};
TransformPool2dparamToPool3d(in_length,
window_spatial_lengths,
out_length,
in_tensor_stride,
out_tensor_stride,
out_tensor_stride,
window_strides,
window_dilations,
input_left_pads,
input_right_pads,
pooling_dims);
SimpleDeviceMem in_device_buf(sizeof(InDataType) * in_tensor_size); SimpleDeviceMem in_device_buf(sizeof(InDataType) * in_tensor_size);
SimpleDeviceMem out_device_buf(sizeof(OutDataType) * out_tensor_size); SimpleDeviceMem out_device_buf(sizeof(OutDataType) * out_tensor_size);
SimpleDeviceMem out_indices_device_buf(sizeof(IndexDataType) * out_tensor_size); SimpleDeviceMem out_indices_device_buf(sizeof(IndexDataType) * out_tensor_size);
...@@ -84,6 +135,8 @@ int main(int argc, char* argv[]) ...@@ -84,6 +135,8 @@ int main(int argc, char* argv[])
InDataType, InDataType,
OutDataType, OutDataType,
IndexDataType, IndexDataType,
InLayout,
OutLayout,
ReduceOpId, ReduceOpId,
OutputIndex>; OutputIndex>;
...@@ -116,9 +169,10 @@ int main(int argc, char* argv[]) ...@@ -116,9 +169,10 @@ int main(int argc, char* argv[])
out_tensor_stride, out_tensor_stride,
out_tensor_stride, out_tensor_stride,
window_strides, window_strides,
window_dilations,
input_left_pads, input_left_pads,
input_right_pads, input_right_pads,
{2, 3}); pooling_dims);
auto invoker_ptr = op_ptr->MakeInvokerPointer(); auto invoker_ptr = op_ptr->MakeInvokerPointer();
...@@ -175,9 +229,10 @@ int main(int argc, char* argv[]) ...@@ -175,9 +229,10 @@ int main(int argc, char* argv[])
out_tensor_stride, out_tensor_stride,
out_tensor_stride, out_tensor_stride,
window_strides, window_strides,
window_dilations,
input_left_pads, input_left_pads,
input_right_pads, input_right_pads,
{2, 3}); pooling_dims);
auto invoker_ptr = op_ptr->MakeInvokerPointer(); auto invoker_ptr = op_ptr->MakeInvokerPointer();
......
add_executable(client_splitK_gemm splitK_gemm_fp16_f8.cpp)
target_link_libraries(client_splitK_gemm PRIVATE composable_kernel::device_operations)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_splitk.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp"
using F8 = ck::f8_t;
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = PassThrough;
using ADataType = F8;
using BDataType = F16;
using CDataType = F16;
using ALayout = Row;
using BLayout = Col;
using CLayout = Row;
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
{
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
}
void* GetDeviceBuffer() { return p_mem_; }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
};
int main(int argc, char* argv[])
{
// GEMM shape
ck::index_t M = 3840;
ck::index_t N = 4096;
ck::index_t K = 4096;
ck::index_t StrideA = 4096;
ck::index_t StrideB = 4096;
ck::index_t StrideC = 4096;
ck::index_t KBatch = 1;
if(argc == 1)
{
// use default case
}
else if(argc == 8)
{
M = std::stoi(argv[1]);
N = std::stoi(argv[2]);
K = std::stoi(argv[3]);
StrideA = std::stoi(argv[4]);
StrideB = std::stoi(argv[5]);
StrideC = std::stoi(argv[6]);
KBatch = std::stoi(argv[7]);
}
else
{
printf("arg1 to 7: M, N, K, StrideA, StrideB, StrideC, KBatch\n");
exit(0);
}
auto f_matrix_space_size =
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
using Layout = decltype(layout);
if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
{
return (nRow - 1) * stride + nCol;
}
else
{
return (nCol - 1) * stride + nRow;
}
};
SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
SimpleDeviceMem c_device_buf(sizeof(CDataType) * f_matrix_space_size(M, N, StrideC, CLayout{}));
using DeviceOp = ck::tensor_operation::device::DeviceGemmSplitK<
ALayout,
BLayout,
CLayout,
ADataType,
BDataType,
CDataType,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
const auto a_element_op = AElementOp{};
const auto b_element_op = BElementOp{};
const auto c_element_op = CElementOp{};
std::string best_op_name;
bool found = false;
int best_op_id = -1;
float best_ave_time = 0;
float best_tflops = 0;
float best_gb_per_sec = 0;
// profile device operation instances
std::cout << "Run all instances and do timing" << std::endl;
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(),
c_device_buf.GetDeviceBuffer(),
M,
N,
K,
StrideA,
StrideB,
StrideC,
a_element_op,
b_element_op,
c_element_op,
KBatch);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << op_name << std::endl;
if(tflops > best_tflops)
{
found = true;
best_op_id = i;
best_op_name = op_name;
best_tflops = tflops;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
}
}
else
{
std::cout << op_name << " does not support this problem" << std::endl;
}
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
// run the best intance
{
auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(),
c_device_buf.GetDeviceBuffer(),
M,
N,
K,
StrideA,
StrideB,
StrideC,
a_element_op,
b_element_op,
c_element_op,
KBatch);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
}
std::cout << "Done" << std::endl;
}
return 0;
}
...@@ -2,7 +2,53 @@ cmake_minimum_required(VERSION 3.15) ...@@ -2,7 +2,53 @@ cmake_minimum_required(VERSION 3.15)
project(ck_app) project(ck_app)
add_compile_options(-std=c++17) add_compile_options(-std=c++17)
find_package(composable_kernel 1.0.0 COMPONENTS device_operations) if (DTYPES)
add_definitions(-DDTYPES)
if (DTYPES MATCHES "int8")
add_definitions(-DCK_ENABLE_INT8)
if(NOT DEFINED ${CK_ENABLE_INT8})
set(CK_ENABLE_INT8 "ON")
endif()
endif()
if (DTYPES MATCHES "fp8")
add_definitions(-DCK_ENABLE_FP8)
if(NOT DEFINED ${CK_ENABLE_FP8})
set(CK_ENABLE_FP8 "ON")
endif()
endif()
if (DTYPES MATCHES "fp16")
add_definitions(-DCK_ENABLE_FP16)
if(NOT DEFINED ${CK_ENABLE_FP16})
set(CK_ENABLE_FP16 "ON")
endif()
endif()
if (DTYPES MATCHES "fp32")
add_definitions(-DCK_ENABLE_FP32)
if(NOT DEFINED ${CK_ENABLE_FP32})
set(CK_ENABLE_FP32 "ON")
endif()
endif()
if (DTYPES MATCHES "fp64")
add_definitions(-DCK_ENABLE_FP64)
if(NOT DEFINED ${CK_ENABLE_FP64})
set(CK_ENABLE_FP64 "ON")
endif()
endif()
if (DTYPES MATCHES "bf16")
add_definitions(-DCK_ENABLE_BF16)
if(NOT DEFINED ${CK_ENABLE_BF16})
set(CK_ENABLE_BF16 "ON")
endif()
endif()
message("DTYPES macro set to ${DTYPES}")
else()
add_definitions(-DCK_ENABLE_INT8 -DCK_ENABLE_FP8 -DCK_ENABLE_FP16 -DCK_ENABLE_FP32 -DCK_ENABLE_FP64 -DCK_ENABLE_BF16)
if(NOT DEFINED ${CK_ENABLE_ALL_DTYPES})
set(CK_ENABLE_ALL_DTYPES "ON")
endif()
endif()
find_package(composable_kernel COMPONENTS device_operations)
find_package(hip REQUIRED PATHS /opt/rocm) find_package(hip REQUIRED PATHS /opt/rocm)
message(STATUS "Build with HIP ${hip_VERSION}") message(STATUS "Build with HIP ${hip_VERSION}")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment