Commit 6368be50 authored by Jun Liu's avatar Jun Liu
Browse files

Merge branch 'amd-develop' into amd-master

parents 32806d5f 71d6ede7
...@@ -55,3 +55,12 @@ _static/ ...@@ -55,3 +55,12 @@ _static/
_templates/ _templates/
_toc.yml _toc.yml
_doxygen/ _doxygen/
# JetBrains IDE
.idea/
cmake-build*/
build*/
# Python virtualenv
.venv/
...@@ -19,7 +19,7 @@ None ...@@ -19,7 +19,7 @@ None
- Support for NHWGC (2D and 3D) grouped convolution backward weight (#769 #804) - Support for NHWGC (2D and 3D) grouped convolution backward weight (#769 #804)
- Support for bf16/f32/f16 and NHWGC (2D and 3D) grouped convolution backward data (#757 #799) - Support for bf16/f32/f16 and NHWGC (2D and 3D) grouped convolution backward data (#757 #799)
- Support for Batched Gemm DL (#732) - Support for Batched Gemm DL (#732)
- Introduce wrapper sublibrary (limited functionality). (#1071, #1098) - Introduce wrapper sublibrary (limited functionality). (#1071, #1098, #1108)
### Changes ### Changes
- Changed the grouped convolution API to maintain consistency with other convolution kernels (#817) - Changed the grouped convolution API to maintain consistency with other convolution kernels (#817)
......
...@@ -4,22 +4,27 @@ if(POLICY CMP0140) ...@@ -4,22 +4,27 @@ if(POLICY CMP0140)
cmake_policy(SET CMP0140 NEW) cmake_policy(SET CMP0140 NEW)
endif() endif()
get_property(_GENERATOR_IS_MULTI_CONFIG GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)
# This has to be initialized before the project() command appears # This has to be initialized before the project() command appears
# Set the default of CMAKE_BUILD_TYPE to be release, unless user specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE # Set the default of CMAKE_BUILD_TYPE to be release, unless user specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE
if( NOT MSVC_IDE AND NOT CMAKE_BUILD_TYPE ) if(_GENERATOR_IS_MULTI_CONFIG)
set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) set(CMAKE_CONFIGURATION_TYPES "Debug;Release;RelWithDebInfo;MinSizeRel" CACHE STRING
"Available build types (configurations) on multi-config generators")
else()
set(CMAKE_BUILD_TYPE Release CACHE STRING
"Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel.")
endif() endif()
# Default installation path # Default installation path
if(WIN32) if(NOT WIN32)
set(CMAKE_INSTALL_PREFIX "/opt/rocm/x86_64-w64-mingw32" CACHE PATH "")
else()
set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "") set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "")
endif() endif()
set(version 1.1.0) set(version 1.1.0)
# Check support for CUDA/HIP in Cmake # Check support for CUDA/HIP in Cmake
project(composable_kernel VERSION ${version}) project(composable_kernel VERSION ${version} LANGUAGES CXX)
include(CTest)
list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
...@@ -62,6 +67,7 @@ endif() ...@@ -62,6 +67,7 @@ endif()
#for f8/bf8_t type #for f8/bf8_t type
add_compile_options(-Wno-bit-int-extension) add_compile_options(-Wno-bit-int-extension)
add_compile_options(-Wno-pass-failed) add_compile_options(-Wno-pass-failed)
add_compile_options(-Wno-switch-default)
if(DL_KERNELS) if(DL_KERNELS)
add_definitions(-DDL_KERNELS) add_definitions(-DDL_KERNELS)
...@@ -73,15 +79,15 @@ if(INSTANCES_ONLY) ...@@ -73,15 +79,15 @@ if(INSTANCES_ONLY)
set(CK_ENABLE_INSTANCES_ONLY "ON") set(CK_ENABLE_INSTANCES_ONLY "ON")
endif() endif()
include(getopt)
# CK config file to record supported datatypes, etc. # CK config file to record supported datatypes, etc.
configure_file("${PROJECT_SOURCE_DIR}/include/ck/config.h.in" "${PROJECT_BINARY_DIR}/include/ck/config.h") configure_file(include/ck/config.h.in ${CMAKE_CURRENT_BINARY_DIR}/include/ck/config.h)
# CK version file to record release version as well as git commit hash # CK version file to record release version as well as git commit hash
find_package(Git REQUIRED) find_package(Git REQUIRED)
execute_process(COMMAND "${GIT_EXECUTABLE}" rev-parse HEAD OUTPUT_VARIABLE COMMIT_ID OUTPUT_STRIP_TRAILING_WHITESPACE) execute_process(COMMAND "${GIT_EXECUTABLE}" rev-parse HEAD OUTPUT_VARIABLE COMMIT_ID OUTPUT_STRIP_TRAILING_WHITESPACE)
configure_file("${PROJECT_SOURCE_DIR}/include/ck/version.h.in" "${PROJECT_BINARY_DIR}/include/ck/version.h") configure_file(include/ck/version.h.in ${CMAKE_CURRENT_BINARY_DIR}/include/ck/version.h)
enable_testing()
set(ROCM_SYMLINK_LIBS OFF) set(ROCM_SYMLINK_LIBS OFF)
find_package(ROCM REQUIRED PATHS /opt/rocm) find_package(ROCM REQUIRED PATHS /opt/rocm)
...@@ -97,7 +103,7 @@ include(TargetFlags) ...@@ -97,7 +103,7 @@ include(TargetFlags)
rocm_setup_version(VERSION ${version}) rocm_setup_version(VERSION ${version})
list(APPEND CMAKE_PREFIX_PATH ${CMAKE_INSTALL_PREFIX} ${CMAKE_INSTALL_PREFIX}/llvm ${CMAKE_INSTALL_PREFIX}/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip) list(APPEND CMAKE_PREFIX_PATH ${CMAKE_INSTALL_PREFIX} ${CMAKE_INSTALL_PREFIX}/llvm ${CMAKE_INSTALL_PREFIX}/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip "$ENV{ROCM_PATH}" "$ENV{HIP_PATH}")
message("GPU_TARGETS= ${GPU_TARGETS}") message("GPU_TARGETS= ${GPU_TARGETS}")
...@@ -142,7 +148,7 @@ find_package(hip) ...@@ -142,7 +148,7 @@ find_package(hip)
# SWDEV-413293 and https://reviews.llvm.org/D155213 # SWDEV-413293 and https://reviews.llvm.org/D155213
math(EXPR hip_VERSION_FLAT "(${hip_VERSION_MAJOR} * 1000 + ${hip_VERSION_MINOR}) * 100000 + ${hip_VERSION_PATCH}") math(EXPR hip_VERSION_FLAT "(${hip_VERSION_MAJOR} * 1000 + ${hip_VERSION_MINOR}) * 100000 + ${hip_VERSION_PATCH}")
message("hip_version_flat=${hip_VERSION_FLAT}") message("hip_version_flat=${hip_VERSION_FLAT}")
if(${hip_VERSION_FLAT} GREATER 500723302) if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 500723302)
message("Adding the fno-offload-uniform-block compiler flag") message("Adding the fno-offload-uniform-block compiler flag")
add_compile_options(-fno-offload-uniform-block) add_compile_options(-fno-offload-uniform-block)
endif() endif()
...@@ -174,8 +180,8 @@ elseif(CK_PARALLEL_COMPILE_JOBS) ...@@ -174,8 +180,8 @@ elseif(CK_PARALLEL_COMPILE_JOBS)
endif() endif()
option(USE_BITINT_EXTENSION_INT4, "Whether to enable clang's BitInt extension to provide int4 data type." OFF) option(USE_BITINT_EXTENSION_INT4 "Whether to enable clang's BitInt extension to provide int4 data type." OFF)
option(USE_OPT_NAVI3X, "Whether to enable LDS cumode and Wavefront32 mode for NAVI3X silicons." OFF) option(USE_OPT_NAVI3X "Whether to enable LDS cumode and Wavefront32 mode for NAVI3X silicons." OFF)
if(USE_BITINT_EXTENSION_INT4) if(USE_BITINT_EXTENSION_INT4)
add_compile_definitions(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4) add_compile_definitions(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
...@@ -195,7 +201,6 @@ find_package(Threads REQUIRED) ...@@ -195,7 +201,6 @@ find_package(Threads REQUIRED)
link_libraries(Threads::Threads) link_libraries(Threads::Threads)
## C++ ## C++
enable_language(CXX)
set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_CXX_EXTENSIONS OFF)
...@@ -466,7 +471,9 @@ if(NOT DEFINED INSTANCES_ONLY) ...@@ -466,7 +471,9 @@ if(NOT DEFINED INSTANCES_ONLY)
PACKAGE_NAME examples PACKAGE_NAME examples
) )
add_subdirectory(example) add_subdirectory(example)
add_subdirectory(test) if(BUILD_TESTING)
add_subdirectory(test)
endif()
rocm_package_setup_component(profiler rocm_package_setup_component(profiler
LIBRARY_NAME composablekernel LIBRARY_NAME composablekernel
......
...@@ -16,12 +16,18 @@ RUN apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl ...@@ -16,12 +16,18 @@ RUN apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl
ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
RUN wget https://repo.radeon.com/amdgpu-install/6.0/ubuntu/focal/amdgpu-install_6.0.60000-1_all.deb --no-check-certificate RUN if [ "$ROCMVERSION" != "6.0.1" ]; then \
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.0.60000-1_all.deb sh -c "wget https://repo.radeon.com/amdgpu-install/6.0/ubuntu/focal/amdgpu-install_6.0.60000-1_all.deb --no-check-certificate" && \
apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.0.60000-1_all.deb && \
RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \ wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \ sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \
sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list' sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \
elif [ "$ROCMVERSION" = "6.0.1" ] && [ "$compiler_version" = "rc1" ]; then \
sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_6.0-20.04-1_all.deb --no-check-certificate" && \
apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install dialog && DEBIAN_FRONTEND=noninteractive apt-get install ./amdgpu-install-internal_6.0-20.04-1_all.deb && \
sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 6.0.1 rel-95 > /etc/apt/sources.list.d/rocm-build.list' && \
amdgpu-repo --amdgpu-build=1704947; \
fi
RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list" RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
RUN amdgpu-install -y --usecase=rocm --no-dkms RUN amdgpu-install -y --usecase=rocm --no-dkms
...@@ -111,7 +117,7 @@ ENV compiler_commit=$compiler_commit ...@@ -111,7 +117,7 @@ ENV compiler_commit=$compiler_commit
RUN sh -c "echo compiler version = '$compiler_version'" RUN sh -c "echo compiler version = '$compiler_version'"
RUN sh -c "echo compiler commit = '$compiler_commit'" RUN sh -c "echo compiler commit = '$compiler_commit'"
RUN if [ "$compiler_version" != "" ] && [ "$compiler_commit" = "" ]; then \ RUN if ( [ "$compiler_version" = "amd-stg-open" ] || [ "$compiler_version" = "amd-mainline-open" ] ) && [ "$compiler_commit" = "" ]; then \
git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \ git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \
cd llvm-project && mkdir build && cd build && \ cd llvm-project && mkdir build && cd build && \
cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \ cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
...@@ -119,7 +125,7 @@ RUN if [ "$compiler_version" != "" ] && [ "$compiler_commit" = "" ]; then \ ...@@ -119,7 +125,7 @@ RUN if [ "$compiler_version" != "" ] && [ "$compiler_commit" = "" ]; then \
else echo "using the release compiler"; \ else echo "using the release compiler"; \
fi fi
RUN if [ "$compiler_version" != "" ] && [ "$compiler_commit" != "" ]; then \ RUN if ( [ "$compiler_version" = "amd-stg-open" ] || [ "$compiler_version" = "amd-mainline-open" ] ) && [ "$compiler_commit" != "" ]; then \
git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \ git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \
cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \ cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \
cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \ cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
......
...@@ -33,7 +33,7 @@ def runShell(String command){ ...@@ -33,7 +33,7 @@ def runShell(String command){
def getDockerImageName(){ def getDockerImageName(){
def img def img
if (params.ROCMVERSION != "6.1"){ if (params.ROCMVERSION != "6.0.1"){
if (params.COMPILER_VERSION == "") { if (params.COMPILER_VERSION == "") {
img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}" img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}"
} }
...@@ -84,7 +84,7 @@ def build_compiler(){ ...@@ -84,7 +84,7 @@ def build_compiler(){
compiler = '/opt/rocm/bin/hipcc' compiler = '/opt/rocm/bin/hipcc'
} }
else{ else{
if (params.COMPILER_VERSION != "" || params.COMPILER_COMMIT != ""){ if (params.COMPILER_VERSION == "amd-stg-open" || params.COMPILER_VERSION == "amd-mainline-open" || params.COMPILER_COMMIT != ""){
compiler = "/llvm-project/build/bin/clang++" compiler = "/llvm-project/build/bin/clang++"
} }
else{ else{
...@@ -293,7 +293,7 @@ def buildHipClangJob(Map conf=[:]){ ...@@ -293,7 +293,7 @@ def buildHipClangJob(Map conf=[:]){
dockerOpts = dockerOpts + " --env HSA_XNACK=1 " dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
} }
def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' " def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
if (params.COMPILER_VERSION != "" || params.COMPILER_COMMIT != ""){ if (params.COMPILER_VERSION == "amd-stg-open" || params.COMPILER_VERSION == "amd-mainline-open" || params.COMPILER_COMMIT != ""){
dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' " dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
} }
...@@ -348,7 +348,7 @@ def runCKProfiler(Map conf=[:]){ ...@@ -348,7 +348,7 @@ def runCKProfiler(Map conf=[:]){
dockerOpts = dockerOpts + " --env HSA_XNACK=1 " dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
} }
def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' " def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
if (params.COMPILER_VERSION != "" || params.COMPILER_COMMIT != ""){ if (params.COMPILER_VERSION == "amd-stg-open" || params.COMPILER_VERSION == "amd-mainline-open" || params.COMPILER_COMMIT != ""){
dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' " dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
} }
...@@ -479,7 +479,7 @@ def Build_CK(Map conf=[:]){ ...@@ -479,7 +479,7 @@ def Build_CK(Map conf=[:]){
dockerOpts = dockerOpts + " --env HSA_XNACK=1 " dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
} }
def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' " def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
if (params.COMPILER_VERSION != "" || params.COMPILER_COMMIT != ""){ if (params.COMPILER_VERSION == "amd-stg-open" || params.COMPILER_VERSION == "amd-mainline-open" || params.COMPILER_COMMIT != ""){
dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' " dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
} }
......
# SPDX-License-Identifier: MIT
# Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
add_library(getopt::getopt INTERFACE IMPORTED GLOBAL)
if(WIN32)
include(FetchContent)
FetchContent_Declare(
getopt
GIT_REPOSITORY https://github.com/apwojcik/getopt.git
GIT_TAG main
SYSTEM
)
set(__build_shared_libs ${BUILD_SHARED_LIBS})
set(BUILD_SHARED_LIBS OFF CACHE INTERNAL "")
FetchContent_MakeAvailable(getopt)
# Restore the old value of BUILD_SHARED_LIBS
set(BUILD_SHARED_LIBS ${__build_shared_libs} CACHE BOOL "Type of libraries to build" FORCE)
FetchContent_GetProperties(getopt)
target_link_libraries(getopt::getopt INTERFACE wingetopt)
target_include_directories(getopt::getopt INTERFACE ${getopt_SOURCE_DIR}/src)
endif()
\ No newline at end of file
...@@ -6,9 +6,42 @@ if(GOOGLETEST_DIR) ...@@ -6,9 +6,42 @@ if(GOOGLETEST_DIR)
set(FETCHCONTENT_SOURCE_DIR_GOOGLETEST ${GOOGLETEST_DIR} CACHE STRING "GoogleTest source directory override") set(FETCHCONTENT_SOURCE_DIR_GOOGLETEST ${GOOGLETEST_DIR} CACHE STRING "GoogleTest source directory override")
endif() endif()
message(STATUS "Fetching GoogleTest") FetchContent_Declare(
GTest
GIT_REPOSITORY https://github.com/google/googletest.git
GIT_TAG f8d7d77c06936315286eb55f8de22cd23c188571
)
# Suppress ROCMChecks WARNING on GoogleTests
set(ROCM_DISABLE_CHECKS FALSE)
macro(rocm_check_toolchain_var var access value list_file)
if(NOT ROCM_DISABLE_CHECKS)
_rocm_check_toolchain_var("${var}" "${access}" "${value}" "${list_file}")
endif()
endmacro()
if(WIN32)
set(gtest_force_shared_crt ON CACHE_INTERNAL "")
endif()
set(BUILD_GMOCK OFF CACHE INTERNAL "")
set(INSTALL_GTEST OFF CACHE INTERNAL "")
# Store the current value of BUILD_SHARED_LIBS
set(__build_shared_libs ${BUILD_SHARED_LIBS})
set(BUILD_SHARED_LIBS OFF CACHE INTERNAL "")
list(APPEND GTEST_CMAKE_CXX_FLAGS set(ROCM_DISABLE_CHECKS TRUE)
FetchContent_MakeAvailable(GTest)
set(ROCM_DISABLE_CHECKS FALSE)
# Restore the old value of BUILD_SHARED_LIBS
set(BUILD_SHARED_LIBS ${__build_shared_libs} CACHE BOOL "Type of libraries to build" FORCE)
set(BUILD_GMOCK OFF CACHE INTERNAL "")
set(INSTALL_GTEST OFF CACHE INTERNAL "")
set(GTEST_CXX_FLAGS
-Wno-undef -Wno-undef
-Wno-reserved-identifier -Wno-reserved-identifier
-Wno-global-constructors -Wno-global-constructors
...@@ -22,29 +55,16 @@ list(APPEND GTEST_CMAKE_CXX_FLAGS ...@@ -22,29 +55,16 @@ list(APPEND GTEST_CMAKE_CXX_FLAGS
-Wno-old-style-cast -Wno-old-style-cast
-Wno-deprecated -Wno-deprecated
-Wno-unsafe-buffer-usage -Wno-unsafe-buffer-usage
-Wno-float-equal
) )
message(STATUS "Suppressing googltest warnings with flags: ${GTEST_CMAKE_CXX_FLAGS}")
FetchContent_Declare( if(WIN32)
googletest list(APPEND GTEST_CXX_FLAGS
GIT_REPOSITORY https://github.com/google/googletest.git -Wno-suggest-destructor-override
GIT_TAG b85864c64758dec007208e56af933fc3f52044ee -Wno-suggest-override
) -Wno-nonportable-system-include-path
-Wno-language-extension-token)
# Will be necessary for windows build
# set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
FetchContent_GetProperties(googletest)
if(NOT googletest_POPULATED)
FetchContent_Populate(googletest)
add_subdirectory(${googletest_SOURCE_DIR} ${googletest_BINARY_DIR} EXCLUDE_FROM_ALL)
endif() endif()
target_compile_options(gtest PRIVATE ${GTEST_CMAKE_CXX_FLAGS}) target_compile_options(gtest PRIVATE ${GTEST_CXX_FLAGS})
target_compile_options(gtest_main PRIVATE ${GTEST_CMAKE_CXX_FLAGS}) target_compile_options(gtest_main PRIVATE ${GTEST_CXX_FLAGS})
target_compile_options(gmock PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
target_compile_options(gmock_main PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
set_target_properties(gtest PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_target_properties(gtest_main PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_target_properties(gmock PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_target_properties(gmock_main PROPERTIES POSITION_INDEPENDENT_CODE ON)
.. meta::
:description: Composable Kernel documentation and API reference library
:keywords: composable kernel, CK, ROCm, API, documentation
******************* .. _api-reference:
API Reference Guide
******************* ********************************************************************
API reference guide
********************************************************************
=================
Introduction
=================
This document contains details of the APIs for the Composable Kernel (CK) library and introduces This document contains details of the APIs for the Composable Kernel (CK) library and introduces
some of the key design principles that are used to write new classes that extend CK functionality. some of the key design principles that are used to write new classes that extend CK functionality.
...@@ -30,7 +32,7 @@ DeviceMem ...@@ -30,7 +32,7 @@ DeviceMem
Kernels For Flashattention Kernels For Flashattention
--------------------------- ---------------------------
The Flashattention algorithm is defined in :cite:t:`dao2022flashattention`. This sections lists The Flashattention algorithm is defined in :cite:t:`dao2022flashattention`. This section lists
the classes that are used in the CK GPU implementation of Flashattention. the classes that are used in the CK GPU implementation of Flashattention.
**Gridwise classes** **Gridwise classes**
......
=================== .. meta::
Contributor's Guide :description: Composable Kernel documentation and API reference library
=================== :keywords: composable kernel, CK, ROCm, API, documentation
.. _contributing-to:
********************************************************************
Contributor's guide
********************************************************************
This chapter explains how to get started contributing to the Composable Kernel project and what are This chapter explains the rules for contributing to the Composable Kernel project, and how to contribute.
the contributing rules.
Getting started Getting started
=============== ===============
...@@ -14,23 +19,21 @@ Getting started ...@@ -14,23 +19,21 @@ Getting started
build the library. You can also find some of this information in the build the library. You can also find some of this information in the
`README file <https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/develop/README.md>`_ `README file <https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/develop/README.md>`_
on the project's GitHub page. on the project's GitHub page.
#. **Additional reading:** We also recommend reading a `blog post #. **Additional reading:** The blog post `AMD Composable Kernel library: efficient fused kernels for AI apps with just a few lines of code <https://community.amd.com/t5/instinct-accelerators/amd-composable-kernel-library-efficient-fused-kernels-for-ai/ba-p/553224>`_ provides a deeper understanding of the CK library and showcases its performance capabilities.
<https://community.amd.com/t5/instinct-accelerators/amd-composable-kernel-library-efficient-fused-kernels-for-ai/ba-p/553224>`_ <https://community.amd.com/t5/instinct-accelerators/amd-composable-kernel-library-efficient-fused-kernels-for-ai/ba-p/553224>`_
from the AMD Community portal. It offers a deeper understanding of the library's objectives and from the AMD Community portal. It offers a deeper understanding of the library's objectives and showcases its performance capabilities.
showcases its performance capabilities.
#. **General information:** For broader information about AMD products, consider exploring the #. **General information:** For broader information about AMD products, consider exploring the
`AMD Developer Central portal <https://www.amd.com/en/developer.html>`_. `AMD Developer Central portal <https://www.amd.com/en/developer.html>`_.
How do I contribute How to contribute
=================== ===================
We deeply value contributions from our users. You can make an impact by reporting issues or You can make an impact by reporting issues or proposing code enhancements through pull requests.
proposing code enhancements through pull requests.
Reporting issues Reporting issues
---------------- ----------------
We use `Github issues <https://github.com/ROCmSoftwarePlatform/composable_kernel/issues>`_ Use `Github issues <https://github.com/ROCmSoftwarePlatform/composable_kernel/issues>`_
to track public bugs and enhancement requests. to track public bugs and enhancement requests.
If you encounter an issue with the library, please check if the problem has already been If you encounter an issue with the library, please check if the problem has already been
...@@ -59,7 +62,7 @@ issue. All reported issues must include: ...@@ -59,7 +62,7 @@ issue. All reported issues must include:
* How frequently does this issue happen? Does it reproduce every time? Or is it a sporadic issue? * How frequently does this issue happen? Does it reproduce every time? Or is it a sporadic issue?
Before sumbitting any issue, ensure you have addressed all relevant questions from the checklist. Before submitting any issue, ensure you have addressed all relevant questions from the checklist.
Creating Pull Requests Creating Pull Requests
---------------------- ----------------------
...@@ -68,7 +71,7 @@ You can submit `Pull Requests (PR) on GitHub ...@@ -68,7 +71,7 @@ You can submit `Pull Requests (PR) on GitHub
<https://github.com/ROCmSoftwarePlatform/composable_kernel/pulls>`_. <https://github.com/ROCmSoftwarePlatform/composable_kernel/pulls>`_.
All contributors are required to develop their changes on a separate branch and then create a All contributors are required to develop their changes on a separate branch and then create a
pull requrest to merge their changes into the `develop` branch, which is the default pull request to merge their changes into the `develop` branch, which is the default
development branch in the Composable Kernel project. All external contributors must use their own development branch in the Composable Kernel project. All external contributors must use their own
forks of the project to develop their changes. forks of the project to develop their changes.
...@@ -99,4 +102,4 @@ When submitting a Pull Request you should: ...@@ -99,4 +102,4 @@ When submitting a Pull Request you should:
Following the above guidelines ensures a seamless review process and faster assistance from our Following the above guidelines ensures a seamless review process and faster assistance from our
end. end.
Thank you for your commitment to enhancing the Composable Kernel project! We look forward to collaborating with you. Thank you for your commitment to enhancing the Composable Kernel project!
========================== .. meta::
:description: Composable Kernel documentation and API reference library
:keywords: composable kernel, CK, ROCm, API, documentation
.. _supported-primitives:
********************************************************************
Supported Primitives Guide Supported Primitives Guide
========================== ********************************************************************
This document contains details of supported primitives in Composable Kernel (CK). In contrast to the This document contains details of supported primitives in Composable Kernel (CK). In contrast to the API Reference Guide, the Supported Primitives Guide is an introduction to the math which underpins the algorithms implemented in CK.
API Reference Guide, the Supported Primitives Guide is an introduction to the math which underpins
the algorithms implemented in CK.
------------ ------------
Softmax Softmax
------------ ------------
For vectors :math:`x^{(1)}, x^{(2)}, \ldots, x^{(T)}` of size :math:`B` we can decompose the For vectors :math:`x^{(1)}, x^{(2)}, \ldots, x^{(T)}` of size :math:`B` you can decompose the
softmax of concatenated :math:`x = [ x^{(1)}\ | \ \ldots \ | \ x^{(T)} ]` as, softmax of concatenated :math:`x = [ x^{(1)}\ | \ \ldots \ | \ x^{(T)} ]` as,
.. math:: .. math::
...@@ -27,7 +31,7 @@ where :math:`f(x^{(j)}) = \exp( x^{(j)} - m(x^{(j)}) )` is of size :math:`B` and ...@@ -27,7 +31,7 @@ where :math:`f(x^{(j)}) = \exp( x^{(j)} - m(x^{(j)}) )` is of size :math:`B` and
:math:`z(x^{(j)}) = f(x_1^{(j)})+ \ldots+ f(x_B^{(j)})` is a scalar. :math:`z(x^{(j)}) = f(x_1^{(j)})+ \ldots+ f(x_B^{(j)})` is a scalar.
For a matrix :math:`X` composed of :math:`T_r \times T_c` tiles, :math:`X_{ij}`, of size For a matrix :math:`X` composed of :math:`T_r \times T_c` tiles, :math:`X_{ij}`, of size
:math:`B_r \times B_c` we can compute the row-wise softmax as follows. :math:`B_r \times B_c` you can compute the row-wise softmax as follows.
For :math:`j` from :math:`1` to :math:`T_c`, and :math:`i` from :math:`1` to :math:`T_r` calculate, For :math:`j` from :math:`1` to :math:`T_c`, and :math:`i` from :math:`1` to :math:`T_r` calculate,
......
=================== .. meta::
:description: Composable Kernel documentation and API reference library
:keywords: composable kernel, CK, ROCm, API, documentation
.. _docker-hub:
********************************************************************
CK Docker Hub CK Docker Hub
=================== ********************************************************************
-------------------------------------
Why do I need this? Why do I need this?
------------------------------------- ===================
To make our lives easier and bring Composable Kernel dependencies together, we recommend using To make things simpler, and bring Composable Kernel and its dependencies together,
docker images that can be found on `Docker Hub <https://hub.docker.com/r/rocm/composable_kernel>`_. docker images can be found on `Docker Hub <https://hub.docker.com/r/rocm/composable_kernel/tags>`_. Docker images provide a complete image of the OS, the Composable Kernel library, and its dependencies in a single downloadable file.
------------------------------------- Refer to `Docker Overview <https://docs.docker.com/get-started/overview/>`_ for more information on Docker images and containers.
So what is Composable Kernel?
-------------------------------------
Composable Kernel (CK) library aims to provide a programming model for writing performance critical Which image is right for me?
kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc, ============================
through general purpose kernel languages, like HIP C++.
To get the CK library:: The image naming includes information related to the docker image.
For example ``ck_ub20.04_rocm6.0`` indicates the following:
git clone https://github.com/ROCmSoftwarePlatform/composable_kernel.git * ``ck`` - made for running Composable Kernel;
* ``ub20.04`` - based on Ubuntu 20.04;
* ``rocm6.0`` - ROCm platform version 6.0.
Download a docker image suitable for your OS and ROCm release, run or start the docker container, and then resume the tutorial from this point. Use the ``docker pull`` command to download the file::
docker pull rocm/composable_kernel:ck_ub20.04_rocm6.0
run a docker container:: What is inside the image?
-------------------------
The docker images have everything you need for running CK including:
* `ROCm <https://www.amd.com/en/graphics/servers-solutions-rocm>`_
* `CMake <https://cmake.org/getting-started/>`_
* `Compiler <https://github.com/RadeonOpenCompute/llvm-project>`_
* `Composable Kernel library <https://github.com/ROCm/composable_kernel>`_
Running the docker container
============================
After downloading the docker image, you can start the container using one of a number of commands. Start with the ``docker run`` command as shown below::
docker run \ docker run \
-it \ -it \
...@@ -30,70 +52,50 @@ run a docker container:: ...@@ -30,70 +52,50 @@ run a docker container::
--group-add sudo \ --group-add sudo \
-w /root/workspace \ -w /root/workspace \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \ -v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
rocm/composable_kernel:ck_ub20.04_rocm5.6 \ rocm/composable_kernel:ck_ub20.04_rocm6.0 \
/bin/bash /bin/bash
and build the CK:: After starting the bash shell, the docker container current folder is `~/workspace`. The library path is ``~/workspace/composable_kernel``. Navigate to the library to begin the tutorial as explained in :ref:`hello-world`:
mkdir build && cd build .. note::
# Need to specify target ID, example below is for gfx908 and gfx90a
cmake \
-D CMAKE_PREFIX_PATH=/opt/rocm \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_CXX_FLAGS="-O3" \
-D CMAKE_BUILD_TYPE=Release \
-D GPU_TARGETS="gfx908;gfx90a" \
..
and:: If your current folder is different from `${HOME}`, adjust the line ``-v ${HOME}:/root/workspace`` in the ``docker run`` command to fit your folder structure.
make -j examples tests Stop and restart the docker image
=================================
To run all the test cases including tests and examples run:: After finishing the tutorial, or just when you have completed your work session, you can close the docker container, or stop the docker container to restart it at another time. Closing the docker container means that it is still in the active state, and can be resumed from where you left it. Stopping the container closes it, and returns the image to its initial state.
make test Use the ``Ctrl-D`` option to exit the container, while leaving it active, so you can return to the container in its current state to resume the tutorial, or pickup your project where you left off.
We can also run specific examples or tests like:: To restart the active container use the ``docker exec`` command to specify the container name and options as follows::
./bin/example_gemm_xdl_fp16 docker exec -it <container_name> bash
./bin/test_gemm_fp16
For more details visit `CK github repository <https://github.com/ROCmSoftwarePlatform/composable_kernel>`_, Where:
`CK examples <https://github.com/ROCmSoftwarePlatform/composable_kernel/tree/develop/example)>`_,
`even more CK examples <https://github.com/ROCmSoftwarePlatform/composable_kernel/tree/develop/client_example>`_.
------------------------------------- * `exec` is the docker command
And what is inside? * `-it` is the interactive option for `exec`
------------------------------------- * `<container_name>` specifies an active container on the system
* `bash` specifies the command to run in the interactive shell
The docker images have everything you need for running CK including: .. note::
* `ROCm <https://www.amd.com/en/graphics/servers-solutions-rocm>`_ You can use the ``docker container ls`` command to list the active containers on the system.
* `CMake <https://cmake.org/>`_
* `Compiler <https://github.com/RadeonOpenCompute/llvm-project>`_
------------------------------------- To start a container from the image, use the ``docker start`` command::
Which image is right for me?
-------------------------------------
Let's take a look at the image naming, for example ``ck_ub20.04_rocm5.6``. The image specs are: docker start <container_name>
* ``ck`` - made for running Composable Kernel; Then use the docker exec command as shown above to start the bash shell.
* ``ub20.04`` - based on Ubuntu 20.04;
* ``rocm5.6`` - ROCm platform version 5.6.
So just pick the right image for your project dependencies and you're all set. Use the ``docker stop`` command to stop the container and restore the image to its initial state::
------------------------------------- docker stop <container_name>
DIY starts here
------------------------------------- Editing the docker image
=======================
If you need to customize a docker image or just can't stop tinkering, feel free to adjust the If you want to customize the docker image, edit the
`Dockerfile <https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/develop/Dockerfile>`_ `Dockerfile <https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/develop/Dockerfile>`_
for your needs. from the GitHub repository to suit your needs.
-------------------------------------
License
-------------------------------------
CK is released under the MIT `license <https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/develop/LICENSE>`_.
============================ .. meta::
:description: Composable Kernel documentation and API reference library
:keywords: composable kernel, CK, ROCm, API, documentation
.. _composable-kernel:
********************************************************************
Composable Kernel User Guide Composable Kernel User Guide
============================ ********************************************************************
The Composable Kernel (CK) library provides a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs and CPUs, through general purpose kernel languages like HIP C++. This document contains instructions for installing, using, and contributing to the Composable Kernel project. To learn more see :ref:`what-is-ck`.
------------ The CK documentation is structured as follows:
Introduction
------------
This document contains instructions for installing, using, and contributing to Composable Kernel (CK). .. card:: Conceptual
----------- * :ref:`what-is-ck`
Methodology
-----------
Composable Kernel (CK) library aims to provide a programming model for writing performance critical .. card:: Installation
kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc,
through general purpose kernel languages, like HIP C++.
CK utilizes two concepts to achieve performance portability and code maintainability: * :ref:`docker-hub`
* A tile-based programming model .. card:: Tutorial
* Algorithm complexity reduction for complex ML operators, using innovative technique we call
"Tensor Coordinate Transformation".
.. image:: data/ck_component.png * :ref:`hello-world`
:alt: CK Components
-------------- .. card:: API reference
Code Structure
--------------
Current CK library are structured into 4 layers: * :ref:`supported-primitives`
* :ref:`api-reference`
* :ref:`wrapper`
* "Templated Tile Operators" layer .. card:: Contributing to CK
* "Templated Kernel and Invoker" layer
* "Instantiated Kernel and Invoker" layer
* "Wrapper for tensor transform operations"
* "Client API" layer
.. image:: data/ck_layer.png * :ref:`contributing-to`
:alt: CK Layers
Documentation Roadmap
^^^^^^^^^^^^^^^^^^^^^
The following is a list of CK documents in the suggested reading order:
.. toctree:: To contribute to the documentation refer to `Contributing to ROCm <https://rocm.docs.amd.com/en/latest/contribute/index.md>`_.
:maxdepth: 5
:caption: Contents:
:numbered:
tutorial_hello_world You can find licensing information at the `Licensing <https://rocm.docs.amd.com/en/latest/about/license.md>`_ page.
dockerhub
wrapper
Supported_Primitives_Guide
API_Reference_Guide
Contributors_Guide
```{include} ../LICENSE.md
```
=======
License
=======
.. include:: ../LICENSE
:literal:
# Anywhere {branch} is used, the branch name will be substituted.
# These comments will also be removed.
defaults: defaults:
numbered: False numbered: False
maxdepth: 6
root: index root: index
subtrees: subtrees:
- caption: About - entries:
entries: - file: what-is-ck.rst
- file: license title: What is Composable Kernel?
- file: dockerhub.rst
title: Docker Hub
- file: tutorial_hello_world.rst
title: Hello World Tutorial
- file: Supported_Primitives_Guide.rst
title: Supported Primitives
- file: API_Reference_Guide.rst
title: API Reference
- file: wrapper.rst
title: Wrapper
- file: Contributors_Guide.rst
title: Contributing to CK
- file: license.md
title: License
rocm-docs-core==0.30.2 rocm-docs-core==0.30.3
sphinxcontrib-bibtex==2.6.1 sphinxcontrib-bibtex==2.6.1
...@@ -113,7 +113,7 @@ requests==2.31.0 ...@@ -113,7 +113,7 @@ requests==2.31.0
# via # via
# pygithub # pygithub
# sphinx # sphinx
rocm-docs-core==0.30.2 rocm-docs-core==0.30.3
# via -r requirements.in # via -r requirements.in
six==1.16.0 six==1.16.0
# via # via
......
=============== .. meta::
CK Hello world :description: Composable Kernel documentation and API reference library
=============== :keywords: composable kernel, CK, ROCm, API, documentation
------------------------------------- .. _hello-world:
Motivation
-------------------------------------
This tutorial is aimed at engineers dealing with artificial intelligence and machine learning who ********************************************************************
would like to optimize their pipelines and squeeze every performance drop by adding Composable Hello World Tutorial
Kernel (CK) library to their projects. We would like to make the CK library approachable so ********************************************************************
the tutorial is not based on the latest release and doesn't have all the bleeding edge features,
but it will be reproducible now and forever.
During this tutorial we will have an introduction to the CK library, we will build it and run some This tutorial is for engineers dealing with artificial intelligence and machine learning who
examples and tests, so to say we will run a "Hello world" example. In future tutorials we will go would like to optimize pipelines and improve performance using the Composable
in depth and breadth and get familiar with other tools and ways to integrate CK into your project. Kernel (CK) library. This tutorial provides an introduction to the CK library. You will build the library and run some examples using a "Hello World" example.
-------------------------------------
Description Description
------------------------------------- ===========
Modern AI technology solves more and more problems in all imaginable fields, but crafting fast and Modern AI technology solves more and more problems in a variety of fields, but crafting fast and
efficient workflows is still challenging. CK is one of the tools to make AI heavy lifting as fast efficient workflows is still challenging. CK can make the AI workflow fast
and efficient as possible. CK is a collection of optimized AI operator kernels and tools to create and efficient. CK is a collection of optimized AI operator kernels with tools to create
new ones. The library has components required for majority of modern neural networks architectures new kernels. The library has components required for modern neural network architectures
including matrix multiplication, convolution, contraction, reduction, attention modules, variety of including matrix multiplication, convolution, contraction, reduction, attention modules, a variety of activation functions, and fused operators.
activation functions, fused operators and many more.
So how do we (almost) reach the speed of light? CK acceleration abilities are based on: CK library acceleration features are based on:
* Layered structure. * Layered structure
* Tile-based computation model. * Tile-based computation model
* Tensor coordinate transformation. * Tensor coordinate transformation
* Hardware acceleration use. * Hardware acceleration use
* Support of low precision data types including fp16, bf16, int8 and int4. * Support of low precision data types including fp16, bf16, int8 and int4
If you are excited and need more technical details and benchmarking results - read this awesome If you need more technical details and benchmarking results read the following
`blog post <https://community.amd.com/t5/instinct-accelerators/amd-composable-kernel-library-efficient-fused-kernels-for-ai/ba-p/553224>`_. `blog post <https://community.amd.com/t5/instinct-accelerators/amd-composable-kernel-library-efficient-fused-kernels-for-ai/ba-p/553224>`_.
For more details visit our `github repository <https://github.com/ROCmSoftwarePlatform/composable_kernel>`_. To download the library visit the `composable_kernel repository <https://github.com/ROCmSoftwarePlatform/composable_kernel>`_.
-------------------------------------
Hardware targets Hardware targets
------------------------------------- ================
CK library fully supports `gfx908` and `gfx90a` GPU architectures and only some operators are CK library fully supports `gfx908` and `gfx90a` GPU architectures, while only some operators are
supported for `gfx1030`. Let's check the hardware you have at hand and decide on the target supported for `gfx1030` devices. Check your hardware to determine the target GPU architecture.
GPU architecture.
========== ========= ========== =========
GPU Target AMD GPU GPU Target AMD GPU
...@@ -59,47 +51,24 @@ gfx1030 Radeon PRO V620, W6800, W6800X, W6800X Duo, W6900X, RX 6800, RX 6 ...@@ -59,47 +51,24 @@ gfx1030 Radeon PRO V620, W6800, W6800X, W6800X Duo, W6900X, RX 6800, RX 6
There are also `cloud options <https://aws.amazon.com/ec2/instance-types/g4/>`_ you can find if There are also `cloud options <https://aws.amazon.com/ec2/instance-types/g4/>`_ you can find if
you don't have an AMD GPU at hand. you don't have an AMD GPU at hand.
-------------------------------------
Build the library Build the library
------------------------------------- =================
First let's clone the library and rebase to the tested version:: This tutorial is based on the use of docker images as explained in :ref:`docker-hub`. Download a docker image suitable for your OS and ROCm release, run or start the docker container, and then resume the tutorial from this point.
git clone https://github.com/ROCmSoftwarePlatform/composable_kernel.git .. note::
cd composable_kernel/
git checkout tutorial_hello_world
To make our lives easier we prepared
`docker images <https://hub.docker.com/r/rocm/composable_kernel>`_ with all the necessary
dependencies. Pick the right image and create a container. In this tutorial we use
``rocm/composable_kernel:ck_ub20.04_rocm5.6`` image, it is based on Ubuntu 20.04 and
ROCm v5.6.
If your current folder is ``${HOME}``, start the docker container with::
docker run \
-it \
--privileged \
--group-add sudo \
-w /root/workspace \
-v ${HOME}:/root/workspace \
rocm/composable_kernel:ck_ub20.04_rocm5.6 \
/bin/bash
If your current folder is different from ``${HOME}``, adjust the line ``-v ${HOME}:/root/workspace`` You can also `install ROCm <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/>`_ on your system, clone the `Composable Kernel repository <https://github.com/ROCmSoftwarePlatform/composable_kernel.git>`_ on GitHub, and use that to build and run the examples using the commands described below.
to fit your folder structure.
Inside the docker container current folder is ``~/workspace``, library path is Both the docker container and GitHub repository include the Composable Kernel library. Navigate to the library::
``~/workspace/composable_kernel``, navigate to the library::
cd composable_kernel/ cd composable_kernel/
Create and go to the ``build`` directory:: Create and change to a ``build`` directory::
mkdir build && cd build mkdir build && cd build
In the previous section we talked about target GPU architecture. Once you decide which one is right The previous section discussed supported GPU architecture. Once you decide which hardware targets are needed, run CMake using the ``GPU_TARGETS`` flag::
for you, run CMake using the right ``GPU_TARGETS`` flag::
cmake \ cmake \
-D CMAKE_PREFIX_PATH=/opt/rocm \ -D CMAKE_PREFIX_PATH=/opt/rocm \
...@@ -109,26 +78,25 @@ for you, run CMake using the right ``GPU_TARGETS`` flag:: ...@@ -109,26 +78,25 @@ for you, run CMake using the right ``GPU_TARGETS`` flag::
-D BUILD_DEV=OFF \ -D BUILD_DEV=OFF \
-D GPU_TARGETS="gfx908;gfx90a;gfx1030" .. -D GPU_TARGETS="gfx908;gfx90a;gfx1030" ..
If everything went well the CMake run will end up with:: If everything goes well the CMake command will return::
-- Configuring done -- Configuring done
-- Generating done -- Generating done
-- Build files have been written to: "/root/workspace/composable_kernel/build" -- Build files have been written to: "/root/workspace/composable_kernel/build"
Finally, we can build examples and tests:: Finally, you can build examples and tests::
make -j examples tests make -j examples tests
If everything is smooth, you'll see:: When complete you should see::
Scanning dependencies of target tests Scanning dependencies of target tests
[100%] Built target tests [100%] Built target tests
---------------------------
Run examples and tests Run examples and tests
--------------------------- ======================
Examples are listed as test cases as well, so we can run all examples and tests with:: Examples are listed as test cases as well, so you can run all examples and tests with::
ctest ctest
...@@ -136,38 +104,32 @@ You can check the list of all tests by running:: ...@@ -136,38 +104,32 @@ You can check the list of all tests by running::
ctest -N ctest -N
We can also run them separately, here is a separate example execution:: You can also run examples separately as shown in the following example execution::
./bin/example_gemm_xdl_fp16 1 1 1 ./bin/example_gemm_xdl_fp16 1 1 1
The arguments ``1 1 1`` mean that we want to run this example in the mode: verify results with CPU, The arguments ``1 1 1`` mean that you want to run this example in the mode: verify results with CPU, initialize matrices with integers, and benchmark the kernel execution. You can play around with these parameters and see how output and execution results change.
initialize matrices with integers and benchmark the kernel execution. You can play around with
these parameters and see how output and execution results change.
If everything goes well and you have a device based on `gfx908` or `gfx90a` architecture you should see If you have a device based on `gfx908` or `gfx90a` architecture, and if the example runs as expected, you should see something like::
something like::
a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1} a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096} b_k_n: dim 2, lengths {4096, 4096}, strides {4096, 1}
c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1} c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
launch_and_time_kernel: grid_dim {480, 1, 1}, block_dim {256, 1, 1} Perf: 1.08153 ms, 119.136 TFlops, 89.1972 GB/s, DeviceGemm_Xdl_CShuffle<Default, 256, 256, 128, 32, 8, 2, 32, 32, 4, 2, 8, 4, 1, 2> LoopScheduler: Interwave, PipelineVersion: v1
Warm up 1 time
Start running 10 times...
Perf: 1.10017 ms, 117.117 TFlops, 87.6854 GB/s, DeviceGemmXdl<256, 256, 128, 4, 8, 32, 32, 4, 2> NumPrefetch: 1, LoopScheduler: Default, PipelineVersion: v1
Meanwhile, running it on a `gfx1030` device should result in:: However, running it on a `gfx1030` device should result in the following::
a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1} a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096} b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096}
c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1} c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
DeviceGemmXdl<256, 256, 128, 4, 8, 32, 32, 4, 2> NumPrefetch: 1, LoopScheduler: Default, PipelineVersion: v1 does not support this problem DeviceGemmXdl<256, 256, 128, 4, 8, 32, 32, 4, 2> NumPrefetch: 1, LoopScheduler: Default, PipelineVersion: v1 does not support this problem
But don't panic, some of the operators are supported on `gfx1030` architecture, so you can run a Don't worry, some operators are supported on `gfx1030` architecture, so you can run a
separate example like:: separate example like::
./bin/example_gemm_dl_fp16 1 1 1 ./bin/example_gemm_dl_fp16 1 1 1
and it should result in something nice similar to:: and it should return something like::
a_m_k: dim 2, lengths {3840, 4096}, strides {1, 4096} a_m_k: dim 2, lengths {3840, 4096}, strides {1, 4096}
b_k_n: dim 2, lengths {4096, 4096}, strides {4096, 1} b_k_n: dim 2, lengths {4096, 4096}, strides {4096, 1}
...@@ -182,12 +144,9 @@ and it should result in something nice similar to:: ...@@ -182,12 +144,9 @@ and it should result in something nice similar to::
.. note:: .. note::
There was a new CMake flag ``DL_KERNELS`` added in the latest versions of CK. If you use one of A new CMake flag ``DL_KERNELS`` has been added to the latest versions of CK. If you do not see the above results when running ``example_gemm_dl_fp16``, you might need to add ``-D DL_KERNELS=ON`` to your CMake command to build the operators supported on the `gfx1030` architecture.
the newest versions of the library and do not see the above results when running
``example_gemm_dl_fp16``, it might be necessary to add ``-D DL_KERNELS=ON`` to your CMake command
in order to build the operators supported on the `gfx1030` architecture.
We can also run a separate test:: You can also run a separate test::
ctest -R test_gemm_fp16 ctest -R test_gemm_fp16
...@@ -198,13 +157,9 @@ If everything goes well you should see something like:: ...@@ -198,13 +157,9 @@ If everything goes well you should see something like::
100% tests passed, 0 tests failed out of 1 100% tests passed, 0 tests failed out of 1
-----------
Summary Summary
----------- =======
In this tutorial we took the first look at the Composable Kernel library, built it on your system In this tutorial you took the first look at the Composable Kernel library, built it on your system and ran some examples and tests. In the next tutorial you will run kernels with different configurations to find out the best one for your hardware and task.
and ran some examples and tests. Stay tuned, in the next tutorial we will run kernels with different
configs to find out the best one for your hardware and task.
P.S.: Don't forget to switch off the cloud instance if you have launched one, you can find better P.S.: If you are running on a cloud instance, don't forget to switch off the cloud instance.
ways to spend your money for sure!
.. meta::
:description: Composable Kernel documentation and API reference library
:keywords: composable kernel, CK, ROCm, API, documentation
.. _what-is-ck:
********************************************************************
What is the Composable Kernel library
********************************************************************
Methodology
===========
The Composable Kernel (CK) library provides a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs and CPUs, through general purpose kernel languages like HIP C++.
CK utilizes two concepts to achieve performance portability and code maintainability:
* A tile-based programming model
* Algorithm complexity reduction for complex ML operators using an innovative technique called
"Tensor Coordinate Transformation".
.. image:: data/ck_component.png
:alt: CK Components
Code Structure
==============
The CK library is structured into 4 layers:
* "Templated Tile Operators" layer
* "Templated Kernel and Invoker" layer
* "Instantiated Kernel and Invoker" layer
* "Client API" layer
It also includes a simple wrapper component used to perform tensor transform operations more easily and with fewer lines of code.
.. image:: data/ck_layer.png
:alt: CK Layers
\ No newline at end of file
=============== .. meta::
:description: Composable Kernel documentation and API reference library
:keywords: composable kernel, CK, ROCm, API, documentation
.. _wrapper:
********************************************************************
Wrapper Wrapper
=============== ********************************************************************
------------------------------------- -------------------------------------
Description Description
...@@ -11,7 +17,7 @@ Description ...@@ -11,7 +17,7 @@ Description
The wrapper is under development and its functionality is limited. The wrapper is under development and its functionality is limited.
CK provides a lightweight wrapper for more complex operations implemented in The CK library provides a lightweight wrapper for more complex operations implemented in
the library. It allows indexing of nested layouts using a simple interface the library. It allows indexing of nested layouts using a simple interface
(avoiding complex descriptor transformations) and memory access (using Tensor). (avoiding complex descriptor transformations) and memory access (using Tensor).
...@@ -71,3 +77,11 @@ Tensor helpers ...@@ -71,3 +77,11 @@ Tensor helpers
------------------------------------- -------------------------------------
.. doxygenfile:: tensor_utils.hpp .. doxygenfile:: tensor_utils.hpp
.. doxygenfile:: tensor_partition.hpp
-------------------------------------
Operations
-------------------------------------
.. doxygenfile:: copy.hpp
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment