Commit 3c4fb1dd authored by Umang Yadav's avatar Umang Yadav
Browse files

Merge remote-tracking branch 'origin/develop' into migx_merge

parents 57cdd70b e8cddfdc
# Documentation files
docs/* @saadrahim @LisaDelaney
*.md @saadrahim @LisaDelaney
*.rst @saadrahim @LisaDelaney
# Header directory
library/include/* @saadrahim @LisaDelaney
# Change Log for Composable Kernel # Changelog for Composable Kernel
Full documentation for Composable Kernel is not yet available. Full documentation for Composable Kernel is not yet available.
## CK 0.2.0 for ROCm 5.5.0 ## (Unreleased) CK for ROCm 6.0.0
### Fixed ### Fixes
- Fixed a bug in 6-dimensional kernels (#555). - Fixed a hazard associated with inline v_dot (#808)
- Fixed grouped ConvBwdWeight test case failure (#524). - Fixed two bugs in grouped convolution backward data without K padding (#848 #876)
### Optimizations ### Optimizations
- Improve proformance of normalization kernel None
### Added ### Additions
- Added new cmake flag "DL_KERNELS" must be set to "ON" in order to build the gemm_dl and batched_gemm_multi_d_dl instances. - Added an image to a column kernel (#867)
- Added new cmake flag "DTYPES" which could be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build instance of select data types. - Added a column to an image kernel (#930)
- Added new cmake flag "INSTANCES_ONLY" which will only build CK library and instances without the tests, examples, or profiler. - Support for 3D grouped convolution on RDNA 3 GPUs (#935, #950, #985)
- Added new feature: if GPU_TARGETS is not set on cmake command line, CK will be built for all targets supported by compiler. - Grouped convolution support for small K and C (#822 #879 #897)
- Added support on MI300A/MI300X. - Support for NHWGC (2D and 3D) grouped convolution backward weight (#769 #804)
- Added support on NAVI3x. - Support for bf16/f32/f16 and NHWGC (2D and 3D) grouped convolution backward data (#757 #799)
- Added user tutorial (#563). - Support for Batched Gemm DL (#732)
- Added more instances for irregular GEMM sizes (#560).
- Added inter-wave consumer-producer programming model for GEMM kernels (#310). ### Changes
- Added multi-D GEMM client APIs (#534). - Changed the grouped convolution API to maintain consistency with other convolution kernels (#817)
- Added multi-embeddings support (#542).
- Added Navi3x blockwise GEMM and real GEMM support (#541). ## CK 0.2.0 for ROCm 5.7.0
- Added Navi grouped ConvBwdWeight support (#505).
- Added MaxPool, AvgPool forward (#815). ### Fixes
- Added MaxPool backward (#750). - Fixed a bug in 6-dimensional kernels (#555)
- Fixed a test case failure with grouped convolution backward weight (#524)
### Changed
- Changed ... ### Optimizations
- Improved the performance of the normalization kernel
### Additions
- New CMake flags:
- "DL_KERNELS"-- Must be set to "ON" in order to build the gemm_dl and batched_gemm_multi_d_dl instances
- "DTYPES" -- Can be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build an instance of the specified data types
- "INSTANCES_ONLY" -- Only builds CK library and instances without tests, examples, or profiler
- New feature: if GPU_TARGETS is not set in the CMake command line, CK will be built for all targets supported by the compiler
- Support for MI300A/MI300X
- Support for AMD RDNA 3
- New user tutorial (#563)
- Additional instances for irregular GEMM sizes (#560)
- New inter-wave consumer-producer programming model for GEMM kernels (#310)
- GEMM with support multiple elementwise fusions (multi-D) (#534)
- Multi-embeddings support (#542)
- AMD RDNA 3 blockwise GEMM and real GEMM support (#541)
- AMD RDNA grouped convolution backward weight support (#505)
- MaxPool and AvgPool forward (#815); MaxPool backward (#750)
### Changes
None
cmake_minimum_required(VERSION 3.14) cmake_minimum_required(VERSION 3.14)
if(POLICY CMP0140)
# policies CMP0140 not known to CMake until 3.25
cmake_policy(SET CMP0140 NEW)
endif()
# This has to be initialized before the project() command appears
# Set the default of CMAKE_BUILD_TYPE to be release, unless user specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE
if( NOT MSVC_IDE AND NOT CMAKE_BUILD_TYPE )
set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." )
endif()
# Default installation path
if(WIN32)
set(CMAKE_INSTALL_PREFIX "/opt/rocm/x86_64-w64-mingw32" CACHE PATH "")
else()
set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "")
endif()
set(version 1.1.0) set(version 1.1.0)
# Check support for CUDA/HIP in Cmake # Check support for CUDA/HIP in Cmake
...@@ -16,6 +33,10 @@ if (DTYPES) ...@@ -16,6 +33,10 @@ if (DTYPES)
add_definitions(-DCK_ENABLE_FP8) add_definitions(-DCK_ENABLE_FP8)
set(CK_ENABLE_FP8 "ON") set(CK_ENABLE_FP8 "ON")
endif() endif()
if (DTYPES MATCHES "bf8")
add_definitions(-DCK_ENABLE_BF8)
set(CK_ENABLE_BF8 "ON")
endif()
if (DTYPES MATCHES "fp16") if (DTYPES MATCHES "fp16")
add_definitions(-DCK_ENABLE_FP16) add_definitions(-DCK_ENABLE_FP16)
set(CK_ENABLE_FP16 "ON") set(CK_ENABLE_FP16 "ON")
...@@ -34,10 +55,13 @@ if (DTYPES) ...@@ -34,10 +55,13 @@ if (DTYPES)
endif() endif()
message("DTYPES macro set to ${DTYPES}") message("DTYPES macro set to ${DTYPES}")
else() else()
add_definitions(-DCK_ENABLE_INT8 -DCK_ENABLE_FP8 -DCK_ENABLE_FP16 -DCK_ENABLE_FP32 -DCK_ENABLE_FP64 -DCK_ENABLE_BF16) add_definitions(-DCK_ENABLE_INT8 -DCK_ENABLE_FP8 -DCK_ENABLE_BF8 -DCK_ENABLE_FP16 -DCK_ENABLE_FP32 -DCK_ENABLE_FP64 -DCK_ENABLE_BF16)
set(CK_ENABLE_ALL_DTYPES "ON") set(CK_ENABLE_ALL_DTYPES "ON")
endif() endif()
#for f8/bf8_t type
add_compile_options(-Wno-bit-int-extension)
if(DL_KERNELS) if(DL_KERNELS)
add_definitions(-DDL_KERNELS) add_definitions(-DDL_KERNELS)
set(CK_ENABLE_DL_KERNELS "ON") set(CK_ENABLE_DL_KERNELS "ON")
...@@ -82,26 +106,30 @@ message("checking which targets are supported") ...@@ -82,26 +106,30 @@ message("checking which targets are supported")
#Setting GPU_TARGETS on command line will override this list #Setting GPU_TARGETS on command line will override this list
if(NOT PROFILER_ONLY) if(NOT PROFILER_ONLY)
rocm_check_target_ids(DEFAULT_GPU_TARGETS rocm_check_target_ids(DEFAULT_GPU_TARGETS
TARGETS "gfx900;gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102") TARGETS "gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102")
else() else()
add_definitions(-DPROFILER_ONLY) add_definitions(-DPROFILER_ONLY)
set(GPU_TARGETS "" CACHE STRING "" FORCE)
if(GPU_TARGETS) if(GPU_TARGETS)
message(FATAL_ERROR "For PROFILE_ONLY build, please do not set GPU_TARGETS, use GPU_ARCH = gfx9, gfx10, or gfx11") message(FATAL_ERROR "For PROFILE_ONLY build, please do not set GPU_TARGETS, use GPU_ARCH = gfx90, gfx94, gfx10, or gfx11")
endif() endif()
if(GPU_ARCH MATCHES "gfx9") if(GPU_ARCH MATCHES "gfx90")
rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx900;gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942") rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx908;gfx90a")
elseif(GPU_ARCH MATCHES "gfx94")
rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx940;gfx941;gfx942")
elseif(GPU_ARCH MATCHES "gfx10") elseif(GPU_ARCH MATCHES "gfx10")
rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx1030") rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx1030")
elseif(GPU_ARCH MATCHES "gfx11") elseif(GPU_ARCH MATCHES "gfx11")
rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx1100;gfx1101;gfx1102") rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx1100;gfx1101;gfx1102")
else() else()
message(FATAL_ERROR "For PROFILE_ONLY build, please specify GPU_ARCH as gfx9, gfx10, or gfx11") message(FATAL_ERROR "For PROFILE_ONLY build, please specify GPU_ARCH as gfx90, gfx94, gfx10, or gfx11")
endif() endif()
set(GPU_TARGETS "${DEFAULT_GPU_TARGETS}" CACHE STRING " " FORCE)
endif() endif()
message("Supported GPU_TARGETS= ${DEFAULT_GPU_TARGETS}") message("Supported GPU_TARGETS= ${DEFAULT_GPU_TARGETS}")
set(AMDGPU_TARGETS "${DEFAULT_GPU_TARGETS}" CACHE STRING " ") set(AMDGPU_TARGETS "${DEFAULT_GPU_TARGETS}" CACHE STRING " " FORCE)
if(GPU_TARGETS) if(GPU_TARGETS)
message("Building CK for the following targets: ${GPU_TARGETS}") message("Building CK for the following targets: ${GPU_TARGETS}")
...@@ -368,16 +396,18 @@ include_directories(BEFORE ...@@ -368,16 +396,18 @@ include_directories(BEFORE
${HIP_INCLUDE_DIRS} ${HIP_INCLUDE_DIRS}
) )
add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})
add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})
if (NOT CK_BUILD_JIT_LIB) if (NOT CK_BUILD_JIT_LIB)
SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV") SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
if(BUILD_DEV) if(BUILD_DEV)
add_compile_options(-Werror) add_compile_options(-Werror -Weverything)
add_compile_options(-Weverything)
endif() endif()
#add flags to reduce the size of binaries
add_compile_options(-Oz -flto=thin)
message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}") message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
file(GLOB_RECURSE INSTANCE_FILES "${PROJECT_SOURCE_DIR}/*/device_*_instance.cpp") file(GLOB_RECURSE INSTANCE_FILES "${PROJECT_SOURCE_DIR}/*/device_*_instance.cpp")
file(GLOB dir_list RELATIVE ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/*) file(GLOB dir_list RELATIVE ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/*)
set(CK_DEVICE_INSTANCES) set(CK_DEVICE_INSTANCES)
...@@ -387,32 +417,28 @@ if (NOT CK_BUILD_JIT_LIB) ...@@ -387,32 +417,28 @@ if (NOT CK_BUILD_JIT_LIB)
set(cmake_instance) set(cmake_instance)
file(READ "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}/CMakeLists.txt" cmake_instance) file(READ "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}/CMakeLists.txt" cmake_instance)
set(add_inst 0) set(add_inst 0)
if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp8\" " AND DTYPES MATCHES "fp8") if(("${cmake_instance}" MATCHES "fp8" OR "${cmake_instance}" MATCHES "_f8") AND DTYPES MATCHES "fp8")
#message("fp8 instance found!")
set(add_inst 1) set(add_inst 1)
endif() endif()
if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp16\"" AND DTYPES MATCHES "fp16") if(("${cmake_instance}" MATCHES "bf8" OR "${cmake_instance}" MATCHES "_b8") AND DTYPES MATCHES "bf8")
#message("fp16 instance found!")
set(add_inst 1) set(add_inst 1)
endif() endif()
if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp32\"" AND DTYPES MATCHES "fp32") if(("${cmake_instance}" MATCHES "fp16" OR "${cmake_instance}" MATCHES "_f16") AND DTYPES MATCHES "fp16")
#message("fp32 instance found!")
set(add_inst 1) set(add_inst 1)
endif() endif()
if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp64\"" AND DTYPES MATCHES "fp64") if(("${cmake_instance}" MATCHES "fp32" OR "${cmake_instance}" MATCHES "_f32") AND DTYPES MATCHES "fp32")
#message("fp64 instance found!")
set(add_inst 1) set(add_inst 1)
endif() endif()
if("${cmake_instance}" MATCHES "DTYPES MATCHES \"bf16\"" AND DTYPES MATCHES "bf16") if(("${cmake_instance}" MATCHES "fp64" OR "${cmake_instance}" MATCHES "_f64") AND DTYPES MATCHES "fp64")
#message("bf16 instance found!")
set(add_inst 1) set(add_inst 1)
endif() endif()
if("${cmake_instance}" MATCHES "DTYPES MATCHES \"int8\"" AND DTYPES MATCHES "int8") if(("${cmake_instance}" MATCHES "bf16" OR "${cmake_instance}" MATCHES "_b16") AND DTYPES MATCHES "bf16")
#message("int8 instance found!") set(add_inst 1)
endif()
if(("${cmake_instance}" MATCHES "int8" OR "${cmake_instance}" MATCHES "_i8") AND DTYPES MATCHES "int8")
set(add_inst 1) set(add_inst 1)
endif() endif()
if(NOT "${cmake_instance}" MATCHES "DTYPES") if(NOT "${cmake_instance}" MATCHES "DTYPES")
#message("instance should be built for all types!")
set(add_inst 1) set(add_inst 1)
endif() endif()
if(add_inst EQUAL 1 OR NOT DEFINED DTYPES) if(add_inst EQUAL 1 OR NOT DEFINED DTYPES)
...@@ -421,41 +447,41 @@ if (NOT CK_BUILD_JIT_LIB) ...@@ -421,41 +447,41 @@ if (NOT CK_BUILD_JIT_LIB)
ENDIF() ENDIF()
ENDFOREACH() ENDFOREACH()
add_custom_target(instances DEPENDS utility;${CK_DEVICE_INSTANCES} SOURCES ${INSTANCE_FILES}) add_custom_target(instances DEPENDS utility;${CK_DEVICE_INSTANCES} SOURCES ${INSTANCE_FILES})
add_subdirectory(library) add_subdirectory(library)
if(NOT DEFINED INSTANCES_ONLY) if(NOT DEFINED INSTANCES_ONLY)
if(NOT DEFINED PROFILER_ONLY) if(NOT DEFINED PROFILER_ONLY)
rocm_package_setup_component(tests rocm_package_setup_component(tests
LIBRARY_NAME composablekernel LIBRARY_NAME composablekernel
PACKAGE_NAME tests # Prevent -static suffix on package name PACKAGE_NAME tests # Prevent -static suffix on package name
) )
rocm_package_setup_component(examples rocm_package_setup_component(examples
LIBRARY_NAME composablekernel LIBRARY_NAME composablekernel
PACKAGE_NAME examples PACKAGE_NAME examples
) )
add_subdirectory(example) add_subdirectory(example)
add_subdirectory(test) add_subdirectory(test)
rocm_package_setup_component(profiler rocm_package_setup_component(profiler
LIBRARY_NAME composablekernel LIBRARY_NAME composablekernel
PACKAGE_NAME ckProfiler PACKAGE_NAME ckprofiler
) )
add_subdirectory(profiler) add_subdirectory(profiler)
else() else()
#When building PROFILER_ONLY, label the package with GPU_ARCH #When building PROFILER_ONLY, label the package with GPU_ARCH
rocm_package_setup_component(profiler rocm_package_setup_component(profiler
LIBRARY_NAME composablekernel LIBRARY_NAME composablekernel
PACKAGE_NAME ckProfiler_${GPU_ARCH} PACKAGE_NAME ckprofiler_${GPU_ARCH}
) )
add_subdirectory(profiler) add_subdirectory(profiler)
endif() endif()
endif() endif()
else() else()
rocm_package_setup_component(jit_library rocm_package_setup_component(jit_library
LIBRARY_NAME composablekernel LIBRARY_NAME composablekernel
PACKAGE_NAME jit_library PACKAGE_NAME jit_library
) )
add_subdirectory(library) add_subdirectory(library)
add_subdirectory(test) add_subdirectory(test)
...@@ -483,7 +509,7 @@ rocm_install(FILES ...@@ -483,7 +509,7 @@ rocm_install(FILES
) )
# Install CK version and configuration files # Install CK version and configuration files
install(FILES rocm_install(FILES
${PROJECT_BINARY_DIR}/include/ck/version.h ${PROJECT_BINARY_DIR}/include/ck/version.h
${PROJECT_BINARY_DIR}/include/ck/config.h ${PROJECT_BINARY_DIR}/include/ck/config.h
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck/
......
FROM ubuntu:20.04 FROM ubuntu:20.04
ARG DEBIAN_FRONTEND=noninteractive ARG DEBIAN_FRONTEND=noninteractive
ARG ROCMVERSION=5.6 ARG ROCMVERSION=5.7
ARG compiler_version="" ARG compiler_version=""
ARG compiler_commit="" ARG compiler_commit=""
...@@ -16,52 +16,52 @@ RUN apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl ...@@ -16,52 +16,52 @@ RUN apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl
ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
RUN wget https://repo.radeon.com/amdgpu-install/5.6/ubuntu/focal/amdgpu-install_5.6.50600-1_all.deb --no-check-certificate RUN wget https://repo.radeon.com/amdgpu-install/5.7/ubuntu/focal/amdgpu-install_5.7.50700-1_all.deb --no-check-certificate
RUN apt-get update && \ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_5.7.50700-1_all.deb
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
./amdgpu-install_5.6.50600-1_all.deb RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \
RUN if [ "$ROCMVERSION" != "5.7" ]; then \ sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'
wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \
sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \
elif [ "$ROCMVERSION" = "5.7" ] && [ "$compiler_version" = "" ] || [ "$compiler_version" = "amd-stg-open" ]; then \
sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_5.7-20.04-1_all.deb" && \
apt update && apt-get install -y ./amdgpu-install-internal_5.7-20.04-1_all.deb && \
amdgpu-repo --amdgpu-build=1609671 --rocm-build=compute-rocm-npi-mi300/1354; \
elif [ "$ROCMVERSION" = "5.7" ] && [ "$compiler_version" = "rc1" ]; then \
sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_5.7-20.04-1_all.deb" && \
apt update && apt-get install -y ./amdgpu-install-internal_5.7-20.04-1_all.deb && \
sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 5.7 rel-19 > /etc/apt/sources.list.d/rocm-build.list' && \
amdgpu-repo --amdgpu-build=1637781; \
fi
RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list" RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
RUN amdgpu-install -y --usecase=rocm --no-dkms RUN amdgpu-install -y --usecase=rocm --no-dkms
## Sccache binary built from source for ROCm
ARG SCCACHE_REPO_URL=http://compute-artifactory.amd.com/artifactory/rocm-generic-experimental/rocm-sccache
ENV SCCACHE_INSTALL_LOCATION=/usr/local/.cargo/bin
RUN mkdir -p ${SCCACHE_INSTALL_LOCATION} && \
curl ${SCCACHE_REPO_URL}/portable/0.2.16/sccache-0.2.16-alpha.1-rocm --output ${SCCACHE_INSTALL_LOCATION}/sccache && \
chmod +x ${SCCACHE_INSTALL_LOCATION}/sccache
ENV PATH=$PATH:${SCCACHE_INSTALL_LOCATION}
# Install dependencies # Install dependencies
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
build-essential \ build-essential \
ccache \
cmake \ cmake \
ccache \
git \ git \
hip-rocclr \ hip-rocclr \
iputils-ping \
jq \ jq \
libelf-dev \ libelf-dev \
libncurses5-dev \ libncurses5-dev \
libnuma-dev \ libnuma-dev \
libpthread-stubs0-dev \ libpthread-stubs0-dev \
llvm-amdgpu \ llvm-amdgpu \
net-tools \
pkg-config \ pkg-config \
python \ python \
python3 \ python3 \
python3-dev \ python3-dev \
python3-pip \ python3-pip \
redis \
sshpass \ sshpass \
stunnel \
software-properties-common \ software-properties-common \
vim \ vim \
nano \ nano \
zlib1g-dev \ zlib1g-dev \
zip \
openssh-server \ openssh-server \
clang-format-12 \ clang-format-12 \
kmod && \ kmod && \
...@@ -73,15 +73,8 @@ RUN wget -qO /usr/local/bin/ninja.gz https://github.com/ninja-build/ninja/releas ...@@ -73,15 +73,8 @@ RUN wget -qO /usr/local/bin/ninja.gz https://github.com/ninja-build/ninja/releas
RUN gunzip /usr/local/bin/ninja.gz RUN gunzip /usr/local/bin/ninja.gz
RUN chmod a+x /usr/local/bin/ninja RUN chmod a+x /usr/local/bin/ninja
RUN git clone https://github.com/nico/ninjatracing.git RUN git clone https://github.com/nico/ninjatracing.git
RUN apt purge --auto-remove -y cmake # Update the cmake to the latest version
RUN apt update RUN pip install --upgrade cmake==3.27.5
RUN apt install -y software-properties-common lsb-release
RUN apt clean all
RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
RUN apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main"
RUN apt install -y kitware-archive-keyring
RUN rm /etc/apt/trusted.gpg.d/kitware.gpg
RUN apt install -y cmake
# Setup ubsan environment to printstacktrace # Setup ubsan environment to printstacktrace
RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer
...@@ -96,9 +89,9 @@ ARG PREFIX=/opt/rocm ...@@ -96,9 +89,9 @@ ARG PREFIX=/opt/rocm
RUN pip3 install --upgrade pip RUN pip3 install --upgrade pip
RUN pip3 install sqlalchemy==1.4.46 RUN pip3 install sqlalchemy==1.4.46
RUN pip3 install pymysql RUN pip3 install pymysql
RUN pip3 install pandas RUN pip3 install pandas==2.0.3
RUN pip3 install setuptools-rust RUN pip3 install setuptools-rust
RUN pip3 install sshtunnel RUN pip3 install sshtunnel==0.4.0
# Setup ubsan environment to printstacktrace # Setup ubsan environment to printstacktrace
ENV UBSAN_OPTIONS=print_stacktrace=1 ENV UBSAN_OPTIONS=print_stacktrace=1
...@@ -121,7 +114,7 @@ RUN sh -c "echo compiler commit = '$compiler_commit'" ...@@ -121,7 +114,7 @@ RUN sh -c "echo compiler commit = '$compiler_commit'"
RUN if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler_commit" = "" ]; then \ RUN if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler_commit" = "" ]; then \
git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \ git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \
cd llvm-project && mkdir build && cd build && \ cd llvm-project && mkdir build && cd build && \
cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \ cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
make -j 8 ; \ make -j 8 ; \
else echo "using the release compiler"; \ else echo "using the release compiler"; \
fi fi
...@@ -129,11 +122,13 @@ RUN if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler_commit" = "" ]; ...@@ -129,11 +122,13 @@ RUN if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler_commit" = "" ];
RUN if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler_commit" != "" ]; then \ RUN if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler_commit" != "" ]; then \
git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \ git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \
cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \ cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \
cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \ cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
make -j 8 ; \ make -j 8 ; \
else echo "using the release compiler"; \ else echo "using the release compiler"; \
fi fi
#clean-up the deb package
RUN sh -c "rm -rf amdgpu-install*"
#ENV HIP_CLANG_PATH='/llvm-project/build/bin' #ENV HIP_CLANG_PATH='/llvm-project/build/bin'
#RUN sh -c "echo HIP_CLANG_PATH = '$HIP_CLANG_PATH'" #RUN sh -c "echo HIP_CLANG_PATH = '$HIP_CLANG_PATH'"
...@@ -33,7 +33,7 @@ def runShell(String command){ ...@@ -33,7 +33,7 @@ def runShell(String command){
def getDockerImageName(){ def getDockerImageName(){
def img def img
if (params.ROCMVERSION != "5.7"){ if (params.ROCMVERSION != "6.0"){
if (params.COMPILER_VERSION == "") { if (params.COMPILER_VERSION == "") {
img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}" img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}"
} }
...@@ -65,10 +65,10 @@ def getDockerImageName(){ ...@@ -65,10 +65,10 @@ def getDockerImageName(){
} }
def check_host() { def check_host() {
if ("${env.CK_CCACHE}" != "null"){ if ("${env.CK_SCCACHE}" != "null"){
def CCACHE_SERVER="${env.CK_CCACHE.split(':')[0]}" def SCCACHE_SERVER="${env.CK_SCCACHE.split(':')[0]}"
echo "ccache server: ${CCACHE_SERVER}" echo "sccache server: ${SCCACHE_SERVER}"
sh '''ping -c 1 -p 6379 "${CCACHE_SERVER}" | echo $? > tmp.txt''' sh '''ping -c 1 -p 6379 "${SCCACHE_SERVER}" | echo $? > tmp.txt'''
def output = readFile(file: "tmp.txt") def output = readFile(file: "tmp.txt")
echo "tmp.txt contents: \$output" echo "tmp.txt contents: \$output"
return (output != "0") return (output != "0")
...@@ -96,24 +96,9 @@ def build_compiler(){ ...@@ -96,24 +96,9 @@ def build_compiler(){
def getDockerImage(Map conf=[:]){ def getDockerImage(Map conf=[:]){
env.DOCKER_BUILDKIT=1 env.DOCKER_BUILDKIT=1
def prefixpath = conf.get("prefixpath", "/opt/rocm") // prefix:/opt/rocm def prefixpath = conf.get("prefixpath", "/opt/rocm")
def no_cache = conf.get("no_cache", false) def no_cache = conf.get("no_cache", false)
def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' " def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
echo "ccache server: ${env.CK_CCACHE}"
if(env.CK_CCACHE)
{
if(check_host())
{
echo "FOUND CCACHE SERVER: ${env.CK_CCACHE}"
}
else
{
echo "CCACHE SERVER: ${env.CK_CCACHE} NOT FOUND, got ${check_host} response"
}
dockerArgs = dockerArgs + " --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CK_CCACHE}' --build-arg COMPILER_LAUNCHER='ccache' "
env.CCACHE_DIR = """/tmp/ccache_store"""
env.CCACHE_SECONDARY_STORAGE="""redis://${env.CK_CCACHE}"""
}
if(no_cache) if(no_cache)
{ {
dockerArgs = dockerArgs + " --no-cache " dockerArgs = dockerArgs + " --no-cache "
...@@ -142,21 +127,6 @@ def buildDocker(install_prefix){ ...@@ -142,21 +127,6 @@ def buildDocker(install_prefix){
def image_name = getDockerImageName() def image_name = getDockerImageName()
echo "Building Docker for ${image_name}" echo "Building Docker for ${image_name}"
def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' " def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
echo "ccache server: ${env.CK_CCACHE}"
if(env.CK_CCACHE)
{
if(check_host())
{
echo "FOUND CCACHE SERVER: ${env.CK_CCACHE}"
}
else
{
echo "CCACHE SERVER: ${env.CK_CCACHE} NOT FOUND, got ${check_host} response"
}
dockerArgs = dockerArgs + " --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CK_CCACHE}' --build-arg COMPILER_LAUNCHER='ccache' "
env.CCACHE_DIR = """/tmp/ccache_store"""
env.CCACHE_SECONDARY_STORAGE="""redis://${env.CK_CCACHE}"""
}
echo "Build Args: ${dockerArgs}" echo "Build Args: ${dockerArgs}"
try{ try{
...@@ -169,7 +139,7 @@ def buildDocker(install_prefix){ ...@@ -169,7 +139,7 @@ def buildDocker(install_prefix){
else{ else{
echo "Checking for image: ${image_name}" echo "Checking for image: ${image_name}"
sh "docker manifest inspect --insecure ${image_name}" sh "docker manifest inspect --insecure ${image_name}"
echo "Image: ${image_name} found!! Skipping building image" echo "Image: ${image_name} found! Skipping building image"
} }
} }
catch(Exception ex){ catch(Exception ex){
...@@ -210,19 +180,18 @@ def cmake_build(Map conf=[:]){ ...@@ -210,19 +180,18 @@ def cmake_build(Map conf=[:]){
} else{ } else{
setup_args = ' -DBUILD_DEV=On' + setup_args setup_args = ' -DBUILD_DEV=On' + setup_args
} }
if (params.DL_KERNELS){
setup_args = setup_args + " -DDL_KERNELS=ON "
}
if(build_type_debug){ if(build_type_debug){
setup_args = " -DCMAKE_BUILD_TYPE=debug -DCMAKE_CXX_FLAGS_DEBUG='${debug_flags}'" + setup_args setup_args = " -DCMAKE_BUILD_TYPE=debug -DCMAKE_CXX_FLAGS_DEBUG='${debug_flags}'" + setup_args
}else{ }else{
setup_args = " -DCMAKE_BUILD_TYPE=release" + setup_args setup_args = " -DCMAKE_BUILD_TYPE=release" + setup_args
} }
if(env.CK_CCACHE)
{
setup_args = " -DCMAKE_CXX_COMPILER_LAUNCHER='ccache' -DCMAKE_C_COMPILER_LAUNCHER='ccache' " + setup_args
}
echo "ccache server: ${env.CK_CCACHE}"
def pre_setup_cmd = """ def pre_setup_cmd = """
#!/bin/bash
echo \$HSA_ENABLE_SDMA echo \$HSA_ENABLE_SDMA
ulimit -c unlimited ulimit -c unlimited
rm -rf build rm -rf build
...@@ -231,6 +200,60 @@ def cmake_build(Map conf=[:]){ ...@@ -231,6 +200,60 @@ def cmake_build(Map conf=[:]){
mkdir install mkdir install
cd build cd build
""" """
def invocation_tag=""
if (setup_args.contains("gfx11")){
invocation_tag="gfx11"
}
if (setup_args.contains("gfx10")){
invocation_tag="gfx10"
}
if (setup_args.contains("gfx90")){
invocation_tag="gfx90"
}
if (setup_args.contains("gfx94")){
invocation_tag="gfx94"
}
echo "invocation tag: ${invocation_tag}"
def redis_pre_setup_cmd = pre_setup_cmd
if(check_host() && params.USE_SCCACHE && "${env.CK_SCCACHE}" != "null" && "${invocation_tag}" != "") {
redis_pre_setup_cmd = pre_setup_cmd + """
#!/bin/bash
export ROCM_PATH=/opt/rocm
export SCCACHE_ENABLED=true
export SCCACHE_LOG_LEVEL=debug
export SCCACHE_IDLE_TIMEOUT=14400
export COMPILERS_HASH_DIR=/tmp/.sccache
export SCCACHE_BIN=/usr/local/.cargo/bin/sccache
export SCCACHE_EXTRAFILES=/tmp/.sccache/rocm_compilers_hash_file
export SCCACHE_REDIS="redis://${env.CK_SCCACHE}"
echo "connect = ${env.CK_SCCACHE}" >> ../script/redis-cli.conf
export SCCACHE_C_CUSTOM_CACHE_BUSTER="${invocation_tag}"
echo \$SCCACHE_C_CUSTOM_CACHE_BUSTER
stunnel ../script/redis-cli.conf
../script/sccache_wrapper.sh --enforce_redis
"""
try {
def cmd1 = conf.get("cmd1", """
${redis_pre_setup_cmd}
""")
sh cmd1
setup_args = " -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache " + setup_args
}
catch(Exception err){
echo "could not connect to redis server: ${err.getMessage()}. will not use sccache."
def cmd2 = conf.get("cmd2", """
${pre_setup_cmd}
""")
sh cmd2
}
}
else{
def cmd3 = conf.get("cmd3", """
${pre_setup_cmd}
""")
sh cmd3
}
def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args} .. ") def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args} .. ")
// reduce parallelism when compiling, clang uses too much memory // reduce parallelism when compiling, clang uses too much memory
def nt = nthreads() def nt = nthreads()
...@@ -238,17 +261,19 @@ def cmake_build(Map conf=[:]){ ...@@ -238,17 +261,19 @@ def cmake_build(Map conf=[:]){
def execute_cmd = conf.get("execute_cmd", "") def execute_cmd = conf.get("execute_cmd", "")
def cmd = conf.get("cmd", """ def cmd = conf.get("cmd", """
${pre_setup_cmd}
${setup_cmd} ${setup_cmd}
${build_cmd} ${build_cmd}
${execute_cmd} ${execute_cmd}
""") """)
echo cmd echo cmd
sh cmd
dir("build"){
sh cmd
}
// Only archive from master or develop // Only archive from master or develop
if (package_build == true && (env.BRANCH_NAME == "develop" || env.BRANCH_NAME == "master")) { if (package_build == true && (env.BRANCH_NAME == "develop" || env.BRANCH_NAME == "amd-master")) {
archiveArtifacts artifacts: "build/*.deb", allowEmptyArchive: true, fingerprint: true archiveArtifacts artifacts: "build/*.deb", allowEmptyArchive: true, fingerprint: true
} }
} }
...@@ -367,8 +392,6 @@ def runCKProfiler(Map conf=[:]){ ...@@ -367,8 +392,6 @@ def runCKProfiler(Map conf=[:]){
withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
timeout(time: 24, unit: 'HOURS') timeout(time: 24, unit: 'HOURS')
{ {
//cmake_build(conf)
//instead of building, just unstash the ckProfiler and install it
sh """ sh """
rm -rf build rm -rf build
mkdir build mkdir build
...@@ -525,6 +548,26 @@ def Build_CK(Map conf=[:]){ ...@@ -525,6 +548,26 @@ def Build_CK(Map conf=[:]){
stash "ckprofiler_0.2.0_amd64.deb" stash "ckprofiler_0.2.0_amd64.deb"
} }
} }
if (params.hipTensor_test && navi_node == 0 ){
//build and test hipTensor
sh """#!/bin/bash
rm -rf "${params.hipTensor_branch}".zip
rm -rf hipTensor-"${params.hipTensor_branch}"
wget https://github.com/ROCmSoftwarePlatform/hipTensor/archive/refs/heads/"${params.hipTensor_branch}".zip
unzip -o "${params.hipTensor_branch}".zip
"""
dir("hipTensor-${params.hipTensor_branch}"){
sh """#!/bin/bash
mkdir -p build
ls -ltr
CC=hipcc CXX=hipcc cmake -Bbuild . -D CMAKE_PREFIX_PATH="/opt/rocm;${env.WORKSPACE}/install"
cmake --build build -- -j
"""
}
dir("hipTensor-${params.hipTensor_branch}/build"){
sh 'ctest'
}
}
} }
} }
} }
...@@ -612,9 +655,9 @@ def process_results(Map conf=[:]){ ...@@ -612,9 +655,9 @@ def process_results(Map conf=[:]){
} }
//launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=5.7;COMPILER_VERSION=rc1 CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=5.7;COMPILER_VERSION=
0 21 * * * % ROCMVERSION=5.6;COMPILER_VERSION=;COMPILER_COMMIT= 0 21 * * * % ROCMVERSION=5.7;COMPILER_VERSION=;COMPILER_COMMIT=
0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=''' : "" 0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=;USE_SCCACHE=false''' : ""
pipeline { pipeline {
agent none agent none
...@@ -631,8 +674,8 @@ pipeline { ...@@ -631,8 +674,8 @@ pipeline {
description: "Force building docker image (default: false), set to true if docker image needs to be updated.") description: "Force building docker image (default: false), set to true if docker image needs to be updated.")
string( string(
name: 'ROCMVERSION', name: 'ROCMVERSION',
defaultValue: '5.6', defaultValue: '5.7',
description: 'Specify which ROCM version to use: 5.6 (default).') description: 'Specify which ROCM version to use: 5.7 (default).')
string( string(
name: 'COMPILER_VERSION', name: 'COMPILER_VERSION',
defaultValue: '', defaultValue: '',
...@@ -649,6 +692,22 @@ pipeline { ...@@ -649,6 +692,22 @@ pipeline {
name: "RUN_FULL_QA", name: "RUN_FULL_QA",
defaultValue: false, defaultValue: false,
description: "Select whether to run small set of performance tests (default) or full QA") description: "Select whether to run small set of performance tests (default) or full QA")
booleanParam(
name: "DL_KERNELS",
defaultValue: false,
description: "Select whether to build DL kernels (default: OFF)")
booleanParam(
name: "hipTensor_test",
defaultValue: true,
description: "Use the CK build to verify hipTensor build and tests (default: ON)")
string(
name: 'hipTensor_branch',
defaultValue: 'develop',
description: 'Specify which branch of hipTensor to use (default: develop)')
booleanParam(
name: "USE_SCCACHE",
defaultValue: true,
description: "Use the sccache for building CK (default: ON)")
} }
environment{ environment{
dbuser = "${dbuser}" dbuser = "${dbuser}"
...@@ -663,15 +722,12 @@ pipeline { ...@@ -663,15 +722,12 @@ pipeline {
} }
stages{ stages{
stage("Build Docker"){ stage("Build Docker"){
//when {
// beforeAgent true
// expression { params.BUILD_DOCKER.toBoolean() }
//}
parallel{ parallel{
stage('Docker /opt/rocm'){ stage('Docker /opt/rocm'){
agent{ label rocmnode("nogpu") } agent{ label rocmnode("nogpu") }
steps{ steps{
buildDocker('/opt/rocm') buildDocker('/opt/rocm')
cleanWs()
} }
} }
} }
...@@ -693,6 +749,7 @@ pipeline { ...@@ -693,6 +749,7 @@ pipeline {
} }
steps{ steps{
buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd, no_reboot:true) buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd, no_reboot:true)
cleanWs()
} }
} }
} }
...@@ -710,11 +767,12 @@ pipeline { ...@@ -710,11 +767,12 @@ pipeline {
} }
agent{ label rocmnode("gfx908 || gfx90a") } agent{ label rocmnode("gfx908 || gfx90a") }
environment{ environment{
setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941" """ setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" -DCMAKE_EXE_LINKER_FLAGS=" -L ${env.WORKSPACE}/script -T hip_fatbin_insert " """
execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
} }
steps{ steps{
Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
cleanWs()
} }
} }
stage("Build CK and run Tests on MI100/MI200") stage("Build CK and run Tests on MI100/MI200")
...@@ -730,6 +788,7 @@ pipeline { ...@@ -730,6 +788,7 @@ pipeline {
} }
steps{ steps{
Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
cleanWs()
} }
} }
stage("Build CK and run Tests on Navi21") stage("Build CK and run Tests on Navi21")
...@@ -742,10 +801,10 @@ pipeline { ...@@ -742,10 +801,10 @@ pipeline {
environment{ environment{
setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" -DDL_KERNELS=ON """ setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" -DDL_KERNELS=ON """
execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1030" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1030" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
} }
steps{ steps{
Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
cleanWs()
} }
} }
stage("Build CK and run Tests on Navi32") stage("Build CK and run Tests on Navi32")
...@@ -756,12 +815,12 @@ pipeline { ...@@ -756,12 +815,12 @@ pipeline {
} }
agent{ label rocmnode("navi32") } agent{ label rocmnode("navi32") }
environment{ environment{
setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DDTYPES="fp16;fp32;bf16" -DGPU_TARGETS="gfx1101" """ setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1101" -DDL_KERNELS=ON """
execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1101" -DDTYPES="fp16;fp32;bf16" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1101" -DDL_KERNELS=ON -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
} }
steps{ steps{
Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
cleanWs()
} }
} }
} }
...@@ -784,6 +843,7 @@ pipeline { ...@@ -784,6 +843,7 @@ pipeline {
} }
steps{ steps{
runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release') runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release')
cleanWs()
} }
} }
stage("Run ckProfiler: gfx90a") stage("Run ckProfiler: gfx90a")
...@@ -799,6 +859,7 @@ pipeline { ...@@ -799,6 +859,7 @@ pipeline {
} }
steps{ steps{
runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release') runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release')
cleanWs()
} }
} }
} }
...@@ -811,6 +872,7 @@ pipeline { ...@@ -811,6 +872,7 @@ pipeline {
agent { label 'mici' } agent { label 'mici' }
steps{ steps{
process_results() process_results()
cleanWs()
} }
} }
} }
......
# Composable Kernel # Composable Kernel
## Methodology The Composable Kernel (CK) library provides a programming model for writing performance-critical
kernels for machine learning workloads across multiple architectures (GPUs, CPUs, etc.). The CK library
uses general purpose kernel languages, such as HIP C++.
Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel languages, like HIP C++. CK uses two concepts to achieve performance portability and code maintainability:
CK utilizes two concepts to achieve performance portability and code maintainability:
* A tile-based programming model * A tile-based programming model
* Algorithm complexity reduction for complex ML operators, using innovative technique we call "Tensor Coordinate Transformation". * Algorithm complexity reduction for complex machine learning (ML) operators. This uses an innovative
technique called *Tensor Coordinate Transformation*.
![ALT](/docs/data/ck_component.png "CK Components") ![ALT](/docs/data/ck_component.png "CK Components")
## Code Structure The current CK library is structured into four layers:
Current CK library are structured into 4 layers: * Templated Tile Operators
* "Templated Tile Operators" layer * Templated Kernel and Invoker
* "Templated Kernel and Invoker" layer * Instantiated Kernel and Invoker
* "Instantiated Kernel and Invoker" layer * Client API
* "Client API" layer
![ALT](/docs/data/ck_layer.png "CK Layers") ![ALT](/docs/data/ck_layer.png "CK Layers")
## Documentation ## General information
Run the steps below to build documentation locally. To build our documentation locally, use the following code:
``` ``` bash
cd docs cd docs
pip3 install -r sphinx/requirements.txt pip3 install -r sphinx/requirements.txt
python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
``` ```
## Contributors You can find a list of our developers and contributors on our [Contributors](/CONTRIBUTORS.md) page.
page.
The list of developers and contributors is here: [Contributors](/CONTRIBUTORS.md)
## Citation ```note
If you use CK, cite us as follows:
If you use CK, please use following citations: * [Realizing Tensor Operators Using Coordinate Transformations and Tile Based Programming](???):
* CK paper will be freely available on arXiv soon: [Realizing Tensor Operators Using Coordinate Transformations and Tile Based Programming](???) This paper will be available on arXiv soon.
* [CITATION.cff](/CITATION.cff) * [CITATION.cff](/CITATION.cff)
```
## License CK is released under the **[MIT license](/LICENSE)**.
CK is released under the MIT license. [License File](/LICENSE) ## Building CK
We recommend building CK inside Docker containers, which include all necessary packages. Pre-built
Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composable_kernel/tags).
# Build CK 1. To build a new Docker image, use the Dockerfile provided with the source code:
## Build docker image ```bash
DOCKER_BUILDKIT=1 docker build -t ck:latest -f Dockerfile .
```
```bash 2. Launch the Docker container:
DOCKER_BUILDKIT=1 docker build -t ck:latest -f Dockerfile .
```
Pre-built dockers are available from this public repo:
https://hub.docker.com/r/rocm/composable_kernel/tags
## Launch docker ```bash
docker run \
-it \
--privileged \
--group-add sudo \
-w /root/workspace \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
ck:latest \
/bin/bash
```
```bash 3. Clone CK source code from the GitHub repository and start the build:
docker run \
-it \
--privileged \
--group-add sudo \
-w /root/workspace \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
ck:latest \
/bin/bash
```
## Build CK ```bash
git clone https://github.com/ROCmSoftwarePlatform/composable_kernel.git && \
cd composable_kernel && \
mkdir build && \
cd build
```
```bash You must set the `GPU_TARGETS` macro to specify the GPU target architecture(s) you want
mkdir build && cd build to run CK on. You can specify single or multiple architectures. If you specify multiple architectures,
use a semicolon between each; for example, `gfx908;gfx90a;gfx940`.
# Need to specify target ID, example below is for gfx908 and gfx90a ```bash
cmake \
-D CMAKE_PREFIX_PATH=/opt/rocm \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_BUILD_TYPE=Release \
-D GPU_TARGETS="gfx908;gfx90a" \
..
```
cmake \ If you don't set `GPU_TARGETS` on the cmake command line, CK is built for all GPU targets
-D CMAKE_PREFIX_PATH=/opt/rocm \ supported by the current compiler (this may take a long time).
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_BUILD_TYPE=Release \
-D GPU_TARGETS="gfx908;gfx90a" \
..
```
If GPU_TARGETS is not set on the cmake command line, CK will be built for all targets supported by the 4. Build the entire CK library:
current compiler.
```bash
make -j
```
Additional cmake flags can be used to significantly speed-up the build: 5. Install CK:
INSTANCES_ONLY (by default is OFF) must be set to ON in order to build only the instances and library ```bash
while skipping all tests, examples, and profiler. This is useful for libraries that use CK as a dependency. make -j install
```
DTYPES (by default not set) can be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build instances ## Optional post-install steps
of select data types only. Currently, building of int8 instances is taking a lot of time (the compiler fix is in the works).
DL_KERNELS (by default is OFF) must be set to ON in order to build the gemm_dl and batched_gemm_multi_d_dl * Build examples and tests:
instances. Those instances are only needed for the NAVI2x platforms.
### Build examples and tests ```bash
make -j examples tests
```
```bash * Build and run all examples and tests:
make -j examples tests
make test ```bash
``` make -j check
```
Instructions for running each individual examples are under [example](/example) You can find instructions for running each individual example in [example](/example).
* Build ckProfiler:
## Build ckProfiler ```bash
make -j ckProfiler
```
You can find instructions for running ckProfiler in [profiler](/profiler).
Note the `-j` option for building with multiple threads in parallel. This speeds up the build significantly.
Depending on the number of CPU cores and the amount of RAM on your system, you may want to
limit the number of threads. For example, if you have a 128-core CPU and 64 Gb of RAM.
By default, `-j` launches one thread per CPU core, which can cause the build to run out of memory and
crash. In such cases, you can reduce the number of threads to 32 by using `-j32`.
Additional cmake flags can be used to significantly speed-up the build:
* `INSTANCES_ONLY` (default is OFF) must be set to ON in order to build only the instances and library
while skipping all tests, examples, and profiler. This is useful in cases when you plan to use CK as a
dependency and don't plan to run any examples or tests.
* `DTYPES` (default is not set) can be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build
instances of select data types only. The main default data types are fp32 and fp16; you can safely skip
other data types.
* `DL_KERNELS` (default is OFF) must be set to ON in order to build instances, such as `gemm_dl` or
`batched_gemm_multi_d_dl`. These instances are useful on architectures like the NAVI2x, as most
other platforms have faster instances, such as `xdl` or `wmma`, available.
## Using sccache for building
The default CK Docker images come with a pre-installed version of sccache, which supports clang
being used as hip-compiler (" -x hip"). Using sccache can help reduce the time to re-build code from
hours to 1-2 minutes. In order to invoke sccache, you need to run:
```bash ```bash
make -j ckProfiler sccache --start-server
``` ```
Instructions for running ckProfiler are under [profiler](/profiler)
## Install CK then add the following flags to the cmake command line:
```bash ```bash
make install -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache
``` ```
You may need to clean up the build folder and repeat the cmake and make steps in order to take
advantage of the sccache during subsequent builds.
## Using CK as pre-built kernel library ## Using CK as pre-built kernel library
Instructions for using CK as a pre-built kernel library are under [client_example](/client_example) You can find instructions for using CK as a pre-built kernel library in [client_example](/client_example).
## Contributing ## Contributing to CK
When you contribute to Composable Kernel, make sure to run `clang-format` on all the changed files. We highly recommend using git hooks that are managed by the `pre-commit` framework. To install hooks, run: When you contribute to CK, make sure you run `clang-format` on all changed files. We highly
recommend using git hooks that are managed by the `pre-commit` framework. To install hooks, run:
```bash ```bash
sudo script/install_precommit.sh sudo script/install_precommit.sh
``` ```
This way, `pre-commit` will add the appropriate hooks to your local repository and automatically run `clang-format` (and possibly additional checks) before any commit is created. With this approach, `pre-commit` adds the appropriate hooks to your local repository and
automatically runs `clang-format` (and possibly additional checks) before any commit is created.
If you need to uninstall hooks from the repository, you can do so by running the following command: If you need to uninstall hooks from the repository, you can do so by running the following command:
...@@ -141,14 +191,5 @@ If you need to uninstall hooks from the repository, you can do so by running the ...@@ -141,14 +191,5 @@ If you need to uninstall hooks from the repository, you can do so by running the
script/uninstall_precommit.sh script/uninstall_precommit.sh
``` ```
If for any reason, you need to temporarily disable precommit hooks, you can add the `--no-verify` option to the `git commit` command. If you need to temporarily disable pre-commit hooks, you can add the `--no-verify` option to the
`git commit` command.
## Caveat
### Kernel Timing and Verification
CK's own kernel timer will warn up kernel once, and then run it multiple times
to get average kernel time. For some kernels that use atomic add, this will cause
output buffer to be accumulated multiple times, causing verification failure.
To work around it, do not use CK's own timer and do verification at the same time.
CK's own timer and verification in each example and ckProfiler can be enabled or
disabled from command line.
add_executable(client_layernorm2d layernorm2d.cpp) add_executable(client_layernorm2d_fwd layernorm2d_fwd.cpp)
target_link_libraries(client_layernorm2d PRIVATE composable_kernel::device_operations) target_link_libraries(client_layernorm2d_fwd PRIVATE composable_kernel::device_operations)
add_executable(client_layernorm4d_fwd layernorm4d_fwd.cpp)
target_link_libraries(client_layernorm4d_fwd PRIVATE composable_kernel::device_operations)
...@@ -7,17 +7,19 @@ ...@@ -7,17 +7,19 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_normalization.hpp" #include "ck/tensor_operation/gpu/device/device_normalization_fwd.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/normalization.hpp" #include "ck/library/tensor_operation_instance/gpu/normalization_fwd.hpp"
using XDataType = ck::half_t; using XDataType = ck::half_t;
using GammaDataType = ck::half_t; using GammaDataType = ck::half_t;
using BetaDataType = ck::half_t; using BetaDataType = ck::half_t;
using YDataType = ck::half_t; using YDataType = ck::half_t;
using ComputeDataType = float; using SaveMeanInvStdDataType = float;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
#define SAVE_MEAN_INV_STD
constexpr int Rank = 2; constexpr int Rank = 2;
constexpr int NumReduceDim = 1; constexpr int NumReduceDim = 1;
...@@ -50,15 +52,19 @@ int main(int argc, char* argv[]) ...@@ -50,15 +52,19 @@ int main(int argc, char* argv[])
SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * N); SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * N);
SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * N); SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * N);
SimpleDeviceMem y_device_buf(sizeof(YDataType) * xy_size); SimpleDeviceMem y_device_buf(sizeof(YDataType) * xy_size);
#ifdef SAVE_MEAN_INV_STD
using DeviceOp = ck::tensor_operation::device::DeviceNormalization<XDataType, SimpleDeviceMem save_mean_device_buf(sizeof(SaveMeanInvStdDataType) * M);
GammaDataType, SimpleDeviceMem save_inv_std_device_buf(sizeof(SaveMeanInvStdDataType) * M);
BetaDataType, #endif
ComputeDataType,
YDataType, using DeviceOp = ck::tensor_operation::device::DeviceNormalizationFwd<XDataType,
PassThrough, GammaDataType,
Rank, BetaDataType,
NumReduceDim>; YDataType,
SaveMeanInvStdDataType,
PassThrough,
Rank,
NumReduceDim>;
// get device op instances // get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
...@@ -84,14 +90,21 @@ int main(int argc, char* argv[]) ...@@ -84,14 +90,21 @@ int main(int argc, char* argv[])
{0, 1}, // gammaStrides {0, 1}, // gammaStrides
{0, 1}, // betaStrides {0, 1}, // betaStrides
{Stride, 1}, // yStrides {Stride, 1}, // yStrides
{1}, // save_mean Strides
{1}, // save_inv_std Strides
{1}, // reduceDims {1}, // reduceDims
1e-4, 1e-4,
x_device_buf.GetDeviceBuffer(), x_device_buf.GetDeviceBuffer(),
gamma_device_buf.GetDeviceBuffer(), gamma_device_buf.GetDeviceBuffer(),
beta_device_buf.GetDeviceBuffer(), beta_device_buf.GetDeviceBuffer(),
y_device_buf.GetDeviceBuffer(), y_device_buf.GetDeviceBuffer(),
#ifdef SAVE_MEAN_INV_STD
save_mean_device_buf.GetDeviceBuffer(),
save_inv_std_device_buf.GetDeviceBuffer(),
#else
nullptr, nullptr,
nullptr, nullptr,
#endif
PassThrough{}); PassThrough{});
auto invoker_ptr = op_ptr->MakeInvokerPointer(); auto invoker_ptr = op_ptr->MakeInvokerPointer();
...@@ -100,11 +113,19 @@ int main(int argc, char* argv[]) ...@@ -100,11 +113,19 @@ int main(int argc, char* argv[])
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
SimpleDeviceMem workspace(workspace_sz);
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t num_byte = sizeof(XDataType) * M * N + sizeof(GammaDataType) * N + std::size_t num_byte = sizeof(XDataType) * M * N + sizeof(GammaDataType) * N +
sizeof(BetaDataType) * N + sizeof(YDataType) * M * N; sizeof(BetaDataType) * N + sizeof(YDataType) * M * N;
#ifdef SAVE_MEAN_INV_STD
num_byte += sizeof(SaveMeanInvStdDataType) * M * 2;
#endif
float gb_per_sec = num_byte / 1.E6 / ave_time; float gb_per_sec = num_byte / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, " std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
...@@ -136,23 +157,34 @@ int main(int argc, char* argv[]) ...@@ -136,23 +157,34 @@ int main(int argc, char* argv[])
auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // lengths auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // lengths
{Stride, 1}, // xStrides {Stride, 1}, // xStrides
{1}, // gammaStrides {0, 1}, // gammaStrides
{1}, // betaStrides {0, 1}, // betaStrides
{Stride, 1}, // yStrides {Stride, 1}, // yStrides
{1}, // save_mean Strides
{1}, // save_inv_std Strides
{1}, // reduceDims {1}, // reduceDims
1e-4, 1e-4,
x_device_buf.GetDeviceBuffer(), x_device_buf.GetDeviceBuffer(),
gamma_device_buf.GetDeviceBuffer(), gamma_device_buf.GetDeviceBuffer(),
beta_device_buf.GetDeviceBuffer(), beta_device_buf.GetDeviceBuffer(),
y_device_buf.GetDeviceBuffer(), y_device_buf.GetDeviceBuffer(),
#ifdef SAVE_MEAN_INV_STD
save_mean_device_buf.GetDeviceBuffer(),
save_inv_std_device_buf.GetDeviceBuffer(),
#else
nullptr, nullptr,
nullptr, nullptr,
#endif
PassThrough{}); PassThrough{});
auto invoker_ptr = op_ptr->MakeInvokerPointer(); auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
SimpleDeviceMem workspace(workspace_sz);
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
} }
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_normalization_fwd.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/normalization_fwd.hpp"
using XDataType = ck::half_t;
using GammaDataType = ck::half_t;
using BetaDataType = ck::half_t;
using YDataType = ck::half_t;
using SaveMeanInvStdDataType = float;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
#define SAVE_MEAN_INV_STD
constexpr int Rank = 4;
constexpr int NumReduceDim = 3;
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
{
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
}
void* GetDeviceBuffer() { return p_mem_; }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
};
int main(int argc, char* argv[])
{
ck::index_t N = 256;
ck::index_t H = 16;
ck::index_t W = 16;
ck::index_t C = 8;
std::vector<ck::index_t> strideXY = {H * W * C, W * C, C, 1};
std::vector<ck::index_t> strideGammaBeta = {0, W * C, C, 1};
std::vector<ck::index_t> strideSaveMeanInvStd = {1};
SimpleDeviceMem x_device_buf(sizeof(XDataType) * N * H * W * C);
SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * H * W * C);
SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * H * W * C);
SimpleDeviceMem y_device_buf(sizeof(YDataType) * N * H * W * C);
#ifdef SAVE_MEAN_INV_STD
SimpleDeviceMem save_mean_device_buf(sizeof(SaveMeanInvStdDataType) * N);
SimpleDeviceMem save_inv_std_device_buf(sizeof(SaveMeanInvStdDataType) * N);
#endif
using DeviceOp = ck::tensor_operation::device::DeviceNormalizationFwd<XDataType,
GammaDataType,
BetaDataType,
YDataType,
SaveMeanInvStdDataType,
PassThrough,
Rank,
NumReduceDim>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name;
bool found = false;
int best_op_id = -1;
float best_ave_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
// profile device operation instances
std::cout << "Run all instances and do timing" << std::endl;
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr =
op_ptr->MakeArgumentPointer({N, H, W, C}, // lengths
strideXY, // xStrides
strideGammaBeta, // gammaStrides
strideGammaBeta, // betaStrides
strideXY, // yStrides
strideSaveMeanInvStd, // save_mean Strides
strideSaveMeanInvStd, // save_inv_std Strides
{1, 2, 3}, // reduceDims
1e-4,
x_device_buf.GetDeviceBuffer(),
gamma_device_buf.GetDeviceBuffer(),
beta_device_buf.GetDeviceBuffer(),
y_device_buf.GetDeviceBuffer(),
#ifdef SAVE_MEAN_INV_STD
save_mean_device_buf.GetDeviceBuffer(),
save_inv_std_device_buf.GetDeviceBuffer(),
#else
nullptr,
nullptr,
#endif
PassThrough{});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
SimpleDeviceMem workspace(workspace_sz);
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t num_byte =
sizeof(XDataType) * N * H * W * C + sizeof(GammaDataType) * H * W * C +
sizeof(BetaDataType) * H * W * C + sizeof(YDataType) * N * H * W * C;
#ifdef SAVE_MEAN_INV_STD
num_byte += sizeof(SaveMeanInvStdDataType) * N * 2;
#endif
float gb_per_sec = num_byte / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
<< op_name << std::endl;
if(ave_time < best_ave_time)
{
found = true;
best_op_id = i;
best_op_name = op_name;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
}
}
else
{
std::cout << op_name << " does not support this problem" << std::endl;
}
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
<< best_op_name << std::endl;
// run the best intance
{
auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl;
auto argument_ptr =
op_ptr->MakeArgumentPointer({N, H, W, C}, // lengths
strideXY, // xStrides
strideGammaBeta, // gammaStrides
strideGammaBeta, // betaStrides
strideXY, // yStrides
strideSaveMeanInvStd, // save_mean Strides
strideSaveMeanInvStd, // save_inv_std Strides
{1, 2, 3}, // reduceDims
1e-4,
x_device_buf.GetDeviceBuffer(),
gamma_device_buf.GetDeviceBuffer(),
beta_device_buf.GetDeviceBuffer(),
y_device_buf.GetDeviceBuffer(),
#ifdef SAVE_MEAN_INV_STD
save_mean_device_buf.GetDeviceBuffer(),
save_inv_std_device_buf.GetDeviceBuffer(),
#else
nullptr,
nullptr,
#endif
PassThrough{});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
SimpleDeviceMem workspace(workspace_sz);
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
}
std::cout << "Done" << std::endl;
}
return 0;
}
...@@ -100,18 +100,18 @@ int main() ...@@ -100,18 +100,18 @@ int main()
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * X * C); SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * X * C);
SimpleDeviceMem out(sizeof(OutDataType) * G * N * Wo * K); SimpleDeviceMem out(sizeof(OutDataType) * G * N * Wo * K);
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial, using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
InLayout, InLayout,
WeiLayout, WeiLayout,
ck::Tuple<>, ck::Tuple<>,
OutLayout, OutLayout,
InDataType, InDataType,
WeiDataType, WeiDataType,
ck::Tuple<>, ck::Tuple<>,
OutDataType, OutDataType,
PassThrough, PassThrough,
PassThrough, PassThrough,
PassThrough>; PassThrough>;
// get device op instances // get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
......
...@@ -71,18 +71,18 @@ int main() ...@@ -71,18 +71,18 @@ int main()
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C); SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K); SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial, using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
InLayout, InLayout,
WeiLayout, WeiLayout,
ck::Tuple<>, ck::Tuple<>,
OutLayout, OutLayout,
InDataType, InDataType,
WeiDataType, WeiDataType,
ck::Tuple<>, ck::Tuple<>,
OutDataType, OutDataType,
PassThrough, PassThrough,
PassThrough, PassThrough,
PassThrough>; PassThrough>;
// get device op instances // get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
......
...@@ -80,7 +80,7 @@ int main(int argc, char* argv[]) ...@@ -80,7 +80,7 @@ int main(int argc, char* argv[])
SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K); SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K);
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K); SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD< using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
NumDimSpatial, NumDimSpatial,
InLayout, InLayout,
WeiLayout, WeiLayout,
......
...@@ -78,18 +78,18 @@ int main(int argc, char* argv[]) ...@@ -78,18 +78,18 @@ int main(int argc, char* argv[])
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K); SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
using DeviceOp = using DeviceOp =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial, ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
InLayout, InLayout,
WeiLayout, WeiLayout,
ck::Tuple<BiasLayout>, ck::Tuple<BiasLayout>,
OutLayout, OutLayout,
InDataType, InDataType,
WeiDataType, WeiDataType,
ck::Tuple<BiasDataType>, ck::Tuple<BiasDataType>,
OutDataType, OutDataType,
PassThrough, PassThrough,
PassThrough, PassThrough,
OutElementOp>; OutElementOp>;
// get device op instances // get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances(); DeviceOp>::GetInstances();
......
...@@ -83,7 +83,7 @@ int main(int argc, char* argv[]) ...@@ -83,7 +83,7 @@ int main(int argc, char* argv[])
SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K); SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K);
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K); SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD< using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
NumDimSpatial, NumDimSpatial,
InLayout, InLayout,
WeiLayout, WeiLayout,
......
...@@ -79,18 +79,18 @@ int main(int argc, char* argv[]) ...@@ -79,18 +79,18 @@ int main(int argc, char* argv[])
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K); SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
using DeviceOp = using DeviceOp =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial, ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
InLayout, InLayout,
WeiLayout, WeiLayout,
ck::Tuple<BiasLayout>, ck::Tuple<BiasLayout>,
OutLayout, OutLayout,
InDataType, InDataType,
WeiDataType, WeiDataType,
ck::Tuple<BiasDataType>, ck::Tuple<BiasDataType>,
OutDataType, OutDataType,
PassThrough, PassThrough,
PassThrough, PassThrough,
OutElementOp>; OutElementOp>;
// get device op instances // get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances(); DeviceOp>::GetInstances();
......
...@@ -76,19 +76,19 @@ int main(int argc, char* argv[]) ...@@ -76,19 +76,19 @@ int main(int argc, char* argv[])
SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K); SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K);
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K); SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
using DeviceOp = using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial, NumDimSpatial,
InLayout, InLayout,
WeiLayout, WeiLayout,
ck::Tuple<RequantScaleLayout>, ck::Tuple<RequantScaleLayout>,
OutLayout, OutLayout,
InDataType, InDataType,
WeiDataType, WeiDataType,
ck::Tuple<RequantScaleDataType>, ck::Tuple<RequantScaleDataType>,
OutDataType, OutDataType,
PassThrough, PassThrough,
PassThrough, PassThrough,
OutElementOp>; OutElementOp>;
// get device op instances // get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances(); DeviceOp>::GetInstances();
......
...@@ -72,18 +72,18 @@ int main(int argc, char* argv[]) ...@@ -72,18 +72,18 @@ int main(int argc, char* argv[])
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C); SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K); SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial, using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
InLayout, InLayout,
WeiLayout, WeiLayout,
ck::Tuple<>, ck::Tuple<>,
OutLayout, OutLayout,
InDataType, InDataType,
WeiDataType, WeiDataType,
ck::Tuple<>, ck::Tuple<>,
OutDataType, OutDataType,
PassThrough, PassThrough,
PassThrough, PassThrough,
OutElementOp>; OutElementOp>;
// get device op instances // get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances(); DeviceOp>::GetInstances();
......
add_executable(client_grouped_conv2d_bwd_data grouped_conv2d_bwd_data.cpp)
target_link_libraries(client_grouped_conv2d_bwd_data PRIVATE composable_kernel::device_operations)
add_executable(client_grouped_conv2d_bwd_data grouped_conv2d_bwd_data.cpp)
target_link_libraries(client_grouped_conv2d_bwd_data PRIVATE composable_kernel::device_operations)
add_executable(client_grouped_conv3d_bwd_data grouped_conv3d_bwd_data.cpp)
target_link_libraries(client_grouped_conv3d_bwd_data PRIVATE composable_kernel::device_operations)
add_executable(client_grouped_conv3d_bwd_data_input_fp16_comp_bf8f8 grouped_conv3d_bwd_data_input_fp16_comp_bf8f8.cpp)
target_link_libraries(client_grouped_conv3d_bwd_data_input_fp16_comp_bf8f8 PRIVATE composable_kernel::device_operations)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment