Commit cd4d4629 authored by danyao12's avatar danyao12
Browse files

Merge branch 'develop' into ck_tile/fa_bwd_v3

parents 21d12bb7 888317e6
* @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk * @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj
# Documentation files # Documentation files
docs/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk docs/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj
*.md @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk *.md @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj
*.rst @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk *.rst @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj
.readthedocs.yaml @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk .readthedocs.yaml @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj
# Header directory for Doxygen documentation # Header directory for Doxygen documentation
library/include/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk library/include/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj
We'd love for you to contribute to our source code!
Some helpful links:
- [Code of Conduct guidelines](https://www.contributor-covenant.org/version/2/1/code_of_conduct/code_of_conduct.txt)
- [New issue guidelines](https://github.com/rocm/composable_kernel/blob/develop/.github/ISSUE_TEMPLATE.md)
- [Submitting a pull request guidelines](https://github.com/rocm/composable_kernel/blob/develop/.github/PULL_REQUEST_TEMPLATE.md)
- [Maintainers](https://github.com/rocm/composable_kernel/blob/develop/CONTRIBUTORS.md)
- [General information](https://github.com/rocm/composable_kernel/blob/develop/README.md)
- [ROCm documentation](https://rocm.docs.amd.com/en/latest/how-to/llm-fine-tuning-optimization/optimizing-with-composable-kernel.html)
\ No newline at end of file
When creating an issue, please check if a similar issue already exists.
### When reporting a bug, please include:
- [ ] A descriptive title
- [ ] An isolated way to reproduce the behavior (preferably a docker container with a repro)
- [ ] ROCm version, clang version, Composable Kernel commit pin
- [ ] Environment variables
- [ ] The behavior you expect to see, and the behavior you actually see
### When requesting a feature, please include:
- [ ] A descriptive title
- [ ] A detailed description of the problem you are trying to solve
- [ ] An overview of the suggested solution
- [ ] Explanation why the solution is an improvement
\ No newline at end of file
## Proposed changes
Please describe the motivation behind the pull request, whether it enables a new feature or fixes a bug. If there are associated pull requests or issues, please link them to the pull request.
## Checklist
Please put an `x` into the boxes that apply. You can also fill these out after creating the PR. If you're not sure, please don't hesitate to ask.
- [ ] I have added tests relevant to the introduced functionality, and the unit tests are passing locally
- [ ] I have added inline documentation which enables the maintainers with understanding the motivation
- [ ] I have removed the stale documentation which is no longer relevant after this pull request
- [ ] (If this change is user-facing) I have added release notes which provide the end users with a brief summary of the improvement from this pull request
- [ ] I have run `clang-format` on all changed files
- [ ] Any dependent changes have been merged
## Discussion
If this is a relatively large or complex change, feel free to start a discussion by explaining why you chose the solution you did and what alternatives you considered
...@@ -137,7 +137,7 @@ if(GPU_TARGETS) ...@@ -137,7 +137,7 @@ if(GPU_TARGETS)
else() else()
set(USER_GPU_TARGETS 0) set(USER_GPU_TARGETS 0)
endif() endif()
find_package(hip) find_package(hip REQUIRED)
# No assumption that HIP kernels are launched with uniform block size for backward compatibility # No assumption that HIP kernels are launched with uniform block size for backward compatibility
# SWDEV-413293 and https://reviews.llvm.org/D155213 # SWDEV-413293 and https://reviews.llvm.org/D155213
math(EXPR hip_VERSION_FLAT "(${hip_VERSION_MAJOR} * 1000 + ${hip_VERSION_MINOR}) * 100000 + ${hip_VERSION_PATCH}") math(EXPR hip_VERSION_FLAT "(${hip_VERSION_MAJOR} * 1000 + ${hip_VERSION_MINOR}) * 100000 + ${hip_VERSION_PATCH}")
...@@ -145,20 +145,20 @@ message("hip_version_flat=${hip_VERSION_FLAT}") ...@@ -145,20 +145,20 @@ message("hip_version_flat=${hip_VERSION_FLAT}")
message("checking which targets are supported") message("checking which targets are supported")
#In order to build just the CK library (without tests and examples) for all supported GPU targets #In order to build just the CK library (without tests and examples) for all supported GPU targets
#use -D GPU_ARCHS="gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" #use -D GPU_ARCHS="gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
#the GPU_TARGETS flag will be reset in this case in order to avoid conflicts. #the GPU_TARGETS flag will be reset in this case in order to avoid conflicts.
# #
#In order to build CK along with all tests and examples it should be OK to set GPU_TARGETS to just 1 or 2 similar architectures. #In order to build CK along with all tests and examples it should be OK to set GPU_TARGETS to just 1 or 2 similar architectures.
if(NOT ENABLE_ASAN_PACKAGING) if(NOT ENABLE_ASAN_PACKAGING)
if(NOT WIN32 AND ${hip_VERSION_FLAT} LESS 600300000) if(NOT WIN32 AND ${hip_VERSION_FLAT} LESS 600300000)
# WORKAROUND: compiler does not yet fully support gfx12 targets, need to fix version above # WORKAROUND: compiler does not yet fully support gfx12 targets, need to fix version above
set(CK_GPU_TARGETS "gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102") set(CK_GPU_TARGETS "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102")
else() else()
set(CK_GPU_TARGETS "gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201") set(CK_GPU_TARGETS "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201")
endif() endif()
else() else()
#build CK only for xnack-supported targets when using ASAN #build CK only for xnack-supported targets when using ASAN
set(CK_GPU_TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx940:xnack+;gfx941:xnack+;gfx942:xnack+") set(CK_GPU_TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx942:xnack+")
endif() endif()
#if user set GPU_ARCHS on the cmake command line, overwrite default target list with user's list #if user set GPU_ARCHS on the cmake command line, overwrite default target list with user's list
...@@ -170,27 +170,45 @@ else() ...@@ -170,27 +170,45 @@ else()
set(CK_GPU_TARGETS ${GPU_TARGETS}) set(CK_GPU_TARGETS ${GPU_TARGETS})
endif() endif()
endif() endif()
#if the user did not set GPU_TARGETS, delete whatever was set by HIP package
if(NOT USER_GPU_TARGETS)
set(GPU_TARGETS "")
endif()
#make sure all the targets on the list are actually supported by the current compiler #make sure all the targets on the list are actually supported by the current compiler
rocm_check_target_ids(SUPPORTED_GPU_TARGETS rocm_check_target_ids(SUPPORTED_GPU_TARGETS
TARGETS ${CK_GPU_TARGETS}) TARGETS ${CK_GPU_TARGETS})
message("Building CK for the following targets: ${SUPPORTED_GPU_TARGETS}") message("Building CK for the following targets: ${SUPPORTED_GPU_TARGETS}")
if (GPU_TARGETS) if (SUPPORTED_GPU_TARGETS MATCHES "gfx9")
if (GPU_TARGETS MATCHES "gfx9") message("Enabling XDL instances")
add_definitions(-DCK_USE_XDL) add_definitions(-DCK_USE_XDL)
set(CK_USE_XDL "ON")
endif()
if (GPU_TARGETS MATCHES "gfx11" OR GPU_TARGETS MATCHES "gfx12")
add_definitions(-DCK_USE_WMMA)
set(CK_USE_WMMA "ON")
endif()
else()
add_definitions(-DCK_USE_WMMA -DCK_USE_XDL)
set(CK_USE_XDL "ON") set(CK_USE_XDL "ON")
endif()
if (SUPPORTED_GPU_TARGETS MATCHES "gfx94")
message("Enabling FP8 gemms on native architectures")
add_definitions(-DCK_USE_GFX94)
set(CK_USE_GFX94 "ON")
endif()
if (SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12")
message("Enabling WMMA instances")
add_definitions(-DCK_USE_WMMA)
set(CK_USE_WMMA "ON") set(CK_USE_WMMA "ON")
endif() endif()
if (SUPPORTED_GPU_TARGETS MATCHES "gfx12")
add_definitions(-DCK_USE_OCP_FP8)
set(CK_USE_OCP_FP8 "ON")
endif()
if (SUPPORTED_GPU_TARGETS MATCHES "gfx90a" OR SUPPORTED_GPU_TARGETS MATCHES "gfx94")
add_definitions(-DCK_USE_FNUZ_FP8)
set(CK_USE_FNUZ_FP8 "ON")
endif()
option(CK_USE_FP8_ON_UNSUPPORTED_ARCH "Enable FP8 GEMM instances on older architectures" OFF)
if(CK_USE_FP8_ON_UNSUPPORTED_ARCH AND (SUPPORTED_GPU_TARGETS MATCHES "gfx90a" OR SUPPORTED_GPU_TARGETS MATCHES "gfx908"))
add_definitions(-DCK_USE_FP8_ON_UNSUPPORTED_ARCH)
set(CK_USE_FP8_ON_UNSUPPORTED_ARCH "ON")
endif()
# CK config file to record supported datatypes, etc. # CK config file to record supported datatypes, etc.
configure_file(include/ck/config.h.in ${CMAKE_CURRENT_BINARY_DIR}/include/ck/config.h) configure_file(include/ck/config.h.in ${CMAKE_CURRENT_BINARY_DIR}/include/ck/config.h)
...@@ -202,6 +220,13 @@ if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 500723302) ...@@ -202,6 +220,13 @@ if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 500723302)
add_compile_options(-fno-offload-uniform-block) add_compile_options(-fno-offload-uniform-block)
endif() endif()
endif() endif()
if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 500500000)
check_cxx_compiler_flag("-mllvm --lsr-drop-solution=1" HAS_LSR_DROP_SOLUTION)
if(HAS_LSR_DROP_SOLUTION)
message("Adding the lsr-drop-solution=1 compiler flag")
add_compile_options("SHELL: -mllvm --lsr-drop-solution=1")
endif()
endif()
if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600140090) if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600140090)
check_cxx_compiler_flag("-mllvm -enable-post-misched=0" HAS_ENABLE_POST_MISCHED) check_cxx_compiler_flag("-mllvm -enable-post-misched=0" HAS_ENABLE_POST_MISCHED)
if(HAS_ENABLE_POST_MISCHED) if(HAS_ENABLE_POST_MISCHED)
...@@ -211,7 +236,7 @@ if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600140090) ...@@ -211,7 +236,7 @@ if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600140090)
endif() endif()
set(check-coerce) set(check-coerce)
check_cxx_compiler_flag(" -mllvm -amdgpu-coerce-illegal-types=1" check-coerce) check_cxx_compiler_flag(" -mllvm -amdgpu-coerce-illegal-types=1" check-coerce)
if(NOT WIN32 AND check-coerce AND ${hip_VERSION_FLAT} GREATER 600241132 AND ${hip_VERSION_FLAT} LESS 600300000) if(NOT WIN32 AND check-coerce AND ${hip_VERSION_FLAT} GREATER 600241132)
message("Adding the amdgpu-coerce-illegal-types=1") message("Adding the amdgpu-coerce-illegal-types=1")
add_compile_options("SHELL: -mllvm -amdgpu-coerce-illegal-types=1") add_compile_options("SHELL: -mllvm -amdgpu-coerce-illegal-types=1")
endif() endif()
...@@ -311,7 +336,6 @@ link_libraries(${OpenMP_gomp_LIBRARY}) ...@@ -311,7 +336,6 @@ link_libraries(${OpenMP_gomp_LIBRARY})
link_libraries(${OpenMP_pthread_LIBRARY}) link_libraries(${OpenMP_pthread_LIBRARY})
## HIP ## HIP
find_package(HIP REQUIRED)
# Override HIP version in config.h, if necessary. # Override HIP version in config.h, if necessary.
# The variables set by find_package() can't be overwritten, # The variables set by find_package() can't be overwritten,
# therefore let's use intermediate variables. # therefore let's use intermediate variables.
...@@ -561,7 +585,7 @@ if(NOT GPU_ARCHS AND USER_GPU_TARGETS) ...@@ -561,7 +585,7 @@ if(NOT GPU_ARCHS AND USER_GPU_TARGETS)
) )
add_subdirectory(example) add_subdirectory(example)
if(BUILD_TESTING) if(BUILD_TESTING)
add_subdirectory(test) add_subdirectory(test)
endif() endif()
endif() endif()
...@@ -571,7 +595,7 @@ rocm_package_setup_component(profiler ...@@ -571,7 +595,7 @@ rocm_package_setup_component(profiler
) )
add_subdirectory(profiler) add_subdirectory(profiler)
if(CK_USE_CODEGEN AND (GPU_TARGETS MATCHES "gfx9" OR GPU_ARCHS)) if(CK_USE_CODEGEN AND (SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR GPU_ARCHS))
add_subdirectory(codegen) add_subdirectory(codegen)
endif() endif()
......
[Back to the main page](./README.md)
# Composable Kernel Developers and Contributors # Composable Kernel Developers and Contributors
This is the list of developers and contributors to Composable Kernel library This is the list of developers and contributors to Composable Kernel library
......
FROM ubuntu:20.04 FROM ubuntu:22.04
ARG DEBIAN_FRONTEND=noninteractive ARG DEBIAN_FRONTEND=noninteractive
ARG ROCMVERSION=6.2 ARG ROCMVERSION=6.3
ARG compiler_version="" ARG compiler_version=""
ARG compiler_commit="" ARG compiler_commit=""
ARG CK_SCCACHE="" ARG CK_SCCACHE=""
RUN set -xe
ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/ ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
RUN useradd -rm -d /home/jenkins -s /bin/bash -u 1004 jenkins
# Add rocm repository
RUN chmod 1777 /tmp
RUN apt-get update
RUN apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl
ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
RUN if [ "$ROCMVERSION" != "6.3" ]; then \ # Add rocm repository
sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/focal/amdgpu-install_6.2.60200-1_all.deb --no-check-certificate" && \ RUN set -xe && \
apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.2.60200-1_all.deb && \ useradd -rm -d /home/jenkins -s /bin/bash -u 1004 jenkins && \
apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl && \
curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
RUN if [ "$ROCMVERSION" != "6.4" ]; then \
sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/focal/amdgpu-install_6.3.60300-1_all.deb --no-check-certificate" && \
apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.3.60300-1_all.deb && \
wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \ wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \ sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \
sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \ sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \
elif [ "$ROCMVERSION" = "6.3" ] && [ "$compiler_version" = "rc1" ]; then \
sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_6.3.0.1-20.04-1_all.deb --no-check-certificate" && \
apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install dialog libpopt0 rsync && DEBIAN_FRONTEND=noninteractive apt-get install ./amdgpu-install-internal_6.3.0.1-20.04-1_all.deb && \
sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 6.3.0.1 rel-5 > /etc/apt/sources.list.d/rocm-build.list' && \
amdgpu-repo --amdgpu-build=2033700; \
fi fi
RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list" RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list" && \
RUN amdgpu-install -y --usecase=rocm --no-dkms amdgpu-install -y --usecase=rocm --no-dkms
## Sccache binary built from source for ROCm, only install if CK_SCCACHE is defined ## Sccache binary built from source for ROCm, only install if CK_SCCACHE is defined
ARG SCCACHE_REPO_URL=http://compute-artifactory.amd.com/artifactory/rocm-generic-experimental/rocm-sccache ARG SCCACHE_REPO_URL=http://compute-artifactory.amd.com/artifactory/rocm-generic-experimental/rocm-sccache
...@@ -57,6 +48,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- ...@@ -57,6 +48,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
libnuma-dev \ libnuma-dev \
libpthread-stubs0-dev \ libpthread-stubs0-dev \
llvm-amdgpu \ llvm-amdgpu \
mpich \
net-tools \ net-tools \
pkg-config \ pkg-config \
python \ python \
...@@ -72,72 +64,52 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- ...@@ -72,72 +64,52 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
nano \ nano \
zlib1g-dev \ zlib1g-dev \
zip \ zip \
libzstd-dev \
openssh-server \ openssh-server \
clang-format-12 \ clang-format-12 \
kmod && \ kmod && \
apt-get clean && \ apt-get clean && \
rm -rf /var/lib/apt/lists/* rm -rf /var/lib/apt/lists/* && \
rm -rf amdgpu-install* && \
# Remove unnecessary rocm components that take a lot of space
apt-get remove -y rocblas rocfft rocsparse composablekernel-dev hipblaslt
# hipTensor requires rocm-llvm-dev for rocm versions > 6.0.1
RUN if [ "$ROCMVERSION" = "6.1" ]; then \
sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev"; \
fi
# Update the cmake to version 3.27.5 # Update the cmake to version 3.27.5
RUN pip install --upgrade cmake==3.27.5 RUN pip install --upgrade cmake==3.27.5 && \
#Install latest ccache #Install latest ccache
RUN git clone https://github.com/ccache/ccache.git && \ git clone https://github.com/ccache/ccache.git && \
cd ccache && mkdir build && cd build && cmake .. && make install cd ccache && mkdir build && cd build && cmake .. && make install && \
#Install ninja build tracing tools #Install ninja build tracing tools
RUN wget -qO /usr/local/bin/ninja.gz https://github.com/ninja-build/ninja/releases/latest/download/ninja-linux.zip cd / && \
RUN gunzip /usr/local/bin/ninja.gz wget -qO /usr/local/bin/ninja.gz https://github.com/ninja-build/ninja/releases/latest/download/ninja-linux.zip && \
RUN chmod a+x /usr/local/bin/ninja gunzip /usr/local/bin/ninja.gz && \
RUN git clone https://github.com/nico/ninjatracing.git chmod a+x /usr/local/bin/ninja && \
git clone https://github.com/nico/ninjatracing.git && \
#Install latest cppcheck #Install latest cppcheck
RUN git clone https://github.com/danmar/cppcheck.git && \ git clone https://github.com/danmar/cppcheck.git && \
cd cppcheck && mkdir build && cd build && cmake .. && cmake --build . cd cppcheck && mkdir build && cd build && cmake .. && cmake --build . && \
WORKDIR / cd / && \
# Setup ubsan environment to printstacktrace
RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer
ENV UBSAN_OPTIONS=print_stacktrace=1
# Install an init system # Install an init system
RUN wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb && \
RUN dpkg -i dumb-init_*.deb && rm dumb-init_*.deb dpkg -i dumb-init_*.deb && rm dumb-init_*.deb && \
ARG PREFIX=/opt/rocm
# Install packages for processing the performance results # Install packages for processing the performance results
RUN pip3 install --upgrade pip pip3 install --upgrade pip && \
RUN pip3 install sqlalchemy==1.4.46 pip3 install sqlalchemy==2.0.36 pymysql pandas==2.2.3 setuptools-rust sshtunnel==0.4.0 && \
RUN pip3 install pymysql # Add render group
RUN pip3 install pandas==2.0.3 groupadd -f render && \
RUN pip3 install setuptools-rust
RUN pip3 install sshtunnel==0.4.0
# Setup ubsan environment to printstacktrace
ENV UBSAN_OPTIONS=print_stacktrace=1
ENV LC_ALL=C.UTF-8
ENV LANG=C.UTF-8
RUN groupadd -f render
# Install the new rocm-cmake version # Install the new rocm-cmake version
RUN git clone -b master https://github.com/ROCm/rocm-cmake.git && \ git clone -b master https://github.com/ROCm/rocm-cmake.git && \
cd rocm-cmake && mkdir build && cd build && \ cd rocm-cmake && mkdir build && cd build && \
cmake .. && cmake --build . && cmake --build . --target install cmake .. && cmake --build . && cmake --build . --target install
WORKDIR / WORKDIR /
# Add alternative compilers, if necessary
ENV compiler_version=$compiler_version ENV compiler_version=$compiler_version
ENV compiler_commit=$compiler_commit ENV compiler_commit=$compiler_commit
RUN sh -c "echo compiler version = '$compiler_version'" RUN sh -c "echo compiler version = '$compiler_version'" && \
RUN sh -c "echo compiler commit = '$compiler_commit'" sh -c "echo compiler commit = '$compiler_commit'"
ARG DISABLE_CACHE=0
RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline-open" ] ) && [ "$compiler_commit" = "" ]; then \ RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" = "" ]; then \
git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \ git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
cd llvm-project && mkdir build && cd build && \ cd llvm-project && mkdir build && cd build && \
cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \ cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
...@@ -145,16 +117,10 @@ RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd ...@@ -145,16 +117,10 @@ RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd
else echo "using the release compiler"; \ else echo "using the release compiler"; \
fi fi
RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline-open" ] ) && [ "$compiler_commit" != "" ]; then \ RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" != "" ]; then \
git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \ git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \ cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \
cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \ cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
make -j 8 ; \ make -j 8 ; \
else echo "using the release compiler"; \ else echo "using the release compiler"; \
fi fi
#clean-up the deb package
RUN sh -c "rm -rf amdgpu-install*"
#ENV HIP_CLANG_PATH='/llvm-project/build/bin'
#RUN sh -c "echo HIP_CLANG_PATH = '$HIP_CLANG_PATH'"
ARG BASE_DOCKER="rocm/composable_kernel:ck_ub22.04_rocm6.3"
FROM $BASE_DOCKER
ARG compiler_version=""
ARG compiler_commit=""
# Add alternative compilers, if necessary
ENV compiler_version=$compiler_version
ENV compiler_commit=$compiler_commit
RUN sh -c "echo compiler version = '$compiler_version'" && \
sh -c "echo compiler commit = '$compiler_commit'"
RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" = "" ]; then \
git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
cd llvm-project && mkdir build && cd build && \
cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
make -j 16 ; \
else echo "using the release compiler"; \
fi
RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" != "" ]; then \
git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \
cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
make -j 16 ; \
else echo "using the release compiler"; \
fi
...@@ -32,41 +32,43 @@ def runShell(String command){ ...@@ -32,41 +32,43 @@ def runShell(String command){
return (output != "") return (output != "")
} }
def getDockerImageName(){ def getBaseDockerImageName(){
def img def img
if (params.USE_CUSTOM_DOCKER != ""){ if (params.USE_CUSTOM_DOCKER != ""){
img = "${params.USE_CUSTOM_DOCKER}" img = "${params.USE_CUSTOM_DOCKER}"
} }
else{ else{
if (params.ROCMVERSION != "6.3"){ def ROCM_numeric = "${params.ROCMVERSION}" as float
if (params.COMPILER_VERSION == "") { if ( ROCM_numeric < 6.4 ){
img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}" img = "${env.CK_DOCKERHUB}:ck_ub22.04_rocm${params.ROCMVERSION}"
} }
else{ else{
if (params.COMPILER_COMMIT == ""){ img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub22.04_rocm${params.ROCMVERSION}"
img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}" }
} }
else{ return img
def commit = "${params.COMPILER_COMMIT}"[0..6] }
img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}_${commit}"
} def getDockerImageName(){
} def img
def base_name = getBaseDockerImageName()
if (params.USE_CUSTOM_DOCKER != ""){
img = "${params.USE_CUSTOM_DOCKER}"
} }
else{ else{
if (params.COMPILER_VERSION == "") { if (params.COMPILER_VERSION == "") {
img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}" img = "${base_name}"
} }
else{ else{
if (params.COMPILER_COMMIT == ""){ if (params.COMPILER_COMMIT == ""){
img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}" img = "${base_name}_${params.COMPILER_VERSION}"
} }
else{ else{
def commit = "${params.COMPILER_COMMIT}"[0..6] def commit = "${params.COMPILER_COMMIT}"[0..6]
img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}_${commit}" img = "${base_name}_${params.COMPILER_VERSION}_${commit}"
} }
} }
} }
}
return img return img
} }
...@@ -131,17 +133,21 @@ def buildDocker(install_prefix){ ...@@ -131,17 +133,21 @@ def buildDocker(install_prefix){
env.DOCKER_BUILDKIT=1 env.DOCKER_BUILDKIT=1
checkout scm checkout scm
def image_name = getDockerImageName() def image_name = getDockerImageName()
def base_image_name = getBaseDockerImageName()
echo "Building Docker for ${image_name}" echo "Building Docker for ${image_name}"
def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' --build-arg DISABLE_CACHE='git rev-parse ${params.COMPILER_VERSION}' " def dockerArgs = "--build-arg PREFIX=${install_prefix} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
if(params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline-open" || params.COMPILER_COMMIT != ""){ if(params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){
dockerArgs = dockerArgs + " --no-cache " dockerArgs = dockerArgs + " --no-cache --build-arg BASE_DOCKER='${base_image_name}' -f Dockerfile.compiler . "
}
else{
dockerArgs = dockerArgs + " -f Dockerfile . "
} }
echo "Build Args: ${dockerArgs}" echo "Build Args: ${dockerArgs}"
try{ try{
if(params.BUILD_DOCKER){ if(params.BUILD_DOCKER){
//force building the new docker if that parameter is true //force building the new docker if that parameter is true
echo "Building image: ${image_name}" echo "Building image: ${image_name}"
retimage = docker.build("${image_name}", dockerArgs + ' .') retimage = docker.build("${image_name}", dockerArgs)
withDockerRegistry([ credentialsId: "docker_test_cred", url: "" ]) { withDockerRegistry([ credentialsId: "docker_test_cred", url: "" ]) {
retimage.push() retimage.push()
} }
...@@ -324,10 +330,8 @@ def cmake_build(Map conf=[:]){ ...@@ -324,10 +330,8 @@ def cmake_build(Map conf=[:]){
try{ try{
archiveArtifacts "perf_fmha_fwd_*.log" archiveArtifacts "perf_fmha_fwd_*.log"
archiveArtifacts "perf_fmha_bwd_*.log" archiveArtifacts "perf_fmha_bwd_*.log"
stash name: "perf_fmha_fwd_gfx942.log" stash includes: "perf_fmha_**_gfx942.log", name: "perf_fmha_log_gfx942"
stash name: "perf_fmha_bwd_gfx942.log" stash includes: "perf_fmha_**_gfx90a.log", name: "perf_fmha_log_gfx90a"
stash name: "perf_fmha_fwd_gfx90a.log"
stash name: "perf_fmha_bwd_gfx90a.log"
} }
catch(Exception err){ catch(Exception err){
echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing." echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing."
...@@ -353,12 +357,12 @@ def buildHipClangJob(Map conf=[:]){ ...@@ -353,12 +357,12 @@ def buildHipClangJob(Map conf=[:]){
def prefixpath = conf.get("prefixpath", "/opt/rocm") def prefixpath = conf.get("prefixpath", "/opt/rocm")
// Jenkins is complaining about the render group // Jenkins is complaining about the render group
def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" def dockerOpts="-u root --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
if (conf.get("enforce_xnack_on", false)) { if (conf.get("enforce_xnack_on", false)) {
dockerOpts = dockerOpts + " --env HSA_XNACK=1 " dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
} }
def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' " def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline-open" || params.COMPILER_COMMIT != ""){ if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){
dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' " dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
} }
def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3') def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3')
...@@ -373,7 +377,7 @@ def buildHipClangJob(Map conf=[:]){ ...@@ -373,7 +377,7 @@ def buildHipClangJob(Map conf=[:]){
gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') { gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
timeout(time: 48, unit: 'HOURS') timeout(time: 20, unit: 'HOURS')
{ {
cmake_build(conf) cmake_build(conf)
} }
...@@ -402,128 +406,6 @@ def buildHipClangJobAndReboot(Map conf=[:]){ ...@@ -402,128 +406,6 @@ def buildHipClangJobAndReboot(Map conf=[:]){
} }
} }
def runCKProfiler(Map conf=[:]){
show_node_info()
env.HSA_ENABLE_SDMA=0
checkout scm
def image = getDockerImageName()
def prefixpath = conf.get("prefixpath", "/opt/rocm")
// Jenkins is complaining about the render group
def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
if (conf.get("enforce_xnack_on", false)) {
dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
}
def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3')
def render_id = sh(returnStdout: true, script: 'getent group render | cut -d: -f3')
dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} "
echo "Docker flags: ${dockerOpts}"
def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
def variant = env.STAGE_NAME
def retimage
gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
try {
(retimage, image) = getDockerImage(conf)
withDockerContainer(image: image, args: dockerOpts) {
timeout(time: 5, unit: 'MINUTES'){
sh 'rocminfo | tee rocminfo.log'
if ( !runShell('grep -n "gfx" rocminfo.log') ){
throw new Exception ("GPU not found")
}
else{
echo "GPU is OK"
}
}
}
}
catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
echo "The job was cancelled or aborted"
throw e
}
withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
timeout(time: 24, unit: 'HOURS')
{
sh """
rm -rf build
mkdir build
"""
dir("build"){
unstash 'ckProfiler.tar.gz'
sh 'tar -xvf ckProfiler.tar.gz'
}
dir("script"){
if (params.RUN_FULL_QA){
sh "./run_full_performance_tests.sh 0 QA_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}"
archiveArtifacts "perf_gemm.log"
archiveArtifacts "perf_resnet50_N256.log"
archiveArtifacts "perf_resnet50_N4.log"
archiveArtifacts "perf_batched_gemm.log"
archiveArtifacts "perf_grouped_gemm.log"
archiveArtifacts "perf_grouped_conv_fwd.log"
archiveArtifacts "perf_grouped_conv_bwd_data.log"
archiveArtifacts "perf_grouped_conv_bwd_weight.log"
archiveArtifacts "perf_gemm_bilinear.log"
archiveArtifacts "perf_reduction.log"
archiveArtifacts "perf_splitK_gemm.log"
archiveArtifacts "perf_onnx_gemm.log"
archiveArtifacts "perf_mixed_gemm.log"
// stash perf files to master
stash name: "perf_gemm.log"
stash name: "perf_resnet50_N256.log"
stash name: "perf_resnet50_N4.log"
stash name: "perf_batched_gemm.log"
stash name: "perf_grouped_gemm.log"
stash name: "perf_grouped_conv_fwd.log"
stash name: "perf_grouped_conv_bwd_data.log"
stash name: "perf_grouped_conv_bwd_weight.log"
stash name: "perf_gemm_bilinear.log"
stash name: "perf_reduction.log"
stash name: "perf_splitK_gemm.log"
stash name: "perf_onnx_gemm.log"
stash name: "perf_mixed_gemm.log"
//we will process results on the master node
}
else{
sh "./run_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}"
archiveArtifacts "perf_gemm.log"
archiveArtifacts "perf_resnet50_N256.log"
archiveArtifacts "perf_resnet50_N4.log"
// stash perf files to master
stash name: "perf_gemm.log"
stash name: "perf_resnet50_N256.log"
stash name: "perf_resnet50_N4.log"
//we will process the results on the master node
}
}
}
}
}
return retimage
}
def runPerfTest(Map conf=[:]){
try{
runCKProfiler(conf)
}
catch(e){
echo "throwing error exception in performance tests"
echo 'Exception occurred: ' + e.toString()
throw e
}
finally{
if (!conf.get("no_reboot", false)) {
reboot()
}
}
}
def Build_CK(Map conf=[:]){ def Build_CK(Map conf=[:]){
show_node_info() show_node_info()
...@@ -544,12 +426,12 @@ def Build_CK(Map conf=[:]){ ...@@ -544,12 +426,12 @@ def Build_CK(Map conf=[:]){
def prefixpath = conf.get("prefixpath", "/opt/rocm") def prefixpath = conf.get("prefixpath", "/opt/rocm")
// Jenkins is complaining about the render group // Jenkins is complaining about the render group
def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" def dockerOpts="-u root --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
if (conf.get("enforce_xnack_on", false)) { if (conf.get("enforce_xnack_on", false)) {
dockerOpts = dockerOpts + " --env HSA_XNACK=1 " dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
} }
def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' " def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline-open" || params.COMPILER_COMMIT != ""){ if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){
dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' " dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
} }
if(params.BUILD_LEGACY_OS){ if(params.BUILD_LEGACY_OS){
...@@ -567,7 +449,7 @@ def Build_CK(Map conf=[:]){ ...@@ -567,7 +449,7 @@ def Build_CK(Map conf=[:]){
try { try {
(retimage, image) = getDockerImage(conf) (retimage, image) = getDockerImage(conf)
withDockerContainer(image: image, args: dockerOpts) { withDockerContainer(image: image, args: dockerOpts) {
timeout(time: 5, unit: 'MINUTES'){ timeout(time: 2, unit: 'MINUTES'){
sh 'rocminfo | tee rocminfo.log' sh 'rocminfo | tee rocminfo.log'
if ( !runShell('grep -n "gfx" rocminfo.log') ){ if ( !runShell('grep -n "gfx" rocminfo.log') ){
throw new Exception ("GPU not found") throw new Exception ("GPU not found")
...@@ -583,36 +465,95 @@ def Build_CK(Map conf=[:]){ ...@@ -583,36 +465,95 @@ def Build_CK(Map conf=[:]){
throw e throw e
} }
withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
timeout(time: 24, unit: 'HOURS') timeout(time: 20, unit: 'HOURS')
{ {
//check whether to run performance tests on this node //check whether to run performance tests on this node
def do_perf_tests = 0 def arch_type = 0
sh 'rocminfo | tee rocminfo.log' sh 'rocminfo | tee rocminfo.log'
if ( runShell('grep -n "gfx1030" rocminfo.log') || runShell('grep -n "gfx1101" rocminfo.log') || runShell('grep -n "gfx1201" rocminfo.log') || runShell('grep -n "gfx942" rocminfo.log') ){ if ( runShell('grep -n "gfx90a" rocminfo.log') ){
do_perf_tests = 1 arch_type = 1
echo "Stash profiler and run performance tests" }
else if ( runShell('grep -n "gfx942" rocminfo.log') ) {
arch_type = 2
}
else if ( runShell('grep -n "gfx1030" rocminfo.log') ) {
arch_type = 3
}
else if ( runShell('grep -n "gfx1101" rocminfo.log') ) {
arch_type = 4
}
else if ( runShell('grep -n "gfx1201" rocminfo.log') ) {
arch_type = 5
} }
cmake_build(conf) cmake_build(conf)
dir("build"){ dir("build"){
//run tests and examples if (params.RUN_FULL_QA && arch_type == 1 ){
//sh 'make -j check' // build deb packages for all gfx9 targets on gfx90a system and prepare to export
if (params.RUN_PERFORMANCE_TESTS && do_perf_tests == 0 ){ echo "Build ckProfiler package"
//we only need the ckProfiler to run the performance tests, so we pack and stash it
//do not stash profiler on nodes where we don't need to run performance tests
sh 'tar -zcvf ckProfiler.tar.gz bin/ckProfiler'
stash name: "ckProfiler.tar.gz"
}
if (params.RUN_FULL_QA && do_perf_tests == 0 ){
// build deb packages for all gfx9 targets and prepare to export
sh 'make -j package' sh 'make -j package'
archiveArtifacts artifacts: 'composablekernel-ckprofiler_*.deb' archiveArtifacts artifacts: 'composablekernel-ckprofiler_*.deb'
archiveArtifacts artifacts: 'composablekernel-tests_*.deb'
sh 'mv composablekernel-ckprofiler_*.deb ckprofiler_0.2.0_amd64.deb' sh 'mv composablekernel-ckprofiler_*.deb ckprofiler_0.2.0_amd64.deb'
stash name: "ckprofiler_0.2.0_amd64.deb" stash includes: "ckprofiler_0.2.0_amd64.deb", name: "ckprofiler_0.2.0_amd64.deb"
} }
} }
if (params.hipTensor_test && do_perf_tests == 0 ){ // run performance tests, stash the logs, results will be processed on the master node
//build and test hipTensor dir("script"){
if (params.RUN_PERFORMANCE_TESTS){
if (params.RUN_FULL_QA && arch_type == 1){
// run full tests on gfx90a
echo "Run full performance tests"
sh "./run_full_performance_tests.sh 0 QA_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}"
archiveArtifacts "perf_gemm.log"
archiveArtifacts "perf_resnet50_N256.log"
archiveArtifacts "perf_resnet50_N4.log"
archiveArtifacts "perf_batched_gemm.log"
archiveArtifacts "perf_grouped_gemm.log"
archiveArtifacts "perf_grouped_conv_fwd.log"
archiveArtifacts "perf_grouped_conv_bwd_data.log"
archiveArtifacts "perf_grouped_conv_bwd_weight.log"
archiveArtifacts "perf_gemm_bilinear.log"
archiveArtifacts "perf_reduction.log"
archiveArtifacts "perf_splitK_gemm.log"
archiveArtifacts "perf_onnx_gemm.log"
archiveArtifacts "perf_mixed_gemm.log"
stash includes: "perf_**.log", name: "perf_log"
}
else if ( arch_type == 1 ){
// run standard tests on gfx90a
echo "Run performance tests"
sh "./run_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}"
archiveArtifacts "perf_gemm.log"
archiveArtifacts "perf_onnx_gemm.log"
archiveArtifacts "perf_resnet50_N256.log"
archiveArtifacts "perf_resnet50_N4.log"
stash includes: "perf_**.log", name: "perf_log"
}
// disable performance tests on gfx1030 for now.
//else if ( arch_type == 3){
// run basic tests on gfx1030
// echo "Run gemm performance tests"
// sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx10"
// archiveArtifacts "perf_onnx_gemm_gfx10.log"
// stash includes: "perf_onnx_gemm_gfx10.log", name: "perf_log_gfx10"
//}
else if ( arch_type == 4){
// run basic tests on gfx11
echo "Run gemm performance tests"
sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx11"
archiveArtifacts "perf_onnx_gemm_gfx11.log"
stash includes: "perf_onnx_gemm_gfx11.log", name: "perf_log_gfx11"
}
else if ( arch_type == 5 ){
// run basic tests on gfx12
echo "Run gemm performance tests"
sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx12"
archiveArtifacts "perf_onnx_gemm_gfx12.log"
stash includes: "perf_onnx_gemm_gfx12.log", name: "perf_log_gfx12"
}
}
}
if (params.hipTensor_test && arch_type == 1 ){
// build and test hipTensor on gfx90a node
sh """#!/bin/bash sh """#!/bin/bash
rm -rf "${params.hipTensor_branch}".zip rm -rf "${params.hipTensor_branch}".zip
rm -rf hipTensor-"${params.hipTensor_branch}" rm -rf hipTensor-"${params.hipTensor_branch}"
...@@ -625,11 +566,9 @@ def Build_CK(Map conf=[:]){ ...@@ -625,11 +566,9 @@ def Build_CK(Map conf=[:]){
ls -ltr ls -ltr
CC=hipcc CXX=hipcc cmake -Bbuild . -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install" CC=hipcc CXX=hipcc cmake -Bbuild . -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install"
cmake --build build -- -j cmake --build build -- -j
ctest --test-dir build
""" """
} }
dir("hipTensor-${params.hipTensor_branch}/build"){
sh 'ctest'
}
} }
} }
} }
...@@ -679,15 +618,13 @@ def process_results(Map conf=[:]){ ...@@ -679,15 +618,13 @@ def process_results(Map conf=[:]){
} }
withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
timeout(time: 1, unit: 'HOURS'){ timeout(time: 15, unit: 'MINUTES'){
try{ try{
dir("script"){ dir("script"){
if (params.RUN_CK_TILE_FMHA_TESTS){ if (params.RUN_CK_TILE_FMHA_TESTS){
try{ try{
unstash "perf_fmha_fwd_gfx942.log" unstash "perf_fmha_log_gfx942"
unstash "perf_fmha_bwd_gfx942.log" unstash "perf_fmha_log_gfx90a"
unstash "perf_fmha_fwd_gfx90a.log"
unstash "perf_fmha_bwd_gfx90a.log"
} }
catch(Exception err){ catch(Exception err){
echo "could not locate the FMHA performance logs: ${err.getMessage()}." echo "could not locate the FMHA performance logs: ${err.getMessage()}."
...@@ -697,26 +634,26 @@ def process_results(Map conf=[:]){ ...@@ -697,26 +634,26 @@ def process_results(Map conf=[:]){
// unstash perf files to master // unstash perf files to master
unstash "ckprofiler_0.2.0_amd64.deb" unstash "ckprofiler_0.2.0_amd64.deb"
sh "sshpass -p ${env.ck_deb_pw} scp -o StrictHostKeyChecking=no ckprofiler_0.2.0_amd64.deb ${env.ck_deb_user}@${env.ck_deb_ip}:/var/www/html/composable_kernel/" sh "sshpass -p ${env.ck_deb_pw} scp -o StrictHostKeyChecking=no ckprofiler_0.2.0_amd64.deb ${env.ck_deb_user}@${env.ck_deb_ip}:/var/www/html/composable_kernel/"
unstash "perf_gemm.log" unstash "perf_log"
unstash "perf_resnet50_N256.log" try{
unstash "perf_resnet50_N4.log" unstash "perf_log_gfx11"
unstash "perf_batched_gemm.log" unstash "perf_log_gfx12"
unstash "perf_grouped_gemm.log" }
unstash "perf_grouped_conv_fwd.log" catch(Exception err){
unstash "perf_grouped_conv_bwd_data.log" echo "could not locate the GEMM gfx11/gfx12 performance logs: ${err.getMessage()}."
unstash "perf_grouped_conv_bwd_weight.log" }
unstash "perf_gemm_bilinear.log"
unstash "perf_reduction.log"
unstash "perf_splitK_gemm.log"
unstash "perf_onnx_gemm.log"
unstash "perf_mixed_gemm.log"
sh "./process_qa_data.sh" sh "./process_qa_data.sh"
} }
else{ else{
// unstash perf files to master // unstash perf files to master
unstash "perf_gemm.log" unstash "perf_log"
unstash "perf_resnet50_N256.log" try{
unstash "perf_resnet50_N4.log" unstash "perf_log_gfx11"
unstash "perf_log_gfx12"
}
catch(Exception err){
echo "could not locate the GEMM gfx11/gfx12 performance logs: ${err.getMessage()}."
}
sh "./process_perf_data.sh" sh "./process_perf_data.sh"
} }
} }
...@@ -734,10 +671,10 @@ def process_results(Map conf=[:]){ ...@@ -734,10 +671,10 @@ def process_results(Map conf=[:]){
} }
//launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.2;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.3;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true
0 21 * * * % ROCMVERSION=6.2;hipTensor_test=true;RUN_CODEGEN_TESTS=true 0 21 * * * % ROCMVERSION=6.3;hipTensor_test=true;RUN_CODEGEN_TESTS=true
0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true 0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline-open;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true 0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false 0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false
0 13 * * * % BUILD_LEGACY_OS=true''' : "" 0 13 * * * % BUILD_LEGACY_OS=true''' : ""
...@@ -760,12 +697,12 @@ pipeline { ...@@ -760,12 +697,12 @@ pipeline {
description: 'If you want to use a custom docker image, please specify it here (default: leave blank).') description: 'If you want to use a custom docker image, please specify it here (default: leave blank).')
string( string(
name: 'ROCMVERSION', name: 'ROCMVERSION',
defaultValue: '6.2', defaultValue: '6.3',
description: 'Specify which ROCM version to use: 6.2 (default).') description: 'Specify which ROCM version to use: 6.3 (default).')
string( string(
name: 'COMPILER_VERSION', name: 'COMPILER_VERSION',
defaultValue: '', defaultValue: '',
description: 'Specify which version of compiler to use: release, amd-staging, amd-mainline-open, or leave blank (default).') description: 'Specify which version of compiler to use: release, amd-staging, amd-mainline, or leave blank (default).')
string( string(
name: 'COMPILER_COMMIT', name: 'COMPILER_COMMIT',
defaultValue: '', defaultValue: '',
...@@ -824,8 +761,8 @@ pipeline { ...@@ -824,8 +761,8 @@ pipeline {
description: "Test building instances for various architectures simultaneously (default: OFF)") description: "Test building instances for various architectures simultaneously (default: OFF)")
booleanParam( booleanParam(
name: "BUILD_GFX12", name: "BUILD_GFX12",
defaultValue: false, defaultValue: true,
description: "Build CK and run tests on gfx12 (default: OFF)") description: "Build CK and run tests on gfx12 (default: ON)")
booleanParam( booleanParam(
name: "NINJA_BUILD_TRACE", name: "NINJA_BUILD_TRACE",
defaultValue: false, defaultValue: false,
...@@ -1101,11 +1038,11 @@ pipeline { ...@@ -1101,11 +1038,11 @@ pipeline {
agent{ label rocmnode("gfx90a") } agent{ label rocmnode("gfx90a") }
environment{ environment{
setup_args = """ -DCMAKE_INSTALL_PREFIX=../install \ setup_args = """ -DCMAKE_INSTALL_PREFIX=../install \
-DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" \ -DGPU_TARGETS="gfx908;gfx90a;gfx942" \
-DCMAKE_CXX_FLAGS=" -O3 " """ -DCMAKE_CXX_FLAGS=" -O3 " """
execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \ execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \ cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
-DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" \ -DGPU_TARGETS="gfx908;gfx90a;gfx942" \
-DCMAKE_CXX_COMPILER="${build_compiler()}" \ -DCMAKE_CXX_COMPILER="${build_compiler()}" \
-DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """ -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
} }
...@@ -1165,7 +1102,7 @@ pipeline { ...@@ -1165,7 +1102,7 @@ pipeline {
execute_args = """ cmake -D CMAKE_PREFIX_PATH=/opt/rocm \ execute_args = """ cmake -D CMAKE_PREFIX_PATH=/opt/rocm \
-D CMAKE_CXX_COMPILER="${build_compiler()}" \ -D CMAKE_CXX_COMPILER="${build_compiler()}" \
-D CMAKE_BUILD_TYPE=Release \ -D CMAKE_BUILD_TYPE=Release \
-D GPU_ARCHS="gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102" \ -D GPU_ARCHS="gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102" \
-D CMAKE_CXX_FLAGS=" -O3 " .. && make -j64 """ -D CMAKE_CXX_FLAGS=" -O3 " .. && make -j64 """
} }
steps{ steps{
...@@ -1235,29 +1172,6 @@ pipeline { ...@@ -1235,29 +1172,6 @@ pipeline {
} }
} }
} }
stage("Performance Tests")
{
parallel
{
stage("Run ckProfiler: gfx90a")
{
when {
beforeAgent true
expression { params.RUN_PERFORMANCE_TESTS.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
}
options { retry(1) }
agent{ label rocmnode("gfx90a")}
environment{
setup_args = "NO_CK_BUILD"
}
steps{
runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release')
cleanWs()
}
}
}
}
stage("Process Performance Test Results") stage("Process Performance Test Results")
{ {
parallel parallel
......
# Composable Kernel # Composable Kernel
> [!NOTE]
> The published documentation is available at [Composable Kernel](https://rocm.docs.amd.com/projects/composable_kernel/en/latest/) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the `docs` folder of this repository. As with all ROCm projects, the documentation is open source. For more information on contributing to the documentation, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html).
The Composable Kernel (CK) library provides a programming model for writing performance-critical The Composable Kernel (CK) library provides a programming model for writing performance-critical
kernels for machine learning workloads across multiple architectures (GPUs, CPUs, etc.). The CK library kernels for machine learning workloads across multiple architectures (GPUs, CPUs, etc.). The CK library
uses general purpose kernel languages, such as HIP C++. uses general purpose kernel languages, such as HIP C++.
...@@ -23,23 +26,15 @@ The current CK library is structured into four layers: ...@@ -23,23 +26,15 @@ The current CK library is structured into four layers:
## General information ## General information
To build our documentation locally, use the following code: * [CK supported operations](include/ck/README.md)
* [CK Tile supported operations](include/ck_tile/README.md)
``` bash * [CK wrapper](client_example/25_wrapper/README.md)
cd docs * [CK codegen](codegen/README.md)
pip3 install -r sphinx/requirements.txt * [CK profiler](profiler/README.md)
python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html * [Examples (Custom use of CK supported operations)](example/README.md)
``` * [Client examples (Use of CK supported operations with instance factory)](client_example/README.md)
* [Terminology](/TERMINOLOGY.md)
You can find a list of our developers and contributors on our [Contributors](/CONTRIBUTORS.md) page. * [Contributors](/CONTRIBUTORS.md)
```note
If you use CK, cite us as follows:
* [Realizing Tensor Operators Using Coordinate Transformations and Tile Based Programming](???):
This paper will be available on arXiv soon.
* [CITATION.cff](/CITATION.cff)
```
CK is released under the **[MIT license](/LICENSE)**. CK is released under the **[MIT license](/LICENSE)**.
...@@ -134,12 +129,19 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa ...@@ -134,12 +129,19 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa
You can find instructions for running ckProfiler in [profiler](/profiler). You can find instructions for running ckProfiler in [profiler](/profiler).
Note the `-j` option for building with multiple threads in parallel. This speeds up the build significantly. * Build our documentation locally:
Depending on the number of CPU cores and the amount of RAM on your system, you may want to
limit the number of threads. For example, if you have a 128-core CPU and 64 Gb of RAM. ``` bash
cd docs
pip3 install -r sphinx/requirements.txt
python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
```
By default, `-j` launches one thread per CPU core, which can cause the build to run out of memory and Note the `-j` option for building with multiple threads in parallel, which speeds up the build significantly.
crash. In such cases, you can reduce the number of threads to 32 by using `-j32`. However, `-j` launches unlimited number of threads, which can cause the build to run out of memory and
crash. On average, you should expect each thread to use ~2Gb of RAM.
Depending on the number of CPU cores and the amount of RAM on your system, you may want to
limit the number of threads. For example, if you have a 128-core CPU and 128 Gb of RAM it's advisable to use `-j32`.
Additional cmake flags can be used to significantly speed-up the build: Additional cmake flags can be used to significantly speed-up the build:
...@@ -151,6 +153,10 @@ Additional cmake flags can be used to significantly speed-up the build: ...@@ -151,6 +153,10 @@ Additional cmake flags can be used to significantly speed-up the build:
`batched_gemm_multi_d_dl`. These instances are useful on architectures like the NAVI2x, as most `batched_gemm_multi_d_dl`. These instances are useful on architectures like the NAVI2x, as most
other platforms have faster instances, such as `xdl` or `wmma`, available. other platforms have faster instances, such as `xdl` or `wmma`, available.
* `CK_USE_FP8_ON_UNSUPPORTED_ARCH` (default is OFF) must be set to ON in order to build instances,
such as `gemm_universal`, `gemm_universal_streamk` and `gemm_multiply_multiply` for fp8 data type for GPU targets which do not have native support for fp8 data type, such as gfx908 or gfx90a. These instances are useful on
architectures like the MI100/MI200 for the functional support only.
## Using sccache for building ## Using sccache for building
The default CK Docker images come with a pre-installed version of sccache, which supports clang The default CK Docker images come with a pre-installed version of sccache, which supports clang
......
[Back to the main page](./README.md)
# Composable Kernel terminology
\ No newline at end of file
...@@ -54,7 +54,7 @@ target_link_libraries(client_conv3d_fwd_convscale_relu_amax_fp8 ...@@ -54,7 +54,7 @@ target_link_libraries(client_conv3d_fwd_convscale_relu_amax_fp8
PRIVATE composable_kernel::device_conv_operations PRIVATE composable_kernel::device_conv_operations
composable_kernel::device_other_operations composable_kernel::device_other_operations
composable_kernel::device_reduction_operations composable_kernel::device_reduction_operations
utility) composable_kernel::utility)
# Fwd convscale + AMAX # Fwd convscale + AMAX
add_executable(client_conv3d_fwd_convscale_amax_fp8 add_executable(client_conv3d_fwd_convscale_amax_fp8
grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_amax_fp8.cpp) grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_amax_fp8.cpp)
...@@ -62,7 +62,7 @@ target_link_libraries(client_conv3d_fwd_convscale_amax_fp8 ...@@ -62,7 +62,7 @@ target_link_libraries(client_conv3d_fwd_convscale_amax_fp8
PRIVATE composable_kernel::device_conv_operations PRIVATE composable_kernel::device_conv_operations
composable_kernel::device_other_operations composable_kernel::device_other_operations
composable_kernel::device_reduction_operations composable_kernel::device_reduction_operations
utility) composable_kernel::utility)
# Fwd convscale # Fwd convscale
add_executable(client_conv3d_fwd_convscale_fp8 add_executable(client_conv3d_fwd_convscale_fp8
grouped_convnd_fwd_convscale/conv3d_fwd_convscale_fp8.cpp) grouped_convnd_fwd_convscale/conv3d_fwd_convscale_fp8.cpp)
......
[Back to the main page](../../README.md)
# Composable Kernel wrapper GEMM tutorial # Composable Kernel wrapper GEMM tutorial
This tutorial demonstrates how to implement matrix multiplication using Composable Kernel (CK) This tutorial demonstrates how to implement matrix multiplication using Composable Kernel (CK) wrapper. We present the base version of GEMM without most of the available optimizations; however, it's worth noting that CK has kernels with different optimizations.
wrapper. We present the base version of GEMM without most of the available optimizations; however,
it's worth noting that CK has kernels with different optimizations.
To implement these optimizations, you can use the CK wrapper or directly use available instances in To implement these optimizations, you can use the CK wrapper or directly use available instances in CK. You can also refer to the [optimized GEMM example](https://github.com/ROCm/composable_kernel/blob/develop/client_example/25_wrapper/wrapper_optimized_gemm.cpp), that uses CK wrapper based on the [`gridwise_gemm_xdlops_v2r3`](https://github.com/ROCm/composable_kernel/blob/develop/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp) implementation.
CK. You can also refer to the
[optimized GEMM example](https://github.com/ROCm/composable_kernel/blob/develop/client_example/25_wrapper/wrapper_optimized_gemm.cpp),
that uses CK wrapper based on the
[`gridwise_gemm_xdlops_v2r3`](https://github.com/ROCm/composable_kernel/blob/develop/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp) implementation.
The kernel definition should look similar to: The kernel definition should look similar to:
......
...@@ -121,7 +121,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co ...@@ -121,7 +121,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
constexpr ck::index_t NumDTensor = 2; constexpr ck::index_t NumDTensor = 2;
using GroupedGemmKernelArgument = using GroupedGemmKernelArgument =
ck::tensor_operation::device::GroupedGemmTileLoopKernelArguments<NumDTensor>; ck::tensor_operation::device::GroupedGemmKernelArgument<NumDTensor>;
std::vector<GroupedGemmKernelArgument> grouped_gemm_kernel_args_; std::vector<GroupedGemmKernelArgument> grouped_gemm_kernel_args_;
grouped_gemm_kernel_args_.reserve(group_count); grouped_gemm_kernel_args_.reserve(group_count);
......
...@@ -120,7 +120,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co ...@@ -120,7 +120,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
constexpr ck::index_t NumDTensor = 1; constexpr ck::index_t NumDTensor = 1;
using GroupedGemmKernelArgument = using GroupedGemmKernelArgument =
ck::tensor_operation::device::GroupedGemmTileLoopKernelArguments<NumDTensor>; ck::tensor_operation::device::GroupedGemmKernelArgument<NumDTensor>;
std::vector<GroupedGemmKernelArgument> grouped_gemm_kernel_args_; std::vector<GroupedGemmKernelArgument> grouped_gemm_kernel_args_;
grouped_gemm_kernel_args_.reserve(group_count); grouped_gemm_kernel_args_.reserve(group_count);
......
...@@ -56,13 +56,21 @@ if (GPU_TARGETS) ...@@ -56,13 +56,21 @@ if (GPU_TARGETS)
add_definitions(-DCK_USE_WMMA) add_definitions(-DCK_USE_WMMA)
set(CK_USE_WMMA "ON") set(CK_USE_WMMA "ON")
endif() endif()
if (GPU_TARGETS MATCHES "gfx12")
add_definitions(-DCK_USE_OCP_FP8)
set(CK_USE_OCP_FP8 "ON")
endif()
if (GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx94")
add_definitions(-DCK_USE_FNUZ_FP8)
set(CK_USE_FNUZ_FP8 "ON")
endif()
else() else()
add_definitions(-DCK_USE_WMMA -DCK_USE_XDL) add_definitions(-DCK_USE_WMMA -DCK_USE_XDL)
set(CK_USE_XDL "ON") set(CK_USE_XDL "ON")
set(CK_USE_WMMA "ON") set(CK_USE_WMMA "ON")
endif() endif()
find_package(composable_kernel COMPONENTS device_other_operations device_gemm_operations device_conv_operations device_reduction_operations) find_package(composable_kernel COMPONENTS device_other_operations device_gemm_operations device_conv_operations device_reduction_operations utility)
if(GPU_TARGETS MATCHES "gfx9") if(GPU_TARGETS MATCHES "gfx9")
find_package(composable_kernel COMPONENTS device_contraction_operations) find_package(composable_kernel COMPONENTS device_contraction_operations)
endif() endif()
......
[Back to the main page](../README.md)
# Composable Kernel client examples
## ##
Client application links to CK library, and therefore CK library needs to be installed before building client applications. Client application links to CK library, and therefore CK library needs to be installed before building client applications.
......
...@@ -66,7 +66,7 @@ else() ...@@ -66,7 +66,7 @@ else()
-Wunreachable-code -Wunreachable-code
-Wunused -Wunused
-Wno-reserved-identifier -Wno-reserved-identifier
-Werror -Werror
-Wno-option-ignored -Wno-option-ignored
-Wsign-compare -Wsign-compare
-Wno-extra-semi-stmt -Wno-extra-semi-stmt
......
...@@ -7,6 +7,7 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) ...@@ -7,6 +7,7 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
set(CK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..) set(CK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..)
configure_file(${CK_ROOT}/include/ck/config.h.in ${CK_ROOT}/include/ck/config.h)
find_package(ROCM) find_package(ROCM)
include(ROCMInstallTargets) include(ROCMInstallTargets)
......
[Back to the main page](../README.md)
# Composable Kernel codegen
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment