Merge branch 'develop' into ck_tile/fa_bwd_v3

cd4d4629 · danyao12 · 21d12bb7 · 888317e6 · cd4d4629 · cd4d4629
Commit cd4d4629 authored Jan 07, 2025 by danyao12
20 changed files
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
-* @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk
+* @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj
 # Documentation files
-docs/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk
-*.md @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk
-*.rst @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk
-.readthedocs.yaml @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk
+docs/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj
+*.md @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj
+*.rst @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj
+.readthedocs.yaml @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj
 # Header directory for Doxygen documentation
-library/include/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk
+library/include/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
+We'd love for you to contribute to our source code!
+
+Some helpful links:
+
+- [Code of Conduct guidelines](https://www.contributor-covenant.org/version/2/1/code_of_conduct/code_of_conduct.txt)
+- [New issue guidelines](https://github.com/rocm/composable_kernel/blob/develop/.github/ISSUE_TEMPLATE.md)
+- [Submitting a pull request guidelines](https://github.com/rocm/composable_kernel/blob/develop/.github/PULL_REQUEST_TEMPLATE.md)
+- [Maintainers](https://github.com/rocm/composable_kernel/blob/develop/CONTRIBUTORS.md)
+- [General information](https://github.com/rocm/composable_kernel/blob/develop/README.md)
+- [ROCm documentation](https://rocm.docs.amd.com/en/latest/how-to/llm-fine-tuning-optimization/optimizing-with-composable-kernel.html)
\ No newline at end of file
--- a/.github/ISSUE_TEMPLATE.md
+++ b/.github/ISSUE_TEMPLATE.md
+When creating an issue, please check if a similar issue already exists.
+
+### When reporting a bug, please include:
+- [ ] A descriptive title
+- [ ] An isolated way to reproduce the behavior (preferably a docker container with a repro)
+- [ ] ROCm version, clang version, Composable Kernel commit pin
+- [ ] Environment variables
+- [ ] The behavior you expect to see, and the behavior you actually see
+
+### When requesting a feature, please include:
+- [ ] A descriptive title
+- [ ] A detailed description of the problem you are trying to solve
+- [ ] An overview of the suggested solution
+- [ ] Explanation why the solution is an improvement
\ No newline at end of file
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
+## Proposed changes
+
+Please describe the motivation behind the pull request, whether it enables a new feature or fixes a bug. If there are associated pull requests or issues, please link them to the pull request.
+
+## Checklist
+
+Please put an `x` into the boxes that apply. You can also fill these out after creating the PR. If you're not sure, please don't hesitate to ask.
+
+- [ ] I have added tests relevant to the introduced functionality, and the unit tests are passing locally
+- [ ] I have added inline documentation which enables the maintainers with understanding the motivation
+- [ ] I have removed the stale documentation which is no longer relevant after this pull request
+- [ ] (If this change is user-facing) I have added release notes which provide the end users with a brief summary of the improvement from this pull request
+- [ ] I have run `clang-format` on all changed files
+- [ ] Any dependent changes have been merged
+
+## Discussion
+
+If this is a relatively large or complex change, feel free to start a discussion by explaining why you chose the solution you did and what alternatives you considered
+
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -137,7 +137,7 @@ if(GPU_TARGETS)
 else()
    set(USER_GPU_TARGETS 0)
 endif()
-find_package(hip)
+find_package(hip REQUIRED)
 # No assumption that HIP kernels are launched with uniform block size for backward compatibility
 # SWDEV-413293 and https://reviews.llvm.org/D155213
 math(EXPR hip_VERSION_FLAT "(${hip_VERSION_MAJOR} * 1000 + ${hip_VERSION_MINOR}) * 100000 + ${hip_VERSION_PATCH}")
@@ -145,20 +145,20 @@ message("hip_version_flat=${hip_VERSION_FLAT}")

 message("checking which targets are supported")
 #In order to build just the CK library (without tests and examples) for all supported GPU targets
-#use -D GPU_ARCHS="gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" 
+#use -D GPU_ARCHS="gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
 #the GPU_TARGETS flag will be reset in this case in order to avoid conflicts.
 #
 #In order to build CK along with all tests and examples it should be OK to set GPU_TARGETS to just 1 or 2 similar architectures.
 if(NOT ENABLE_ASAN_PACKAGING)
    if(NOT WIN32 AND ${hip_VERSION_FLAT} LESS 600300000)
        # WORKAROUND: compiler does not yet fully support gfx12 targets, need to fix version above
-        set(CK_GPU_TARGETS "gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102")
+        set(CK_GPU_TARGETS "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102")
    else()
-        set(CK_GPU_TARGETS "gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201")
+        set(CK_GPU_TARGETS "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201")
    endif()
 else()
    #build CK only for xnack-supported targets when using ASAN
-    set(CK_GPU_TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx940:xnack+;gfx941:xnack+;gfx942:xnack+")
+    set(CK_GPU_TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx942:xnack+")
 endif()

 #if user set GPU_ARCHS on the cmake command line, overwrite default target list with user's list
@@ -170,27 +170,45 @@ else()
        set(CK_GPU_TARGETS ${GPU_TARGETS})
    endif()
 endif()
-
+#if the user did not set GPU_TARGETS, delete whatever was set by HIP package
+if(NOT USER_GPU_TARGETS)
+    set(GPU_TARGETS "")
+endif()
 #make sure all the targets on the list are actually supported by the current compiler
 rocm_check_target_ids(SUPPORTED_GPU_TARGETS
        TARGETS ${CK_GPU_TARGETS})

 message("Building CK for the following targets: ${SUPPORTED_GPU_TARGETS}")

-if (GPU_TARGETS)
-    if (GPU_TARGETS MATCHES "gfx9")
-        add_definitions(-DCK_USE_XDL)
-        set(CK_USE_XDL "ON")
-    endif()
-    if (GPU_TARGETS MATCHES "gfx11" OR GPU_TARGETS MATCHES "gfx12")
-        add_definitions(-DCK_USE_WMMA)
-        set(CK_USE_WMMA "ON")
-    endif()
-else()
-    add_definitions(-DCK_USE_WMMA -DCK_USE_XDL)
+if (SUPPORTED_GPU_TARGETS MATCHES "gfx9")
+    message("Enabling XDL instances")
+    add_definitions(-DCK_USE_XDL)
    set(CK_USE_XDL "ON")
+endif()
+if (SUPPORTED_GPU_TARGETS MATCHES "gfx94")
+    message("Enabling FP8 gemms on native architectures")
+    add_definitions(-DCK_USE_GFX94)
+    set(CK_USE_GFX94 "ON")
+endif()
+if (SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12")
+    message("Enabling WMMA instances")
+    add_definitions(-DCK_USE_WMMA)
    set(CK_USE_WMMA "ON")
 endif()
+if (SUPPORTED_GPU_TARGETS MATCHES "gfx12")
+    add_definitions(-DCK_USE_OCP_FP8)
+    set(CK_USE_OCP_FP8 "ON")
+endif()
+if (SUPPORTED_GPU_TARGETS MATCHES "gfx90a" OR SUPPORTED_GPU_TARGETS MATCHES "gfx94")
+    add_definitions(-DCK_USE_FNUZ_FP8)
+    set(CK_USE_FNUZ_FP8 "ON")
+endif()
+
+option(CK_USE_FP8_ON_UNSUPPORTED_ARCH "Enable FP8 GEMM instances on older architectures" OFF)
+if(CK_USE_FP8_ON_UNSUPPORTED_ARCH AND (SUPPORTED_GPU_TARGETS MATCHES "gfx90a" OR SUPPORTED_GPU_TARGETS MATCHES "gfx908"))
+    add_definitions(-DCK_USE_FP8_ON_UNSUPPORTED_ARCH)
+    set(CK_USE_FP8_ON_UNSUPPORTED_ARCH "ON")
+endif()

 # CK config file to record supported datatypes, etc.
 configure_file(include/ck/config.h.in ${CMAKE_CURRENT_BINARY_DIR}/include/ck/config.h)
@@ -202,6 +220,13 @@ if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 500723302)
    add_compile_options(-fno-offload-uniform-block)
  endif()
 endif()
+if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 500500000)
+  check_cxx_compiler_flag("-mllvm --lsr-drop-solution=1" HAS_LSR_DROP_SOLUTION)
+  if(HAS_LSR_DROP_SOLUTION)
+    message("Adding the lsr-drop-solution=1 compiler flag")
+    add_compile_options("SHELL: -mllvm --lsr-drop-solution=1")
+  endif()
+endif()
 if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600140090)
  check_cxx_compiler_flag("-mllvm -enable-post-misched=0" HAS_ENABLE_POST_MISCHED)
  if(HAS_ENABLE_POST_MISCHED)
@@ -211,7 +236,7 @@ if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600140090)
 endif()
 set(check-coerce)
 check_cxx_compiler_flag(" -mllvm -amdgpu-coerce-illegal-types=1" check-coerce)
-if(NOT WIN32 AND check-coerce AND ${hip_VERSION_FLAT} GREATER 600241132 AND ${hip_VERSION_FLAT} LESS 600300000)
+if(NOT WIN32 AND check-coerce AND ${hip_VERSION_FLAT} GREATER 600241132)
   message("Adding the amdgpu-coerce-illegal-types=1")
   add_compile_options("SHELL: -mllvm -amdgpu-coerce-illegal-types=1")
 endif()
@@ -311,7 +336,6 @@ link_libraries(${OpenMP_gomp_LIBRARY})
 link_libraries(${OpenMP_pthread_LIBRARY})

 ## HIP
-find_package(HIP REQUIRED)
 # Override HIP version in config.h, if necessary.
 # The variables set by find_package() can't be overwritten,
 # therefore let's use intermediate variables.
@@ -561,7 +585,7 @@ if(NOT GPU_ARCHS AND USER_GPU_TARGETS)
   )
   add_subdirectory(example)
   if(BUILD_TESTING)
-	   add_subdirectory(test)
+       add_subdirectory(test)
   endif()
 endif()

@@ -571,7 +595,7 @@ rocm_package_setup_component(profiler
 )
 add_subdirectory(profiler)

-if(CK_USE_CODEGEN AND (GPU_TARGETS MATCHES "gfx9" OR GPU_ARCHS))
+if(CK_USE_CODEGEN AND (SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR GPU_ARCHS))
  add_subdirectory(codegen)
 endif()


--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
+[Back to the main page](./README.md)
 # Composable Kernel Developers and Contributors

 This is the list of developers and contributors to Composable Kernel library

--- a/Dockerfile
+++ b/Dockerfile
-FROM ubuntu:20.04
+FROM ubuntu:22.04
 ARG DEBIAN_FRONTEND=noninteractive
-ARG ROCMVERSION=6.2
+ARG ROCMVERSION=6.3
 ARG compiler_version=""
 ARG compiler_commit=""
 ARG CK_SCCACHE=""
-
-RUN set -xe
-
 ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
-RUN useradd -rm -d /home/jenkins -s /bin/bash -u 1004 jenkins
-# Add rocm repository
-RUN chmod 1777 /tmp
-RUN apt-get update
-RUN apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl
-
 ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
-RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg

-RUN if [ "$ROCMVERSION" != "6.3" ]; then \
-        sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/focal/amdgpu-install_6.2.60200-1_all.deb  --no-check-certificate" && \
-        apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.2.60200-1_all.deb && \
+# Add rocm repository
+RUN set -xe && \
+    useradd -rm -d /home/jenkins -s /bin/bash -u 1004 jenkins && \
+    apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl && \
+    curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
+
+RUN if [ "$ROCMVERSION" != "6.4" ]; then \
+        sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/focal/amdgpu-install_6.3.60300-1_all.deb  --no-check-certificate" && \
+        apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.3.60300-1_all.deb && \
        wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
        sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \
        sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \
-    elif [ "$ROCMVERSION" = "6.3" ] && [ "$compiler_version" = "rc1" ]; then \
-        sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_6.3.0.1-20.04-1_all.deb --no-check-certificate" && \
-        apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install dialog libpopt0 rsync && DEBIAN_FRONTEND=noninteractive apt-get install ./amdgpu-install-internal_6.3.0.1-20.04-1_all.deb && \
-        sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 6.3.0.1 rel-5 > /etc/apt/sources.list.d/rocm-build.list' && \
-        amdgpu-repo --amdgpu-build=2033700; \
    fi

-RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
-RUN amdgpu-install -y --usecase=rocm --no-dkms
+RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list" && \
+    amdgpu-install -y --usecase=rocm --no-dkms

 ## Sccache binary built from source for ROCm, only install if CK_SCCACHE is defined
 ARG SCCACHE_REPO_URL=http://compute-artifactory.amd.com/artifactory/rocm-generic-experimental/rocm-sccache
@@ -57,6 +48,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
    libnuma-dev \
    libpthread-stubs0-dev \
    llvm-amdgpu \
+    mpich \
    net-tools \
    pkg-config \
    python \
@@ -72,72 +64,52 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
    nano \
    zlib1g-dev \
    zip \
+    libzstd-dev \
    openssh-server \
    clang-format-12 \
    kmod && \
    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
+    rm -rf /var/lib/apt/lists/* && \
+    rm -rf amdgpu-install* && \
+# Remove unnecessary rocm components that take a lot of space
+    apt-get remove -y rocblas rocfft rocsparse composablekernel-dev hipblaslt

-# hipTensor requires rocm-llvm-dev for rocm versions > 6.0.1
-RUN if [ "$ROCMVERSION" = "6.1" ]; then \
-        sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev"; \
-    fi
 # Update the cmake to version 3.27.5
-RUN pip install --upgrade cmake==3.27.5
-
+RUN pip install --upgrade cmake==3.27.5 && \
 #Install latest ccache
-RUN git clone https://github.com/ccache/ccache.git && \
-    cd ccache && mkdir build && cd build && cmake .. && make install
-
+    git clone https://github.com/ccache/ccache.git && \
+    cd ccache && mkdir build && cd build && cmake .. && make install && \
 #Install ninja build tracing tools
-RUN wget -qO /usr/local/bin/ninja.gz https://github.com/ninja-build/ninja/releases/latest/download/ninja-linux.zip
-RUN gunzip /usr/local/bin/ninja.gz
-RUN chmod a+x /usr/local/bin/ninja
-RUN git clone https://github.com/nico/ninjatracing.git
-
+    cd / && \
+    wget -qO /usr/local/bin/ninja.gz https://github.com/ninja-build/ninja/releases/latest/download/ninja-linux.zip && \
+    gunzip /usr/local/bin/ninja.gz && \
+    chmod a+x /usr/local/bin/ninja && \
+    git clone https://github.com/nico/ninjatracing.git && \
 #Install latest cppcheck
-RUN git clone https://github.com/danmar/cppcheck.git && \
-    cd cppcheck && mkdir build && cd build && cmake .. && cmake --build .
-WORKDIR /
-
-# Setup ubsan environment to printstacktrace
-RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer
-ENV UBSAN_OPTIONS=print_stacktrace=1
-
+    git clone https://github.com/danmar/cppcheck.git && \
+    cd cppcheck && mkdir build && cd build && cmake .. && cmake --build . && \
+    cd / && \
 # Install an init system
-RUN wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb
-RUN dpkg -i dumb-init_*.deb && rm dumb-init_*.deb
-
-ARG PREFIX=/opt/rocm
+    wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb && \
+    dpkg -i dumb-init_*.deb && rm dumb-init_*.deb && \
 # Install packages for processing the performance results
-RUN pip3 install --upgrade pip
-RUN pip3 install sqlalchemy==1.4.46
-RUN pip3 install pymysql
-RUN pip3 install pandas==2.0.3
-RUN pip3 install setuptools-rust
-RUN pip3 install sshtunnel==0.4.0
-# Setup ubsan environment to printstacktrace
-ENV UBSAN_OPTIONS=print_stacktrace=1
-
-ENV LC_ALL=C.UTF-8
-ENV LANG=C.UTF-8
-RUN groupadd -f render
-
+    pip3 install --upgrade pip && \
+    pip3 install sqlalchemy==2.0.36 pymysql pandas==2.2.3 setuptools-rust sshtunnel==0.4.0 && \
+# Add render group
+    groupadd -f render && \
 # Install the new rocm-cmake version
-RUN git clone -b master https://github.com/ROCm/rocm-cmake.git  && \
-  cd rocm-cmake && mkdir build && cd build && \
-  cmake  .. && cmake --build . && cmake --build . --target install
+    git clone -b master https://github.com/ROCm/rocm-cmake.git  && \
+    cd rocm-cmake && mkdir build && cd build && \
+    cmake  .. && cmake --build . && cmake --build . --target install

 WORKDIR /
-
+# Add alternative compilers, if necessary
 ENV compiler_version=$compiler_version
 ENV compiler_commit=$compiler_commit
-RUN sh -c "echo compiler version = '$compiler_version'"
-RUN sh -c "echo compiler commit = '$compiler_commit'"
-
-ARG DISABLE_CACHE=0
+RUN sh -c "echo compiler version = '$compiler_version'" && \
+    sh -c "echo compiler commit = '$compiler_commit'"

-RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline-open" ] ) && [ "$compiler_commit" = "" ]; then \
+RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" = "" ]; then \
        git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
        cd llvm-project && mkdir build && cd build && \
        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
@@ -145,16 +117,10 @@ RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd
    else echo "using the release compiler"; \
    fi

-RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline-open" ] ) && [ "$compiler_commit" != "" ]; then \
+RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" != "" ]; then \
        git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
        cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \
        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
        make -j 8 ; \
    else echo "using the release compiler"; \
    fi
-
-#clean-up the deb package
-RUN sh -c "rm -rf amdgpu-install*"
-
-#ENV HIP_CLANG_PATH='/llvm-project/build/bin'
-#RUN sh -c "echo HIP_CLANG_PATH = '$HIP_CLANG_PATH'"
--- a/Dockerfile.compiler
+++ b/Dockerfile.compiler
+ARG BASE_DOCKER="rocm/composable_kernel:ck_ub22.04_rocm6.3"
+FROM $BASE_DOCKER
+ARG compiler_version=""
+ARG compiler_commit=""
+
+# Add alternative compilers, if necessary
+ENV compiler_version=$compiler_version
+ENV compiler_commit=$compiler_commit
+RUN sh -c "echo compiler version = '$compiler_version'" && \
+    sh -c "echo compiler commit = '$compiler_commit'"
+
+RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" = "" ]; then \
+        git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
+        cd llvm-project && mkdir build && cd build && \
+        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
+        make -j 16 ; \
+    else echo "using the release compiler"; \
+    fi
+
+RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" != "" ]; then \
+        git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
+        cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \
+        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
+        make -j 16 ; \
+    else echo "using the release compiler"; \
+    fi
--- a/Jenkinsfile
+++ b/Jenkinsfile
--- a/README.md
+++ b/README.md
 # Composable Kernel

+> [!NOTE]
+> The published documentation is available at [Composable Kernel](https://rocm.docs.amd.com/projects/composable_kernel/en/latest/) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the `docs` folder of this repository. As with all ROCm projects, the documentation is open source. For more information on contributing to the documentation, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html).
+
 The Composable Kernel (CK) library provides a programming model for writing performance-critical
 kernels for machine learning workloads across multiple architectures (GPUs, CPUs, etc.). The CK library
 uses general purpose kernel languages, such as HIP C++.
@@ -23,23 +26,15 @@ The current CK library is structured into four layers:

 ## General information

-To build our documentation locally, use the following code:
-
-``` bash
-cd docs
-pip3 install -r sphinx/requirements.txt
-python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
-```
-
-You can find a list of our developers and contributors on our [Contributors](/CONTRIBUTORS.md) page.
-
-```note
-If you use CK, cite us as follows:
-
-* [Realizing Tensor Operators Using Coordinate Transformations and Tile Based Programming](???):
-  This paper will be available on arXiv soon.
-* [CITATION.cff](/CITATION.cff)
-```
+* [CK supported operations](include/ck/README.md)
+* [CK Tile supported operations](include/ck_tile/README.md)
+* [CK wrapper](client_example/25_wrapper/README.md)
+* [CK codegen](codegen/README.md)
+* [CK profiler](profiler/README.md)
+* [Examples (Custom use of CK supported operations)](example/README.md)
+* [Client examples (Use of CK supported operations with instance factory)](client_example/README.md)
+* [Terminology](/TERMINOLOGY.md)
+* [Contributors](/CONTRIBUTORS.md)

 CK is released under the **[MIT license](/LICENSE)**.

@@ -134,12 +129,19 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa

    You can find instructions for running ckProfiler in [profiler](/profiler).

-Note the `-j` option for building with multiple threads in parallel. This speeds up the build significantly.
-Depending on the number of CPU cores and the amount of RAM on your system, you may want to
-limit the number of threads. For example, if you have a 128-core CPU and 64 Gb of RAM.
+* Build our documentation locally:
+
+    ``` bash
+    cd docs
+    pip3 install -r sphinx/requirements.txt
+    python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
+    ```

-By default, `-j` launches one thread per CPU core, which can cause the build to run out of memory and
-crash. In such cases, you can reduce the number of threads to 32 by using `-j32`.
+Note the `-j` option for building with multiple threads in parallel, which speeds up the build significantly.
+However, `-j` launches unlimited number of threads, which can cause the build to run out of memory and
+crash. On average, you should expect each thread to use ~2Gb of RAM.
+Depending on the number of CPU cores and the amount of RAM on your system, you may want to
+limit the number of threads. For example, if you have a 128-core CPU and 128 Gb of RAM it's advisable to use `-j32`.

 Additional cmake flags can be used to significantly speed-up the build:

@@ -151,6 +153,10 @@ Additional cmake flags can be used to significantly speed-up the build:
  `batched_gemm_multi_d_dl`. These instances are useful on architectures like the NAVI2x, as most
  other platforms have faster instances, such as `xdl` or `wmma`, available.

+* `CK_USE_FP8_ON_UNSUPPORTED_ARCH` (default is OFF) must be set to ON in order to build instances,
+  such as `gemm_universal`, `gemm_universal_streamk` and `gemm_multiply_multiply` for fp8 data type for GPU targets which do not  have native support for fp8 data type, such as gfx908 or gfx90a. These instances are useful on
+  architectures like the MI100/MI200 for the functional support only.
+
 ## Using sccache for building

 The default CK Docker images come with a pre-installed version of sccache, which supports clang

--- a/TERMINOLOGY.md
+++ b/TERMINOLOGY.md
+[Back to the main page](./README.md)
+# Composable Kernel terminology
\ No newline at end of file
--- a/client_example/24_grouped_conv_activation/CMakeLists.txt
+++ b/client_example/24_grouped_conv_activation/CMakeLists.txt
@@ -54,7 +54,7 @@ target_link_libraries(client_conv3d_fwd_convscale_relu_amax_fp8
                      PRIVATE composable_kernel::device_conv_operations
                              composable_kernel::device_other_operations
                              composable_kernel::device_reduction_operations
-                              utility)
+                              composable_kernel::utility)
 # Fwd convscale + AMAX
 add_executable(client_conv3d_fwd_convscale_amax_fp8
               grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_amax_fp8.cpp)
@@ -62,7 +62,7 @@ target_link_libraries(client_conv3d_fwd_convscale_amax_fp8
                      PRIVATE composable_kernel::device_conv_operations
                              composable_kernel::device_other_operations
                              composable_kernel::device_reduction_operations
-                              utility)
+                              composable_kernel::utility)
 # Fwd convscale
 add_executable(client_conv3d_fwd_convscale_fp8
               grouped_convnd_fwd_convscale/conv3d_fwd_convscale_fp8.cpp)

--- a/client_example/25_wrapper/README.md
+++ b/client_example/25_wrapper/README.md
+[Back to the main page](../../README.md)
 # Composable Kernel wrapper GEMM tutorial

-This tutorial demonstrates how to implement matrix multiplication using Composable Kernel (CK)
-wrapper. We present the base version of GEMM without most of the available optimizations; however,
-it's worth noting that CK has kernels with different optimizations.
+This tutorial demonstrates how to implement matrix multiplication using Composable Kernel (CK) wrapper. We present the base version of GEMM without most of the available optimizations; however, it's worth noting that CK has kernels with different optimizations.

-To implement these optimizations, you can use the CK wrapper or directly use available instances in
-CK. You can also refer to the
-[optimized GEMM example](https://github.com/ROCm/composable_kernel/blob/develop/client_example/25_wrapper/wrapper_optimized_gemm.cpp),
-that uses CK wrapper based on the
-[`gridwise_gemm_xdlops_v2r3`](https://github.com/ROCm/composable_kernel/blob/develop/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp) implementation.
+To implement these optimizations, you can use the CK wrapper or directly use available instances in CK. You can also refer to the [optimized GEMM example](https://github.com/ROCm/composable_kernel/blob/develop/client_example/25_wrapper/wrapper_optimized_gemm.cpp), that uses CK wrapper based on the [`gridwise_gemm_xdlops_v2r3`](https://github.com/ROCm/composable_kernel/blob/develop/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp) implementation.

 The kernel definition should look similar to:


--- a/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_bias_fastgelu_xdl_bf16_i8.cpp
+++ b/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_bias_fastgelu_xdl_bf16_i8.cpp
@@ -121,7 +121,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
    constexpr ck::index_t NumDTensor = 2;

    using GroupedGemmKernelArgument =
-        ck::tensor_operation::device::GroupedGemmTileLoopKernelArguments<NumDTensor>;
+        ck::tensor_operation::device::GroupedGemmKernelArgument<NumDTensor>;

    std::vector<GroupedGemmKernelArgument> grouped_gemm_kernel_args_;
    grouped_gemm_kernel_args_.reserve(group_count);

--- a/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_xdl_bf16_i8.cpp
+++ b/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_xdl_bf16_i8.cpp
@@ -120,7 +120,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
    constexpr ck::index_t NumDTensor = 1;

    using GroupedGemmKernelArgument =
-        ck::tensor_operation::device::GroupedGemmTileLoopKernelArguments<NumDTensor>;
+        ck::tensor_operation::device::GroupedGemmKernelArgument<NumDTensor>;

    std::vector<GroupedGemmKernelArgument> grouped_gemm_kernel_args_;
    grouped_gemm_kernel_args_.reserve(group_count);

--- a/client_example/CMakeLists.txt
+++ b/client_example/CMakeLists.txt
@@ -56,13 +56,21 @@ if (GPU_TARGETS)
        add_definitions(-DCK_USE_WMMA)
        set(CK_USE_WMMA "ON")
    endif()
+    if (GPU_TARGETS MATCHES "gfx12")
+        add_definitions(-DCK_USE_OCP_FP8)
+        set(CK_USE_OCP_FP8 "ON")
+    endif()
+    if (GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx94")
+        add_definitions(-DCK_USE_FNUZ_FP8)
+        set(CK_USE_FNUZ_FP8 "ON")
+    endif()
 else()
    add_definitions(-DCK_USE_WMMA -DCK_USE_XDL)
    set(CK_USE_XDL "ON")
    set(CK_USE_WMMA "ON")
 endif()

-find_package(composable_kernel COMPONENTS device_other_operations device_gemm_operations device_conv_operations  device_reduction_operations)
+find_package(composable_kernel COMPONENTS device_other_operations device_gemm_operations device_conv_operations  device_reduction_operations utility)
 if(GPU_TARGETS MATCHES "gfx9")
    find_package(composable_kernel COMPONENTS device_contraction_operations)
 endif()

--- a/client_example/README.md
+++ b/client_example/README.md
+[Back to the main page](../README.md)
+# Composable Kernel client examples
 ##
 Client application links to CK library, and therefore CK library needs to be installed before building client applications.


--- a/cmake/EnableCompilerWarnings.cmake
+++ b/cmake/EnableCompilerWarnings.cmake
@@ -66,7 +66,7 @@ else()
            -Wunreachable-code
            -Wunused
            -Wno-reserved-identifier
-	    -Werror
+            -Werror
            -Wno-option-ignored
            -Wsign-compare
            -Wno-extra-semi-stmt

--- a/codegen/CMakeLists.txt
+++ b/codegen/CMakeLists.txt
@@ -7,6 +7,7 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 set(CK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..)
+configure_file(${CK_ROOT}/include/ck/config.h.in ${CK_ROOT}/include/ck/config.h)

 find_package(ROCM)
 include(ROCMInstallTargets)

--- a/codegen/README.md
+++ b/codegen/README.md
+[Back to the main page](../README.md)
+# Composable Kernel codegen
\ No newline at end of file