Merge branch 'develop' of...

Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/composable_kernel into e2e_kernellib

Merge branch 'develop' of...
Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/composable_kernel into e2e_kernellib
e330961d · aska-0096 · 8e862b7b · 1e59eb3b · e330961d · e330961d
Commit e330961d authored Mar 07, 2023 by aska-0096
15 changed files
--- a/Dockerfile
+++ b/Dockerfile
@@ -7,6 +7,8 @@ ARG compiler_commit=""
 RUN set -xe

 ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
+RUN useradd -rm -d /home/jenkins -s /bin/bash -u 1004 jenkins
+RUN useradd -rm -d /home/manitera -s /bin/bash -u 1002 manitera
 # Add rocm repository
 RUN apt-get update
 RUN apt-get install -y wget gnupg
@@ -37,6 +39,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
    python-dev \
    python3-dev \
    python3-pip \
+    sshpass \
    software-properties-common \
    rocm-dev \
    rocm-device-libs \

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -14,7 +14,6 @@ def show_node_info() {
 def runShell(String command){
    def responseCode = sh returnStatus: true, script: "${command} > tmp.txt"
    def output = readFile(file: "tmp.txt")
-    echo "tmp.txt contents: $output"
    return (output != "")
 }

@@ -172,7 +171,7 @@ def cmake_build(Map conf=[:]){
    if(conf.get("build_install","") == "true")
    {
        config_targets = 'install ' + config_targets
-        setup_args = ' -DBUILD_DEV=Off -DCMAKE_INSTALL_PREFIX=../install' + setup_args
+        setup_args = ' -DBUILD_DEV=On -DCMAKE_INSTALL_PREFIX=../install' + setup_args
    } else{
        setup_args = ' -DBUILD_DEV=On' + setup_args
    }
@@ -427,6 +426,7 @@ def Build_CK(Map conf=[:]){

        def variant = env.STAGE_NAME
        def retimage
+        def navi_node = 0

        gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
            try {
@@ -440,6 +440,9 @@ def Build_CK(Map conf=[:]){
                        else{
                            echo "GPU is OK"
                        }
+                        if ( runShell('grep -n "gfx1030" clinfo.log') ){
+                            navi_node = 1
+                        }
                    }
                }
            }
@@ -458,6 +461,9 @@ def Build_CK(Map conf=[:]){
                        else{
                            echo "GPU is OK"
                        }
+                        if ( runShell('grep -n "gfx1030" clinfo.log') ){
+                            navi_node = 1
+                        }
                    }
                }
            }
@@ -466,16 +472,20 @@ def Build_CK(Map conf=[:]){
                {
                    cmake_build(conf)
                    dir("build"){
-                        //run tests and examples
-                        sh 'make -j check'
-                        //we only need the ckProfiler to run the performance tests, so we pack and stash it
-                        sh 'tar -zcvf ckProfiler.tar.gz bin/ckProfiler'
-                        stash "ckProfiler.tar.gz"
+                        if (navi_node == 0 ){
+                           //run tests and examples on all nodes except Navi
+                           sh 'make -j check'
+                           //we only need the ckProfiler to run the performance tests, so we pack and stash it
+                           sh 'tar -zcvf ckProfiler.tar.gz bin/ckProfiler'
+                           stash "ckProfiler.tar.gz"
+                        }
                        if (params.RUN_FULL_QA){
                           // build deb packages
                           sh 'make -j package'
                           archiveArtifacts artifacts: 'composablekernel-ckprofiler_*.deb'
                           archiveArtifacts artifacts: 'composablekernel-tests_*.deb'
+                           sh 'mv composablekernel-ckprofiler_*.deb ckprofiler_0.2.0_amd64.deb'
+                           stash "ckprofiler_0.2.0_amd64.deb"
                        }
                    }
                }
@@ -543,6 +553,8 @@ def process_results(Map conf=[:]){
                        unstash "perf_splitK_gemm.log"
                        unstash "perf_onnx_gemm.log"
                        sh "./process_qa_data.sh"
+                        unstash "ckprofiler_0.2.0_amd64.deb"
+                        sh "sshpass -p ${env.ck_deb_pw} scp -o StrictHostKeyChecking=no ckprofiler_0.2.0_amd64.deb ${env.ck_deb_user}@${env.ck_deb_ip}:/var/www/html/composable_kernel/"
                    }
                    else{
                        // unstash perf files to master
@@ -564,7 +576,7 @@ def process_results(Map conf=[:]){

 //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
 CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true
-                                              0 21 * * * % RUN_FULL_QA=false;COMPILER_VERSION=release;COMPILER_COMMIT=
+                                              0 21 * * * % COMPILER_VERSION=release;COMPILER_COMMIT=
                                              0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=''' : ""

 pipeline {
@@ -653,12 +665,28 @@ pipeline {
        {
            parallel
            {
-                stage("Build CK and run Tests")
+                stage("Build CK and run Tests on MI100/MI200")
                {
                    agent{ label rocmnode("gfx908 || gfx90a") }
                    environment{
-                        setup_args = "${params.COMPILER_VERSION == "ck-9110" ? """ -DBUILD_DEV=Off -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a;gfx1030" -DCMAKE_CXX_FLAGS="-O3 -Xclang -mlink-builtin-bitcode -Xclang /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc" """ : """ -DBUILD_DEV=Off -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a;gfx1030" -DCMAKE_CXX_FLAGS="-O3 " """ }"
-                        execute_args = "${params.COMPILER_VERSION == "ck-9110" ? """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a;gfx1030" -DCMAKE_CXX_FLAGS="-O3 -Xclang -mlink-builtin-bitcode -Xclang /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ : """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908,gfx90a;gfx1030" -DCMAKE_CXX_FLAGS="-O3" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ }"
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a" """
+                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908,gfx90a" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ 
+                    }
+                    steps{
+                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+                    }
+                }
+                stage("Build CK and run Tests on Navi")
+                {
+                    when {
+                        beforeAgent true
+                        expression { !params.RUN_FULL_QA.toBoolean() }
+                    }
+                    agent{ label rocmnode("navi21") }
+                    environment{
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" """ 
+                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1030" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
+
                    }
                    steps{
                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
@@ -671,7 +699,7 @@ pipeline {
        {
            parallel
            {
-                stage("Run ckProfiler: gfx908 or gfx90a")
+                stage("Run ckProfiler: gfx90*")
                {
                    when {
                        beforeAgent true
@@ -680,7 +708,7 @@ pipeline {
                    options { retry(2) }
                    agent{ label rocmnode("gfx908 || gfx90a")}
                    environment{
-                        setup_args = "${params.COMPILER_VERSION == "ck-9110" ? """ -DGPU_TARGETS="gfx908;gfx90a;gfx1030" -DCMAKE_CXX_FLAGS=" -O3 -Xclang -mlink-builtin-bitcode -Xclang /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc" -DBUILD_DEV=On """ : """ -DGPU_TARGETS="gfx908;gfx90a;gfx1030" -DCMAKE_CXX_FLAGS=" -O3 " -DBUILD_DEV=On """}"
+                        setup_args = """ -DGPU_TARGETS="gfx908;gfx90a" -DBUILD_DEV=On """
                   }
                    steps{
                        runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release')
@@ -695,7 +723,7 @@ pipeline {
                    options { retry(2) }
                    agent{ label rocmnode("gfx90a")}
                    environment{
-                        setup_args = "${params.COMPILER_VERSION == "ck-9110" ? """ -DGPU_TARGETS="gfx90a" -DCMAKE_CXX_FLAGS=" -O3 -Xclang -mlink-builtin-bitcode -Xclang /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc" -DBUILD_DEV=On """ : """ -DGPU_TARGETS="gfx90a" -DCMAKE_CXX_FLAGS=" -O3 " -DBUILD_DEV=On """}"
+                        setup_args = """ -DGPU_TARGETS="gfx90a" -DBUILD_DEV=On """
                    }
                    steps{
                        runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release')

--- a/cmake/EnableCompilerWarnings.cmake
+++ b/cmake/EnableCompilerWarnings.cmake
@@ -65,7 +65,8 @@ else()
            -Wuninitialized
            -Wunreachable-code
            -Wunused
-
+            -Wno-reserved-identifier
+            -Werror
            -Wsign-compare
            -Wno-extra-semi-stmt
        )

--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -775,8 +775,10 @@ WARN_LOGFILE           =
 # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
 # Note: If this tag is empty the current directory is searched.

-INPUT                  = ../library/include \
-                         ../library/include/internal
+INPUT                  = ../include/ck/tensor_operation/gpu/grid \
+                         ../include/ck/tensor_operation/gpu/block \
+                         ../include/ck/tensor_operation/gpu/thread \
+                         ../library/include/ck/library/utility

 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
@@ -845,7 +847,7 @@ FILE_PATTERNS          = *.c \
 # be searched for input files as well.
 # The default value is: NO.

-RECURSIVE              = NO
+RECURSIVE              = YES

 # The EXCLUDE tag can be used to specify files and/or directories that should be
 # excluded from the INPUT source files. This way you can easily exclude a

--- a/docs/source/API_Reference_Guide.rst
+++ b/docs/source/API_Reference_Guide.rst

-===================
+*******************
 API Reference Guide
-===================
+*******************

------------
+=================
 Introduction
------------
+=================

 This document contains details of the APIs for the Composable Kernel (CK) library and introduces some of the key design
 principles that are used to write new classes that extend CK functionality.
@@ -16,8 +16,37 @@ Using CK API

 This section describes how to use the CK library API.

-----------------
+=================
 CK Datatypes
+=================
+
+-----------------
+DeviceMem
 -----------------

-[TODO]
\ No newline at end of file
+.. doxygenstruct:: DeviceMem
+
+---------------------------
+Kernels For Flashattention
+---------------------------
+
+The Flashattention algorithm is defined in :cite:t:`dao2022flashattention`.  This sections lists the classes that are
+used in the CK GPU implementation of Flashattention.
+
+**Gridwise classes**
+
+.. doxygenstruct:: ck::GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle
+
+**Blockwise classes**
+
+.. doxygenstruct:: ck::ThreadGroupTensorSliceTransfer_v4r1
+
+.. doxygenstruct:: ck::BlockwiseGemmXdlops_v2
+
+.. doxygenstruct:: ck::BlockwiseSoftmax
+
+**Threadwise classes**
+
+.. doxygenstruct:: ck::ThreadwiseTensorSliceTransfer_StaticToStatic
+
+.. bibliography::
\ No newline at end of file
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -59,10 +59,13 @@ if read_the_docs_build:
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
-extensions = ['sphinx.ext.mathjax', 'breathe']
+extensions = ['sphinx.ext.mathjax', 'breathe', 'sphinxcontrib.bibtex']
+
 breathe_projects = { "CK": "../docBin/xml" }
 breathe_default_project = "CK"

+bibtex_bibfiles = ['refs.bib']
+
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']


--- a/docs/source/refs.bib
+++ b/docs/source/refs.bib
+
+@article{dao2022flashattention,
+  title={Flashattention: Fast and memory-efficient exact attention with io-awareness},
+  author={Dao, Tri and Fu, Daniel Y and Ermon, Stefano and Rudra, Atri and R{\'e}, Christopher},
+  journal={arXiv preprint arXiv:2205.14135},
+  year={2022}
+}
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
@@ -622,11 +622,16 @@ constexpr auto BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector()
    }
 };

-// Blockwise gemm supporting
-// 1. regular XDL output M2_M3_M4_M2 and transposed XDL output M2_N2_N3_N4
-// 2. decoupled input tile descriptor and mma tile descriptor in order to support both vgpr and LDS
-// source buffer
-// 3. configurable k index starting position and step size after each FMA/XDL instruction
+/**
+ * @brief Blockwise gemm
+ *
+ * Supports
+ * 1. regular XDL output M2_M3_M4_M2 and transposed XDL output M2_N2_N3_N4
+ * 2. decoupled input tile descriptor and mma tile descriptor in order to support both vgpr and LDS
+ * source buffer
+ * 3. configurable k index starting position and step size after each FMA/XDL instruction
+ */
+
 template <index_t BlockSize,
          typename FloatAB,
          typename FloatAcc,

--- a/include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp
@@ -12,6 +12,16 @@

 namespace ck {

+/**
+ * @brief Blockwise softmax
+ *
+ * @tparam BlockSize Block size
+ * @tparam AccDataType Accumulator data type
+ * @tparam ThreadMap_M_K Thread id to m_k
+ * @tparam ThreadClusterDesc_M_K Threadwise cluster descriptor
+ * @tparam ThreadSliceDesc_M_K Threadwise slices descriptor
+ * @tparam IgnoreNaN Flag to ignore NaN, false by default
+ */
 template <index_t BlockSize,
          typename AccDataType,
          typename ThreadMap_M_K, // thread_id to m_k

--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
@@ -11,10 +11,15 @@

 namespace ck {

-// this version does following things to avoid scratch memory issue
-// 1. Use StaticallyIndexedArray instead of C array for thread buffer
-// 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor
-// 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
+/**
+ * @brief Blockwise data transfer
+ *
+ * This version does following things to avoid scratch memory issue
+ * 1. Use StaticallyIndexedArray instead of C array for thread buffer
+ * 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor
+ * 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
+ *
+ */
 template <typename ThreadGroup,
          typename SrcElementwiseOperation,
          typename DstElementwiseOperation,

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp
@@ -166,7 +166,7 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD<ALayout,

                return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw);
            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
            {
                const auto b_grid_desc_nraw_kraw =
                    make_naive_tensor_descriptor(make_tuple(NRaw, KRaw), make_tuple(StrideB, I1));
@@ -208,39 +208,22 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD<ALayout,
    }

    template <typename ELayout_>
-    static auto MakeEGridDescriptor_M_N(index_t M, index_t N, index_t StrideE)
+    static auto MakeEGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideE)
    {
-        const auto e_grid_desc_m_n = [&]() {
+        const auto e_grid_desc_mraw_nraw = [&]() {
            if constexpr(is_same<tensor_layout::gemm::RowMajor, ELayout_>::value)
            {
-                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideE, I1));
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(StrideE, I1));
            }
            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ELayout_>::value)
            {
-                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideE));
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(I1, StrideE));
            }
        }();

-        if constexpr(GemmSpec == GemmSpecialization::MNPadding)
-        {
-            const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
-            const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
-
-            return transform_tensor_descriptor(
-                e_grid_desc_m_n,
-                make_tuple(make_right_pad_transform(M, PadM), make_right_pad_transform(N, PadN)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-
-            return transform_tensor_descriptor(
-                e_grid_desc_m_n,
-                make_tuple(make_pass_through_transform(M), make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
+        return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw);
    }

    static auto MakeDsGridDescriptor_M_N(const std::array<index_t, NumDTensor>& Ms,

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp
@@ -12,6 +12,7 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_gemm.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
@@ -157,7 +158,7 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm<ALayout,

                return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw);
            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
            {
                const auto b_grid_desc_nraw_kraw =
                    make_naive_tensor_descriptor(make_tuple(NRaw, KRaw), make_tuple(StrideB, I1));

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
@@ -18,6 +18,10 @@

 namespace ck {

+/**
+ * @brief Gridwise gemm + softmax + gemm fusion
+ *
+ */
 template <typename FloatAB,
          typename FloatGemmAcc,
          typename FloatCShuffle,

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
@@ -1201,7 +1201,12 @@ struct ThreadwiseTensorSliceTransfer_v4
    SrcCoord src_ref_coord_;
 };

-// Do NOT involve any tensor coordinates with StaticBuffer
+/**
+ * @brief Threadwise data transfer
+ *
+ * Do NOT involve any tensor coordinates with StaticBuffer
+ *
+ */
 template <typename SrcData,
          typename DstData,
          typename SrcDesc,

--- a/library/include/ck/library/utility/device_memory.hpp
+++ b/library/include/ck/library/utility/device_memory.hpp
@@ -14,6 +14,10 @@ __global__ void set_buffer_value(T* p, T x, uint64_t buffer_element_size)
    }
 }

+/**
+ * @brief Container for storing data in GPU device memory
+ *
+ */
 struct DeviceMem
 {
    DeviceMem() = delete;