sync from public

2298a1a4 · illsilin · 965b7ba4 · 2f088b87 · 2298a1a4 · 2298a1a4
Commit 2298a1a4 authored Dec 09, 2024 by illsilin
20 changed files
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
-* @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk
+* @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca
 # Documentation files
-docs/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk
+docs/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca
-*.md @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk
+*.md @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca
-*.rst @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk
+*.rst @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca
-.readthedocs.yaml @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk
+.readthedocs.yaml @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca
 # Header directory for Doxygen documentation
-library/include/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk
+library/include/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
+[Back to the main page](./README.md)
 # Composable Kernel Developers and Contributors
 This is the list of developers and contributors to Composable Kernel library

--- a/Dockerfile
+++ b/Dockerfile
 FROM ubuntu:20.04
 ARG DEBIAN_FRONTEND=noninteractive
-ARG ROCMVERSION=6.2
+ARG ROCMVERSION=6.3
 ARG compiler_version=""
 ARG compiler_commit=""
 ARG CK_SCCACHE=""
-RUN set -xe
 ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
-RUN useradd -rm -d /home/jenkins -s /bin/bash -u 1004 jenkins
-# Add rocm repository
-RUN chmod 1777 /tmp
-RUN apt-get update
-RUN apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl
 ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
-RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
-RUN if [ "$ROCMVERSION" != "6.3" ]; then \
+# Add rocm repository
-        sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/focal/amdgpu-install_6.2.60200-1_all.deb  --no-check-certificate" && \
+RUN set -xe && \
-        apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.2.60200-1_all.deb && \
+    useradd -rm -d /home/jenkins -s /bin/bash -u 1004 jenkins && \
+    apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl && \
+    curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
+RUN if [ "$ROCMVERSION" != "6.4" ]; then \
+        sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/focal/amdgpu-install_6.3.60300-1_all.deb  --no-check-certificate" && \
+        apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.3.60300-1_all.deb && \
        wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
        sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \
        sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \
-    elif [ "$ROCMVERSION" = "6.3" ] && [ "$compiler_version" = "rc1" ]; then \
-        sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_6.3-20.04-1_all.deb --no-check-certificate" && \
-        apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install dialog libpopt0 rsync && DEBIAN_FRONTEND=noninteractive apt-get install ./amdgpu-install-internal_6.3-20.04-1_all.deb && \
-        sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 6.3 rel-20 > /etc/apt/sources.list.d/rocm-build.list' && \
-        amdgpu-repo --amdgpu-build=2074281; \
    fi
-RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
+RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list" && \
-RUN amdgpu-install -y --usecase=rocm --no-dkms
+    amdgpu-install -y --usecase=rocm --no-dkms
 ## Sccache binary built from source for ROCm, only install if CK_SCCACHE is defined
 ARG SCCACHE_REPO_URL=http://compute-artifactory.amd.com/artifactory/rocm-generic-experimental/rocm-sccache
@@ -76,68 +67,47 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
    clang-format-12 \
    kmod && \
    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
+    rm -rf /var/lib/apt/lists/* && \
+    rm -rf amdgpu-install* && \
+# Remove unnecessary rocm components that take a lot of space
+    apt-get remove -y rocblas rocfft rocsparse composablekernel-dev
-# hipTensor requires rocm-llvm-dev for rocm versions > 6.0.1
-RUN if [ "$ROCMVERSION" = "6.1" ]; then \
-        sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev"; \
-    fi
 # Update the cmake to version 3.27.5
-RUN pip install --upgrade cmake==3.27.5
+RUN pip install --upgrade cmake==3.27.5 && \
 #Install latest ccache
-RUN git clone https://github.com/ccache/ccache.git && \
+    git clone https://github.com/ccache/ccache.git && \
-    cd ccache && mkdir build && cd build && cmake .. && make install
+    cd ccache && mkdir build && cd build && cmake .. && make install && \
 #Install ninja build tracing tools
-RUN wget -qO /usr/local/bin/ninja.gz https://github.com/ninja-build/ninja/releases/latest/download/ninja-linux.zip
+    cd / && \
-RUN gunzip /usr/local/bin/ninja.gz
+    wget -qO /usr/local/bin/ninja.gz https://github.com/ninja-build/ninja/releases/latest/download/ninja-linux.zip && \
-RUN chmod a+x /usr/local/bin/ninja
+    gunzip /usr/local/bin/ninja.gz && \
-RUN git clone https://github.com/nico/ninjatracing.git
+    chmod a+x /usr/local/bin/ninja && \
+    git clone https://github.com/nico/ninjatracing.git && \
 #Install latest cppcheck
-RUN git clone https://github.com/danmar/cppcheck.git && \
+    git clone https://github.com/danmar/cppcheck.git && \
-    cd cppcheck && mkdir build && cd build && cmake .. && cmake --build .
+    cd cppcheck && mkdir build && cd build && cmake .. && cmake --build . && \
-WORKDIR /
+    cd / && \
-# Setup ubsan environment to printstacktrace
-RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer
-ENV UBSAN_OPTIONS=print_stacktrace=1
 # Install an init system
-RUN wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb
+    wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb && \
-RUN dpkg -i dumb-init_*.deb && rm dumb-init_*.deb
+    dpkg -i dumb-init_*.deb && rm dumb-init_*.deb && \
-ARG PREFIX=/opt/rocm
 # Install packages for processing the performance results
-RUN pip3 install --upgrade pip
+    pip3 install --upgrade pip && \
-RUN pip3 install sqlalchemy==1.4.46
+    pip3 install sqlalchemy==1.4.46 pymysql pandas==2.0.3 setuptools-rust sshtunnel==0.4.0 && \
-RUN pip3 install pymysql
+# Add render group
-RUN pip3 install pandas==2.0.3
+    groupadd -f render && \
-RUN pip3 install setuptools-rust
-RUN pip3 install sshtunnel==0.4.0
-# Setup ubsan environment to printstacktrace
-ENV UBSAN_OPTIONS=print_stacktrace=1
-ENV LC_ALL=C.UTF-8
-ENV LANG=C.UTF-8
-RUN groupadd -f render
 # Install the new rocm-cmake version
-RUN git clone -b master https://github.com/ROCm/rocm-cmake.git  && \
+    git clone -b master https://github.com/ROCm/rocm-cmake.git  && \
-  cd rocm-cmake && mkdir build && cd build && \
+    cd rocm-cmake && mkdir build && cd build && \
-  cmake  .. && cmake --build . && cmake --build . --target install
+    cmake  .. && cmake --build . && cmake --build . --target install
 WORKDIR /
+# Add alternative compilers, if necessary
 ENV compiler_version=$compiler_version
 ENV compiler_commit=$compiler_commit
-RUN sh -c "echo compiler version = '$compiler_version'"
+RUN sh -c "echo compiler version = '$compiler_version'" && \
-RUN sh -c "echo compiler commit = '$compiler_commit'"
+    sh -c "echo compiler commit = '$compiler_commit'"
-ARG DISABLE_CACHE=0
-RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline-open" ] ) && [ "$compiler_commit" = "" ]; then \
+RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" = "" ]; then \
        git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
        cd llvm-project && mkdir build && cd build && \
        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
@@ -145,16 +115,10 @@ RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd
    else echo "using the release compiler"; \
    fi
-RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline-open" ] ) && [ "$compiler_commit" != "" ]; then \
+RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" != "" ]; then \
        git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
        cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \
        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
        make -j 8 ; \
    else echo "using the release compiler"; \
    fi
-#clean-up the deb package
-RUN sh -c "rm -rf amdgpu-install*"
-#ENV HIP_CLANG_PATH='/llvm-project/build/bin'
-#RUN sh -c "echo HIP_CLANG_PATH = '$HIP_CLANG_PATH'"
--- a/Dockerfile.compiler
+++ b/Dockerfile.compiler
+ARG BASE_DOCKER="rocm/composable_kernel:ck_ub20.04_rocm6.3"
+FROM $BASE_DOCKER
+ARG compiler_version=""
+ARG compiler_commit=""
+# Add alternative compilers, if necessary
+ENV compiler_version=$compiler_version
+ENV compiler_commit=$compiler_commit
+RUN sh -c "echo compiler version = '$compiler_version'" && \
+    sh -c "echo compiler commit = '$compiler_commit'"
+RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" = "" ]; then \
+        git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
+        cd llvm-project && mkdir build && cd build && \
+        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
+        make -j 16 ; \
+    else echo "using the release compiler"; \
+    fi
+RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" != "" ]; then \
+        git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
+        cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \
+        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
+        make -j 16 ; \
+    else echo "using the release compiler"; \
+    fi
--- a/Jenkinsfile
+++ b/Jenkinsfile
--- a/README.md
+++ b/README.md
@@ -26,23 +26,15 @@ The current CK library is structured into four layers:
 ## General information
-To build our documentation locally, use the following code:
+* [CK supported operations](include/ck/README.md)
+* [CK Tile supported operations](include/ck_tile/README.md)
-``` bash
+* [CK wrapper](client_example/25_wrapper/README.md)
-cd docs
+* [CK codegen](codegen/README.md)
-pip3 install -r sphinx/requirements.txt
+* [CK profiler](profiler/README.md)
-python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
+* [Examples (Custom use of CK supported operations)](example/README.md)
-```
+* [Client examples (Use of CK supported operations with instance factory)](client_example/README.md)
+* [Terminology](/TERMINOLOGY.md)
-You can find a list of our developers and contributors on our [Contributors](/CONTRIBUTORS.md) page.
+* [Contributors](/CONTRIBUTORS.md)
-```note
-If you use CK, cite us as follows:
-* [Realizing Tensor Operators Using Coordinate Transformations and Tile Based Programming](???):
-  This paper will be available on arXiv soon.
-* [CITATION.cff](/CITATION.cff)
-```
 CK is released under the **[MIT license](/LICENSE)**.
@@ -137,6 +129,14 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa
    You can find instructions for running ckProfiler in [profiler](/profiler).
+* Build our documentation locally:
+    ``` bash
+    cd docs
+    pip3 install -r sphinx/requirements.txt
+    python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
+    ```
 Note the `-j` option for building with multiple threads in parallel, which speeds up the build significantly.
 However, `-j` launches unlimited number of threads, which can cause the build to run out of memory and
 crash. On average, you should expect each thread to use ~2Gb of RAM.
@@ -154,8 +154,7 @@ Additional cmake flags can be used to significantly speed-up the build:
  other platforms have faster instances, such as `xdl` or `wmma`, available.
 * `CK_USE_FP8_ON_UNSUPPORTED_ARCH` (default is OFF) must be set to ON in order to build instances,
-  such as `gemm_universal` and `gemm_multiply_multiply` for fp8 data type for GPU targets which do not
+  such as `gemm_universal`, `gemm_universal_streamk` and `gemm_multiply_multiply` for fp8 data type for GPU targets which do not  have native support for fp8 data type, such as gfx908 or gfx90a. These instances are useful on
-  have native support for fp8 data type, such as gfx908 or gfx90a. These instances are useful on
  architectures like the MI100/MI200 for the functional support only.
 ## Using sccache for building

--- a/TERMINOLOGY.md
+++ b/TERMINOLOGY.md
+[Back to the main page](./README.md)
+# Composable Kernel terminology
\ No newline at end of file
--- a/client_example/25_wrapper/README.md
+++ b/client_example/25_wrapper/README.md
+[Back to the main page](../../README.md)
 # Composable Kernel wrapper GEMM tutorial
-This tutorial demonstrates how to implement matrix multiplication using Composable Kernel (CK)
+This tutorial demonstrates how to implement matrix multiplication using Composable Kernel (CK) wrapper. We present the base version of GEMM without most of the available optimizations; however, it's worth noting that CK has kernels with different optimizations.
-wrapper. We present the base version of GEMM without most of the available optimizations; however,
-it's worth noting that CK has kernels with different optimizations.
-To implement these optimizations, you can use the CK wrapper or directly use available instances in
+To implement these optimizations, you can use the CK wrapper or directly use available instances in CK. You can also refer to the [optimized GEMM example](https://github.com/ROCm/composable_kernel/blob/develop/client_example/25_wrapper/wrapper_optimized_gemm.cpp), that uses CK wrapper based on the [`gridwise_gemm_xdlops_v2r3`](https://github.com/ROCm/composable_kernel/blob/develop/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp) implementation.
-CK. You can also refer to the
-[optimized GEMM example](https://github.com/ROCm/composable_kernel/blob/develop/client_example/25_wrapper/wrapper_optimized_gemm.cpp),
-that uses CK wrapper based on the
-[`gridwise_gemm_xdlops_v2r3`](https://github.com/ROCm/composable_kernel/blob/develop/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp) implementation.
 The kernel definition should look similar to:

--- a/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_bias_fastgelu_xdl_bf16_i8.cpp
+++ b/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_bias_fastgelu_xdl_bf16_i8.cpp
@@ -121,7 +121,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
    constexpr ck::index_t NumDTensor = 2;
    using GroupedGemmKernelArgument =
-        ck::tensor_operation::device::GroupedGemmTileLoopKernelArguments<NumDTensor>;
+        ck::tensor_operation::device::GroupedGemmKernelArgument<NumDTensor>;
    std::vector<GroupedGemmKernelArgument> grouped_gemm_kernel_args_;
    grouped_gemm_kernel_args_.reserve(group_count);

--- a/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_xdl_bf16_i8.cpp
+++ b/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_xdl_bf16_i8.cpp
@@ -120,7 +120,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
    constexpr ck::index_t NumDTensor = 1;
    using GroupedGemmKernelArgument =
-        ck::tensor_operation::device::GroupedGemmTileLoopKernelArguments<NumDTensor>;
+        ck::tensor_operation::device::GroupedGemmKernelArgument<NumDTensor>;
    std::vector<GroupedGemmKernelArgument> grouped_gemm_kernel_args_;
    grouped_gemm_kernel_args_.reserve(group_count);

--- a/client_example/README.md
+++ b/client_example/README.md
+[Back to the main page](../README.md)
+# Composable Kernel client examples
 ##
 Client application links to CK library, and therefore CK library needs to be installed before building client applications.

--- a/codegen/CMakeLists.txt
+++ b/codegen/CMakeLists.txt
@@ -7,6 +7,7 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 set(CK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..)
+configure_file(${CK_ROOT}/include/ck/config.h.in ${CK_ROOT}/include/ck/config.h)
 find_package(ROCM)
 include(ROCMInstallTargets)

--- a/codegen/README.md
+++ b/codegen/README.md
+[Back to the main page](../README.md)
+# Composable Kernel codegen
\ No newline at end of file
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
-rocm-docs-core==1.8.4
+rocm-docs-core==1.11.0
 sphinxcontrib-bibtex==2.6.3
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -103,7 +103,7 @@ requests==2.32.3
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==1.8.4
+rocm-docs-core==1.11.0
    # via -r requirements.in
 six==1.16.0
    # via pybtex

--- a/example/01_gemm/CMakeLists.txt
+++ b/example/01_gemm/CMakeLists.txt
@@ -77,9 +77,16 @@ add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8)
 add_example_executable(example_gemm_xdl_fp8_bf8 gemm_xdl_fp8_bf8.cpp)
 add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8_bf8)
+add_example_executable(example_gemm_xdl_fp8_streamk_v3 gemm_xdl_fp8_streamk_v3.cpp)
+add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8_streamk_v3)
 add_example_executable(example_gemm_xdl_fp16_fp8 gemm_xdl_fp16_fp8.cpp)
 add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8)
 add_custom_target(example_gemm_wmma)
 add_example_executable(example_gemm_wmma_fp16 gemm_wmma_fp16.cpp)
 add_example_dependencies(example_gemm_wmma example_gemm_wmma_fp16)
+add_example_executable(example_gemm_wmma_bf16 gemm_wmma_bf16.cpp)
+add_example_dependencies(example_gemm_wmma example_gemm_wmma_bf16)
+add_example_executable(example_gemm_wmma_int8 gemm_wmma_int8.cpp)
+add_example_dependencies(example_gemm_wmma example_gemm_wmma_int8)
--- a/example/01_gemm/common.hpp
+++ b/example/01_gemm/common.hpp
@@ -44,7 +44,7 @@ struct ProblemSizeStreamK final
    ck::index_t StrideB = -1;
    ck::index_t StrideC = -1;
-    ck::index_t NumSKBlocks = -1;
+    ck::index_t NumSKBlocks = -1; // number of stream-k blocks
 };
 struct ProblemSizeStreamK_universal final
 {

--- a/example/01_gemm/gemm_wmma_bf16.cpp
+++ b/example/01_gemm/gemm_wmma_bf16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "common.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp"
+using ADataType        = ck::bhalf_t;
+using BDataType        = ck::bhalf_t;
+using AccDataType      = float;
+using CShuffleDataType = float;
+using CDataType        = ck::bhalf_t;
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle
+         < ALayout,
+           BLayout,
+           CLayout,
+           ADataType,
+           BDataType,
+           CDataType,
+           AccDataType,
+           CShuffleDataType,
+           AElementOp,
+           BElementOp,
+           CElementOp,
+           GemmDefault,
+           1,           // Prefetch stage
+           128,         // BlockSize
+           64,          // MPerBlock
+           128,         // NPerBlock
+           64,          // KPerBlock
+           2,           // K1
+           16,          // MPerWmma
+           16,          // NPerWmma
+           2,           // M-Repeat // M-PerWmma / M-Repeat = M-Wave
+           4,           // N-Repeat // N-PerWmma / N-Repeat = N-Wave
+           S<4, 32, 1>,
+           S<1, 0, 2>,
+           S<1, 0, 2>,
+           2,
+           2,
+           2,
+           true,
+           S<4, 32, 1>,
+           S<1, 0, 2>,
+           S<1, 0, 2>,
+           2,
+           2,
+           2,
+           true,
+           1,           // C shuffle (M Repeat) Per store
+           1,           // C shuffle (N Repeat) Per store
+           S<1, 32, 1,  4>,
+           8>;
+// clang-format on
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
+#include "run_gemm_example.inc"
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
--- a/example/01_gemm/gemm_wmma_int8.cpp
+++ b/example/01_gemm/gemm_wmma_int8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "common.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp"
+using ADataType        = int8_t;
+using BDataType        = int8_t;
+using AccDataType      = int32_t;
+using CShuffleDataType = int32_t;
+using CDataType        = int8_t;
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle
+         < ALayout,
+           BLayout,
+           CLayout,
+           ADataType,
+           BDataType,
+           CDataType,
+           AccDataType,
+           CShuffleDataType,
+           AElementOp,
+           BElementOp,
+           CElementOp,
+           GemmDefault,
+           1,           // Prefetch stage
+           128,         // BlockSize
+           64,          // MPerBlock
+           128,         // NPerBlock
+           64,          // KPerBlock
+           2,           // K1
+           16,          // MPerWmma
+           16,          // NPerWmma
+           2,           // M-Repeat // M-PerWmma / M-Repeat = M-Wave
+           4,           // N-Repeat // N-PerWmma / N-Repeat = N-Wave
+           S<4, 32, 1>,
+           S<1, 0, 2>,
+           S<1, 0, 2>,
+           2,
+           2,
+           2,
+           true,
+           S<4, 32, 1>,
+           S<1, 0, 2>,
+           S<1, 0, 2>,
+           2,
+           2,
+           2,
+           true,
+           1,           // C shuffle (M Repeat) Per store
+           1,           // C shuffle (N Repeat) Per store
+           S<1, 32, 1,  4>,
+           8>;
+// clang-format on
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
+#include "run_gemm_example.inc"
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
--- a/example/01_gemm/gemm_xdl_fp16_streamk_v3.cpp
+++ b/example/01_gemm/gemm_xdl_fp16_streamk_v3.cpp
@@ -8,7 +8,7 @@
 using ADataType        = ck::half_t;
 using BDataType        = ck::half_t;
 using AccDataType      = float;
-using CShuffleDataType = ck::half_t;
+using CShuffleDataType = float;
 using CDataType        = ck::half_t;
 using ALayout = Row;
@@ -43,6 +43,17 @@ using DeviceGemmV2_Streamk_Instance =
 using ReferenceGemmInstance = ck::tensor_operation::host::
    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
 #include "run_gemm_example_streamk_v2.inc"
 int main(int argc, char* argv[]) { return !run_gemm_universal_streamk_example(argc, argv); }