Merge branch 'develop' into mi300

f8a6c69c · Illia Silin · GitHub · 56599d67 · 52f64967 · f8a6c69c
Unverified Commit f8a6c69c authored Mar 13, 2023 by Illia Silin Committed by GitHub Mar 13, 2023
20 changed files
--- a/docs/source/Makefile
+++ b/docs/source/Makefile
+# Minimal makefile for Sphinx documentation
+#
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SPHINXPROJ    = CK
+SOURCEDIR     = .
+BUILDDIR      = _build
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+.PHONY: help Makefile
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/docs/source/Supported_Primitives_Guide.rst
+++ b/docs/source/Supported_Primitives_Guide.rst
+==========================
+Supported Primitives Guide
+==========================
+This document contains details of supported primitives in Composable Kernel (CK). In contrast to the API Reference
+Guide, the Supported Primitives Guide is an introduction to the math which underpins the algorithms implemented in CK.
+------------
+Softmax
+------------
+For vectors :math:`x^{(1)}, x^{(2)}, \ldots, x^{(T)}` of size :math:`B` we can decompose the softmax of concatenated
+:math:`x = [ x^{(1)}\ | \ \ldots \ | \ x^{(T)} ]` as,
+.. math::
+   :nowrap:
+   \begin{align}
+      m(x) & = m( [ x^{(1)}\ | \ \ldots \ | \ x^{(T)} ] ) = \max( m(x^{(1)}),\ldots, m(x^{(T)}) )  \\
+      f(x) & = [\exp( m(x^{(1)}) - m(x) ) f( x^{(1)} )\ | \ \ldots \ | \ \exp( m(x^{(T)}) - m(x) ) f( x^{(T)} )] \\
+      z(x) & = \exp( m(x^{(1)}) - m(x) )\ z(x^{(1)}) + \ldots + \exp( m(x^{(T)}) - m(x) )\ z(x^{(1)}) \\
+      \operatorname{softmax}(x) &= f(x)\ / \ z(x)
+   \end{align}
+where :math:`f(x^{(j)}) = \exp( x^{(j)} - m(x^{(j)}) )` is of size :math:`B` and
+:math:`z(x^{(j)}) = f(x_1^{(j)})+ \ldots+ f(x_B^{(j)})` is a scalar.
+For a matrix :math:`X` composed of :math:`T_r \times T_c` tiles, :math:`X_{ij}`, of size :math:`B_r \times B_c` we can
+compute the row-wise softmax as follows.
+For :math:`j` from :math:`1` to :math:`T_c`, and :math:`i` from :math:`1` to :math:`T_r` calculate,
+.. math::
+   :nowrap:
+   \begin{align}
+      \tilde{m}_{ij}   &= \operatorname{rowmax}( X_{ij} ) \\
+      \tilde{P}_{ij}   &= \exp(X_{ij} - \tilde{m}_{ij} ) \\
+      \tilde{z}_{ij}   &= \operatorname{rowsum}( P_{ij} ) \\
+   \end{align}
+If :math:`j=1`, initialize running max, running sum, and the first column block of the output,
+.. math::
+   :nowrap:
+   \begin{align}
+      m_i            &= \tilde{m}_{i1} \\
+      z_i            &= \tilde{z}_{i1} \\
+      \tilde{Y}_{i1} &= \diag(\tilde{z}_{ij})^{-1} \tilde{P}_{i1}
+   \end{align}
+Else if :math:`j>1`,
+1. Update running max, running sum and column blocks :math:`k=1` to :math:`k=j-1`
+.. math::
+   :nowrap:
+   \begin{align}
+      m^{new}_i &= \max(m_i, \tilde{m}_{ij} ) \\
+      z^{new}_i &= \exp(m_i - m^{new}_i)\ z_i + \exp( \tilde{m}_{ij} - m^{new}_i )\ \tilde{z}_{ij}  \\
+      Y_{ik}    &= \diag(z^{new}_{i})^{-1} \diag(z_{i}) \exp(m_i - m^{new}_i)\ Y_{ik}
+   \end{align}
+2. Initialize column block :math:`j` of output and reset running max and running sum variables:
+.. math::
+   :nowrap:
+   \begin{align}
+      \tilde{Y}_{ij} &= \diag(z^{new}_{i})^{-1} \exp(\tilde{m}_{ij} - m^{new}_i ) \tilde{P}_{ij} \\
+      z_i            &= z^{new}_i \\
+      m_i            &= m^{new}_i \\
+   \end{align}
\ No newline at end of file
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
--- a/docs/source/dockerhub.rst
+++ b/docs/source/dockerhub.rst
+===================
+CK docker hub
+===================
+`Docker hub <https://hub.docker.com/r/rocm/composable_kernel>`_
+-------------------------------------
+Why do I need this?
+-------------------------------------
+To make our lives easier and bring Composable Kernel dependencies together, we recommend using docker images.
+-------------------------------------
+So what is Composable Kernel?
+-------------------------------------
+Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel languages, like HIP C++.
+To get the CK library::
+    git clone https://github.com/ROCmSoftwarePlatform/composable_kernel.git
+run a docker container::
+    docker run                                                            \
+    -it                                                                   \
+    --privileged                                                          \
+    --group-add sudo                                                      \
+    -w /root/workspace                                                    \
+    -v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                         \
+    rocm/composable_kernel:ck_ub20.04_rocm5.3_release                     \
+    /bin/bash
+and build the CK::
+    mkdir build && cd build
+    # Need to specify target ID, example below is for gfx908 and gfx90a
+    cmake                                                                                             \
+    -D CMAKE_PREFIX_PATH=/opt/rocm                                                                    \
+    -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                         \
+    -D CMAKE_CXX_FLAGS="-O3"                                                                          \
+    -D CMAKE_BUILD_TYPE=Release                                                                       \
+    -D GPU_TARGETS="gfx908;gfx90a"                                                                    \
+    ..
+and::
+    make -j examples tests
+To run all the test cases including tests and examples run::
+    make test
+We can also run specific examples or tests like::
+    ./bin/example_gemm_xdl_fp16
+    ./bin/test_gemm_fp16
+For more details visit `CK github repo <https://github.com/ROCmSoftwarePlatform/composable_kernel>`_, `CK examples <https://github.com/ROCmSoftwarePlatform/composable_kernel/tree/develop/example)>`_, `even more CK examples <https://github.com/ROCmSoftwarePlatform/composable_kernel/tree/develop/client_example>`_.
+-------------------------------------
+And what is inside?
+-------------------------------------
+The docker images have everything you need for running CK including:
+* `ROCm <https://www.amd.com/en/graphics/servers-solutions-rocm>`_
+* `CMake <https://cmake.org/>`_
+* `Compiler <https://github.com/RadeonOpenCompute/llvm-project>`_
+-------------------------------------
+Which image is right for me?
+-------------------------------------
+Let's take a look at the image naming, for example "ck_ub20.04_rocm5.4_release". The image specs are:
+* "ck" - made for running Composable Kernel
+* "ub20.04" - based on Ubuntu 20.04
+* "rocm5.4" - ROCm platform version 5.4
+* "release" - compiler version is release
+So just pick the right image for your project dependencies and you're all set.
+-------------------------------------
+DIY starts here
+-------------------------------------
+If you need to customize a docker image or just can't stop tinkering, feel free to adjust the `Dockerfile <https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/develop/Dockerfile>`_ for your needs.
+-------------------------------------
+License
+-------------------------------------
+CK is released under the MIT `license <https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/develop/LICENSE>`_.
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
+============================
+Composable Kernel User Guide
+============================
+.. toctree::
+   :maxdepth: 5
+   :caption: Contents:
+   :numbered:
+   Linux_Install_Guide
+   tutorial_hello_world
+   dockerhub
+   Supported_Primitives_Guide
+   API_Reference_Guide
+   Contributors_Guide
+   Disclaimer
\ No newline at end of file
--- a/docs/source/rocm_logo.png
+++ b/docs/source/rocm_logo.png
--- a/docs/source/tutorial_hello_world.rst
+++ b/docs/source/tutorial_hello_world.rst
--- a/example/01_gemm/CMakeLists.txt
+++ b/example/01_gemm/CMakeLists.txt
@@ -38,7 +38,9 @@ add_example_executable_no_testing(example_gemm_xdl_fp64 gemm_xdl_fp64.cpp)
 add_dependencies(example_gemm_xdl example_gemm_xdl_skip_b_lds_fp16)
 add_dependencies(example_gemm_xdl example_gemm_xdl_fp64)
-add_custom_target(example_gemm_wmma)
+if(GPU_TARGETS MATCHES "gfx1100")
-add_example_executable(example_gemm_wmma_fp16 gemm_wmma_fp16.cpp)
+  add_custom_target(example_gemm_wmma)
-add_dependencies(example_gemm_wmma example_gemm_wmma_fp16)
+  add_example_executable(example_gemm_wmma_fp16 gemm_wmma_fp16.cpp)
+  add_dependencies(example_gemm_wmma example_gemm_wmma_fp16)
+endif()
--- a/example/02_gemm_bilinear/CMakeLists.txt
+++ b/example/02_gemm_bilinear/CMakeLists.txt
 add_example_executable(example_gemm_bilinear_xdl_fp16 gemm_bilinear_xdl_fp16.cpp)
+if(GPU_TARGETS MATCHES "gfx1100")
+    add_example_executable(example_gemm_bilinear_wmma_fp16 gemm_bilinear_wmma_fp16.cpp)
+endif()
--- a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
+++ b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
--- a/example/20_grouped_conv_bwd_weight/CMakeLists.txt
+++ b/example/20_grouped_conv_bwd_weight/CMakeLists.txt
@@ -6,3 +6,9 @@ add_example_executable(example_grouped_conv_bwd_weight_xdl_bf16 grouped_conv_bwd
 add_dependencies(example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_xdl_fp16
                                                 example_grouped_conv_bwd_weight_xdl_bf16)
+add_custom_target(example_grouped_conv_bwd_weight_dl)
+add_example_executable(example_grouped_conv_bwd_weight_dl_fp16 grouped_conv_bwd_weight_dl_fp16.cpp)
+add_dependencies(example_grouped_conv_bwd_weight_dl example_grouped_conv_bwd_weight_dl_fp16)
--- a/example/20_grouped_conv_bwd_weight/common.hpp
+++ b/example/20_grouped_conv_bwd_weight/common.hpp
@@ -9,7 +9,6 @@
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

--- a/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_dl_fp16.cpp
+++ b/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_dl_fp16.cpp
--- a/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_bf16.cpp
+++ b/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_bf16.cpp
--- a/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16.cpp
+++ b/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16.cpp
--- a/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
+++ b/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
--- a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp
@@ -4,7 +4,6 @@
 #include <iostream>
 #include <numeric>
 #include <initializer_list>
-#include <cstdlib>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -116,7 +115,7 @@ auto f_host_tensor_descriptor2d =
    [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
        using namespace ck::literals;
-        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+        if constexpr(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
        {
            return HostTensorDescriptor({row, col}, {stride, 1_uz});
        }

--- a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp
--- a/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp
@@ -4,7 +4,6 @@
 #include <iostream>
 #include <numeric>
 #include <initializer_list>
-#include <cstdlib>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -115,7 +114,7 @@ auto f_host_tensor_descriptor2d =
    [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
        using namespace ck::literals;
-        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+        if constexpr(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
        {
            return HostTensorDescriptor({row, col}, {stride, 1_uz});
        }

--- a/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp
@@ -135,7 +135,7 @@ int main(int argc, char* argv[])
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
            using namespace ck::literals;
-            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            if constexpr(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
            {
                return HostTensorDescriptor({row, col}, {stride, 1_uz});
            }