Generate output using Doxygen / Breathe (#598)

* Modify Doxygen config to pick up include directories recursively * Add DeviceMem struct to API Reference guide * Add classes that are used in Flash Attention kernel * Add a reference and config for generating bibliography Co-authored-by: Philip Maybank <Philip.Maybank@amd.com>

Generate output using Doxygen / Breathe (#598)
* Modify Doxygen config to pick up include directories recursively * Add DeviceMem struct to API Reference guide * Add classes that are used in Flash Attention kernel * Add a reference and config for generating bibliography Co-authored-by: Philip Maybank <Philip.Maybank@amd.com>
e4bf6d42 · pmaybank · GitHub · e6cda9f8 · e4bf6d42 · e4bf6d42
Unverified Commit e4bf6d42 authored Mar 06, 2023 by pmaybank Committed by GitHub Mar 06, 2023
10 changed files
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -775,8 +775,10 @@ WARN_LOGFILE           =
 # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
 # Note: If this tag is empty the current directory is searched.

-INPUT                  = ../library/include \
-                         ../library/include/internal
+INPUT                  = ../include/ck/tensor_operation/gpu/grid \
+                         ../include/ck/tensor_operation/gpu/block \
+                         ../include/ck/tensor_operation/gpu/thread \
+                         ../library/include/ck/library/utility

 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
@@ -845,7 +847,7 @@ FILE_PATTERNS          = *.c \
 # be searched for input files as well.
 # The default value is: NO.

-RECURSIVE              = NO
+RECURSIVE              = YES

 # The EXCLUDE tag can be used to specify files and/or directories that should be
 # excluded from the INPUT source files. This way you can easily exclude a

--- a/docs/source/API_Reference_Guide.rst
+++ b/docs/source/API_Reference_Guide.rst

-===================
+*******************
 API Reference Guide
-===================
+*******************

------------
+=================
 Introduction
------------
+=================

 This document contains details of the APIs for the Composable Kernel (CK) library and introduces some of the key design
 principles that are used to write new classes that extend CK functionality.
@@ -16,8 +16,37 @@ Using CK API

 This section describes how to use the CK library API.

-----------------
+=================
 CK Datatypes
+=================
+
+-----------------
+DeviceMem
 -----------------

-[TODO]
\ No newline at end of file
+.. doxygenstruct:: DeviceMem
+
+---------------------------
+Kernels For Flashattention
+---------------------------
+
+The Flashattention algorithm is defined in :cite:t:`dao2022flashattention`.  This sections lists the classes that are
+used in the CK GPU implementation of Flashattention.
+
+**Gridwise classes**
+
+.. doxygenstruct:: ck::GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle
+
+**Blockwise classes**
+
+.. doxygenstruct:: ck::ThreadGroupTensorSliceTransfer_v4r1
+
+.. doxygenstruct:: ck::BlockwiseGemmXdlops_v2
+
+.. doxygenstruct:: ck::BlockwiseSoftmax
+
+**Threadwise classes**
+
+.. doxygenstruct:: ck::ThreadwiseTensorSliceTransfer_StaticToStatic
+
+.. bibliography::
\ No newline at end of file
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -59,10 +59,13 @@ if read_the_docs_build:
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
-extensions = ['sphinx.ext.mathjax', 'breathe']
+extensions = ['sphinx.ext.mathjax', 'breathe', 'sphinxcontrib.bibtex']
+
 breathe_projects = { "CK": "../docBin/xml" }
 breathe_default_project = "CK"

+bibtex_bibfiles = ['refs.bib']
+
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']


--- a/docs/source/refs.bib
+++ b/docs/source/refs.bib
+
+@article{dao2022flashattention,
+  title={Flashattention: Fast and memory-efficient exact attention with io-awareness},
+  author={Dao, Tri and Fu, Daniel Y and Ermon, Stefano and Rudra, Atri and R{\'e}, Christopher},
+  journal={arXiv preprint arXiv:2205.14135},
+  year={2022}
+}
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
@@ -622,11 +622,16 @@ constexpr auto BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector()
    }
 };

-// Blockwise gemm supporting
-// 1. regular XDL output M2_M3_M4_M2 and transposed XDL output M2_N2_N3_N4
-// 2. decoupled input tile descriptor and mma tile descriptor in order to support both vgpr and LDS
-// source buffer
-// 3. configurable k index starting position and step size after each FMA/XDL instruction
+/**
+ * @brief Blockwise gemm
+ *
+ * Supports
+ * 1. regular XDL output M2_M3_M4_M2 and transposed XDL output M2_N2_N3_N4
+ * 2. decoupled input tile descriptor and mma tile descriptor in order to support both vgpr and LDS
+ * source buffer
+ * 3. configurable k index starting position and step size after each FMA/XDL instruction
+ */
+
 template <index_t BlockSize,
          typename FloatAB,
          typename FloatAcc,

--- a/include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp
@@ -12,6 +12,16 @@

 namespace ck {

+/**
+ * @brief Blockwise softmax
+ *
+ * @tparam BlockSize
+ * @tparam AccDataType
+ * @tparam ThreadMap_M_K
+ * @tparam ThreadClusterDesc_M_K
+ * @tparam ThreadSliceDesc_M_K
+ * @tparam IgnoreNaN
+ */
 template <index_t BlockSize,
          typename AccDataType,
          typename ThreadMap_M_K, // thread_id to m_k

--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
@@ -11,10 +11,15 @@

 namespace ck {

-// this version does following things to avoid scratch memory issue
-// 1. Use StaticallyIndexedArray instead of C array for thread buffer
-// 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor
-// 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
+/**
+ * @brief Blockwise data transfer
+ *
+ * This version does following things to avoid scratch memory issue
+ * 1. Use StaticallyIndexedArray instead of C array for thread buffer
+ * 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor
+ * 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
+ *
+ */
 template <typename ThreadGroup,
          typename SrcElementwiseOperation,
          typename DstElementwiseOperation,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
@@ -18,6 +18,10 @@

 namespace ck {

+/**
+ * @brief Gridwise gemm + softmax + gemm fusion
+ *
+ */
 template <typename FloatAB,
          typename FloatGemmAcc,
          typename FloatCShuffle,

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
@@ -1201,7 +1201,12 @@ struct ThreadwiseTensorSliceTransfer_v4
    SrcCoord src_ref_coord_;
 };

-// Do NOT involve any tensor coordinates with StaticBuffer
+/**
+ * @brief Threadwise data transfer
+ *
+ * Do NOT involve any tensor coordinates with StaticBuffer
+ *
+ */
 template <typename SrcData,
          typename DstData,
          typename SrcDesc,

--- a/library/include/ck/library/utility/device_memory.hpp
+++ b/library/include/ck/library/utility/device_memory.hpp
@@ -14,6 +14,10 @@ __global__ void set_buffer_value(T* p, T x, uint64_t buffer_element_size)
    }
 }

+/**
+ * @brief Container for storing data in GPU device memory
+ *
+ */
 struct DeviceMem
 {
    DeviceMem() = delete;