Update comments to conform with doxygen style.

d254ed90 · Adam Osewski · f6f70673 · d254ed90 · d254ed90 · d254ed90
Commit d254ed90 authored Jul 27, 2023 by Adam Osewski
10 changed files
--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp
@@ -82,22 +82,22 @@ struct DeviceGroupedGemmSplitK : public DeviceGroupedGemm<ALayout,
                                                          BElementwiseOperation,
                                                          CElementwiseOperation>
 {
-    //------------------------------------------------------------------------//
-    // @brief      Sets the k batch size.
-    //
-    // @param      p_arg   Pointer to the Argument we're going to change.
-    // @param[in]  kbatch  The kbatch value.
-    //
+    ///------------------------------------------------------------------------//
+    /// @brief      Sets the k batch size.
+    ///
+    /// @param      p_arg   Pointer to the Argument we're going to change.
+    /// @param[in]  kbatch  The kbatch value.
+    ///
    virtual void SetKBatchSize(BaseArgument* /*p_arg*/, index_t /*kbatch*/) const {}

-    //------------------------------------------------------------------------//
-    //
-    // @brief      Sets the device kernel arguments pointer.
-    //
-    // @param      p_arg              The pointer to the Argument we're going to update.
-    // @param[in]  p_dev_kernel_args  The pointer to the device memory which contains kernel
-    //                                arguments.
-    //
+    ///------------------------------------------------------------------------//
+    ///
+    /// @brief      Sets the device kernel arguments pointer.
+    ///
+    /// @param      p_arg              The pointer to the Argument we're going to update.
+    /// @param[in]  p_dev_kernel_args  The pointer to the device memory which contains kernel
+    ///                                arguments.
+    ///
    virtual void SetDeviceKernelArgs(BaseArgument* /*p_arg*/,
                                     const void* /*p_dev_kernel_args*/) const
    {

--- a/include/ck/tensor_operation/gpu/device/device_softmax.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_softmax.hpp
@@ -22,22 +22,22 @@ template <typename InDataType,
          index_t NumReduceDim>
 struct DeviceSoftmax : public BaseOperator
 {
-    //
-    // @brief      Makes a pointer to Argument class.
-    //
-    // @param[in]  inLengths           Input tensor extent(s) from high to low dimension
-    // @param[in]  inStrides           Input tensor stride(s) from high to low dimension
-    // @param[in]  reduceDims          The dimension(s) the normalization operation is applied
-    // @param[in]  alpha               double type value
-    // @param[in]  beta                double type value
-    // @param[in]  in_dev              Typeless const pointer in device memory storing the input
-    //                                 tensor
-    // @param      out_dev             Typeless pointer in device memory storing the output tensor
-    // @param[in]  in_elementwise_op   The input elementwise operation.
-    // @param[in]  acc_elementwise_op  The accumulation elementwise operation.
-    //
-    // @return     Unique pointer to the Argument class.
-    //
+    ///
+    /// @brief      Makes a pointer to Argument class.
+    ///
+    /// @param[in]  inLengths           Input tensor extent(s) from high to low dimension
+    /// @param[in]  inStrides           Input tensor stride(s) from high to low dimension
+    /// @param[in]  reduceDims          The dimension(s) the normalization operation is applied
+    /// @param[in]  alpha               double type value
+    /// @param[in]  beta                double type value
+    /// @param[in]  in_dev              Typeless const pointer in device memory storing the input
+    ///                                 tensor
+    /// @param      out_dev             Typeless pointer in device memory storing the output tensor
+    /// @param[in]  in_elementwise_op   The input elementwise operation.
+    /// @param[in]  acc_elementwise_op  The accumulation elementwise operation.
+    ///
+    /// @return     Unique pointer to the Argument class.
+    ///
    virtual std::unique_ptr<BaseArgument>
    MakeArgumentPointer(const std::vector<index_t> inLengths,
                        const std::vector<index_t> inStrides,

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
@@ -201,22 +201,22 @@ __global__ void
 }
 } // namespace

-//
-// @brief      Device Convolution operation.
-//
-// Supports:
-//  @li         Forward convolution with up to 3 spatial dimentions
-//  @li         Input tensor in GNWC data format
-//  @li         Weight tensor in GKXC data format
-//  @li         Output tensor in GNWK data format
-//
-// 1D:
-// out[N, Wo, K] = in[N, Wi, C] * wei[K, X, C]
-// 2D:
-// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C]
-// 3D:
-// out[N, Do, Ho, Wo, K] = in[N, Di, Hi, Wi, C] * wei[K, Z, Y, X, C]
-//
+///
+/// @brief      Device Convolution operation.
+///
+/// Supports:
+///  @li         Forward convolution with up to 3 spatial dimentions
+///  @li         Input tensor in GNWC data format
+///  @li         Weight tensor in GKXC data format
+///  @li         Output tensor in GNWK data format
+///
+/// 1D:
+/// out[N, Wo, K] = in[N, Wi, C] * wei[K, X, C]
+/// 2D:
+/// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C]
+/// 3D:
+/// out[N, Do, Ho, Wo, K] = in[N, Di, Hi, Wi, C] * wei[K, Z, Y, X, C]
+///
 template <index_t NDimSpatial,
          typename ADataType,
          typename BDataType,

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
@@ -154,22 +154,22 @@ __global__ void

 } // namespace

-//
-// @brief      Device Convolution operation.
-//
-// Supports:
-//  @li         Forward convolution with up to 3 spatial dimentions
-//  @li         Input tensor in GNWC data format
-//  @li         Weight tensor in GKXC data format
-//  @li         Output tensor in GNWK data format
-//
-// 1D:
-// out[N, Wo, K] = in[N, Wi, C] * wei[K, X, C]
-// 2D:
-// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C]
-// 3D:
-// out[N, Do, Ho, Wo, K] = in[N, Di, Hi, Wi, C] * wei[K, Z, Y, X, C]
-//
+///
+/// @brief      Device Convolution operation.
+///
+/// Supports:
+///  @li         Forward convolution with up to 3 spatial dimentions
+///  @li         Input tensor in GNWC data format
+///  @li         Weight tensor in GKXC data format
+///  @li         Output tensor in GNWK data format
+///
+/// 1D:
+/// out[N, Wo, K] = in[N, Wi, C] * wei[K, X, C]
+/// 2D:
+/// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C]
+/// 3D:
+/// out[N, Do, Ho, Wo, K] = in[N, Di, Hi, Wi, C] * wei[K, Z, Y, X, C]
+///
 template <
    index_t NDimSpatial,
    typename ADataType,

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
@@ -76,23 +76,23 @@ struct ComputePtrOffsetOfStridedBatch

 } // namespace

-//
-// @brief      Device Convolution operation.
-//
-// Supports:
-//  @li         Forward convolution with up to 3 spatial dimentions
-//  @li         Input tensor in GNWC data format
-//  @li         Weight tensor in GKXC data format
-//  @li         Output tensor in GNWK data format
-//
-// 1D:
-// out[N, Wo, K] = in[N, Wi, C] * wei[K, X, C]
-// 2D:
-// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C]
-// 3D:
-// out[N, Do, Ho, Wo, K] = in[N, Di, Hi, Wi, C] * wei[K, Z, Y, X, C]
-// Assume:
-//  AK1 == BK1
+///
+/// @brief      Device Convolution operation.
+///
+/// Supports:
+///  @li         Forward convolution with up to 3 spatial dimentions
+///  @li         Input tensor in GNWC data format
+///  @li         Weight tensor in GKXC data format
+///  @li         Output tensor in GNWK data format
+///
+/// 1D:
+/// out[N, Wo, K] = in[N, Wi, C] * wei[K, X, C]
+/// 2D:
+/// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C]
+/// 3D:
+/// out[N, Do, Ho, Wo, K] = in[N, Di, Hi, Wi, C] * wei[K, Z, Y, X, C]
+/// Assume:
+///  AK1 == BK1
 template <index_t NDimSpatial,
          typename ALayout,
          typename BLayout,

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp
@@ -194,22 +194,22 @@ __global__ void

 } // namespace

-//
-// @brief      Device Convolution operation.
-//
-// Supports:
-//  @li         Forward convolution with up to 3 spatial dimentions
-//  @li         Input tensor in GNWC data format
-//  @li         Weight tensor in GKXC data format
-//  @li         Output tensor in GNWK data format
-//
-// 1D:
-// out[N, Wo, K] = in[N, Wi, C] * wei[K, X, C]
-// 2D:
-// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C]
-// 3D:
-// out[N, Do, Ho, Wo, K] = in[N, Di, Hi, Wi, C] * wei[K, Z, Y, X, C]
-//
+///
+/// @brief      Device Convolution operation.
+///
+/// Supports:
+///  @li         Forward convolution with up to 3 spatial dimentions
+///  @li         Input tensor in GNWC data format
+///  @li         Weight tensor in GKXC data format
+///  @li         Output tensor in GNWK data format
+///
+/// 1D:
+/// out[N, Wo, K] = in[N, Wi, C] * wei[K, X, C]
+/// 2D:
+/// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C]
+/// 3D:
+/// out[N, Do, Ho, Wo, K] = in[N, Di, Hi, Wi, C] * wei[K, Z, Y, X, C]
+///
 template <index_t NDimSpatial,
          typename ALayout,
          typename BLayout,

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle_tile_loop.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle_tile_loop.hpp
@@ -25,23 +25,23 @@ namespace ck {
 namespace tensor_operation {
 namespace device {

-//
-// @brief      Entry point kernel for device-wide Grouped GEMM operation.
-//
-// @param[in]  gemm_desc_const             The pointer to the array of GEMM descriptor structures.
-// @param[in]  tile_count                  The overall number of output tiles we divided all groups
-//                                         into.
-// @param[in]  k_batch                     The number of batches we split the K dimension into.
-//
-// @tparam     GridwiseGemm                The specific GridwiseGEMM algorithm implementation.
-// @tparam     GemmDesc                    The structure holding all necessary descriptors and other
-//                                         data needed for groupd gemm calculation and work
-//                                         distribution.
-// @tparam     HasMainKBlockLoop           Flag indicating whether all GEMM problem configurations
-//                                         need to loop over tiles in K dimension.
-// @tparam     CGlobalMemoryDataOperation  The functor used to store data in output C matrix.
-//                                         In example could be: AtomicAdd or Store.
-//
+///
+/// @brief      Entry point kernel for device-wide Grouped GEMM operation.
+///
+/// @param[in]  gemm_desc_const             The pointer to the array of GEMM descriptor structures.
+/// @param[in]  tile_count                  The overall number of output tiles we divided all groups
+///                                         into.
+/// @param[in]  k_batch                     The number of batches we split the K dimension into.
+///
+/// @tparam     GridwiseGemm                The specific GridwiseGEMM algorithm implementation.
+/// @tparam     GemmDesc                    The structure holding all necessary descriptors and
+///                                         other data needed for groupd gemm calculation and work
+///                                         distribution.
+/// @tparam     HasMainKBlockLoop           Flag indicating whether all GEMM problem configurations
+///                                         need to loop over tiles in K dimension.
+/// @tparam     CGlobalMemoryDataOperation  The functor used to store data in output C matrix.
+///                                         In example could be: AtomicAdd or Store.
+///
 template <typename GridwiseGemm,
          typename GemmDesc,
          typename FloatA,
@@ -383,18 +383,18 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
        // Assume we want to have at most 2 waves per SIMD
        static constexpr int CU_BLOCKS = math::integer_divide_floor(2 * CU_SIMDS, BLOCK_WAVES);

-        //
-        // @brief      Launch Grouped Gemm kernel.
-        //
-        // @note       This function overload is using user provided device buffer for kernel
-        //             arguments.
-        //
-        // @param[in]  arg            The structure containing kernel arguments (in host memory).
-        // @param[in]  dev_gemm_args  The point to device memory with kernel arguments.
-        // @param[in]  stream_config  The device stream configuration.
-        //
-        // @return     The average kernel execution time (if time measurement is enabled.)
-        //
+        ///
+        /// @brief      Launch Grouped Gemm kernel.
+        ///
+        /// @note       This function overload is using user provided device buffer for kernel
+        ///             arguments.
+        ///
+        /// @param[in]  arg            The structure containing kernel arguments (in host memory).
+        /// @param[in]  dev_gemm_args  The point to device memory with kernel arguments.
+        /// @param[in]  stream_config  The device stream configuration.
+        ///
+        /// @return     The average kernel execution time (if time measurement is enabled.)
+        ///
        float Run(const Argument& arg,
                  const void* dev_gemm_args,
                  const StreamConfig& stream_config = StreamConfig{})
@@ -451,18 +451,18 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
            return ave_time;
        }

-        //
-        // @brief      Launch Grouped Gemm kernel.
-        //
-        // @note       This function overload is using device workspace buffer for kernel arguments.
-        //             The user should call @see GetWorkSpaceSize and @see SetWorkSpacePointer on
-        //             arg parameter to properly allocate this buffer.
-        //
-        // @param[in]  arg            The structure containing kernel arguments (in host memory).
-        // @param[in]  stream_config  The device stream configuration.
-        //
-        // @return     The average kernel execution time (if time measurement is enabled.)
-        //
+        ///
+        /// @brief      Launch Grouped Gemm kernel.
+        ///
+        /// @note       This function overload is using device workspace buffer for kernel
+        ///             arguments. The user should call @see GetWorkSpaceSize and @see
+        ///             SetWorkSpacePointer on arg parameter to properly allocate this buffer.
+        ///
+        /// @param[in]  arg            The structure containing kernel arguments (in host memory).
+        /// @param[in]  stream_config  The device stream configuration.
+        ///
+        /// @return     The average kernel execution time (if time measurement is enabled.)
+        ///
        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
        {
            if(arg.p_workspace_ != nullptr)

--- a/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp
@@ -348,24 +348,24 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
                        acc_elementwise_op};
    };

-    //
-    // @brief      Makes a pointer to Argument class.
-    //
-    // @param[in]  inLengths           Input tensor extent(s) from high to low dimension
-    // @param[in]  inStrides           Input tensor stride(s) from high to low dimension
-    // @param[in]  reduceDims          The dimension(s) the normalization operation is applied
-    // @param[in]  alpha               Typeless pointer in host memory storing the alpha scaling
-    //                                 value as type AccDataType
-    // @param[in]  beta                Typeless pointer in host memory storing the beta scaling
-    //                                 value as type AccDataType
-    // @param[in]  in_dev              Typeless const pointer in device memory storing the input
-    //                                 tensor
-    // @param      out_dev             Typeless pointer in device memory storing the output tensor
-    // @param[in]  in_elementwise_op   The input elementwise operation.
-    // @param[in]  acc_elementwise_op  The accumulation elementwise operation.
-    //
-    // @return     Unique pointer to the Argument class.
-    //
+    ///
+    /// @brief      Makes a pointer to Argument class.
+    ///
+    /// @param[in]  inLengths           Input tensor extent(s) from high to low dimension
+    /// @param[in]  inStrides           Input tensor stride(s) from high to low dimension
+    /// @param[in]  reduceDims          The dimension(s) the normalization operation is applied
+    /// @param[in]  alpha               Typeless pointer in host memory storing the alpha scaling
+    ///                                 value as type AccDataType
+    /// @param[in]  beta                Typeless pointer in host memory storing the beta scaling
+    ///                                 value as type AccDataType
+    /// @param[in]  in_dev              Typeless const pointer in device memory storing the input
+    ///                                 tensor
+    /// @param      out_dev             Typeless pointer in device memory storing the output tensor
+    /// @param[in]  in_elementwise_op   The input elementwise operation.
+    /// @param[in]  acc_elementwise_op  The accumulation elementwise operation.
+    ///
+    /// @return     Unique pointer to the Argument class.
+    ///
    std::unique_ptr<BaseArgument> MakeArgumentPointer(const std::vector<index_t> inLengths,
                                                      const std::vector<index_t> inStrides,
                                                      const std::vector<int> reduceDims,

--- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
@@ -622,30 +622,31 @@ struct OffsettedBlockToCTileMap
    index_t block_start_;
 };

-//
-// @brief      Simple tile mapping which creates 3D grid of block of threads.
-//
-// @paragraph  Description
-//             This Block-to-C-tile-map creates a 3D grid (n_blocks, m_blocks, z_blocks) of thread
-//             blocks. The first 2D are regular 2D tiles created by division of output GEMM
-//             dimenions by corresponding tile size. The third dimension (Z) is a k-split dimension,
-//             which denotes the number of blocks we use to divide work on GEMM K dimension onto.
-//
-// @tparam     MPerBlock  Output block tile size in M dimension.
-// @tparam     NPerBlock  Output block tile size in N dimension.
-//
+///
+/// @brief      Simple tile mapping which creates 3D grid of block of threads.
+///
+/// @paragraph  Description
+///             This Block-to-C-tile-map creates a 3D grid (n_blocks, m_blocks, z_blocks) of thread
+///             blocks. The first 2D are regular 2D tiles created by division of output GEMM
+///             dimenions by corresponding tile size. The third dimension (Z) is a k-split
+///             dimension, which denotes the number of blocks we use to divide work on GEMM K
+///             dimension onto.
+///
+/// @tparam     MPerBlock  Output block tile size in M dimension.
+/// @tparam     NPerBlock  Output block tile size in N dimension.
+///
 template <index_t MPerBlock, index_t NPerBlock>
 struct BlockToCTileMap_3DGrid_KSplit
 {
    __host__ __device__ BlockToCTileMap_3DGrid_KSplit() = default;

-    //
-    // @brief      Constructs a new instance.
-    //
-    // @param      <unnamed>  Swallow blockIdx.
-    //
-    // @tparam     TopIdx     The type of block index.
-    //
+    ///
+    /// @brief      Constructs a new instance.
+    ///
+    /// @param      <unnamed>  Swallow blockIdx.
+    ///
+    /// @tparam     TopIdx     The type of block index.
+    ///
    template <typename TopIdx>
    __host__ __device__ BlockToCTileMap_3DGrid_KSplit(TopIdx)
    {
@@ -680,14 +681,14 @@ struct BlockToCTileMap_3DGrid_KSplit
    }
 };

-//
-// @brief      Block to CTile Map which foster external mechanism for setting up local block id.
-//
-//             In example this type can be easily used to implement tile looping work distribution
-//             scheme.
-//
-// @tparam     UnderlyingBlockToCTileMap  The type of the local tile mapp.
-//
+///
+/// @brief      Block to CTile Map which foster external mechanism for setting up local block id.
+///
+///             In example this type can be easily used to implement tile looping work distribution
+///             scheme.
+///
+/// @tparam     UnderlyingBlockToCTileMap  The type of the local tile mapp.
+///
 template <typename UnderlyingBlockToCTileMap>
 struct LocalBlockToCTileMap
 {

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
@@ -14,27 +14,27 @@ namespace ck {
 namespace tensor_operation {
 namespace host {

-//
-// @brief      Reference implementation for forward convolution.
-//
-// @paragraph
-//             Tensor descriptor in GNCHW/GKCXY/GNKHW dimensional order
-//             Supports both GNCHW/NGCHW as well as GNHWC/NHWGC physical layout
-//             as long as dimensions in tensor descriptor is in GNCHW order
-//
-// @tparam     InDataType               Input tensor data type.
-// @tparam     WeiDataType              Weights tensor data type.
-// @tparam     OutDataType              Output tensor data type.
-// @tparam     InElementwiseOperation   Functor for input tensor elementwise
-//                                      operation.
-// @tparam     WeiElementwiseOperation  Functor for weights tensor elementwise
-//                                      operation.
-// @tparam     NDimSpatial  Number of spatial dimensions.
-//
-// input descriptor in [G, N, C, Do, Ho, Wo] order
-// weight descriptor in [G, K, C, Z, Y, X] order
-// output descriptor in [G, N, K, Di, Hi, Wi] order
-// phyiscal layout is irrelavent
+///
+/// @brief      Reference implementation for forward convolution.
+///
+/// @paragraph
+///             Tensor descriptor in GNCHW/GKCXY/GNKHW dimensional order
+///             Supports both GNCHW/NGCHW as well as GNHWC/NHWGC physical layout
+///             as long as dimensions in tensor descriptor is in GNCHW order
+///
+/// @tparam     InDataType               Input tensor data type.
+/// @tparam     WeiDataType              Weights tensor data type.
+/// @tparam     OutDataType              Output tensor data type.
+/// @tparam     InElementwiseOperation   Functor for input tensor elementwise
+///                                      operation.
+/// @tparam     WeiElementwiseOperation  Functor for weights tensor elementwise
+///                                      operation.
+/// @tparam     NDimSpatial  Number of spatial dimensions.
+///
+/// input descriptor in [G, N, C, Do, Ho, Wo] order
+/// weight descriptor in [G, K, C, Z, Y, X] order
+/// output descriptor in [G, N, K, Di, Hi, Wi] order
+/// phyiscal layout is irrelavent
 template <ck::index_t NDimSpatial,
          typename InDataType,
          typename WeiDataType,