"docs/vscode:/vscode.git/clone" did not exist on "5f65ef4d0ad71ec4e405ceecc1fe7f339124a92e"
Commit d254ed90 authored by Adam Osewski's avatar Adam Osewski
Browse files

Update comments to conform with doxygen style.

parent f6f70673
...@@ -82,22 +82,22 @@ struct DeviceGroupedGemmSplitK : public DeviceGroupedGemm<ALayout, ...@@ -82,22 +82,22 @@ struct DeviceGroupedGemmSplitK : public DeviceGroupedGemm<ALayout,
BElementwiseOperation, BElementwiseOperation,
CElementwiseOperation> CElementwiseOperation>
{ {
//------------------------------------------------------------------------// ///------------------------------------------------------------------------//
// @brief Sets the k batch size. /// @brief Sets the k batch size.
// ///
// @param p_arg Pointer to the Argument we're going to change. /// @param p_arg Pointer to the Argument we're going to change.
// @param[in] kbatch The kbatch value. /// @param[in] kbatch The kbatch value.
// ///
virtual void SetKBatchSize(BaseArgument* /*p_arg*/, index_t /*kbatch*/) const {} virtual void SetKBatchSize(BaseArgument* /*p_arg*/, index_t /*kbatch*/) const {}
//------------------------------------------------------------------------// ///------------------------------------------------------------------------//
// ///
// @brief Sets the device kernel arguments pointer. /// @brief Sets the device kernel arguments pointer.
// ///
// @param p_arg The pointer to the Argument we're going to update. /// @param p_arg The pointer to the Argument we're going to update.
// @param[in] p_dev_kernel_args The pointer to the device memory which contains kernel /// @param[in] p_dev_kernel_args The pointer to the device memory which contains kernel
// arguments. /// arguments.
// ///
virtual void SetDeviceKernelArgs(BaseArgument* /*p_arg*/, virtual void SetDeviceKernelArgs(BaseArgument* /*p_arg*/,
const void* /*p_dev_kernel_args*/) const const void* /*p_dev_kernel_args*/) const
{ {
......
...@@ -22,22 +22,22 @@ template <typename InDataType, ...@@ -22,22 +22,22 @@ template <typename InDataType,
index_t NumReduceDim> index_t NumReduceDim>
struct DeviceSoftmax : public BaseOperator struct DeviceSoftmax : public BaseOperator
{ {
// ///
// @brief Makes a pointer to Argument class. /// @brief Makes a pointer to Argument class.
// ///
// @param[in] inLengths Input tensor extent(s) from high to low dimension /// @param[in] inLengths Input tensor extent(s) from high to low dimension
// @param[in] inStrides Input tensor stride(s) from high to low dimension /// @param[in] inStrides Input tensor stride(s) from high to low dimension
// @param[in] reduceDims The dimension(s) the normalization operation is applied /// @param[in] reduceDims The dimension(s) the normalization operation is applied
// @param[in] alpha double type value /// @param[in] alpha double type value
// @param[in] beta double type value /// @param[in] beta double type value
// @param[in] in_dev Typeless const pointer in device memory storing the input /// @param[in] in_dev Typeless const pointer in device memory storing the input
// tensor /// tensor
// @param out_dev Typeless pointer in device memory storing the output tensor /// @param out_dev Typeless pointer in device memory storing the output tensor
// @param[in] in_elementwise_op The input elementwise operation. /// @param[in] in_elementwise_op The input elementwise operation.
// @param[in] acc_elementwise_op The accumulation elementwise operation. /// @param[in] acc_elementwise_op The accumulation elementwise operation.
// ///
// @return Unique pointer to the Argument class. /// @return Unique pointer to the Argument class.
// ///
virtual std::unique_ptr<BaseArgument> virtual std::unique_ptr<BaseArgument>
MakeArgumentPointer(const std::vector<index_t> inLengths, MakeArgumentPointer(const std::vector<index_t> inLengths,
const std::vector<index_t> inStrides, const std::vector<index_t> inStrides,
......
...@@ -201,22 +201,22 @@ __global__ void ...@@ -201,22 +201,22 @@ __global__ void
} }
} // namespace } // namespace
// ///
// @brief Device Convolution operation. /// @brief Device Convolution operation.
// ///
// Supports: /// Supports:
// @li Forward convolution with up to 3 spatial dimentions /// @li Forward convolution with up to 3 spatial dimentions
// @li Input tensor in GNWC data format /// @li Input tensor in GNWC data format
// @li Weight tensor in GKXC data format /// @li Weight tensor in GKXC data format
// @li Output tensor in GNWK data format /// @li Output tensor in GNWK data format
// ///
// 1D: /// 1D:
// out[N, Wo, K] = in[N, Wi, C] * wei[K, X, C] /// out[N, Wo, K] = in[N, Wi, C] * wei[K, X, C]
// 2D: /// 2D:
// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C] /// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C]
// 3D: /// 3D:
// out[N, Do, Ho, Wo, K] = in[N, Di, Hi, Wi, C] * wei[K, Z, Y, X, C] /// out[N, Do, Ho, Wo, K] = in[N, Di, Hi, Wi, C] * wei[K, Z, Y, X, C]
// ///
template <index_t NDimSpatial, template <index_t NDimSpatial,
typename ADataType, typename ADataType,
typename BDataType, typename BDataType,
......
...@@ -154,22 +154,22 @@ __global__ void ...@@ -154,22 +154,22 @@ __global__ void
} // namespace } // namespace
// ///
// @brief Device Convolution operation. /// @brief Device Convolution operation.
// ///
// Supports: /// Supports:
// @li Forward convolution with up to 3 spatial dimentions /// @li Forward convolution with up to 3 spatial dimentions
// @li Input tensor in GNWC data format /// @li Input tensor in GNWC data format
// @li Weight tensor in GKXC data format /// @li Weight tensor in GKXC data format
// @li Output tensor in GNWK data format /// @li Output tensor in GNWK data format
// ///
// 1D: /// 1D:
// out[N, Wo, K] = in[N, Wi, C] * wei[K, X, C] /// out[N, Wo, K] = in[N, Wi, C] * wei[K, X, C]
// 2D: /// 2D:
// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C] /// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C]
// 3D: /// 3D:
// out[N, Do, Ho, Wo, K] = in[N, Di, Hi, Wi, C] * wei[K, Z, Y, X, C] /// out[N, Do, Ho, Wo, K] = in[N, Di, Hi, Wi, C] * wei[K, Z, Y, X, C]
// ///
template < template <
index_t NDimSpatial, index_t NDimSpatial,
typename ADataType, typename ADataType,
......
...@@ -76,23 +76,23 @@ struct ComputePtrOffsetOfStridedBatch ...@@ -76,23 +76,23 @@ struct ComputePtrOffsetOfStridedBatch
} // namespace } // namespace
// ///
// @brief Device Convolution operation. /// @brief Device Convolution operation.
// ///
// Supports: /// Supports:
// @li Forward convolution with up to 3 spatial dimentions /// @li Forward convolution with up to 3 spatial dimentions
// @li Input tensor in GNWC data format /// @li Input tensor in GNWC data format
// @li Weight tensor in GKXC data format /// @li Weight tensor in GKXC data format
// @li Output tensor in GNWK data format /// @li Output tensor in GNWK data format
// ///
// 1D: /// 1D:
// out[N, Wo, K] = in[N, Wi, C] * wei[K, X, C] /// out[N, Wo, K] = in[N, Wi, C] * wei[K, X, C]
// 2D: /// 2D:
// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C] /// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C]
// 3D: /// 3D:
// out[N, Do, Ho, Wo, K] = in[N, Di, Hi, Wi, C] * wei[K, Z, Y, X, C] /// out[N, Do, Ho, Wo, K] = in[N, Di, Hi, Wi, C] * wei[K, Z, Y, X, C]
// Assume: /// Assume:
// AK1 == BK1 /// AK1 == BK1
template <index_t NDimSpatial, template <index_t NDimSpatial,
typename ALayout, typename ALayout,
typename BLayout, typename BLayout,
......
...@@ -194,22 +194,22 @@ __global__ void ...@@ -194,22 +194,22 @@ __global__ void
} // namespace } // namespace
// ///
// @brief Device Convolution operation. /// @brief Device Convolution operation.
// ///
// Supports: /// Supports:
// @li Forward convolution with up to 3 spatial dimentions /// @li Forward convolution with up to 3 spatial dimentions
// @li Input tensor in GNWC data format /// @li Input tensor in GNWC data format
// @li Weight tensor in GKXC data format /// @li Weight tensor in GKXC data format
// @li Output tensor in GNWK data format /// @li Output tensor in GNWK data format
// ///
// 1D: /// 1D:
// out[N, Wo, K] = in[N, Wi, C] * wei[K, X, C] /// out[N, Wo, K] = in[N, Wi, C] * wei[K, X, C]
// 2D: /// 2D:
// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C] /// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C]
// 3D: /// 3D:
// out[N, Do, Ho, Wo, K] = in[N, Di, Hi, Wi, C] * wei[K, Z, Y, X, C] /// out[N, Do, Ho, Wo, K] = in[N, Di, Hi, Wi, C] * wei[K, Z, Y, X, C]
// ///
template <index_t NDimSpatial, template <index_t NDimSpatial,
typename ALayout, typename ALayout,
typename BLayout, typename BLayout,
......
...@@ -25,23 +25,23 @@ namespace ck { ...@@ -25,23 +25,23 @@ namespace ck {
namespace tensor_operation { namespace tensor_operation {
namespace device { namespace device {
// ///
// @brief Entry point kernel for device-wide Grouped GEMM operation. /// @brief Entry point kernel for device-wide Grouped GEMM operation.
// ///
// @param[in] gemm_desc_const The pointer to the array of GEMM descriptor structures. /// @param[in] gemm_desc_const The pointer to the array of GEMM descriptor structures.
// @param[in] tile_count The overall number of output tiles we divided all groups /// @param[in] tile_count The overall number of output tiles we divided all groups
// into. /// into.
// @param[in] k_batch The number of batches we split the K dimension into. /// @param[in] k_batch The number of batches we split the K dimension into.
// ///
// @tparam GridwiseGemm The specific GridwiseGEMM algorithm implementation. /// @tparam GridwiseGemm The specific GridwiseGEMM algorithm implementation.
// @tparam GemmDesc The structure holding all necessary descriptors and other /// @tparam GemmDesc The structure holding all necessary descriptors and
// data needed for groupd gemm calculation and work /// other data needed for groupd gemm calculation and work
// distribution. /// distribution.
// @tparam HasMainKBlockLoop Flag indicating whether all GEMM problem configurations /// @tparam HasMainKBlockLoop Flag indicating whether all GEMM problem configurations
// need to loop over tiles in K dimension. /// need to loop over tiles in K dimension.
// @tparam CGlobalMemoryDataOperation The functor used to store data in output C matrix. /// @tparam CGlobalMemoryDataOperation The functor used to store data in output C matrix.
// In example could be: AtomicAdd or Store. /// In example could be: AtomicAdd or Store.
// ///
template <typename GridwiseGemm, template <typename GridwiseGemm,
typename GemmDesc, typename GemmDesc,
typename FloatA, typename FloatA,
...@@ -383,18 +383,18 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo ...@@ -383,18 +383,18 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
// Assume we want to have at most 2 waves per SIMD // Assume we want to have at most 2 waves per SIMD
static constexpr int CU_BLOCKS = math::integer_divide_floor(2 * CU_SIMDS, BLOCK_WAVES); static constexpr int CU_BLOCKS = math::integer_divide_floor(2 * CU_SIMDS, BLOCK_WAVES);
// ///
// @brief Launch Grouped Gemm kernel. /// @brief Launch Grouped Gemm kernel.
// ///
// @note This function overload is using user provided device buffer for kernel /// @note This function overload is using user provided device buffer for kernel
// arguments. /// arguments.
// ///
// @param[in] arg The structure containing kernel arguments (in host memory). /// @param[in] arg The structure containing kernel arguments (in host memory).
// @param[in] dev_gemm_args The point to device memory with kernel arguments. /// @param[in] dev_gemm_args The point to device memory with kernel arguments.
// @param[in] stream_config The device stream configuration. /// @param[in] stream_config The device stream configuration.
// ///
// @return The average kernel execution time (if time measurement is enabled.) /// @return The average kernel execution time (if time measurement is enabled.)
// ///
float Run(const Argument& arg, float Run(const Argument& arg,
const void* dev_gemm_args, const void* dev_gemm_args,
const StreamConfig& stream_config = StreamConfig{}) const StreamConfig& stream_config = StreamConfig{})
...@@ -451,18 +451,18 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo ...@@ -451,18 +451,18 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
return ave_time; return ave_time;
} }
// ///
// @brief Launch Grouped Gemm kernel. /// @brief Launch Grouped Gemm kernel.
// ///
// @note This function overload is using device workspace buffer for kernel arguments. /// @note This function overload is using device workspace buffer for kernel
// The user should call @see GetWorkSpaceSize and @see SetWorkSpacePointer on /// arguments. The user should call @see GetWorkSpaceSize and @see
// arg parameter to properly allocate this buffer. /// SetWorkSpacePointer on arg parameter to properly allocate this buffer.
// ///
// @param[in] arg The structure containing kernel arguments (in host memory). /// @param[in] arg The structure containing kernel arguments (in host memory).
// @param[in] stream_config The device stream configuration. /// @param[in] stream_config The device stream configuration.
// ///
// @return The average kernel execution time (if time measurement is enabled.) /// @return The average kernel execution time (if time measurement is enabled.)
// ///
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{ {
if(arg.p_workspace_ != nullptr) if(arg.p_workspace_ != nullptr)
......
...@@ -348,24 +348,24 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType, ...@@ -348,24 +348,24 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
acc_elementwise_op}; acc_elementwise_op};
}; };
// ///
// @brief Makes a pointer to Argument class. /// @brief Makes a pointer to Argument class.
// ///
// @param[in] inLengths Input tensor extent(s) from high to low dimension /// @param[in] inLengths Input tensor extent(s) from high to low dimension
// @param[in] inStrides Input tensor stride(s) from high to low dimension /// @param[in] inStrides Input tensor stride(s) from high to low dimension
// @param[in] reduceDims The dimension(s) the normalization operation is applied /// @param[in] reduceDims The dimension(s) the normalization operation is applied
// @param[in] alpha Typeless pointer in host memory storing the alpha scaling /// @param[in] alpha Typeless pointer in host memory storing the alpha scaling
// value as type AccDataType /// value as type AccDataType
// @param[in] beta Typeless pointer in host memory storing the beta scaling /// @param[in] beta Typeless pointer in host memory storing the beta scaling
// value as type AccDataType /// value as type AccDataType
// @param[in] in_dev Typeless const pointer in device memory storing the input /// @param[in] in_dev Typeless const pointer in device memory storing the input
// tensor /// tensor
// @param out_dev Typeless pointer in device memory storing the output tensor /// @param out_dev Typeless pointer in device memory storing the output tensor
// @param[in] in_elementwise_op The input elementwise operation. /// @param[in] in_elementwise_op The input elementwise operation.
// @param[in] acc_elementwise_op The accumulation elementwise operation. /// @param[in] acc_elementwise_op The accumulation elementwise operation.
// ///
// @return Unique pointer to the Argument class. /// @return Unique pointer to the Argument class.
// ///
std::unique_ptr<BaseArgument> MakeArgumentPointer(const std::vector<index_t> inLengths, std::unique_ptr<BaseArgument> MakeArgumentPointer(const std::vector<index_t> inLengths,
const std::vector<index_t> inStrides, const std::vector<index_t> inStrides,
const std::vector<int> reduceDims, const std::vector<int> reduceDims,
......
...@@ -622,30 +622,31 @@ struct OffsettedBlockToCTileMap ...@@ -622,30 +622,31 @@ struct OffsettedBlockToCTileMap
index_t block_start_; index_t block_start_;
}; };
// ///
// @brief Simple tile mapping which creates 3D grid of block of threads. /// @brief Simple tile mapping which creates 3D grid of block of threads.
// ///
// @paragraph Description /// @paragraph Description
// This Block-to-C-tile-map creates a 3D grid (n_blocks, m_blocks, z_blocks) of thread /// This Block-to-C-tile-map creates a 3D grid (n_blocks, m_blocks, z_blocks) of thread
// blocks. The first 2D are regular 2D tiles created by division of output GEMM /// blocks. The first 2D are regular 2D tiles created by division of output GEMM
// dimenions by corresponding tile size. The third dimension (Z) is a k-split dimension, /// dimenions by corresponding tile size. The third dimension (Z) is a k-split
// which denotes the number of blocks we use to divide work on GEMM K dimension onto. /// dimension, which denotes the number of blocks we use to divide work on GEMM K
// /// dimension onto.
// @tparam MPerBlock Output block tile size in M dimension. ///
// @tparam NPerBlock Output block tile size in N dimension. /// @tparam MPerBlock Output block tile size in M dimension.
// /// @tparam NPerBlock Output block tile size in N dimension.
///
template <index_t MPerBlock, index_t NPerBlock> template <index_t MPerBlock, index_t NPerBlock>
struct BlockToCTileMap_3DGrid_KSplit struct BlockToCTileMap_3DGrid_KSplit
{ {
__host__ __device__ BlockToCTileMap_3DGrid_KSplit() = default; __host__ __device__ BlockToCTileMap_3DGrid_KSplit() = default;
// ///
// @brief Constructs a new instance. /// @brief Constructs a new instance.
// ///
// @param <unnamed> Swallow blockIdx. /// @param <unnamed> Swallow blockIdx.
// ///
// @tparam TopIdx The type of block index. /// @tparam TopIdx The type of block index.
// ///
template <typename TopIdx> template <typename TopIdx>
__host__ __device__ BlockToCTileMap_3DGrid_KSplit(TopIdx) __host__ __device__ BlockToCTileMap_3DGrid_KSplit(TopIdx)
{ {
...@@ -680,14 +681,14 @@ struct BlockToCTileMap_3DGrid_KSplit ...@@ -680,14 +681,14 @@ struct BlockToCTileMap_3DGrid_KSplit
} }
}; };
// ///
// @brief Block to CTile Map which foster external mechanism for setting up local block id. /// @brief Block to CTile Map which foster external mechanism for setting up local block id.
// ///
// In example this type can be easily used to implement tile looping work distribution /// In example this type can be easily used to implement tile looping work distribution
// scheme. /// scheme.
// ///
// @tparam UnderlyingBlockToCTileMap The type of the local tile mapp. /// @tparam UnderlyingBlockToCTileMap The type of the local tile mapp.
// ///
template <typename UnderlyingBlockToCTileMap> template <typename UnderlyingBlockToCTileMap>
struct LocalBlockToCTileMap struct LocalBlockToCTileMap
{ {
......
...@@ -14,27 +14,27 @@ namespace ck { ...@@ -14,27 +14,27 @@ namespace ck {
namespace tensor_operation { namespace tensor_operation {
namespace host { namespace host {
// ///
// @brief Reference implementation for forward convolution. /// @brief Reference implementation for forward convolution.
// ///
// @paragraph /// @paragraph
// Tensor descriptor in GNCHW/GKCXY/GNKHW dimensional order /// Tensor descriptor in GNCHW/GKCXY/GNKHW dimensional order
// Supports both GNCHW/NGCHW as well as GNHWC/NHWGC physical layout /// Supports both GNCHW/NGCHW as well as GNHWC/NHWGC physical layout
// as long as dimensions in tensor descriptor is in GNCHW order /// as long as dimensions in tensor descriptor is in GNCHW order
// ///
// @tparam InDataType Input tensor data type. /// @tparam InDataType Input tensor data type.
// @tparam WeiDataType Weights tensor data type. /// @tparam WeiDataType Weights tensor data type.
// @tparam OutDataType Output tensor data type. /// @tparam OutDataType Output tensor data type.
// @tparam InElementwiseOperation Functor for input tensor elementwise /// @tparam InElementwiseOperation Functor for input tensor elementwise
// operation. /// operation.
// @tparam WeiElementwiseOperation Functor for weights tensor elementwise /// @tparam WeiElementwiseOperation Functor for weights tensor elementwise
// operation. /// operation.
// @tparam NDimSpatial Number of spatial dimensions. /// @tparam NDimSpatial Number of spatial dimensions.
// ///
// input descriptor in [G, N, C, Do, Ho, Wo] order /// input descriptor in [G, N, C, Do, Ho, Wo] order
// weight descriptor in [G, K, C, Z, Y, X] order /// weight descriptor in [G, K, C, Z, Y, X] order
// output descriptor in [G, N, K, Di, Hi, Wi] order /// output descriptor in [G, N, K, Di, Hi, Wi] order
// phyiscal layout is irrelavent /// phyiscal layout is irrelavent
template <ck::index_t NDimSpatial, template <ck::index_t NDimSpatial,
typename InDataType, typename InDataType,
typename WeiDataType, typename WeiDataType,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment