[Misc] Replace \xxx with @XXX in structured comment. (#4822)

* param * brief * note * return * tparam * brief2 * file * return2 * return * blabla * all Co-authored-by: Steve <ubuntu@ip-172-31-34-29.ap-northeast-1.compute.internal>

[Misc] Replace \xxx with @XXX in structured comment. (#4822)
* param * brief * note * return * tparam * brief2 * file * return2 * return * blabla * all Co-authored-by: Steve <ubuntu@ip-172-31-34-29.ap-northeast-1.compute.internal>
619d735d · Hongzhi (Steve), Chen · GitHub · 96297fb8 · 619d735d · 619d735d
Unverified Commit 619d735d authored Nov 07, 2022 by Hongzhi (Steve), Chen Committed by GitHub Nov 07, 2022
20 changed files
--- a/src/array/cuda/gather_mm.cu
+++ b/src/array/cuda/gather_mm.cu
 /*!
 *  Copyright (c) 2020 by Contributors
- * \file array/cuda/gather_mm.cu
- * \brief GatherMM C APIs and definitions.
+ * @file array/cuda/gather_mm.cu
+ * @brief GatherMM C APIs and definitions.
 */
 #include <dgl/array.h>
 #include <algorithm>  // std::swap
@@ -15,7 +15,7 @@ namespace aten {

 namespace {

-/*! \brief Call cuBLAS GEMM API for dense matmul operation for float and double. */
+/*! @brief Call cuBLAS GEMM API for dense matmul operation for float and double. */
 template <typename DType>
 cublasStatus_t cublasGemm(cublasHandle_t handle, cublasOperation_t transa,
    cublasOperation_t transb, int m, int n, int k,
@@ -77,12 +77,13 @@ cublasStatus_t cublasGemm<double>(cublasHandle_t handle, cublasOperation_t trans

 namespace cuda {

-/* \Note Each row of A multiplies a segment of matrix of B of dimension in_len * outlen.
-  One warp is assigned to process one row of A. Each WARP sequentially multiplies
-  one element of A and a row of B to compute partial result of the output. A
-  is loaded in shared memory in a coalesced way. Output matrix is loaded in
-  registers. B should get benefit from L2 cache.
-*/
+/**
+ * @note Each row of A multiplies a segment of matrix of B of dimension in_len * outlen.
+ * One warp is assigned to process one row of A. Each WARP sequentially multiplies
+ * one element of A and a row of B to compute partial result of the output. A
+ * is loaded in shared memory in a coalesced way. Output matrix is loaded in
+ * registers. B should get benefit from L2 cache.
+ */
 template <typename Idx, typename DType>
 __global__ void GatherMMScatterKernel(
    const DType* __restrict__ A,
@@ -138,12 +139,13 @@ __global__ void GatherMMScatterKernel(
 }


-/* \Note Output matrix is accumulated via atomic operations. Rest of the strategies
-  are similar to GatherMMKernel. One warp is assigned to process one row of A. Each
-  WARP sequentially multiplies one element of A and a row of B to compute partial
-  result of the output. A is loaded in shared memory in a coalesced way. B should
-  get benefit from L2 cache.
-*/
+/**
+ * @note Output matrix is accumulated via atomic operations. Rest of the strategies
+ * are similar to GatherMMKernel. One warp is assigned to process one row of A. Each
+ * WARP sequentially multiplies one element of A and a row of B to compute partial
+ * result of the output. A is loaded in shared memory in a coalesced way. B should
+ * get benefit from L2 cache.
+ */
 template <typename Idx, typename DType>
 __global__ void GatherMMScatterKernel2(
    const DType* __restrict__ A,
@@ -197,15 +199,15 @@ __global__ void GatherMMScatterKernel2(
 }  // namespace cuda

 /*!
- * \brief Implementation of Gather_mm operator. The input matrix A is
+ * @brief Implementation of Gather_mm operator. The input matrix A is
 *        expected to be sorted according to relation type.
- * \param A The input dense matrix of dimension m x k
- * \param B The input dense matrix of dimension k x n
- * \param C The output dense matrix of dimension m x n
- * \param seglen_A The input vector of size R. Each element
+ * @param A The input dense matrix of dimension m x k
+ * @param B The input dense matrix of dimension k x n
+ * @param C The output dense matrix of dimension m x n
+ * @param seglen_A The input vector of size R. Each element
 *        is the length of segments of input ``A``
- * \param a_trans Matrix A to be transposed
- * \param b_trans Matrix B to be transposed
+ * @param a_trans Matrix A to be transposed
+ * @param b_trans Matrix B to be transposed
 */
 template <int XPU, typename IdType, typename DType>
 void SegmentMM(const NDArray A,
@@ -308,13 +310,13 @@ void SegmentMMBackwardB(const NDArray A,
 }

 /*!
- * \brief Implementation of Gather_mm operator. The input matrix A is
+ * @brief Implementation of Gather_mm operator. The input matrix A is
 *        expected to be sorted according to relation type.
- * \param A The input dense matrix of dimension m x k
- * \param B The input dense matrix of dimension k x n
- * \param C The output dense matrix of dimension m x n
- * \param idx_a The input vector to gather left hand operand on
- * \param idx_b The input vector to gather right hand operand on
+ * @param A The input dense matrix of dimension m x k
+ * @param B The input dense matrix of dimension k x n
+ * @param C The output dense matrix of dimension m x n
+ * @param idx_a The input vector to gather left hand operand on
+ * @param idx_b The input vector to gather right hand operand on
 */

 template <int XPU, typename IdType, typename DType>
@@ -345,17 +347,17 @@ void GatherMM(const NDArray A,
 }

 /*!
- * \brief Implementation of Gather_mm operator. The input matrix A is
+ * @brief Implementation of Gather_mm operator. The input matrix A is
 *        expected to be sorted according to relation type.
- * \param A The input dense matrix of dimension m x k
- * \param B The input dense matrix of dimension k x n
- * \param C The output dense matrix of dimension m x n
- * \param idx_a The input vector to gather left hand operand on
- * \param idx_b The input vector to gather right hand operand on
- * \param idx_c The input vector to gather output operand on
- * \param num_rel The number of idx types in idx_b
- * \param a_trans Matrix A to be transposed
- * \param b_trans Matrix B to be transposed
+ * @param A The input dense matrix of dimension m x k
+ * @param B The input dense matrix of dimension k x n
+ * @param C The output dense matrix of dimension m x n
+ * @param idx_a The input vector to gather left hand operand on
+ * @param idx_b The input vector to gather right hand operand on
+ * @param idx_c The input vector to gather output operand on
+ * @param num_rel The number of idx types in idx_b
+ * @param a_trans Matrix A to be transposed
+ * @param b_trans Matrix B to be transposed
 */
 template <int XPU, typename IdType, typename DType>
 void GatherMMScatter(const NDArray A,

--- a/src/array/cuda/ge_spmm.cuh
+++ b/src/array/cuda/ge_spmm.cuh
 /*!
 * Copyright (c) 2020 by Contributors
- * \file array/cuda/ge_spmm.cuh
- * \brief GE-SpMM CUDA kernel function header.
+ * @file array/cuda/ge_spmm.cuh
+ * @brief GE-SpMM CUDA kernel function header.
 */
 #ifndef DGL_ARRAY_CUDA_GE_SPMM_CUH_
 #define DGL_ARRAY_CUDA_GE_SPMM_CUH_
@@ -19,8 +19,8 @@ namespace aten {
 namespace cuda {

 /*!
- * \brief CUDA kernel of GE-SpMM on Csr.
- * \note GE-SpMM: https://arxiv.org/pdf/2007.03179.pdf
+ * @brief CUDA kernel of GE-SpMM on Csr.
+ * @note GE-SpMM: https://arxiv.org/pdf/2007.03179.pdf
 *       The grid dimension x and y are reordered for better performance.
 */
 template <typename Idx, typename DType, typename BinaryOp>

--- a/src/array/cuda/macro.cuh
+++ b/src/array/cuda/macro.cuh
 /*!
 *  Copyright (c) 2020 by Contributors
- * \file array/cuda/macro.cuh
- * \brief Macro to call SPMM/SDDMM cuda kernels.
+ * @file array/cuda/macro.cuh
+ * @brief Macro to call SPMM/SDDMM cuda kernels.
 */
 #ifndef DGL_ARRAY_CUDA_MACRO_CUH_
 #define DGL_ARRAY_CUDA_MACRO_CUH_

--- a/src/array/cuda/negative_sampling.cu
+++ b/src/array/cuda/negative_sampling.cu
 /*!
 *  Copyright (c) 2021 by Contributors
- * \file array/cuda/negative_sampling.cu
- * \brief rowwise sampling
+ * @file array/cuda/negative_sampling.cu
+ * @brief rowwise sampling
 */

 #include <curand_kernel.h>
@@ -81,7 +81,7 @@ struct IsNotMinusOne {
 };

 /*!
- * \brief Sort ordered pairs in ascending order, using \a tmp_major and \a
+ * @brief Sort ordered pairs in ascending order, using \a tmp_major and \a
 * tmp_minor as temporary buffers, each with \a n elements.
 */
 template <typename IdType>

--- a/src/array/cuda/rowwise_sampling.cu
+++ b/src/array/cuda/rowwise_sampling.cu
 /*!
 *  Copyright (c) 2021 by Contributors
- * \file array/cuda/rowwise_sampling.cu
- * \brief uniform rowwise sampling
+ * @file array/cuda/rowwise_sampling.cu
+ * @brief uniform rowwise sampling
 */

 #include <curand_kernel.h>

--- a/src/array/cuda/rowwise_sampling_prob.cu
+++ b/src/array/cuda/rowwise_sampling_prob.cu
 /*!
 *  Copyright (c) 2022 by Contributors
- * \file array/cuda/rowwise_sampling_prob.cu
- * \brief weighted rowwise sampling. The degree computing kernels and
+ * @file array/cuda/rowwise_sampling_prob.cu
+ * @brief weighted rowwise sampling. The degree computing kernels and
 * host-side functions are partially borrowed from the uniform rowwise
 * sampling code rowwise_sampling.cu.
- * \author pengqirong (OPPO), dlasalle and Xin from Nvidia.
+ * @author pengqirong (OPPO), dlasalle and Xin from Nvidia.
 */
 #include <dgl/random.h>
 #include <dgl/runtime/device_api.h>

--- a/src/array/cuda/sddmm.cu
+++ b/src/array/cuda/sddmm.cu
 /*!
 *  Copyright (c) 2020 by Contributors
- * \file array/cuda/sddmm.cu
- * \brief SDDMM C APIs and definitions.
+ * @file array/cuda/sddmm.cu
+ * @brief SDDMM C APIs and definitions.
 */
 #include <dgl/array.h>
 #include "./sddmm.cuh"
@@ -11,7 +11,7 @@ namespace dgl {
 namespace aten {

 /*!
- * \brief CUDA implementation of g-SDDMM on Csr format.
+ * @brief CUDA implementation of g-SDDMM on Csr format.
 */
 template <int XPU, typename IdType, typename DType>
 void SDDMMCsr(const std::string& op,
@@ -31,7 +31,7 @@ void SDDMMCsr(const std::string& op,


 /*!
- * \brief CUDA implementation of g-SDDMM on Coo format.
+ * @brief CUDA implementation of g-SDDMM on Coo format.
 */
 template <int XPU, typename IdType, typename DType>
 void SDDMMCoo(const std::string& op,

--- a/src/array/cuda/sddmm.cuh
+++ b/src/array/cuda/sddmm.cuh
 /*!
 *  Copyright (c) 2020 by Contributors
- * \file array/cuda/sddmm.cuh
- * \brief SDDMM CUDA kernel function header.
+ * @file array/cuda/sddmm.cuh
+ * @brief SDDMM CUDA kernel function header.
 */
 #ifndef DGL_ARRAY_CUDA_SDDMM_CUH_
 #define DGL_ARRAY_CUDA_SDDMM_CUH_
@@ -87,8 +87,8 @@ namespace cuda {
 constexpr unsigned int full_mask = 0xffffffff;

 /*!
- * \brief CUDA kernel of g-SDDMM on Coo format.
- * \note it uses edge parallel strategy, different threadblocks (on y-axis)
+ * @brief CUDA kernel of g-SDDMM on Coo format.
+ * @note it uses edge parallel strategy, different threadblocks (on y-axis)
 *       is responsible for the computation on different edges. Threadblocks
 *       on the x-axis are responsible for the computation on different positions
 *       in feature dimension.
@@ -136,8 +136,8 @@ __global__ void SDDMMCooKernel(
 }

 /*!
- * \brief CUDA kernel of SDDMM-dot on Coo format, accelerated with tree reduction.
- * \note it uses edge parallel strategy, different threadblocks (on y-axis)
+ * @brief CUDA kernel of SDDMM-dot on Coo format, accelerated with tree reduction.
+ * @note it uses edge parallel strategy, different threadblocks (on y-axis)
 *       is responsible for the computation on different edges. Threadblocks
 *       on the x-axis are responsible for the computation on different positions
 *       in feature dimension.
@@ -204,8 +204,8 @@ __device__ __forceinline__ Idx BinarySearchSrc(const Idx *array, Idx length, Idx
 }

 /*!
- * \brief CUDA kernel of g-SDDMM on Csr format.
- * \note it uses edge parallel strategy, different threadblocks (on y-axis)
+ * @brief CUDA kernel of g-SDDMM on Csr format.
+ * @note it uses edge parallel strategy, different threadblocks (on y-axis)
 *       is responsible for the computation on different edges. Threadblocks
 *       on the x-axis are responsible for the computation on different positions
 *       in feature dimension.
@@ -255,12 +255,12 @@ __global__ void SDDMMCsrKernel(
 }

 /*!
- * \brief CUDA implementation of g-SDDMM on Coo format.
- * \param bcast Broadcast information.
- * \param coo The Coo matrix.
- * \param lhs The left hand side operand feature.
- * \param rhs The right hand size operand feature.
- * \param out The result feature on edges.
+ * @brief CUDA implementation of g-SDDMM on Coo format.
+ * @param bcast Broadcast information.
+ * @param coo The Coo matrix.
+ * @param lhs The left hand side operand feature.
+ * @param rhs The right hand size operand feature.
+ * @param out The result feature on edges.
 */
 template <typename Idx, typename DType, typename Op,
          int LhsTarget = 0, int RhsTarget = 2>
@@ -324,12 +324,12 @@ void SDDMMCoo(
 }

 /*!
- * \brief CUDA implementation of g-SDDMM on Csr format.
- * \param bcast Broadcast information.
- * \param csr The Csr matrix.
- * \param lhs The left hand side operand feature.
- * \param rhs The right hand size operand feature.
- * \param out The result feature on edges.
+ * @brief CUDA implementation of g-SDDMM on Csr format.
+ * @param bcast Broadcast information.
+ * @param csr The Csr matrix.
+ * @param lhs The left hand side operand feature.
+ * @param rhs The right hand size operand feature.
+ * @param out The result feature on edges.
 */
 template <typename Idx, typename DType, typename Op,
          int LhsTarget = 0, int RhsTarget = 2>

--- a/src/array/cuda/sddmm_hetero_coo.cu
+++ b/src/array/cuda/sddmm_hetero_coo.cu
 /*!
 *  Copyright (c) 2020 by Contributors
- * \file array/cuda/sddmm.cu
- * \brief SDDMM C APIs and definitions.
+ * @file array/cuda/sddmm.cu
+ * @brief SDDMM C APIs and definitions.
 */
 #include <dgl/array.h>
 #include "./sddmm.cuh"
@@ -10,7 +10,7 @@ namespace dgl {
 namespace aten {

 /*!
- * \brief CUDA implementation of g-SDDMM on heterograph using
+ * @brief CUDA implementation of g-SDDMM on heterograph using
    Csr format.
 */
 template <int XPU, typename IdType, typename DType>

--- a/src/array/cuda/sddmm_hetero_csr.cu
+++ b/src/array/cuda/sddmm_hetero_csr.cu
 /*!
 *  Copyright (c) 2020 by Contributors
- * \file array/cuda/sddmm.cu
- * \brief SDDMM C APIs and definitions.
+ * @file array/cuda/sddmm.cu
+ * @brief SDDMM C APIs and definitions.
 */
 #include <dgl/array.h>
 #include "./sddmm.cuh"
@@ -10,7 +10,7 @@ namespace dgl {
 namespace aten {

 /*!
- * \brief CUDA implementation of g-SDDMM on heterograph using 
+ * @brief CUDA implementation of g-SDDMM on heterograph using 
    Csr format.
 */
 template <int XPU, typename IdType, typename DType>

--- a/src/array/cuda/segment_reduce.cu
+++ b/src/array/cuda/segment_reduce.cu
 /*!
 *  Copyright (c) 2020 by Contributors
- * \file array/cuda/segment_reduce.cu
- * \brief Segment reduce C APIs and definitions.
+ * @file array/cuda/segment_reduce.cu
+ * @brief Segment reduce C APIs and definitions.
 */
 #include <dgl/array.h>
 #include <dgl/base_heterograph.h>

--- a/src/array/cuda/segment_reduce.cuh
+++ b/src/array/cuda/segment_reduce.cuh
 /*!
 *  Copyright (c) 2020 by Contributors
- * \file array/cuda/segment_reduce.cuh
- * \brief Segment reduce kernel function header.
+ * @file array/cuda/segment_reduce.cuh
+ * @brief Segment reduce kernel function header.
 */
 #ifndef DGL_ARRAY_CUDA_SEGMENT_REDUCE_CUH_
 #define DGL_ARRAY_CUDA_SEGMENT_REDUCE_CUH_
@@ -21,8 +21,8 @@ namespace aten {
 namespace cuda {

 /*!
- * \brief CUDA kernel of segment reduce.
- * \note each blockthread is responsible for aggregation on a row
+ * @brief CUDA kernel of segment reduce.
+ * @note each blockthread is responsible for aggregation on a row
 *       in the result tensor.
 */
 template <typename IdType, typename DType, typename ReduceOp>
@@ -45,8 +45,8 @@ __global__ void SegmentReduceKernel(
 }

 /*!
- * \brief CUDA kernel of scatter add.
- * \note each blockthread is responsible for adding a row in feature tensor
+ * @brief CUDA kernel of scatter add.
+ * @note each blockthread is responsible for adding a row in feature tensor
 *       to a target row in output tensor.
 */
 template <typename IdType, typename DType>
@@ -63,8 +63,8 @@ __global__ void ScatterAddKernel(
 }

 /*!
- * \brief CUDA kernel to update gradients for reduce op max/min
- * \note each WARP (group of 32 threads) is responsible for adding a row in
+ * @brief CUDA kernel to update gradients for reduce op max/min
+ * @note each WARP (group of 32 threads) is responsible for adding a row in
 * feature tensor to a target row in output tensor.
 */

@@ -91,8 +91,8 @@ __global__ void UpdateGradMinMaxHeteroKernel(
 }

 /*!
- * \brief CUDA kernel of backward phase in segment min/max.
- * \note each blockthread is responsible for writing a row in the
+ * @brief CUDA kernel of backward phase in segment min/max.
+ * @note each blockthread is responsible for writing a row in the
 *       result gradient tensor by lookup the ArgMin/Max for index information.
 */
 template <typename IdType, typename DType>
@@ -111,11 +111,11 @@ __global__ void BackwardSegmentCmpKernel(
 }

 /*!
- * \brief CUDA implementation of forward phase of Segment Reduce.
- * \param feat The input tensor.
- * \param offsets The offsets tensor.
- * \param out The output tensor.
- * \param arg An auxiliary tensor storing ArgMax/Min information,
+ * @brief CUDA implementation of forward phase of Segment Reduce.
+ * @param feat The input tensor.
+ * @param offsets The offsets tensor.
+ * @param out The output tensor.
+ * @param arg An auxiliary tensor storing ArgMax/Min information,
 */
 template <typename IdType, typename DType, typename ReduceOp>
 void SegmentReduce(NDArray feat, NDArray offsets, NDArray out, NDArray arg) {
@@ -142,11 +142,11 @@ void SegmentReduce(NDArray feat, NDArray offsets, NDArray out, NDArray arg) {
 }

 /*!
- * \brief CUDA implementation of Scatter Add (on first dimension).
- * \note math equation: out[idx[i], *] += feat[i, *]
- * \param feat The input tensor.
- * \param idx The indices tensor.
- * \param out The output tensor.
+ * @brief CUDA implementation of Scatter Add (on first dimension).
+ * @note math equation: out[idx[i], *] += feat[i, *]
+ * @param feat The input tensor.
+ * @param idx The indices tensor.
+ * @param out The output tensor.
 */
 template <typename IdType, typename DType>
 void ScatterAdd(NDArray feat, NDArray idx, NDArray out) {
@@ -171,13 +171,13 @@ void ScatterAdd(NDArray feat, NDArray idx, NDArray out) {
 }

 /*!
- * \brief CUDA implementation to update gradients for reduce op max/min
- * \param graph The input heterogeneous graph.
- * \param op The binary operator, could be `copy_u`, `copy_e'.
- * \param list_feat List of the input tensors.
- * \param list_idx  List of the indices tensors.
- * \param list_idx_etype List of the node- or edge-type tensors.
- * \param list_out List of the output tensors.
+ * @brief CUDA implementation to update gradients for reduce op max/min
+ * @param graph The input heterogeneous graph.
+ * @param op The binary operator, could be `copy_u`, `copy_e'.
+ * @param list_feat List of the input tensors.
+ * @param list_idx  List of the indices tensors.
+ * @param list_idx_etype List of the node- or edge-type tensors.
+ * @param list_out List of the output tensors.
 */
 template <typename IdType, typename DType>
 void UpdateGradMinMax_hetero(
@@ -224,12 +224,12 @@ void UpdateGradMinMax_hetero(
 }

 /*!
- * \brief CUDA implementation of backward phase of Segment Reduce with Min/Max
+ * @brief CUDA implementation of backward phase of Segment Reduce with Min/Max
 *        reducer.
- * \note math equation: out[arg[i, k], k] = feat[i, k] \param feat The input
+ * @note math equation: out[arg[i, k], k] = feat[i, k] \param feat The input
 *       tensor.
- * \param arg The ArgMin/Max information, used for indexing.
- * \param out The output tensor.
+ * @param arg The ArgMin/Max information, used for indexing.
+ * @param out The output tensor.
 */
 template <typename IdType, typename DType>
 void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) {

--- a/src/array/cuda/spmat_op_impl_coo.cu
+++ b/src/array/cuda/spmat_op_impl_coo.cu
 /*!
 *  Copyright (c) 2021 by contributors.
- * \file array/cuda/spmat_op_impl_coo.cu
- * \brief COO operator GPU implementation
+ * @file array/cuda/spmat_op_impl_coo.cu
+ * @brief COO operator GPU implementation
 */
 #include <dgl/array.h>


--- a/src/array/cuda/spmat_op_impl_csr.cu
+++ b/src/array/cuda/spmat_op_impl_csr.cu
 /*!
 *  Copyright (c) 2020 by Contributors
- * \file array/cuda/spmat_op_impl_csr.cu
- * \brief CSR operator CPU implementation
+ * @file array/cuda/spmat_op_impl_csr.cu
+ * @brief CSR operator CPU implementation
 */
 #include <dgl/array.h>

@@ -82,7 +82,7 @@ template NDArray CSRIsNonZero<kDGLCUDA, int64_t>(CSRMatrix, NDArray, NDArray);
 ///////////////////////////// CSRHasDuplicate /////////////////////////////

 /*!
- * \brief Check whether each row does not have any duplicate entries.
+ * @brief Check whether each row does not have any duplicate entries.
 * Assume the CSR is sorted.
 */
 template <typename IdType>
@@ -228,7 +228,7 @@ template CSRMatrix CSRSliceRows<kDGLCUDA, int32_t>(CSRMatrix, int64_t, int64_t);
 template CSRMatrix CSRSliceRows<kDGLCUDA, int64_t>(CSRMatrix, int64_t, int64_t);

 /*!
- * \brief Copy data segment to output buffers
+ * @brief Copy data segment to output buffers
 *
 * For the i^th row r = row[i], copy the data from indptr[r] ~ indptr[r+1]
 * to the out_data from out_indptr[i] ~ out_indptr[i+1]
@@ -299,7 +299,7 @@ template CSRMatrix CSRSliceRows<kDGLCUDA, int64_t>(CSRMatrix, NDArray);
 ///////////////////////////// CSRGetDataAndIndices /////////////////////////////

 /*!
- * \brief Generate a 0-1 mask for each index that hits the provided (row, col)
+ * @brief Generate a 0-1 mask for each index that hits the provided (row, col)
 *        index.
 *
 * Examples:
@@ -331,7 +331,7 @@ __global__ void _SegmentMaskKernel(
 }

 /*!
- * \brief Search for the insertion positions for needle in the hay.
+ * @brief Search for the insertion positions for needle in the hay.
 *
 * The hay is a list of sorted elements and the result is the insertion position
 * of each needle so that the insertion still gives sorted order.
@@ -424,7 +424,7 @@ template std::vector<NDArray> CSRGetDataAndIndices<kDGLCUDA, int64_t>(
 ///////////////////////////// CSRSliceMatrix /////////////////////////////

 /*!
- * \brief Generate a 0-1 mask for each index whose column is in the provided
+ * @brief Generate a 0-1 mask for each index whose column is in the provided
 * set. It also counts the number of masked values per row.
 */
 template <typename IdType>

--- a/src/array/cuda/spmm.cu
+++ b/src/array/cuda/spmm.cu
 /*!
 *  Copyright (c) 2020 by Contributors
- * \file array/cuda/spmm.cu
- * \brief SPMM C APIs and definitions.
+ * @file array/cuda/spmm.cu
+ * @brief SPMM C APIs and definitions.
 */
 #include <dgl/array.h>
 #include "./spmm.cuh"
@@ -16,8 +16,8 @@ using namespace cuda;
 namespace aten {

 /*!
- * \brief CUDA implementation of g-SpMM on Csr format.
- * \note use cusparse if the reduce operator is `sum` and there is
+ * @brief CUDA implementation of g-SpMM on Csr format.
+ * @note use cusparse if the reduce operator is `sum` and there is
 *       no broadcast, use dgl's kernel in other cases.
 */
 template <int XPU, typename IdType, typename DType>
@@ -81,7 +81,7 @@ void SpMMCsr(const std::string& op, const std::string& reduce,


 /*!
- * \brief CUDA implementation of g-SpMM on Coo format.
+ * @brief CUDA implementation of g-SpMM on Coo format.
 */
 template <int XPU, typename IdType, typename DType>
 void SpMMCoo(const std::string& op, const std::string& reduce,

--- a/src/array/cuda/spmm.cuh
+++ b/src/array/cuda/spmm.cuh
 /*!
 *  Copyright (c) 2020 by Contributors
- * \file array/cuda/spmm.cuh
- * \brief SPMM CUDA kernel function header.
+ * @file array/cuda/spmm.cuh
+ * @brief SPMM CUDA kernel function header.
 */
 #ifndef DGL_ARRAY_CUDA_SPMM_CUH_
 #define DGL_ARRAY_CUDA_SPMM_CUH_
@@ -22,7 +22,7 @@ using namespace cuda;
 namespace aten {

 /*!
- * \brief Determine whether cusparse SpMM function is applicable.
+ * @brief Determine whether cusparse SpMM function is applicable.
 */
 template <typename DType, typename IdType>
 inline bool cusparse_available(bool more_nnz_than_matrix_size) {
@@ -41,7 +41,7 @@ inline bool cusparse_available(bool more_nnz_than_matrix_size) {

 namespace {

-/*! \brief Call cuBLAS geam API for transpose operation for float and double. */
+/*! @brief Call cuBLAS geam API for transpose operation for float and double. */
 template <typename DType>
 cublasStatus_t Xgeam(cublasHandle_t handle, cublasOperation_t transa,
    cublasOperation_t transb, int m, int n,
@@ -98,8 +98,8 @@ cublasStatus_t Xgeam<double>(cublasHandle_t handle, cublasOperation_t transa,
      beta, B, ldb, C, ldc);
 }

-/* \brief IndexSelect operator kernel implementation.
- * \note duplicate of IndexSelectKernel defined in array_index_select.cu
+/* @brief IndexSelect operator kernel implementation.
+ * @note duplicate of IndexSelectKernel defined in array_index_select.cu
 */
 template <typename DType, typename IdType>
 __global__ void _IndexSelectKernel(
@@ -112,8 +112,8 @@ __global__ void _IndexSelectKernel(
    out[i * m + j] = in[idx[i] * m + j];
 }

-/* \brief Transpose operator kernel implementation.
- * \note not efficient but it's not a bottleneck, used for float16 dtype.
+/* @brief Transpose operator kernel implementation.
+ * @note not efficient but it's not a bottleneck, used for float16 dtype.
 */
 template <typename DType>
 __global__ void _TransposeKernel(
@@ -126,9 +126,9 @@ __global__ void _TransposeKernel(
 }

 /*
- * \brief Tranpose the input matrix.
- * \param row number of rows of input matrix.
- * \param col number of columns of input matrix.
+ * @brief Tranpose the input matrix.
+ * @param row number of rows of input matrix.
+ * @param col number of columns of input matrix.
 */
 template <typename DType>
 void _Transpose(const DType* in, DType* out,
@@ -150,8 +150,8 @@ void _Transpose(const DType* in, DType* out,
 }

 /*
- * \brief Tranpose the input matrix for data type half.
- * \note cuBLAS has no geam API for half data type, fallback to our kernel.
+ * @brief Tranpose the input matrix for data type half.
+ * @note cuBLAS has no geam API for half data type, fallback to our kernel.
 */
 template <>
 void _Transpose<half>(const half* in, half* out,
@@ -164,8 +164,8 @@ void _Transpose<half>(const half* in, half* out,

 #if BF16_ENABLED
 /*
- * \brief Tranpose the input matrix for data type half.
- * \note cuBLAS has no geam API for bf16 data type, fallback to our kernel.
+ * @brief Tranpose the input matrix for data type half.
+ * @note cuBLAS has no geam API for bf16 data type, fallback to our kernel.
 */
 template <>
 void _Transpose<__nv_bfloat16>(const __nv_bfloat16* in, __nv_bfloat16* out,
@@ -178,7 +178,7 @@ void _Transpose<__nv_bfloat16>(const __nv_bfloat16* in, __nv_bfloat16* out,
 #endif  // BF16_ENABLED

 /*
- * \brief
+ * @brief
 */
 template <typename DType, typename IdType>
 __global__ void _IndexSelectKernel(const DType* array, const IdType* index,
@@ -191,8 +191,8 @@ __global__ void _IndexSelectKernel(const DType* array, const IdType* index,
  }
 }

-/* \brief IndexSelect operator.
- * \note duplicate of IndexSelect defined in array_op.h but it can
+/* @brief IndexSelect operator.
+ * @note duplicate of IndexSelect defined in array_op.h but it can
 *    not be applied to float16 dtype.
 */
 template<typename DType, typename IdType>
@@ -477,8 +477,8 @@ namespace cuda {


 /*!
- * \brief CUDA kernel of g-SpMM on Coo format.
- * \note it uses edge parallel strategy, different threadblocks (on y-axis)
+ * @brief CUDA kernel of g-SpMM on Coo format.
+ * @note it uses edge parallel strategy, different threadblocks (on y-axis)
 *       is responsible for the computation on different edges. Threadblocks
 *       on the x-axis are responsible for the computation on different positions
 *       in feature dimension.
@@ -526,8 +526,8 @@ __global__ void SpMMCooKernel(
 }

 /*!
- * \brief CUDA kernel to compute argu and arge in g-SpMM on Coo format.
- * \note it uses edge parallel strategy, different threadblocks (on y-axis)
+ * @brief CUDA kernel to compute argu and arge in g-SpMM on Coo format.
+ * @note it uses edge parallel strategy, different threadblocks (on y-axis)
 *       is responsible for the computation on different edges. Threadblocks
 *       on the x-axis are responsible for the computation on different positions
 *       in feature dimension.
@@ -574,8 +574,8 @@ __global__ void ArgSpMMCooKernel(
 }

 /*!
- * \brief CUDA kernel of g-SpMM on Csr format.
- * \note it uses node parallel strategy, different threadblocks (on y-axis)
+ * @brief CUDA kernel of g-SpMM on Csr format.
+ * @note it uses node parallel strategy, different threadblocks (on y-axis)
 *       is responsible for the computation on different destination nodes.
 *       Threadblocks on the x-axis are responsible for the computation on
 *       different positions in feature dimension.
@@ -632,8 +632,8 @@ __global__ void SpMMCsrKernel(
 }

 /*!
- * \brief CUDA kernel of SpMM-Min/Max on Csr format.
- * \note it uses node parallel strategy, different threadblocks (on y-axis)
+ * @brief CUDA kernel of SpMM-Min/Max on Csr format.
+ * @note it uses node parallel strategy, different threadblocks (on y-axis)
 *       is responsible for the computation on different destination nodes.
 *       Threadblocks on the x-axis are responsible for the computation on
 *       different positions in feature dimension.
@@ -693,16 +693,16 @@ __global__ void SpMMCmpCsrHeteroKernel(
 }

 /*!
- * \brief CUDA implementation of g-SpMM on Coo format.
- * \param bcast Broadcast information.
- * \param coo The Coo matrix.
- * \param ufeat The feature on source nodes.
- * \param efeat The feature on edges.
- * \param out The result feature on destination nodes.
- * \param argu Arg-Min/Max on source nodes, which refers the source node indices
+ * @brief CUDA implementation of g-SpMM on Coo format.
+ * @param bcast Broadcast information.
+ * @param coo The Coo matrix.
+ * @param ufeat The feature on source nodes.
+ * @param efeat The feature on edges.
+ * @param out The result feature on destination nodes.
+ * @param argu Arg-Min/Max on source nodes, which refers the source node indices
 *        correspond to the minimum/maximum values of reduction result on
 *        destination nodes. It's useful in computing gradients of Min/Max reducer.
- * \param arge Arg-Min/Max on edges. which refers the source node indices
+ * @param arge Arg-Min/Max on edges. which refers the source node indices
 *        correspond to the minimum/maximum values of reduction result on
 *        destination nodes. It's useful in computing gradients of Min/Max reducer.
 */
@@ -770,16 +770,16 @@ void SpMMCoo(
 }

 /*!
- * \brief CUDA implementation of g-SpMM on Csr format.
- * \param bcast Broadcast information.
- * \param csr The Csr matrix.
- * \param ufeat The feature on source nodes.
- * \param efeat The feature on edges.
- * \param out The result feature on destination nodes.
- * \param argu Arg-Min/Max on source nodes, which refers the source node indices
+ * @brief CUDA implementation of g-SpMM on Csr format.
+ * @param bcast Broadcast information.
+ * @param csr The Csr matrix.
+ * @param ufeat The feature on source nodes.
+ * @param efeat The feature on edges.
+ * @param out The result feature on destination nodes.
+ * @param argu Arg-Min/Max on source nodes, which refers the source node indices
 *        correspond to the minimum/maximum values of reduction result on
 *        destination nodes. It's useful in computing gradients of Min/Max reducer.
- * \param arge Arg-Min/Max on edges. which refers the source node indices
+ * @param arge Arg-Min/Max on edges. which refers the source node indices
 *        correspond to the minimum/maximum values of reduction result on
 *        destination nodes. It's useful in computing gradients of Min/Max reducer.
 */
@@ -825,26 +825,26 @@ void SpMMCsr(
 }

 /*!
- * \brief CUDA kernel of SpMM-Min/Max on Csr format on heterogeneous graph.
- * \param bcast Broadcast information.
- * \param csr The Csr matrix.
- * \param ufeat The feature on source nodes.
- * \param efeat The feature on edges.
- * \param out The result feature on destination nodes.
- * \param argu Arg-Min/Max on source nodes, which refers the source node indices
+ * @brief CUDA kernel of SpMM-Min/Max on Csr format on heterogeneous graph.
+ * @param bcast Broadcast information.
+ * @param csr The Csr matrix.
+ * @param ufeat The feature on source nodes.
+ * @param efeat The feature on edges.
+ * @param out The result feature on destination nodes.
+ * @param argu Arg-Min/Max on source nodes, which refers the source node indices
 *        correspond to the minimum/maximum values of reduction result on
 *        destination nodes. It's useful in computing gradients of Min/Max reducer.
- * \param arge Arg-Min/Max on edges. which refers the source node indices
+ * @param arge Arg-Min/Max on edges. which refers the source node indices
 *        correspond to the minimum/maximum values of reduction result on
 *        destination nodes. It's useful in computing gradients of Min/Max reducer.
- * \param argu_ntype Node type of the arg-Min/Max on source nodes, which refers the
+ * @param argu_ntype Node type of the arg-Min/Max on source nodes, which refers the
 *        source node types correspond to the minimum/maximum values of reduction result
 *        on destination nodes. It's useful in computing gradients of Min/Max reducer.
- * \param arge_etype Edge-type of the arg-Min/Max on edges. which refers the source
+ * @param arge_etype Edge-type of the arg-Min/Max on edges. which refers the source
 *        node indices correspond to the minimum/maximum values of reduction result on
 *        destination nodes. It's useful in computing gradients of Min/Max reducer.
- * \param src_type Node type of the source nodes of an etype
- * \param etype Edge type
+ * @param src_type Node type of the source nodes of an etype
+ * @param etype Edge type
 */
 template <typename Idx, typename DType,
          typename BinaryOp, typename ReduceOp>

--- a/src/array/cuda/spmm_hetero.cu
+++ b/src/array/cuda/spmm_hetero.cu
 /*!
 *  Copyright (c) 2020 by Contributors
- * \file array/cuda/spmm.cu
- * \brief SPMM C APIs and definitions.
+ * @file array/cuda/spmm.cu
+ * @brief SPMM C APIs and definitions.
 */
 #include <dgl/array.h>
 #include "./spmm.cuh"
@@ -16,8 +16,8 @@ using namespace cuda;
 namespace aten {

 /*!
- * \brief CUDA implementation of g-SpMM on Csr format.
- * \note use cusparse if the reduce operator is `sum` and there is
+ * @brief CUDA implementation of g-SpMM on Csr format.
+ * @note use cusparse if the reduce operator is `sum` and there is
 *       no broadcast, use dgl's kernel in other cases.
 */
 template <int XPU, typename IdType, typename DType>

--- a/src/array/cuda/utils.cu
+++ b/src/array/cuda/utils.cu
 /*!
 *  Copyright (c) 2020 by Contributors
- * \file array/cuda/utils.cu
- * \brief Utilities for CUDA kernels.
+ * @file array/cuda/utils.cu
+ * @brief Utilities for CUDA kernels.
 */

 #include "./utils.h"

--- a/src/array/cuda/utils.h
+++ b/src/array/cuda/utils.h
 /*!
 *  Copyright (c) 2020 by Contributors
- * \file array/cuda/utils.h
- * \brief Utilities for CUDA kernels.
+ * @file array/cuda/utils.h
+ * @brief Utilities for CUDA kernels.
 */
 #ifndef DGL_ARRAY_CUDA_UTILS_H_
 #define DGL_ARRAY_CUDA_UTILS_H_
@@ -23,7 +23,7 @@ namespace cuda {
 #define CUDA_MAX_NUM_THREADS 256


-/*! \brief Calculate the number of threads needed given the dimension length.
+/*! @brief Calculate the number of threads needed given the dimension length.
 *
 * It finds the biggest number that is smaller than min(dim, max_nthrs)
 * and is also power of two.
@@ -78,20 +78,20 @@ __device__ __forceinline__ T _ldg(T* addr) {
 }

 /*!
- * \brief Return true if the given bool flag array is all true.
+ * @brief Return true if the given bool flag array is all true.
 * The input bool array is in int8_t type so it is aligned with byte address.
 *
- * \param flags The bool array.
- * \param length The length.
- * \param ctx Device context.
- * \return True if all the flags are true.
+ * @param flags The bool array.
+ * @param length The length.
+ * @param ctx Device context.
+ * @return True if all the flags are true.
 */
 bool AllTrue(int8_t* flags, int64_t length, const DGLContext& ctx);

 /*!
- * \brief CUDA Kernel of filling the vector started from ptr of size length
+ * @brief CUDA Kernel of filling the vector started from ptr of size length
 *        with val.
- * \note internal use only.
+ * @note internal use only.
 */
 template <typename DType>
 __global__ void _FillKernel(DType* ptr, size_t length, DType val) {
@@ -103,7 +103,7 @@ __global__ void _FillKernel(DType* ptr, size_t length, DType val) {
  }
 }

-/*! \brief Fill the vector started from ptr of size length with val */
+/*! @brief Fill the vector started from ptr of size length with val */
 template <typename DType>
 void _Fill(DType* ptr, size_t length, DType val) {
  cudaStream_t stream = runtime::getCurrentCUDAStream();
@@ -113,7 +113,7 @@ void _Fill(DType* ptr, size_t length, DType val) {
 }

 /*!
- * \brief Search adjacency list linearly for each (row, col) pair and
+ * @brief Search adjacency list linearly for each (row, col) pair and
 * write the data under the matched position in the indices array to the output.
 *
 * If there is no match, the value in \c filler is written.
@@ -157,7 +157,7 @@ __global__ void _LinearSearchKernel(

 #if BF16_ENABLED
 /*!
- * \brief Specialization for bf16 because conversion from long long to bfloat16
+ * @brief Specialization for bf16 because conversion from long long to bfloat16
 * doesn't exist before SM80.
 */
 template <typename IdType>
@@ -206,7 +206,7 @@ inline DType GetCUDAScalar(
 }

 /*!
- * \brief Given a sorted array and a value this function returns the index
+ * @brief Given a sorted array and a value this function returns the index
 * of the first element which compares greater than value.
 *
 * This function assumes 0-based index
@@ -231,7 +231,7 @@ __device__ IdType _UpperBound(const IdType *A, int64_t n, IdType x) {
 }

 /*!
- * \brief Given a sorted array and a value this function returns the index
+ * @brief Given a sorted array and a value this function returns the index
 * of the element who is equal to val. If not exist returns n+1
 *
 * This function assumes 0-based index

--- a/src/array/cuda/uvm/array_index_select_uvm.cu
+++ b/src/array/cuda/uvm/array_index_select_uvm.cu
 /*!
 *  Copyright (c) 2019-2022 by Contributors
- * \file array/cuda/uvm/array_index_select_uvm.cu
- * \brief Array index select GPU implementation
+ * @file array/cuda/uvm/array_index_select_uvm.cu
+ * @brief Array index select GPU implementation
 */
 #include <dgl/array.h>