[Misc] Replace /*! with /**. (#4823)

* replace * blabla * balbla * blabla Co-authored-by: Steve <ubuntu@ip-172-31-34-29.ap-northeast-1.compute.internal>

[Misc] Replace /*! with /**. (#4823)
* replace * blabla * balbla * blabla Co-authored-by: Steve <ubuntu@ip-172-31-34-29.ap-northeast-1.compute.internal>
bcd37684 · Hongzhi (Steve), Chen · GitHub · 619d735d · bcd37684 · bcd37684
Unverified Commit bcd37684 authored Nov 07, 2022 by Hongzhi (Steve), Chen Committed by GitHub Nov 07, 2022
20 changed files
--- a/src/array/cuda/macro.cuh
+++ b/src/array/cuda/macro.cuh
-/*!
+/**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/macro.cuh
 * @brief Macro to call SPMM/SDDMM cuda kernels.

--- a/src/array/cuda/negative_sampling.cu
+++ b/src/array/cuda/negative_sampling.cu
-/*!
+/**
 *  Copyright (c) 2021 by Contributors
 * @file array/cuda/negative_sampling.cu
 * @brief rowwise sampling
@@ -80,7 +80,7 @@ struct IsNotMinusOne {
  }
 };
-/*!
+/**
 * @brief Sort ordered pairs in ascending order, using \a tmp_major and \a
 * tmp_minor as temporary buffers, each with \a n elements.
 */

--- a/src/array/cuda/rowwise_sampling.cu
+++ b/src/array/cuda/rowwise_sampling.cu
-/*!
+/**
 *  Copyright (c) 2021 by Contributors
 * @file array/cuda/rowwise_sampling.cu
 * @brief uniform rowwise sampling

--- a/src/array/cuda/rowwise_sampling_prob.cu
+++ b/src/array/cuda/rowwise_sampling_prob.cu
-/*!
+/**
 *  Copyright (c) 2022 by Contributors
 * @file array/cuda/rowwise_sampling_prob.cu
 * @brief weighted rowwise sampling. The degree computing kernels and

--- a/src/array/cuda/sddmm.cu
+++ b/src/array/cuda/sddmm.cu
-/*!
+/**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/sddmm.cu
 * @brief SDDMM C APIs and definitions.
@@ -10,7 +10,7 @@
 namespace dgl {
 namespace aten {
-/*!
+/**
 * @brief CUDA implementation of g-SDDMM on Csr format.
 */
 template <int XPU, typename IdType, typename DType>
@@ -30,7 +30,7 @@ void SDDMMCsr(const std::string& op,
 }
-/*!
+/**
 * @brief CUDA implementation of g-SDDMM on Coo format.
 */
 template <int XPU, typename IdType, typename DType>

--- a/src/array/cuda/sddmm.cuh
+++ b/src/array/cuda/sddmm.cuh
-/*!
+/**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/sddmm.cuh
 * @brief SDDMM CUDA kernel function header.
@@ -86,7 +86,7 @@ namespace cuda {
 constexpr unsigned int full_mask = 0xffffffff;
-/*!
+/**
 * @brief CUDA kernel of g-SDDMM on Coo format.
 * @note it uses edge parallel strategy, different threadblocks (on y-axis)
 *       is responsible for the computation on different edges. Threadblocks
@@ -135,7 +135,7 @@ __global__ void SDDMMCooKernel(
  }
 }
-/*!
+/**
 * @brief CUDA kernel of SDDMM-dot on Coo format, accelerated with tree reduction.
 * @note it uses edge parallel strategy, different threadblocks (on y-axis)
 *       is responsible for the computation on different edges. Threadblocks
@@ -203,7 +203,7 @@ __device__ __forceinline__ Idx BinarySearchSrc(const Idx *array, Idx length, Idx
  }
 }
-/*!
+/**
 * @brief CUDA kernel of g-SDDMM on Csr format.
 * @note it uses edge parallel strategy, different threadblocks (on y-axis)
 *       is responsible for the computation on different edges. Threadblocks
@@ -254,7 +254,7 @@ __global__ void SDDMMCsrKernel(
  }
 }
-/*!
+/**
 * @brief CUDA implementation of g-SDDMM on Coo format.
 * @param bcast Broadcast information.
 * @param coo The Coo matrix.
@@ -323,7 +323,7 @@ void SDDMMCoo(
  }
 }
-/*!
+/**
 * @brief CUDA implementation of g-SDDMM on Csr format.
 * @param bcast Broadcast information.
 * @param csr The Csr matrix.

--- a/src/array/cuda/sddmm_hetero_coo.cu
+++ b/src/array/cuda/sddmm_hetero_coo.cu
-/*!
+/**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/sddmm.cu
 * @brief SDDMM C APIs and definitions.
@@ -9,7 +9,7 @@
 namespace dgl {
 namespace aten {
-/*!
+/**
 * @brief CUDA implementation of g-SDDMM on heterograph using
    Csr format.
 */

--- a/src/array/cuda/sddmm_hetero_csr.cu
+++ b/src/array/cuda/sddmm_hetero_csr.cu
-/*!
+/**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/sddmm.cu
 * @brief SDDMM C APIs and definitions.
@@ -9,7 +9,7 @@
 namespace dgl {
 namespace aten {
-/*!
+/**
 * @brief CUDA implementation of g-SDDMM on heterograph using 
    Csr format.
 */

--- a/src/array/cuda/segment_reduce.cu
+++ b/src/array/cuda/segment_reduce.cu
-/*!
+/**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/segment_reduce.cu
 * @brief Segment reduce C APIs and definitions.

--- a/src/array/cuda/segment_reduce.cuh
+++ b/src/array/cuda/segment_reduce.cuh
-/*!
+/**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/segment_reduce.cuh
 * @brief Segment reduce kernel function header.
@@ -20,7 +20,7 @@ using namespace cuda;
 namespace aten {
 namespace cuda {
-/*!
+/**
 * @brief CUDA kernel of segment reduce.
 * @note each blockthread is responsible for aggregation on a row
 *       in the result tensor.
@@ -44,7 +44,7 @@ __global__ void SegmentReduceKernel(
  }
 }
-/*!
+/**
 * @brief CUDA kernel of scatter add.
 * @note each blockthread is responsible for adding a row in feature tensor
 *       to a target row in output tensor.
@@ -62,7 +62,7 @@ __global__ void ScatterAddKernel(
  }
 }
-/*!
+/**
 * @brief CUDA kernel to update gradients for reduce op max/min
 * @note each WARP (group of 32 threads) is responsible for adding a row in
 * feature tensor to a target row in output tensor.
@@ -90,7 +90,7 @@ __global__ void UpdateGradMinMaxHeteroKernel(
  }
 }
-/*!
+/**
 * @brief CUDA kernel of backward phase in segment min/max.
 * @note each blockthread is responsible for writing a row in the
 *       result gradient tensor by lookup the ArgMin/Max for index information.
@@ -110,7 +110,7 @@ __global__ void BackwardSegmentCmpKernel(
  }
 }
-/*!
+/**
 * @brief CUDA implementation of forward phase of Segment Reduce.
 * @param feat The input tensor.
 * @param offsets The offsets tensor.
@@ -141,7 +141,7 @@ void SegmentReduce(NDArray feat, NDArray offsets, NDArray out, NDArray arg) {
      feat_data, offsets_data, out_data, arg_data, n, dim);
 }
-/*!
+/**
 * @brief CUDA implementation of Scatter Add (on first dimension).
 * @note math equation: out[idx[i], *] += feat[i, *]
 * @param feat The input tensor.
@@ -170,7 +170,7 @@ void ScatterAdd(NDArray feat, NDArray idx, NDArray out) {
      idx_data, out_data, n, dim);
 }
-/*!
+/**
 * @brief CUDA implementation to update gradients for reduce op max/min
 * @param graph The input heterogeneous graph.
 * @param op The binary operator, could be `copy_u`, `copy_e'.
@@ -223,7 +223,7 @@ void UpdateGradMinMax_hetero(
  }
 }
-/*!
+/**
 * @brief CUDA implementation of backward phase of Segment Reduce with Min/Max
 *        reducer.
 * @note math equation: out[arg[i, k], k] = feat[i, k] \param feat The input

--- a/src/array/cuda/spmat_op_impl_coo.cu
+++ b/src/array/cuda/spmat_op_impl_coo.cu
-/*!
+/**
 *  Copyright (c) 2021 by contributors.
 * @file array/cuda/spmat_op_impl_coo.cu
 * @brief COO operator GPU implementation

--- a/src/array/cuda/spmat_op_impl_csr.cu
+++ b/src/array/cuda/spmat_op_impl_csr.cu
-/*!
+/**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/spmat_op_impl_csr.cu
 * @brief CSR operator CPU implementation
@@ -81,7 +81,7 @@ template NDArray CSRIsNonZero<kDGLCUDA, int64_t>(CSRMatrix, NDArray, NDArray);
 ///////////////////////////// CSRHasDuplicate /////////////////////////////
-/*!
+/**
 * @brief Check whether each row does not have any duplicate entries.
 * Assume the CSR is sorted.
 */
@@ -227,7 +227,7 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, int64_t start, int64_t end) {
 template CSRMatrix CSRSliceRows<kDGLCUDA, int32_t>(CSRMatrix, int64_t, int64_t);
 template CSRMatrix CSRSliceRows<kDGLCUDA, int64_t>(CSRMatrix, int64_t, int64_t);
-/*!
+/**
 * @brief Copy data segment to output buffers
 *
 * For the i^th row r = row[i], copy the data from indptr[r] ~ indptr[r+1]
@@ -298,7 +298,7 @@ template CSRMatrix CSRSliceRows<kDGLCUDA, int64_t>(CSRMatrix, NDArray);
 ///////////////////////////// CSRGetDataAndIndices /////////////////////////////
-/*!
+/**
 * @brief Generate a 0-1 mask for each index that hits the provided (row, col)
 *        index.
 *
@@ -330,7 +330,7 @@ __global__ void _SegmentMaskKernel(
  }
 }
-/*!
+/**
 * @brief Search for the insertion positions for needle in the hay.
 *
 * The hay is a list of sorted elements and the result is the insertion position
@@ -423,7 +423,7 @@ template std::vector<NDArray> CSRGetDataAndIndices<kDGLCUDA, int64_t>(
 ///////////////////////////// CSRSliceMatrix /////////////////////////////
-/*!
+/**
 * @brief Generate a 0-1 mask for each index whose column is in the provided
 * set. It also counts the number of masked values per row.
 */

--- a/src/array/cuda/spmm.cu
+++ b/src/array/cuda/spmm.cu
-/*!
+/**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/spmm.cu
 * @brief SPMM C APIs and definitions.
@@ -15,7 +15,7 @@ using namespace cuda;
 namespace aten {
-/*!
+/**
 * @brief CUDA implementation of g-SpMM on Csr format.
 * @note use cusparse if the reduce operator is `sum` and there is
 *       no broadcast, use dgl's kernel in other cases.
@@ -80,7 +80,7 @@ void SpMMCsr(const std::string& op, const std::string& reduce,
 }
-/*!
+/**
 * @brief CUDA implementation of g-SpMM on Coo format.
 */
 template <int XPU, typename IdType, typename DType>

--- a/src/array/cuda/spmm.cuh
+++ b/src/array/cuda/spmm.cuh
-/*!
+/**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/spmm.cuh
 * @brief SPMM CUDA kernel function header.
@@ -21,7 +21,7 @@ using namespace cuda;
 namespace aten {
-/*!
+/**
 * @brief Determine whether cusparse SpMM function is applicable.
 */
 template <typename DType, typename IdType>
@@ -41,7 +41,7 @@ inline bool cusparse_available(bool more_nnz_than_matrix_size) {
 namespace {
-/*! @brief Call cuBLAS geam API for transpose operation for float and double. */
+/** @brief Call cuBLAS geam API for transpose operation for float and double. */
 template <typename DType>
 cublasStatus_t Xgeam(cublasHandle_t handle, cublasOperation_t transa,
    cublasOperation_t transb, int m, int n,
@@ -98,7 +98,8 @@ cublasStatus_t Xgeam<double>(cublasHandle_t handle, cublasOperation_t transa,
      beta, B, ldb, C, ldc);
 }
-/* @brief IndexSelect operator kernel implementation.
+/**
+ * @brief IndexSelect operator kernel implementation.
 * @note duplicate of IndexSelectKernel defined in array_index_select.cu
 */
 template <typename DType, typename IdType>
@@ -112,7 +113,8 @@ __global__ void _IndexSelectKernel(
    out[i * m + j] = in[idx[i] * m + j];
 }
-/* @brief Transpose operator kernel implementation.
+/**
+ * @brief Transpose operator kernel implementation.
 * @note not efficient but it's not a bottleneck, used for float16 dtype.
 */
 template <typename DType>
@@ -125,7 +127,7 @@ __global__ void _TransposeKernel(
    out[i * m + j] = in[j * n + i];
 }
-/*
+/**
 * @brief Tranpose the input matrix.
 * @param row number of rows of input matrix.
 * @param col number of columns of input matrix.
@@ -149,7 +151,7 @@ void _Transpose(const DType* in, DType* out,
      out, row));
 }
-/*
+/**
 * @brief Tranpose the input matrix for data type half.
 * @note cuBLAS has no geam API for half data type, fallback to our kernel.
 */
@@ -163,7 +165,7 @@ void _Transpose<half>(const half* in, half* out,
 }
 #if BF16_ENABLED
-/*
+/**
 * @brief Tranpose the input matrix for data type half.
 * @note cuBLAS has no geam API for bf16 data type, fallback to our kernel.
 */
@@ -177,7 +179,7 @@ void _Transpose<__nv_bfloat16>(const __nv_bfloat16* in, __nv_bfloat16* out,
 }
 #endif  // BF16_ENABLED
-/*
+/**
 * @brief
 */
 template <typename DType, typename IdType>
@@ -247,7 +249,7 @@ cusparseStatus_t Xcsrmm2<double>(cusparseHandle_t handle, cusparseOperation_t tr
 }
 #endif
-/*! Cusparse implementation of SpMM on Csr format. */
+/** Cusparse implementation of SpMM on Csr format. */
 template <typename DType, typename IdType>
 void CusparseCsrmm2(
    const DGLContext& ctx,
@@ -347,7 +349,7 @@ void CusparseCsrmm2(
    device->FreeWorkspace(ctx, valptr);
 }
-/*! Cusparse implementation of SpMM on Csr format. */
+/** Cusparse implementation of SpMM on Csr format. */
 template <typename DType, typename IdType>
 void CusparseCsrmm2Hetero(
    const DGLContext& ctx,
@@ -476,7 +478,7 @@ void CusparseCsrmm2Hetero(
 namespace cuda {
-/*!
+/**
 * @brief CUDA kernel of g-SpMM on Coo format.
 * @note it uses edge parallel strategy, different threadblocks (on y-axis)
 *       is responsible for the computation on different edges. Threadblocks
@@ -525,7 +527,7 @@ __global__ void SpMMCooKernel(
  }
 }
-/*!
+/**
 * @brief CUDA kernel to compute argu and arge in g-SpMM on Coo format.
 * @note it uses edge parallel strategy, different threadblocks (on y-axis)
 *       is responsible for the computation on different edges. Threadblocks
@@ -573,7 +575,7 @@ __global__ void ArgSpMMCooKernel(
  }
 }
-/*!
+/**
 * @brief CUDA kernel of g-SpMM on Csr format.
 * @note it uses node parallel strategy, different threadblocks (on y-axis)
 *       is responsible for the computation on different destination nodes.
@@ -631,7 +633,7 @@ __global__ void SpMMCsrKernel(
  }
 }
-/*!
+/**
 * @brief CUDA kernel of SpMM-Min/Max on Csr format.
 * @note it uses node parallel strategy, different threadblocks (on y-axis)
 *       is responsible for the computation on different destination nodes.
@@ -692,7 +694,7 @@ __global__ void SpMMCmpCsrHeteroKernel(
  }
 }
-/*!
+/**
 * @brief CUDA implementation of g-SpMM on Coo format.
 * @param bcast Broadcast information.
 * @param coo The Coo matrix.
@@ -769,7 +771,7 @@ void SpMMCoo(
  });
 }
-/*!
+/**
 * @brief CUDA implementation of g-SpMM on Csr format.
 * @param bcast Broadcast information.
 * @param csr The Csr matrix.
@@ -824,7 +826,7 @@ void SpMMCsr(
  });
 }
-/*!
+/**
 * @brief CUDA kernel of SpMM-Min/Max on Csr format on heterogeneous graph.
 * @param bcast Broadcast information.
 * @param csr The Csr matrix.

--- a/src/array/cuda/spmm_hetero.cu
+++ b/src/array/cuda/spmm_hetero.cu
-/*!
+/**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/spmm.cu
 * @brief SPMM C APIs and definitions.
@@ -15,7 +15,7 @@ using namespace cuda;
 namespace aten {
-/*!
+/**
 * @brief CUDA implementation of g-SpMM on Csr format.
 * @note use cusparse if the reduce operator is `sum` and there is
 *       no broadcast, use dgl's kernel in other cases.

--- a/src/array/cuda/utils.cu
+++ b/src/array/cuda/utils.cu
-/*!
+/**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/utils.cu
 * @brief Utilities for CUDA kernels.

--- a/src/array/cuda/utils.h
+++ b/src/array/cuda/utils.h
-/*!
+/**
 *  Copyright (c) 2020 by Contributors
 * @file array/cuda/utils.h
 * @brief Utilities for CUDA kernels.
@@ -23,7 +23,7 @@ namespace cuda {
 #define CUDA_MAX_NUM_THREADS 256
-/*! @brief Calculate the number of threads needed given the dimension length.
+/** @brief Calculate the number of threads needed given the dimension length.
 *
 * It finds the biggest number that is smaller than min(dim, max_nthrs)
 * and is also power of two.
@@ -39,7 +39,7 @@ inline int FindNumThreads(int dim, int max_nthrs = CUDA_MAX_NUM_THREADS) {
  return ret;
 }
-/*
+/**
 * !\brief Find number of blocks is smaller than nblks and max_nblks
 * on the given axis ('x', 'y' or 'z').
 */
@@ -77,7 +77,7 @@ __device__ __forceinline__ T _ldg(T* addr) {
 #endif
 }
-/*!
+/**
 * @brief Return true if the given bool flag array is all true.
 * The input bool array is in int8_t type so it is aligned with byte address.
 *
@@ -88,7 +88,7 @@ __device__ __forceinline__ T _ldg(T* addr) {
 */
 bool AllTrue(int8_t* flags, int64_t length, const DGLContext& ctx);
-/*!
+/**
 * @brief CUDA Kernel of filling the vector started from ptr of size length
 *        with val.
 * @note internal use only.
@@ -103,7 +103,7 @@ __global__ void _FillKernel(DType* ptr, size_t length, DType val) {
  }
 }
-/*! @brief Fill the vector started from ptr of size length with val */
+/** @brief Fill the vector started from ptr of size length with val */
 template <typename DType>
 void _Fill(DType* ptr, size_t length, DType val) {
  cudaStream_t stream = runtime::getCurrentCUDAStream();
@@ -112,7 +112,7 @@ void _Fill(DType* ptr, size_t length, DType val) {
  CUDA_KERNEL_CALL(cuda::_FillKernel, nb, nt, 0, stream, ptr, length, val);
 }
-/*!
+/**
 * @brief Search adjacency list linearly for each (row, col) pair and
 * write the data under the matched position in the indices array to the output.
 *
@@ -156,7 +156,7 @@ __global__ void _LinearSearchKernel(
 }
 #if BF16_ENABLED
-/*!
+/**
 * @brief Specialization for bf16 because conversion from long long to bfloat16
 * doesn't exist before SM80.
 */
@@ -205,7 +205,7 @@ inline DType GetCUDAScalar(
  return result;
 }
-/*!
+/**
 * @brief Given a sorted array and a value this function returns the index
 * of the first element which compares greater than value.
 *
@@ -230,7 +230,7 @@ __device__ IdType _UpperBound(const IdType *A, int64_t n, IdType x) {
  return l;
 }
-/*!
+/**
 * @brief Given a sorted array and a value this function returns the index
 * of the element who is equal to val. If not exist returns n+1
 *

--- a/src/array/cuda/uvm/array_index_select_uvm.cu
+++ b/src/array/cuda/uvm/array_index_select_uvm.cu
-/*!
+/**
 *  Copyright (c) 2019-2022 by Contributors
 * @file array/cuda/uvm/array_index_select_uvm.cu
 * @brief Array index select GPU implementation

--- a/src/array/cuda/uvm/array_index_select_uvm.cuh
+++ b/src/array/cuda/uvm/array_index_select_uvm.cuh
-/*!
+/**
 *  Copyright (c) 2021 by Contributors
 * @file array/cpu/array_index_select_uvm.cuh
 * @brief Array index select GPU kernel implementation
@@ -13,9 +13,10 @@ namespace dgl {
 namespace aten {
 namespace impl {
-/*  This is a cross-device access version of IndexSelectMultiKernel.
+/**
- *   Since the memory access over PCIe is more sensitive to the
+ *  This is a cross-device access version of IndexSelectMultiKernel.
- *   data access aligment (cacheline), we need a separate version here.
+ *  Since the memory access over PCIe is more sensitive to the
+ *  data access aligment (cacheline), we need a separate version here.
 */
 template <typename DType, typename IdType>
 __global__ void IndexSelectMultiKernelAligned(

--- a/src/array/filter.cc
+++ b/src/array/filter.cc
-/*!
+/**
 *  Copyright (c) 2021 by Contributors
 * @file array/filter.cc
 * @brief Object for selecting items in a set, or selecting items not in a set.