Unverified Commit 619d735d authored by Hongzhi (Steve), Chen's avatar Hongzhi (Steve), Chen Committed by GitHub
Browse files

[Misc] Replace \xxx with @XXX in structured comment. (#4822)



* param

* brief

* note

* return

* tparam

* brief2

* file

* return2

* return

* blabla

* all
Co-authored-by: default avatarSteve <ubuntu@ip-172-31-34-29.ap-northeast-1.compute.internal>
parent 96297fb8
/*!
* Copyright (c) 2020 by Contributors
* \file array/cuda/gather_mm.cu
* \brief GatherMM C APIs and definitions.
* @file array/cuda/gather_mm.cu
* @brief GatherMM C APIs and definitions.
*/
#include <dgl/array.h>
#include <algorithm> // std::swap
......@@ -15,7 +15,7 @@ namespace aten {
namespace {
/*! \brief Call cuBLAS GEMM API for dense matmul operation for float and double. */
/*! @brief Call cuBLAS GEMM API for dense matmul operation for float and double. */
template <typename DType>
cublasStatus_t cublasGemm(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
......@@ -77,12 +77,13 @@ cublasStatus_t cublasGemm<double>(cublasHandle_t handle, cublasOperation_t trans
namespace cuda {
/* \Note Each row of A multiplies a segment of matrix of B of dimension in_len * outlen.
One warp is assigned to process one row of A. Each WARP sequentially multiplies
one element of A and a row of B to compute partial result of the output. A
is loaded in shared memory in a coalesced way. Output matrix is loaded in
registers. B should get benefit from L2 cache.
*/
/**
* @note Each row of A multiplies a segment of matrix of B of dimension in_len * outlen.
* One warp is assigned to process one row of A. Each WARP sequentially multiplies
* one element of A and a row of B to compute partial result of the output. A
* is loaded in shared memory in a coalesced way. Output matrix is loaded in
* registers. B should get benefit from L2 cache.
*/
template <typename Idx, typename DType>
__global__ void GatherMMScatterKernel(
const DType* __restrict__ A,
......@@ -138,12 +139,13 @@ __global__ void GatherMMScatterKernel(
}
/* \Note Output matrix is accumulated via atomic operations. Rest of the strategies
are similar to GatherMMKernel. One warp is assigned to process one row of A. Each
WARP sequentially multiplies one element of A and a row of B to compute partial
result of the output. A is loaded in shared memory in a coalesced way. B should
get benefit from L2 cache.
*/
/**
* @note Output matrix is accumulated via atomic operations. Rest of the strategies
* are similar to GatherMMKernel. One warp is assigned to process one row of A. Each
* WARP sequentially multiplies one element of A and a row of B to compute partial
* result of the output. A is loaded in shared memory in a coalesced way. B should
* get benefit from L2 cache.
*/
template <typename Idx, typename DType>
__global__ void GatherMMScatterKernel2(
const DType* __restrict__ A,
......@@ -197,15 +199,15 @@ __global__ void GatherMMScatterKernel2(
} // namespace cuda
/*!
* \brief Implementation of Gather_mm operator. The input matrix A is
* @brief Implementation of Gather_mm operator. The input matrix A is
* expected to be sorted according to relation type.
* \param A The input dense matrix of dimension m x k
* \param B The input dense matrix of dimension k x n
* \param C The output dense matrix of dimension m x n
* \param seglen_A The input vector of size R. Each element
* @param A The input dense matrix of dimension m x k
* @param B The input dense matrix of dimension k x n
* @param C The output dense matrix of dimension m x n
* @param seglen_A The input vector of size R. Each element
* is the length of segments of input ``A``
* \param a_trans Matrix A to be transposed
* \param b_trans Matrix B to be transposed
* @param a_trans Matrix A to be transposed
* @param b_trans Matrix B to be transposed
*/
template <int XPU, typename IdType, typename DType>
void SegmentMM(const NDArray A,
......@@ -308,13 +310,13 @@ void SegmentMMBackwardB(const NDArray A,
}
/*!
* \brief Implementation of Gather_mm operator. The input matrix A is
* @brief Implementation of Gather_mm operator. The input matrix A is
* expected to be sorted according to relation type.
* \param A The input dense matrix of dimension m x k
* \param B The input dense matrix of dimension k x n
* \param C The output dense matrix of dimension m x n
* \param idx_a The input vector to gather left hand operand on
* \param idx_b The input vector to gather right hand operand on
* @param A The input dense matrix of dimension m x k
* @param B The input dense matrix of dimension k x n
* @param C The output dense matrix of dimension m x n
* @param idx_a The input vector to gather left hand operand on
* @param idx_b The input vector to gather right hand operand on
*/
template <int XPU, typename IdType, typename DType>
......@@ -345,17 +347,17 @@ void GatherMM(const NDArray A,
}
/*!
* \brief Implementation of Gather_mm operator. The input matrix A is
* @brief Implementation of Gather_mm operator. The input matrix A is
* expected to be sorted according to relation type.
* \param A The input dense matrix of dimension m x k
* \param B The input dense matrix of dimension k x n
* \param C The output dense matrix of dimension m x n
* \param idx_a The input vector to gather left hand operand on
* \param idx_b The input vector to gather right hand operand on
* \param idx_c The input vector to gather output operand on
* \param num_rel The number of idx types in idx_b
* \param a_trans Matrix A to be transposed
* \param b_trans Matrix B to be transposed
* @param A The input dense matrix of dimension m x k
* @param B The input dense matrix of dimension k x n
* @param C The output dense matrix of dimension m x n
* @param idx_a The input vector to gather left hand operand on
* @param idx_b The input vector to gather right hand operand on
* @param idx_c The input vector to gather output operand on
* @param num_rel The number of idx types in idx_b
* @param a_trans Matrix A to be transposed
* @param b_trans Matrix B to be transposed
*/
template <int XPU, typename IdType, typename DType>
void GatherMMScatter(const NDArray A,
......
/*!
* Copyright (c) 2020 by Contributors
* \file array/cuda/ge_spmm.cuh
* \brief GE-SpMM CUDA kernel function header.
* @file array/cuda/ge_spmm.cuh
* @brief GE-SpMM CUDA kernel function header.
*/
#ifndef DGL_ARRAY_CUDA_GE_SPMM_CUH_
#define DGL_ARRAY_CUDA_GE_SPMM_CUH_
......@@ -19,8 +19,8 @@ namespace aten {
namespace cuda {
/*!
* \brief CUDA kernel of GE-SpMM on Csr.
* \note GE-SpMM: https://arxiv.org/pdf/2007.03179.pdf
* @brief CUDA kernel of GE-SpMM on Csr.
* @note GE-SpMM: https://arxiv.org/pdf/2007.03179.pdf
* The grid dimension x and y are reordered for better performance.
*/
template <typename Idx, typename DType, typename BinaryOp>
......
/*!
* Copyright (c) 2020 by Contributors
* \file array/cuda/macro.cuh
* \brief Macro to call SPMM/SDDMM cuda kernels.
* @file array/cuda/macro.cuh
* @brief Macro to call SPMM/SDDMM cuda kernels.
*/
#ifndef DGL_ARRAY_CUDA_MACRO_CUH_
#define DGL_ARRAY_CUDA_MACRO_CUH_
......
/*!
* Copyright (c) 2021 by Contributors
* \file array/cuda/negative_sampling.cu
* \brief rowwise sampling
* @file array/cuda/negative_sampling.cu
* @brief rowwise sampling
*/
#include <curand_kernel.h>
......@@ -81,7 +81,7 @@ struct IsNotMinusOne {
};
/*!
* \brief Sort ordered pairs in ascending order, using \a tmp_major and \a
* @brief Sort ordered pairs in ascending order, using \a tmp_major and \a
* tmp_minor as temporary buffers, each with \a n elements.
*/
template <typename IdType>
......
/*!
* Copyright (c) 2021 by Contributors
* \file array/cuda/rowwise_sampling.cu
* \brief uniform rowwise sampling
* @file array/cuda/rowwise_sampling.cu
* @brief uniform rowwise sampling
*/
#include <curand_kernel.h>
......
/*!
* Copyright (c) 2022 by Contributors
* \file array/cuda/rowwise_sampling_prob.cu
* \brief weighted rowwise sampling. The degree computing kernels and
* @file array/cuda/rowwise_sampling_prob.cu
* @brief weighted rowwise sampling. The degree computing kernels and
* host-side functions are partially borrowed from the uniform rowwise
* sampling code rowwise_sampling.cu.
* \author pengqirong (OPPO), dlasalle and Xin from Nvidia.
* @author pengqirong (OPPO), dlasalle and Xin from Nvidia.
*/
#include <dgl/random.h>
#include <dgl/runtime/device_api.h>
......
/*!
* Copyright (c) 2020 by Contributors
* \file array/cuda/sddmm.cu
* \brief SDDMM C APIs and definitions.
* @file array/cuda/sddmm.cu
* @brief SDDMM C APIs and definitions.
*/
#include <dgl/array.h>
#include "./sddmm.cuh"
......@@ -11,7 +11,7 @@ namespace dgl {
namespace aten {
/*!
* \brief CUDA implementation of g-SDDMM on Csr format.
* @brief CUDA implementation of g-SDDMM on Csr format.
*/
template <int XPU, typename IdType, typename DType>
void SDDMMCsr(const std::string& op,
......@@ -31,7 +31,7 @@ void SDDMMCsr(const std::string& op,
/*!
* \brief CUDA implementation of g-SDDMM on Coo format.
* @brief CUDA implementation of g-SDDMM on Coo format.
*/
template <int XPU, typename IdType, typename DType>
void SDDMMCoo(const std::string& op,
......
/*!
* Copyright (c) 2020 by Contributors
* \file array/cuda/sddmm.cuh
* \brief SDDMM CUDA kernel function header.
* @file array/cuda/sddmm.cuh
* @brief SDDMM CUDA kernel function header.
*/
#ifndef DGL_ARRAY_CUDA_SDDMM_CUH_
#define DGL_ARRAY_CUDA_SDDMM_CUH_
......@@ -87,8 +87,8 @@ namespace cuda {
constexpr unsigned int full_mask = 0xffffffff;
/*!
* \brief CUDA kernel of g-SDDMM on Coo format.
* \note it uses edge parallel strategy, different threadblocks (on y-axis)
* @brief CUDA kernel of g-SDDMM on Coo format.
* @note it uses edge parallel strategy, different threadblocks (on y-axis)
* is responsible for the computation on different edges. Threadblocks
* on the x-axis are responsible for the computation on different positions
* in feature dimension.
......@@ -136,8 +136,8 @@ __global__ void SDDMMCooKernel(
}
/*!
* \brief CUDA kernel of SDDMM-dot on Coo format, accelerated with tree reduction.
* \note it uses edge parallel strategy, different threadblocks (on y-axis)
* @brief CUDA kernel of SDDMM-dot on Coo format, accelerated with tree reduction.
* @note it uses edge parallel strategy, different threadblocks (on y-axis)
* is responsible for the computation on different edges. Threadblocks
* on the x-axis are responsible for the computation on different positions
* in feature dimension.
......@@ -204,8 +204,8 @@ __device__ __forceinline__ Idx BinarySearchSrc(const Idx *array, Idx length, Idx
}
/*!
* \brief CUDA kernel of g-SDDMM on Csr format.
* \note it uses edge parallel strategy, different threadblocks (on y-axis)
* @brief CUDA kernel of g-SDDMM on Csr format.
* @note it uses edge parallel strategy, different threadblocks (on y-axis)
* is responsible for the computation on different edges. Threadblocks
* on the x-axis are responsible for the computation on different positions
* in feature dimension.
......@@ -255,12 +255,12 @@ __global__ void SDDMMCsrKernel(
}
/*!
* \brief CUDA implementation of g-SDDMM on Coo format.
* \param bcast Broadcast information.
* \param coo The Coo matrix.
* \param lhs The left hand side operand feature.
* \param rhs The right hand size operand feature.
* \param out The result feature on edges.
* @brief CUDA implementation of g-SDDMM on Coo format.
* @param bcast Broadcast information.
* @param coo The Coo matrix.
* @param lhs The left hand side operand feature.
* @param rhs The right hand size operand feature.
* @param out The result feature on edges.
*/
template <typename Idx, typename DType, typename Op,
int LhsTarget = 0, int RhsTarget = 2>
......@@ -324,12 +324,12 @@ void SDDMMCoo(
}
/*!
* \brief CUDA implementation of g-SDDMM on Csr format.
* \param bcast Broadcast information.
* \param csr The Csr matrix.
* \param lhs The left hand side operand feature.
* \param rhs The right hand size operand feature.
* \param out The result feature on edges.
* @brief CUDA implementation of g-SDDMM on Csr format.
* @param bcast Broadcast information.
* @param csr The Csr matrix.
* @param lhs The left hand side operand feature.
* @param rhs The right hand size operand feature.
* @param out The result feature on edges.
*/
template <typename Idx, typename DType, typename Op,
int LhsTarget = 0, int RhsTarget = 2>
......
/*!
* Copyright (c) 2020 by Contributors
* \file array/cuda/sddmm.cu
* \brief SDDMM C APIs and definitions.
* @file array/cuda/sddmm.cu
* @brief SDDMM C APIs and definitions.
*/
#include <dgl/array.h>
#include "./sddmm.cuh"
......@@ -10,7 +10,7 @@ namespace dgl {
namespace aten {
/*!
* \brief CUDA implementation of g-SDDMM on heterograph using
* @brief CUDA implementation of g-SDDMM on heterograph using
Csr format.
*/
template <int XPU, typename IdType, typename DType>
......
/*!
* Copyright (c) 2020 by Contributors
* \file array/cuda/sddmm.cu
* \brief SDDMM C APIs and definitions.
* @file array/cuda/sddmm.cu
* @brief SDDMM C APIs and definitions.
*/
#include <dgl/array.h>
#include "./sddmm.cuh"
......@@ -10,7 +10,7 @@ namespace dgl {
namespace aten {
/*!
* \brief CUDA implementation of g-SDDMM on heterograph using
* @brief CUDA implementation of g-SDDMM on heterograph using
Csr format.
*/
template <int XPU, typename IdType, typename DType>
......
/*!
* Copyright (c) 2020 by Contributors
* \file array/cuda/segment_reduce.cu
* \brief Segment reduce C APIs and definitions.
* @file array/cuda/segment_reduce.cu
* @brief Segment reduce C APIs and definitions.
*/
#include <dgl/array.h>
#include <dgl/base_heterograph.h>
......
/*!
* Copyright (c) 2020 by Contributors
* \file array/cuda/segment_reduce.cuh
* \brief Segment reduce kernel function header.
* @file array/cuda/segment_reduce.cuh
* @brief Segment reduce kernel function header.
*/
#ifndef DGL_ARRAY_CUDA_SEGMENT_REDUCE_CUH_
#define DGL_ARRAY_CUDA_SEGMENT_REDUCE_CUH_
......@@ -21,8 +21,8 @@ namespace aten {
namespace cuda {
/*!
* \brief CUDA kernel of segment reduce.
* \note each blockthread is responsible for aggregation on a row
* @brief CUDA kernel of segment reduce.
* @note each blockthread is responsible for aggregation on a row
* in the result tensor.
*/
template <typename IdType, typename DType, typename ReduceOp>
......@@ -45,8 +45,8 @@ __global__ void SegmentReduceKernel(
}
/*!
* \brief CUDA kernel of scatter add.
* \note each blockthread is responsible for adding a row in feature tensor
* @brief CUDA kernel of scatter add.
* @note each blockthread is responsible for adding a row in feature tensor
* to a target row in output tensor.
*/
template <typename IdType, typename DType>
......@@ -63,8 +63,8 @@ __global__ void ScatterAddKernel(
}
/*!
* \brief CUDA kernel to update gradients for reduce op max/min
* \note each WARP (group of 32 threads) is responsible for adding a row in
* @brief CUDA kernel to update gradients for reduce op max/min
* @note each WARP (group of 32 threads) is responsible for adding a row in
* feature tensor to a target row in output tensor.
*/
......@@ -91,8 +91,8 @@ __global__ void UpdateGradMinMaxHeteroKernel(
}
/*!
* \brief CUDA kernel of backward phase in segment min/max.
* \note each blockthread is responsible for writing a row in the
* @brief CUDA kernel of backward phase in segment min/max.
* @note each blockthread is responsible for writing a row in the
* result gradient tensor by lookup the ArgMin/Max for index information.
*/
template <typename IdType, typename DType>
......@@ -111,11 +111,11 @@ __global__ void BackwardSegmentCmpKernel(
}
/*!
* \brief CUDA implementation of forward phase of Segment Reduce.
* \param feat The input tensor.
* \param offsets The offsets tensor.
* \param out The output tensor.
* \param arg An auxiliary tensor storing ArgMax/Min information,
* @brief CUDA implementation of forward phase of Segment Reduce.
* @param feat The input tensor.
* @param offsets The offsets tensor.
* @param out The output tensor.
* @param arg An auxiliary tensor storing ArgMax/Min information,
*/
template <typename IdType, typename DType, typename ReduceOp>
void SegmentReduce(NDArray feat, NDArray offsets, NDArray out, NDArray arg) {
......@@ -142,11 +142,11 @@ void SegmentReduce(NDArray feat, NDArray offsets, NDArray out, NDArray arg) {
}
/*!
* \brief CUDA implementation of Scatter Add (on first dimension).
* \note math equation: out[idx[i], *] += feat[i, *]
* \param feat The input tensor.
* \param idx The indices tensor.
* \param out The output tensor.
* @brief CUDA implementation of Scatter Add (on first dimension).
* @note math equation: out[idx[i], *] += feat[i, *]
* @param feat The input tensor.
* @param idx The indices tensor.
* @param out The output tensor.
*/
template <typename IdType, typename DType>
void ScatterAdd(NDArray feat, NDArray idx, NDArray out) {
......@@ -171,13 +171,13 @@ void ScatterAdd(NDArray feat, NDArray idx, NDArray out) {
}
/*!
* \brief CUDA implementation to update gradients for reduce op max/min
* \param graph The input heterogeneous graph.
* \param op The binary operator, could be `copy_u`, `copy_e'.
* \param list_feat List of the input tensors.
* \param list_idx List of the indices tensors.
* \param list_idx_etype List of the node- or edge-type tensors.
* \param list_out List of the output tensors.
* @brief CUDA implementation to update gradients for reduce op max/min
* @param graph The input heterogeneous graph.
* @param op The binary operator, could be `copy_u`, `copy_e'.
* @param list_feat List of the input tensors.
* @param list_idx List of the indices tensors.
* @param list_idx_etype List of the node- or edge-type tensors.
* @param list_out List of the output tensors.
*/
template <typename IdType, typename DType>
void UpdateGradMinMax_hetero(
......@@ -224,12 +224,12 @@ void UpdateGradMinMax_hetero(
}
/*!
* \brief CUDA implementation of backward phase of Segment Reduce with Min/Max
* @brief CUDA implementation of backward phase of Segment Reduce with Min/Max
* reducer.
* \note math equation: out[arg[i, k], k] = feat[i, k] \param feat The input
* @note math equation: out[arg[i, k], k] = feat[i, k] \param feat The input
* tensor.
* \param arg The ArgMin/Max information, used for indexing.
* \param out The output tensor.
* @param arg The ArgMin/Max information, used for indexing.
* @param out The output tensor.
*/
template <typename IdType, typename DType>
void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) {
......
/*!
* Copyright (c) 2021 by contributors.
* \file array/cuda/spmat_op_impl_coo.cu
* \brief COO operator GPU implementation
* @file array/cuda/spmat_op_impl_coo.cu
* @brief COO operator GPU implementation
*/
#include <dgl/array.h>
......
/*!
* Copyright (c) 2020 by Contributors
* \file array/cuda/spmat_op_impl_csr.cu
* \brief CSR operator CPU implementation
* @file array/cuda/spmat_op_impl_csr.cu
* @brief CSR operator CPU implementation
*/
#include <dgl/array.h>
......@@ -82,7 +82,7 @@ template NDArray CSRIsNonZero<kDGLCUDA, int64_t>(CSRMatrix, NDArray, NDArray);
///////////////////////////// CSRHasDuplicate /////////////////////////////
/*!
* \brief Check whether each row does not have any duplicate entries.
* @brief Check whether each row does not have any duplicate entries.
* Assume the CSR is sorted.
*/
template <typename IdType>
......@@ -228,7 +228,7 @@ template CSRMatrix CSRSliceRows<kDGLCUDA, int32_t>(CSRMatrix, int64_t, int64_t);
template CSRMatrix CSRSliceRows<kDGLCUDA, int64_t>(CSRMatrix, int64_t, int64_t);
/*!
* \brief Copy data segment to output buffers
* @brief Copy data segment to output buffers
*
* For the i^th row r = row[i], copy the data from indptr[r] ~ indptr[r+1]
* to the out_data from out_indptr[i] ~ out_indptr[i+1]
......@@ -299,7 +299,7 @@ template CSRMatrix CSRSliceRows<kDGLCUDA, int64_t>(CSRMatrix, NDArray);
///////////////////////////// CSRGetDataAndIndices /////////////////////////////
/*!
* \brief Generate a 0-1 mask for each index that hits the provided (row, col)
* @brief Generate a 0-1 mask for each index that hits the provided (row, col)
* index.
*
* Examples:
......@@ -331,7 +331,7 @@ __global__ void _SegmentMaskKernel(
}
/*!
* \brief Search for the insertion positions for needle in the hay.
* @brief Search for the insertion positions for needle in the hay.
*
* The hay is a list of sorted elements and the result is the insertion position
* of each needle so that the insertion still gives sorted order.
......@@ -424,7 +424,7 @@ template std::vector<NDArray> CSRGetDataAndIndices<kDGLCUDA, int64_t>(
///////////////////////////// CSRSliceMatrix /////////////////////////////
/*!
* \brief Generate a 0-1 mask for each index whose column is in the provided
* @brief Generate a 0-1 mask for each index whose column is in the provided
* set. It also counts the number of masked values per row.
*/
template <typename IdType>
......
/*!
* Copyright (c) 2020 by Contributors
* \file array/cuda/spmm.cu
* \brief SPMM C APIs and definitions.
* @file array/cuda/spmm.cu
* @brief SPMM C APIs and definitions.
*/
#include <dgl/array.h>
#include "./spmm.cuh"
......@@ -16,8 +16,8 @@ using namespace cuda;
namespace aten {
/*!
* \brief CUDA implementation of g-SpMM on Csr format.
* \note use cusparse if the reduce operator is `sum` and there is
* @brief CUDA implementation of g-SpMM on Csr format.
* @note use cusparse if the reduce operator is `sum` and there is
* no broadcast, use dgl's kernel in other cases.
*/
template <int XPU, typename IdType, typename DType>
......@@ -81,7 +81,7 @@ void SpMMCsr(const std::string& op, const std::string& reduce,
/*!
* \brief CUDA implementation of g-SpMM on Coo format.
* @brief CUDA implementation of g-SpMM on Coo format.
*/
template <int XPU, typename IdType, typename DType>
void SpMMCoo(const std::string& op, const std::string& reduce,
......
/*!
* Copyright (c) 2020 by Contributors
* \file array/cuda/spmm.cuh
* \brief SPMM CUDA kernel function header.
* @file array/cuda/spmm.cuh
* @brief SPMM CUDA kernel function header.
*/
#ifndef DGL_ARRAY_CUDA_SPMM_CUH_
#define DGL_ARRAY_CUDA_SPMM_CUH_
......@@ -22,7 +22,7 @@ using namespace cuda;
namespace aten {
/*!
* \brief Determine whether cusparse SpMM function is applicable.
* @brief Determine whether cusparse SpMM function is applicable.
*/
template <typename DType, typename IdType>
inline bool cusparse_available(bool more_nnz_than_matrix_size) {
......@@ -41,7 +41,7 @@ inline bool cusparse_available(bool more_nnz_than_matrix_size) {
namespace {
/*! \brief Call cuBLAS geam API for transpose operation for float and double. */
/*! @brief Call cuBLAS geam API for transpose operation for float and double. */
template <typename DType>
cublasStatus_t Xgeam(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n,
......@@ -98,8 +98,8 @@ cublasStatus_t Xgeam<double>(cublasHandle_t handle, cublasOperation_t transa,
beta, B, ldb, C, ldc);
}
/* \brief IndexSelect operator kernel implementation.
* \note duplicate of IndexSelectKernel defined in array_index_select.cu
/* @brief IndexSelect operator kernel implementation.
* @note duplicate of IndexSelectKernel defined in array_index_select.cu
*/
template <typename DType, typename IdType>
__global__ void _IndexSelectKernel(
......@@ -112,8 +112,8 @@ __global__ void _IndexSelectKernel(
out[i * m + j] = in[idx[i] * m + j];
}
/* \brief Transpose operator kernel implementation.
* \note not efficient but it's not a bottleneck, used for float16 dtype.
/* @brief Transpose operator kernel implementation.
* @note not efficient but it's not a bottleneck, used for float16 dtype.
*/
template <typename DType>
__global__ void _TransposeKernel(
......@@ -126,9 +126,9 @@ __global__ void _TransposeKernel(
}
/*
* \brief Tranpose the input matrix.
* \param row number of rows of input matrix.
* \param col number of columns of input matrix.
* @brief Tranpose the input matrix.
* @param row number of rows of input matrix.
* @param col number of columns of input matrix.
*/
template <typename DType>
void _Transpose(const DType* in, DType* out,
......@@ -150,8 +150,8 @@ void _Transpose(const DType* in, DType* out,
}
/*
* \brief Tranpose the input matrix for data type half.
* \note cuBLAS has no geam API for half data type, fallback to our kernel.
* @brief Tranpose the input matrix for data type half.
* @note cuBLAS has no geam API for half data type, fallback to our kernel.
*/
template <>
void _Transpose<half>(const half* in, half* out,
......@@ -164,8 +164,8 @@ void _Transpose<half>(const half* in, half* out,
#if BF16_ENABLED
/*
* \brief Tranpose the input matrix for data type half.
* \note cuBLAS has no geam API for bf16 data type, fallback to our kernel.
* @brief Tranpose the input matrix for data type half.
* @note cuBLAS has no geam API for bf16 data type, fallback to our kernel.
*/
template <>
void _Transpose<__nv_bfloat16>(const __nv_bfloat16* in, __nv_bfloat16* out,
......@@ -178,7 +178,7 @@ void _Transpose<__nv_bfloat16>(const __nv_bfloat16* in, __nv_bfloat16* out,
#endif // BF16_ENABLED
/*
* \brief
* @brief
*/
template <typename DType, typename IdType>
__global__ void _IndexSelectKernel(const DType* array, const IdType* index,
......@@ -191,8 +191,8 @@ __global__ void _IndexSelectKernel(const DType* array, const IdType* index,
}
}
/* \brief IndexSelect operator.
* \note duplicate of IndexSelect defined in array_op.h but it can
/* @brief IndexSelect operator.
* @note duplicate of IndexSelect defined in array_op.h but it can
* not be applied to float16 dtype.
*/
template<typename DType, typename IdType>
......@@ -477,8 +477,8 @@ namespace cuda {
/*!
* \brief CUDA kernel of g-SpMM on Coo format.
* \note it uses edge parallel strategy, different threadblocks (on y-axis)
* @brief CUDA kernel of g-SpMM on Coo format.
* @note it uses edge parallel strategy, different threadblocks (on y-axis)
* is responsible for the computation on different edges. Threadblocks
* on the x-axis are responsible for the computation on different positions
* in feature dimension.
......@@ -526,8 +526,8 @@ __global__ void SpMMCooKernel(
}
/*!
* \brief CUDA kernel to compute argu and arge in g-SpMM on Coo format.
* \note it uses edge parallel strategy, different threadblocks (on y-axis)
* @brief CUDA kernel to compute argu and arge in g-SpMM on Coo format.
* @note it uses edge parallel strategy, different threadblocks (on y-axis)
* is responsible for the computation on different edges. Threadblocks
* on the x-axis are responsible for the computation on different positions
* in feature dimension.
......@@ -574,8 +574,8 @@ __global__ void ArgSpMMCooKernel(
}
/*!
* \brief CUDA kernel of g-SpMM on Csr format.
* \note it uses node parallel strategy, different threadblocks (on y-axis)
* @brief CUDA kernel of g-SpMM on Csr format.
* @note it uses node parallel strategy, different threadblocks (on y-axis)
* is responsible for the computation on different destination nodes.
* Threadblocks on the x-axis are responsible for the computation on
* different positions in feature dimension.
......@@ -632,8 +632,8 @@ __global__ void SpMMCsrKernel(
}
/*!
* \brief CUDA kernel of SpMM-Min/Max on Csr format.
* \note it uses node parallel strategy, different threadblocks (on y-axis)
* @brief CUDA kernel of SpMM-Min/Max on Csr format.
* @note it uses node parallel strategy, different threadblocks (on y-axis)
* is responsible for the computation on different destination nodes.
* Threadblocks on the x-axis are responsible for the computation on
* different positions in feature dimension.
......@@ -693,16 +693,16 @@ __global__ void SpMMCmpCsrHeteroKernel(
}
/*!
* \brief CUDA implementation of g-SpMM on Coo format.
* \param bcast Broadcast information.
* \param coo The Coo matrix.
* \param ufeat The feature on source nodes.
* \param efeat The feature on edges.
* \param out The result feature on destination nodes.
* \param argu Arg-Min/Max on source nodes, which refers the source node indices
* @brief CUDA implementation of g-SpMM on Coo format.
* @param bcast Broadcast information.
* @param coo The Coo matrix.
* @param ufeat The feature on source nodes.
* @param efeat The feature on edges.
* @param out The result feature on destination nodes.
* @param argu Arg-Min/Max on source nodes, which refers the source node indices
* correspond to the minimum/maximum values of reduction result on
* destination nodes. It's useful in computing gradients of Min/Max reducer.
* \param arge Arg-Min/Max on edges. which refers the source node indices
* @param arge Arg-Min/Max on edges. which refers the source node indices
* correspond to the minimum/maximum values of reduction result on
* destination nodes. It's useful in computing gradients of Min/Max reducer.
*/
......@@ -770,16 +770,16 @@ void SpMMCoo(
}
/*!
* \brief CUDA implementation of g-SpMM on Csr format.
* \param bcast Broadcast information.
* \param csr The Csr matrix.
* \param ufeat The feature on source nodes.
* \param efeat The feature on edges.
* \param out The result feature on destination nodes.
* \param argu Arg-Min/Max on source nodes, which refers the source node indices
* @brief CUDA implementation of g-SpMM on Csr format.
* @param bcast Broadcast information.
* @param csr The Csr matrix.
* @param ufeat The feature on source nodes.
* @param efeat The feature on edges.
* @param out The result feature on destination nodes.
* @param argu Arg-Min/Max on source nodes, which refers the source node indices
* correspond to the minimum/maximum values of reduction result on
* destination nodes. It's useful in computing gradients of Min/Max reducer.
* \param arge Arg-Min/Max on edges. which refers the source node indices
* @param arge Arg-Min/Max on edges. which refers the source node indices
* correspond to the minimum/maximum values of reduction result on
* destination nodes. It's useful in computing gradients of Min/Max reducer.
*/
......@@ -825,26 +825,26 @@ void SpMMCsr(
}
/*!
* \brief CUDA kernel of SpMM-Min/Max on Csr format on heterogeneous graph.
* \param bcast Broadcast information.
* \param csr The Csr matrix.
* \param ufeat The feature on source nodes.
* \param efeat The feature on edges.
* \param out The result feature on destination nodes.
* \param argu Arg-Min/Max on source nodes, which refers the source node indices
* @brief CUDA kernel of SpMM-Min/Max on Csr format on heterogeneous graph.
* @param bcast Broadcast information.
* @param csr The Csr matrix.
* @param ufeat The feature on source nodes.
* @param efeat The feature on edges.
* @param out The result feature on destination nodes.
* @param argu Arg-Min/Max on source nodes, which refers the source node indices
* correspond to the minimum/maximum values of reduction result on
* destination nodes. It's useful in computing gradients of Min/Max reducer.
* \param arge Arg-Min/Max on edges. which refers the source node indices
* @param arge Arg-Min/Max on edges. which refers the source node indices
* correspond to the minimum/maximum values of reduction result on
* destination nodes. It's useful in computing gradients of Min/Max reducer.
* \param argu_ntype Node type of the arg-Min/Max on source nodes, which refers the
* @param argu_ntype Node type of the arg-Min/Max on source nodes, which refers the
* source node types correspond to the minimum/maximum values of reduction result
* on destination nodes. It's useful in computing gradients of Min/Max reducer.
* \param arge_etype Edge-type of the arg-Min/Max on edges. which refers the source
* @param arge_etype Edge-type of the arg-Min/Max on edges. which refers the source
* node indices correspond to the minimum/maximum values of reduction result on
* destination nodes. It's useful in computing gradients of Min/Max reducer.
* \param src_type Node type of the source nodes of an etype
* \param etype Edge type
* @param src_type Node type of the source nodes of an etype
* @param etype Edge type
*/
template <typename Idx, typename DType,
typename BinaryOp, typename ReduceOp>
......
/*!
* Copyright (c) 2020 by Contributors
* \file array/cuda/spmm.cu
* \brief SPMM C APIs and definitions.
* @file array/cuda/spmm.cu
* @brief SPMM C APIs and definitions.
*/
#include <dgl/array.h>
#include "./spmm.cuh"
......@@ -16,8 +16,8 @@ using namespace cuda;
namespace aten {
/*!
* \brief CUDA implementation of g-SpMM on Csr format.
* \note use cusparse if the reduce operator is `sum` and there is
* @brief CUDA implementation of g-SpMM on Csr format.
* @note use cusparse if the reduce operator is `sum` and there is
* no broadcast, use dgl's kernel in other cases.
*/
template <int XPU, typename IdType, typename DType>
......
/*!
* Copyright (c) 2020 by Contributors
* \file array/cuda/utils.cu
* \brief Utilities for CUDA kernels.
* @file array/cuda/utils.cu
* @brief Utilities for CUDA kernels.
*/
#include "./utils.h"
......
/*!
* Copyright (c) 2020 by Contributors
* \file array/cuda/utils.h
* \brief Utilities for CUDA kernels.
* @file array/cuda/utils.h
* @brief Utilities for CUDA kernels.
*/
#ifndef DGL_ARRAY_CUDA_UTILS_H_
#define DGL_ARRAY_CUDA_UTILS_H_
......@@ -23,7 +23,7 @@ namespace cuda {
#define CUDA_MAX_NUM_THREADS 256
/*! \brief Calculate the number of threads needed given the dimension length.
/*! @brief Calculate the number of threads needed given the dimension length.
*
* It finds the biggest number that is smaller than min(dim, max_nthrs)
* and is also power of two.
......@@ -78,20 +78,20 @@ __device__ __forceinline__ T _ldg(T* addr) {
}
/*!
* \brief Return true if the given bool flag array is all true.
* @brief Return true if the given bool flag array is all true.
* The input bool array is in int8_t type so it is aligned with byte address.
*
* \param flags The bool array.
* \param length The length.
* \param ctx Device context.
* \return True if all the flags are true.
* @param flags The bool array.
* @param length The length.
* @param ctx Device context.
* @return True if all the flags are true.
*/
bool AllTrue(int8_t* flags, int64_t length, const DGLContext& ctx);
/*!
* \brief CUDA Kernel of filling the vector started from ptr of size length
* @brief CUDA Kernel of filling the vector started from ptr of size length
* with val.
* \note internal use only.
* @note internal use only.
*/
template <typename DType>
__global__ void _FillKernel(DType* ptr, size_t length, DType val) {
......@@ -103,7 +103,7 @@ __global__ void _FillKernel(DType* ptr, size_t length, DType val) {
}
}
/*! \brief Fill the vector started from ptr of size length with val */
/*! @brief Fill the vector started from ptr of size length with val */
template <typename DType>
void _Fill(DType* ptr, size_t length, DType val) {
cudaStream_t stream = runtime::getCurrentCUDAStream();
......@@ -113,7 +113,7 @@ void _Fill(DType* ptr, size_t length, DType val) {
}
/*!
* \brief Search adjacency list linearly for each (row, col) pair and
* @brief Search adjacency list linearly for each (row, col) pair and
* write the data under the matched position in the indices array to the output.
*
* If there is no match, the value in \c filler is written.
......@@ -157,7 +157,7 @@ __global__ void _LinearSearchKernel(
#if BF16_ENABLED
/*!
* \brief Specialization for bf16 because conversion from long long to bfloat16
* @brief Specialization for bf16 because conversion from long long to bfloat16
* doesn't exist before SM80.
*/
template <typename IdType>
......@@ -206,7 +206,7 @@ inline DType GetCUDAScalar(
}
/*!
* \brief Given a sorted array and a value this function returns the index
* @brief Given a sorted array and a value this function returns the index
* of the first element which compares greater than value.
*
* This function assumes 0-based index
......@@ -231,7 +231,7 @@ __device__ IdType _UpperBound(const IdType *A, int64_t n, IdType x) {
}
/*!
* \brief Given a sorted array and a value this function returns the index
* @brief Given a sorted array and a value this function returns the index
* of the element who is equal to val. If not exist returns n+1
*
* This function assumes 0-based index
......
/*!
* Copyright (c) 2019-2022 by Contributors
* \file array/cuda/uvm/array_index_select_uvm.cu
* \brief Array index select GPU implementation
* @file array/cuda/uvm/array_index_select_uvm.cu
* @brief Array index select GPU implementation
*/
#include <dgl/array.h>
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment