"torchvision/git@developer.sourcefind.cn:OpenDAS/vision.git" did not exist on "dbb767910fac67d996aaeb4e0fff630f0419edae"
Unverified Commit c81efdf2 authored by Jinjing Zhou's avatar Jinjing Zhou Committed by GitHub
Browse files

Remove deprecated kernels (#3316)

* remove

* remove

* fix

* remove

* remove
parent 75d793a1
...@@ -145,8 +145,6 @@ file(GLOB DGL_SRC ...@@ -145,8 +145,6 @@ file(GLOB DGL_SRC
src/array/cpu/*.cc src/array/cpu/*.cc
src/random/*.cc src/random/*.cc
src/random/cpu/*.cc src/random/cpu/*.cc
src/kernel/*.cc
src/kernel/cpu/*.cc
src/runtime/*.cc src/runtime/*.cc
src/geometry/*.cc src/geometry/*.cc
src/geometry/cpu/*.cc src/geometry/cpu/*.cc
......
...@@ -10,6 +10,9 @@ ...@@ -10,6 +10,9 @@
#include <cassert> #include <cassert>
#include "fp16.cuh" #include "fp16.cuh"
#if __CUDA_ARCH__ >= 600
#include <cuda_fp16.h>
#endif
namespace dgl { namespace dgl {
namespace aten { namespace aten {
...@@ -133,6 +136,84 @@ DEFINE_ATOMIC_HALF(Min) ...@@ -133,6 +136,84 @@ DEFINE_ATOMIC_HALF(Min)
DEFINE_ATOMIC(Add) DEFINE_ATOMIC(Add)
#undef OP #undef OP
/**
* \brief Performs an atomic compare-and-swap on 64 bit integers. That is,
* it the word `old` at the memory location `address`, computes
* `(old == compare ? val : old)` , and stores the result back to memory at
* the same address.
*
* \param address The address to perform the atomic operation on.
* \param compare The value to compare to.
* \param val The new value to conditionally store.
*
* \return The old value at the address.
*/
inline __device__ int64_t AtomicCAS(
int64_t * const address,
const int64_t compare,
const int64_t val) {
// match the type of "::atomicCAS", so ignore lint warning
using Type = unsigned long long int; // NOLINT
static_assert(sizeof(Type) == sizeof(*address), "Type width must match");
return atomicCAS(reinterpret_cast<Type*>(address),
static_cast<Type>(compare),
static_cast<Type>(val));
}
/**
* \brief Performs an atomic compare-and-swap on 32 bit integers. That is,
* it the word `old` at the memory location `address`, computes
* `(old == compare ? val : old)` , and stores the result back to memory at
* the same address.
*
* \param address The address to perform the atomic operation on.
* \param compare The value to compare to.
* \param val The new value to conditionally store.
*
* \return The old value at the address.
*/
inline __device__ int32_t AtomicCAS(
int32_t * const address,
const int32_t compare,
const int32_t val) {
// match the type of "::atomicCAS", so ignore lint warning
using Type = int; // NOLINT
static_assert(sizeof(Type) == sizeof(*address), "Type width must match");
return atomicCAS(reinterpret_cast<Type*>(address),
static_cast<Type>(compare),
static_cast<Type>(val));
}
inline __device__ int64_t AtomicMax(
int64_t * const address,
const int64_t val) {
// match the type of "::atomicCAS", so ignore lint warning
using Type = unsigned long long int; // NOLINT
static_assert(sizeof(Type) == sizeof(*address), "Type width must match");
return atomicMax(reinterpret_cast<Type*>(address),
static_cast<Type>(val));
}
inline __device__ int32_t AtomicMax(
int32_t * const address,
const int32_t val) {
// match the type of "::atomicCAS", so ignore lint warning
using Type = int; // NOLINT
static_assert(sizeof(Type) == sizeof(*address), "Type width must match");
return atomicMax(reinterpret_cast<Type*>(address),
static_cast<Type>(val));
}
template <> template <>
__device__ __forceinline__ float AtomicAdd<float>(float* addr, float val) { __device__ __forceinline__ float AtomicAdd<float>(float* addr, float val) {
#if __CUDA_ARCH__ >= 200 #if __CUDA_ARCH__ >= 200
......
...@@ -10,10 +10,10 @@ ...@@ -10,10 +10,10 @@
#include <numeric> #include <numeric>
#include "./dgl_cub.cuh" #include "./dgl_cub.cuh"
#include "../../kernel/cuda/atomic.cuh" #include "../../array/cuda/atomic.cuh"
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
using namespace dgl::kernel::cuda; using namespace dgl::aten::cuda;
namespace dgl { namespace dgl {
namespace aten { namespace aten {
......
This diff is collapsed.
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/binary_reduce.h
* \brief Binary reduce function C++ header.
*/
#ifndef DGL_KERNEL_BINARY_REDUCE_H_
#define DGL_KERNEL_BINARY_REDUCE_H_
#include <dgl/runtime/ndarray.h>
#include <vector>
#include <string>
#include "./binary_reduce_common.h"
#include "./csr_interface.h"
namespace dgl {
namespace kernel {
// Structure for broadcasting shapes
struct BcastInfo {
// inferred output shape
std::vector<int64_t> real_out_shape;
// Following shapes here have been preprocessed, so that:
// - The first dimension (for graph) is removed. Shapes here are only for features.
// - They have the same number of dimensions.
// e.g. (4,) and (3, 4) become (1, 4) and (3, 4)
// - Continuous non-broadcasting dimenions are flattened.
// e.g. (4, 1, 3, 3) and (4, 5, 3, 3) become (4, 1, 9) and (4, 5, 9)
std::vector<int64_t> lhs_shape, lhs_stride;
std::vector<int64_t> rhs_shape, rhs_stride;
std::vector<int64_t> out_shape, out_stride;
int64_t data_len;
};
/*
* !\brief Compute the feature shape after binary reduce computation.
*/
std::vector<int64_t> InferBinaryFeatureShape(
runtime::NDArray lhs,
runtime::NDArray rhs);
/*!
* \brief Perform binary operation between the given data and reduce by the graph.
*
* If the reducer is one of "sum, "max, "min", "prod", the operator computes,
* for each node i,
*
* out[i] = Sigma_{j\in Neighbor(i)} ( A[s1(i, j, e)] op B[s2(i, j, e)] )
*
* , where A, B are two input feature tensors, op could be element-wise add/sub/div/mul.
* Depending on the lhs and rhs target, s1 and s2 will select the src/dst/edge
* ids of each neighbor.
*
* If the reducer is "none", the operator computes, for each edge e,
*
* out[e] = A[s1(i, j, e)] op B[s2(i, j, e)]
*
* Here, the node/edge feature (e.g., A[i], B[e]) could be dense tensor. In such
* case, broadcasting is supported on the feature dimensions.
*
* Examples:
*
* A.shape = (N, D1, D2) # N is the number of nodes
* B.shape = (M, D1, 1) # M is the number of edges
* C = BinaryOpReduce("sum", "add", graph, A, B, ...)
* C.shape = (N, D1, D2)
*
* \param reducer The type of the reducer ("sum", "max", "prod", "min", "none").
* If the reducer is "none", the output is an edge feature tensor.
* Otherwise, a node feature tensor is returned.
* \param op The type of the binary operator ("mul", "add").
* \param graph The graph object.
* \param lhs The lhs target (src, dst, edge)
* \param rhs The rhs target (src, dst, edge)
* \param lhs_data The lhs feature tensor.
* \param rhs_data The rhs feature tensor.
* \param out_data The output tensor. Could be either node or edge feature
* tensor depending on the reducer.
* \param lhs_mapping An optional int64 id mapping array.
* \param rhs_mapping An optional int64 id mapping array.
* \param out_mapping An optional int64 id mapping array.
*/
void BinaryOpReduce(
const std::string& reducer,
const std::string& op,
const CSRWrapper& graph,
binary_op::Target lhs, binary_op::Target rhs,
runtime::NDArray lhs_data, runtime::NDArray rhs_data,
runtime::NDArray out_data,
runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping,
runtime::NDArray out_mapping);
/*!
* \brief Compute the lhs gradient of BinaryOpReduce
*
* Broadcasting along feature dimensions is supported. However, the gradient
* of the being-broadcasted dimensions will *not* be reduced. Therefore, the
* gradient tensor has the same shape with the out tensor.
*
* Examples:
* A.shape = (N, D1, 1) # N is the number of nodes
* B.shape = (M, D1, D2) # M is the number of edges
* C = BinaryOpReduce("sum", "add", graph, A, B, ...)
* C.shape = (N, D1, D2)
* dC.shape = (N, D1, D2)
* dA = BackwardLhsBinaryOpReduce("sum", "add", graph, A, B, C, dC, ...)
* dA.shape = (N, D1, D2) # extra reduction should be handled afterwards
*
* \param reducer The type of the reducer ("sum", "max", "prod", "min", "none").
* If the reducer is "none", the output is an edge feature tensor.
* Otherwise, a node feature tensor is returned.
* \param op The type of the binary operator ("mul", "add").
* \param graph The graph object.
* \param lhs The lhs target (src, dst, edge)
* \param rhs The rhs target (src, dst, edge)
* \param lhs_mapping An optional int64 id mapping array.
* \param rhs_mapping An optional int64 id mapping array.
* \param out_mapping An optional int64 id mapping array.
* \param lhs_data The lhs feature tensor.
* \param rhs_data The rhs feature tensor.
* \param out_data The output tensor. Could be either node or edge feature
* tensor depending on the reducer.
* \param grad_out_data The gradient output tensor.
* \param grad_lhs_data The gradient lhs tensor.
*/
void BackwardLhsBinaryOpReduce(
const std::string& reducer,
const std::string& op,
const CSRWrapper& graph,
binary_op::Target lhs, binary_op::Target rhs,
runtime::NDArray lhs_mapping,
runtime::NDArray rhs_mapping,
runtime::NDArray out_mapping,
runtime::NDArray lhs_data,
runtime::NDArray rhs_data,
runtime::NDArray out_data,
runtime::NDArray grad_out_data,
runtime::NDArray grad_lhs_data);
/*!
* \brief Compute the rhs gradient of BinaryOpReduce
*
* Broadcasting along feature dimensions is supported. However, the gradient
* of the being-broadcasted dimensions will *not* be reduced. Therefore, the
* gradient tensor has the same shape with the out tensor.
*
* Examples:
* A.shape = (N, D1, D2) # N is the number of nodes
* B.shape = (M, D1, 1) # M is the number of edges
* C = BinaryOpReduce("sum", "add", graph, A, B, ...)
* C.shape = (N, D1, D2)
* dC.shape = (N, D1, D2)
* dB = BackwardRhsBinaryOpReduce("sum", "add", graph, A, B, C, dC, ...)
* dB.shape = (N, D1, D2) # extra reduction should be handled afterwards
*
* \param reducer The type of the reducer ("sum", "max", "prod", "min", "none").
* If the reducer is "none", the output is an edge feature tensor.
* Otherwise, a node feature tensor is returned.
* \param op The type of the binary operator ("mul", "add").
* \param graph The graph object.
* \param lhs The lhs target (src, dst, edge)
* \param rhs The rhs target (src, dst, edge)
* \param lhs_mapping An optional int64 id mapping array.
* \param rhs_mapping An optional int64 id mapping array.
* \param out_mapping An optional int64 id mapping array.
* \param lhs_data The lhs feature tensor.
* \param rhs_data The rhs feature tensor.
* \param out_data The output tensor. Could be either node or edge feature
* tensor depending on the reducer.
* \param grad_out_data The gradient output tensor.
* \param grad_rhs_data The gradient rhs tensor.
*/
void BackwardRhsBinaryOpReduce(
const std::string& reducer,
const std::string& op,
const CSRWrapper& graph,
binary_op::Target lhs, binary_op::Target rhs,
runtime::NDArray lhs_mapping,
runtime::NDArray rhs_mapping,
runtime::NDArray out_mapping,
runtime::NDArray lhs_data,
runtime::NDArray rhs_data,
runtime::NDArray out_data,
runtime::NDArray grad_out_data,
runtime::NDArray grad_rhs_data);
/*!
* \brief Copy the target data and reduce by graph structure.
*
* If the reducer is one of "sum, "max, "min", "prod", the operator computes,
* for each node i,
*
* out[i] = Sigma_{j\in Neighbor(i)} A[s1(i, j, e)]
*
* , where A, B are two input feature tensors.
* Depending on the lhs and rhs target, s1 and s2 will select the src/dst/edge
* ids of each neighbor.
*
* If the reducer is "none", the operator computes, for each edge e,
*
* out[e] = A[s1(i, j, e)]
*
* \param reducer The type of the reducer ("sum", "max", "prod", "min", "none").
* If the reducer is "none", the output is an edge feature tensor.
* Otherwise, a node feature tensor is returned.
* \param graph The graph object.
* \param target The nput target (src, edge)
* \param in_data The input feature tensor.
* \param out_data The output tensor. Could be either node or edge feature
* tensor depending on the reducer.
* \param in_mapping An optional int64 id mapping array.
* \param out_mapping An optional int64 id mapping array.
*/
void CopyReduce(
const std::string& reducer,
const CSRWrapper& graph,
binary_op::Target target,
runtime::NDArray in_data, runtime::NDArray out_data,
runtime::NDArray in_mapping, runtime::NDArray out_mapping);
/*!
* \brief Compute backward of the CopyReduce
*
* \param reducer The type of the reducer ("sum", "max", "prod", "min", "none").
* If the reducer is "none", the output is an edge feature tensor.
* Otherwise, a node feature tensor is returned.
* \param graph The graph object.
* \param target The nput target (src, edge)
* \param in_mapping An optional int64 id mapping array.
* \param out_mapping An optional int64 id mapping array.
* \param in_data The input feature tensor.
* \param out_data The output tensor. Could be either node or edge feature
* tensor depending on the reducer.
* \param grad_out_data The gradient output tensor.
* \param grad_in_data The gradient input tensor.
*/
void BackwardCopyReduce(
const std::string& reducer,
const CSRWrapper& graph,
binary_op::Target target,
runtime::NDArray in_mapping,
runtime::NDArray out_mapping,
runtime::NDArray in_data,
runtime::NDArray out_data,
runtime::NDArray grad_out_data,
runtime::NDArray grad_in_data);
} // namespace kernel
} // namespace dgl
#endif // DGL_KERNEL_BINARY_REDUCE_H_
This diff is collapsed.
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/binary_reduce_impl.h
* \brief Implementations of binary reduce operations.
*/
#ifndef DGL_KERNEL_BINARY_REDUCE_IMPL_H_
#define DGL_KERNEL_BINARY_REDUCE_IMPL_H_
#include <minigun/minigun.h>
#include <dgl/runtime/device_api.h>
#include <algorithm>
#include <string>
#ifdef __CUDACC__
#include "../runtime/cuda/cuda_common.h"
#endif
#include "./binary_reduce.h"
#include "./binary_reduce_impl_decl.h"
#include "./csr_interface.h"
#include "./utils.h"
namespace dgl {
namespace kernel {
///////////////////////////////////////////////////////////////////////////////
// BinaryReduce device-agnostic implementation
///////////////////////////////////////////////////////////////////////////////
template <int XPU, typename Idx, typename DType, typename Reducer>
GData<Idx, DType> AllocGData(const std::string& op,
const DLContext& ctx, int64_t x_len,
runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping,
runtime::NDArray lhs_data, runtime::NDArray rhs_data,
runtime::NDArray out_mapping, runtime::NDArray out_data) {
// GData
GData<Idx, DType> gdata;
gdata.x_length = x_len;
gdata.lhs_data = static_cast<DType*>(lhs_data->data);
gdata.rhs_data = static_cast<DType*>(rhs_data->data);
gdata.out_data = static_cast<DType*>(out_data->data);
if (!aten::IsNullArray(lhs_mapping)) {
gdata.lhs_mapping = static_cast<Idx*>(lhs_mapping->data);
}
if (!aten::IsNullArray(rhs_mapping)) {
gdata.rhs_mapping = static_cast<Idx*>(rhs_mapping->data);
}
if (!aten::IsNullArray(out_mapping)) {
gdata.out_mapping = static_cast<Idx*>(out_mapping->data);
}
// for dot operation: vector [dot] vector
if (op == binary_op::kDot) {
// get size of vector
gdata.data_len = lhs_data->shape[lhs_data->ndim - 1];
} else {
gdata.data_len = 1;
}
// fill out data with zero values
utils::Fill<XPU>(ctx, gdata.out_data, utils::NElements(out_data), Zero<Reducer>::value);
return gdata;
}
template <int XPU>
void BinaryReduceImpl(
const std::string& reducer,
const std::string& op,
const CSRWrapper& graph,
binary_op::Target lhs, binary_op::Target rhs,
runtime::NDArray lhs_data, runtime::NDArray rhs_data,
runtime::NDArray out_data,
runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping,
runtime::NDArray out_mapping) {
using runtime::NDArray;
using minigun::Csr;
// device
#ifdef __CUDACC__
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
#endif
const int64_t x_len = utils::ComputeXLength(out_data);
// advance config
minigun::advance::RuntimeConfig rtcfg;
rtcfg.ctx = out_data->ctx;
#ifdef __CUDACC__
rtcfg.stream = thr_entry->stream;
const int nt = utils::FindNumThreads(x_len, 64);
rtcfg.data_num_threads = nt;
// XXX(minjie): hard-code to let each thread compute two elements to increase
// instruction level parallelism
rtcfg.data_num_blocks = (x_len + (nt * 2) - 1) / (nt * 2);
#endif
if (reducer == binary_op::kReduceMean) {
// TODO(minjie): divide
LOG(FATAL) << "reduce mean is not supported.";
}
const DLDataType& dtype = out_data->dtype;
const auto bits = graph.NumBits();
DGL_DTYPE_SWITCH(dtype, DType, {
DGL_IDX_TYPE_SWITCH(bits, Idx, {
REDUCER_SWITCH(reducer, XPU, DType, Reducer, {
auto gdata = AllocGData<XPU, Idx, DType, Reducer>(op,
rtcfg.ctx, x_len, lhs_mapping, rhs_mapping,
lhs_data, rhs_data, out_mapping, out_data);
OP_TARGET_SWITCH(op, lhs, rhs, DType, BinaryOp, LeftTarget, RightTarget, {
CallBinaryReduce<XPU, Idx, DType, LeftTarget,
RightTarget, BinaryOp, Reducer>(rtcfg, graph, &gdata);
});
});
});
});
}
///////////////////////////////////////////////////////////////////////////////
// BackwardBinaryReduce device-agnostic implementation
///////////////////////////////////////////////////////////////////////////////
template <int XPU, typename Idx, typename DType>
BackwardGData<Idx, DType> AllocBackwardGData(
const std::string& op, const DLContext& ctx, int64_t x_len,
runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping, runtime::NDArray out_mapping,
runtime::NDArray lhs_data, runtime::NDArray rhs_data, runtime::NDArray out_data,
runtime::NDArray grad_out_data,
runtime::NDArray grad_lhs_data, runtime::NDArray grad_rhs_data) {
// GData
BackwardGData<Idx, DType> gdata;
gdata.x_length = x_len;
gdata.lhs_data = static_cast<DType*>(lhs_data->data);
gdata.rhs_data = static_cast<DType*>(rhs_data->data);
gdata.out_data = static_cast<DType*>(out_data->data);
gdata.grad_out_data = static_cast<DType*>(grad_out_data->data);
if (!aten::IsNullArray(grad_lhs_data)) {
gdata.grad_lhs_data = static_cast<DType*>(grad_lhs_data->data);
// fill out data with zero values
utils::Fill<XPU>(ctx, gdata.grad_lhs_data, utils::NElements(grad_lhs_data),
static_cast<DType>(0));
}
if (!aten::IsNullArray(grad_rhs_data)) {
gdata.grad_rhs_data = static_cast<DType*>(grad_rhs_data->data);
// fill out data with zero values
utils::Fill<XPU>(ctx, gdata.grad_rhs_data, utils::NElements(grad_rhs_data),
static_cast<DType>(0));
}
if (!aten::IsNullArray(lhs_mapping)) {
gdata.lhs_mapping = static_cast<Idx*>(lhs_mapping->data);
}
if (!aten::IsNullArray(rhs_mapping)) {
gdata.rhs_mapping = static_cast<Idx*>(rhs_mapping->data);
}
if (!aten::IsNullArray(out_mapping)) {
gdata.out_mapping = static_cast<Idx*>(out_mapping->data);
}
// for dot operation: vector [dot] vector
if (op == binary_op::kDot) {
// get size of vector
gdata.data_len = lhs_data->shape[lhs_data->ndim - 1];
} else {
gdata.data_len = 1;
}
return gdata;
}
template <int XPU>
void BackwardBinaryReduceImpl(
const std::string& reducer,
const std::string& op,
const CSRWrapper& graph,
binary_op::Target lhs, binary_op::Target rhs,
runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping, runtime::NDArray out_mapping,
runtime::NDArray lhs_data, runtime::NDArray rhs_data, runtime::NDArray out_data,
runtime::NDArray grad_out_data,
runtime::NDArray grad_lhs_data, runtime::NDArray grad_rhs_data) {
using runtime::NDArray;
using minigun::Csr;
#ifdef __CUDACC__
// device
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
#endif
// Graph
const int64_t x_len = utils::ComputeXLength(out_data);
// advance config
minigun::advance::RuntimeConfig rtcfg;
rtcfg.ctx = out_data->ctx;
#ifdef __CUDACC__
rtcfg.stream = thr_entry->stream;
const int nt = utils::FindNumThreads(x_len, 64);
rtcfg.data_num_threads = nt;
// XXX(minjie): hard-code to let each thread compute two elements to increase
// instruction level parallelism
rtcfg.data_num_blocks = (x_len + (nt * 2) - 1) / (nt * 2);
#endif
const DLDataType& dtype = out_data->dtype;
const bool req_lhs = !aten::IsNullArray(grad_lhs_data);
const bool req_rhs = !aten::IsNullArray(grad_rhs_data);
const auto bits = graph.NumBits();
if (reducer == binary_op::kReduceMean) {
// TODO(minjie): divide
LOG(FATAL) << "reduce mean is not supported.";
}
DGL_DTYPE_SWITCH(dtype, DType, {
DGL_IDX_TYPE_SWITCH(bits, Idx, {
auto gdata = AllocBackwardGData<XPU, Idx, DType>(op,
rtcfg.ctx, x_len, lhs_mapping, rhs_mapping, out_mapping,
lhs_data, rhs_data, out_data, grad_out_data,
grad_lhs_data, grad_rhs_data);
BACKWARD_MODE_SWITCH(req_lhs, req_rhs, Mode, {
REDUCER_SWITCH(reducer, XPU, DType, Reducer, {
OP_TARGET_SWITCH(op, lhs, rhs, DType, BinaryOp, LeftTarget, RightTarget, {
CallBackwardBinaryReduce<XPU, Mode, Idx, DType, LeftTarget,
RightTarget, BinaryOp, Reducer>(rtcfg, graph, &gdata);
});
});
});
});
});
}
///////////////////////////////////////////////////////////////////////////////
// BinaryReduceBcast device-agnostic implementation
///////////////////////////////////////////////////////////////////////////////
template <int XPU, int NDim, typename Idx, typename DType, typename Reducer>
BcastGData<NDim, Idx, DType> AllocBcastGData(
const DLContext& ctx, const BcastInfo& info,
runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping,
runtime::NDArray lhs_data, runtime::NDArray rhs_data,
runtime::NDArray out_mapping, runtime::NDArray out_data) {
// GData
BcastGData<NDim, Idx, DType> gdata;
// dim, shape and stride
gdata.ndim = info.lhs_shape.size();
std::copy(info.lhs_shape.begin(), info.lhs_shape.end(), gdata.lhs_shape);
std::copy(info.lhs_stride.begin(), info.lhs_stride.end(), gdata.lhs_stride);
std::copy(info.rhs_shape.begin(), info.rhs_shape.end(), gdata.rhs_shape);
std::copy(info.rhs_stride.begin(), info.rhs_stride.end(), gdata.rhs_stride);
std::copy(info.out_shape.begin(), info.out_shape.end(), gdata.out_shape);
std::copy(info.out_stride.begin(), info.out_stride.end(), gdata.out_stride);
gdata.lhs_len = utils::Prod(info.lhs_shape);
gdata.rhs_len = utils::Prod(info.rhs_shape);
gdata.out_len = utils::Prod(info.out_shape);
// data
gdata.lhs_data = static_cast<DType*>(lhs_data->data);
gdata.rhs_data = static_cast<DType*>(rhs_data->data);
gdata.out_data = static_cast<DType*>(out_data->data);
if (!aten::IsNullArray(lhs_mapping)) {
gdata.lhs_mapping = static_cast<Idx*>(lhs_mapping->data);
}
if (!aten::IsNullArray(rhs_mapping)) {
gdata.rhs_mapping = static_cast<Idx*>(rhs_mapping->data);
}
if (!aten::IsNullArray(out_mapping)) {
gdata.out_mapping = static_cast<Idx*>(out_mapping->data);
}
gdata.data_len = info.data_len;
// fill out data with zero values
utils::Fill<XPU>(ctx, gdata.out_data, utils::NElements(out_data), Zero<Reducer>::value);
return gdata;
}
template <int XPU>
void BinaryReduceBcastImpl(
const BcastInfo& info,
const std::string& reducer,
const std::string& op,
const CSRWrapper& graph,
binary_op::Target lhs,
binary_op::Target rhs,
runtime::NDArray lhs_data,
runtime::NDArray rhs_data,
runtime::NDArray out_data,
runtime::NDArray lhs_mapping,
runtime::NDArray rhs_mapping,
runtime::NDArray out_mapping) {
using runtime::NDArray;
using minigun::Csr;
#ifdef __CUDACC__
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
#endif
// advance config
minigun::advance::RuntimeConfig rtcfg;
rtcfg.ctx = out_data->ctx;
#ifdef __CUDACC__
rtcfg.stream = thr_entry->stream;
const int64_t x_len = utils::ComputeXLength(out_data);
const int nt = utils::FindNumThreads(x_len, 64);
rtcfg.data_num_threads = nt;
// XXX(minjie): hard-code to let each thread compute two elements to increase
// instruction level parallelism
rtcfg.data_num_blocks = (x_len + (nt * 2) - 1) / (nt * 2);
#endif
const DLDataType& dtype = out_data->dtype;
const int bcast_ndim = info.out_shape.size();
const auto bits = graph.NumBits();
if (reducer == binary_op::kReduceMean) {
// TODO(minjie): divide
LOG(FATAL) << "reduce mean is not supported.";
}
DGL_DTYPE_SWITCH(dtype, DType, {
DGL_IDX_TYPE_SWITCH(bits, Idx, {
REDUCER_SWITCH(reducer, XPU, DType, Reducer, {
BCAST_NDIM_SWITCH(bcast_ndim, NDim, {
auto gdata = AllocBcastGData<XPU, NDim, Idx, DType, Reducer>(
rtcfg.ctx, info, lhs_mapping, rhs_mapping,
lhs_data, rhs_data, out_mapping, out_data);
OP_TARGET_SWITCH(op, lhs, rhs, DType, BinaryOp, LeftTarget, RightTarget, {
CallBinaryReduceBcast<XPU, NDim, Idx, DType, LeftTarget,
RightTarget, BinaryOp, Reducer>(rtcfg, graph, &gdata);
});
});
});
});
});
}
///////////////////////////////////////////////////////////////////////////////
// BackwardBinaryReduceBcast device-agnostic implementation
///////////////////////////////////////////////////////////////////////////////
template <int XPU, int NDim, typename Idx, typename DType>
BackwardBcastGData<NDim, Idx, DType> AllocBackwardBcastGData(
const DLContext& ctx, const BcastInfo& info,
runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping, runtime::NDArray out_mapping,
runtime::NDArray lhs, runtime::NDArray rhs, runtime::NDArray out, runtime::NDArray grad_out,
runtime::NDArray grad_lhs, runtime::NDArray grad_rhs) {
// GData
BackwardBcastGData<NDim, Idx, DType> gdata;
// dim, shape and stride
gdata.ndim = info.lhs_shape.size();
gdata.lhs_len = utils::Prod(info.lhs_shape);
gdata.rhs_len = utils::Prod(info.rhs_shape);
gdata.out_len = utils::Prod(info.out_shape);
std::copy(info.lhs_shape.begin(), info.lhs_shape.end(), gdata.lhs_shape);
std::copy(info.lhs_stride.begin(), info.lhs_stride.end(), gdata.lhs_stride);
std::copy(info.rhs_shape.begin(), info.rhs_shape.end(), gdata.rhs_shape);
std::copy(info.rhs_stride.begin(), info.rhs_stride.end(), gdata.rhs_stride);
std::copy(info.out_shape.begin(), info.out_shape.end(), gdata.out_shape);
std::copy(info.out_stride.begin(), info.out_stride.end(), gdata.out_stride);
// mappings
if (!aten::IsNullArray(lhs_mapping)) {
gdata.lhs_mapping = static_cast<Idx*>(lhs_mapping->data);
}
if (!aten::IsNullArray(rhs_mapping)) {
gdata.rhs_mapping = static_cast<Idx*>(rhs_mapping->data);
}
if (!aten::IsNullArray(out_mapping)) {
gdata.out_mapping = static_cast<Idx*>(out_mapping->data);
}
gdata.data_len = info.data_len;
// data
gdata.lhs_data = static_cast<DType*>(lhs->data);
gdata.rhs_data = static_cast<DType*>(rhs->data);
gdata.out_data = static_cast<DType*>(out->data);
gdata.grad_out_data = static_cast<DType*>(grad_out->data);
if (!aten::IsNullArray(grad_lhs)) {
gdata.grad_lhs_data = static_cast<DType*>(grad_lhs->data);
// fill out data with zero values
utils::Fill<XPU>(ctx, gdata.grad_lhs_data, utils::NElements(grad_lhs),
static_cast<DType>(0));
}
if (!aten::IsNullArray(grad_rhs)) {
gdata.grad_rhs_data = static_cast<DType*>(grad_rhs->data);
// fill out data with zero values
utils::Fill<XPU>(ctx, gdata.grad_rhs_data, utils::NElements(grad_rhs),
static_cast<DType>(0));
}
return gdata;
}
template <int XPU>
void BackwardBinaryReduceBcastImpl(
const BcastInfo& info,
const std::string& reducer,
const std::string& op,
const CSRWrapper& graph,
binary_op::Target lhs_tgt, binary_op::Target rhs_tgt,
runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping, runtime::NDArray out_mapping,
runtime::NDArray lhs, runtime::NDArray rhs, runtime::NDArray out, runtime::NDArray grad_out,
runtime::NDArray grad_lhs, runtime::NDArray grad_rhs) {
using runtime::NDArray;
using minigun::Csr;
#ifdef __CUDACC__
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
#endif
// advance config
minigun::advance::RuntimeConfig rtcfg;
rtcfg.ctx = out->ctx;
#ifdef __CUDACC__
rtcfg.stream = thr_entry->stream;
const int64_t x_len = utils::ComputeXLength(out);
const int nt = utils::FindNumThreads(x_len, 64);
rtcfg.data_num_threads = nt;
// XXX(minjie): hard-code to let each thread compute two elements to increase
// instruction level parallelism
rtcfg.data_num_blocks = (x_len + (nt * 2) - 1) / (nt * 2);
#endif
const DLDataType& dtype = out->dtype;
const int bcast_ndim = info.out_shape.size();
const bool req_lhs = !aten::IsNullArray(grad_lhs);
const bool req_rhs = !aten::IsNullArray(grad_rhs);
const auto bits = graph.NumBits();
if (reducer == binary_op::kReduceMean) {
// TODO(minjie): divide
LOG(FATAL) << "reduce mean is not supported.";
}
DGL_DTYPE_SWITCH(dtype, DType, {
DGL_IDX_TYPE_SWITCH(bits, Idx, {
BCAST_NDIM_SWITCH(bcast_ndim, NDim, {
auto gdata = AllocBackwardBcastGData<XPU, NDim, Idx, DType>(
rtcfg.ctx, info,
lhs_mapping, rhs_mapping, out_mapping,
lhs, rhs, out, grad_out,
grad_lhs, grad_rhs);
BACKWARD_MODE_SWITCH(req_lhs, req_rhs, Mode, {
REDUCER_SWITCH(reducer, XPU, DType, Reducer, {
OP_TARGET_SWITCH(op, lhs_tgt, rhs_tgt, DType, BinaryOp, LeftTarget, RightTarget, {
CallBackwardBinaryReduceBcast<XPU, Mode, NDim, Idx, DType,
LeftTarget, RightTarget, BinaryOp, Reducer>(rtcfg, graph, &gdata);
});
});
});
});
});
});
}
} // namespace kernel
} // namespace dgl
#endif // DGL_KERNEL_BINARY_REDUCE_IMPL_H_
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/binary_reduce_impl_decl.h
* \brief Data structure and function declarations for implementations.
*/
#ifndef DGL_KERNEL_BINARY_REDUCE_IMPL_DECL_H_
#define DGL_KERNEL_BINARY_REDUCE_IMPL_DECL_H_
#include <dgl/runtime/ndarray.h>
#include <string>
#include "./binary_reduce_common.h"
#include "./csr_interface.h"
namespace minigun {
namespace advance {
// forward declaration
struct RuntimeConfig;
} // namespace advance
} // namespace minigun
namespace dgl {
namespace kernel {
// forward declaration
struct BcastInfo;
///////////////////////////////////////////////////////////////////////////////
// BinaryReduce declarations
///////////////////////////////////////////////////////////////////////////////
/*!\brief Data structure used by computing BinaryOpReduce in Minigun. */
template <typename Idx, typename DType>
struct GData {
// length along x(feature) dimension
int64_t x_length{0};
// size of data, can be single value or a vector
int64_t data_len{0};
// input data
DType *lhs_data{nullptr}, *rhs_data{nullptr};
// output data
DType *out_data{nullptr};
// input id mappings
Idx *lhs_mapping{nullptr}, *rhs_mapping{nullptr};
// output id mapping
Idx *out_mapping{nullptr};
};
/*!
* \brief Template declaration for BinaryReduce operator.
*
* LeftSelector and RightSelector must be one of the four operand target
* categories.
*
* BinaryOp must be one of the binary operator types.
*
* Reducer must be one of the reducer types.
*
* The implementation of this template is device-dependent
* (see kernel/xpu/binary_reduce_impl.(cu)h).
*
* See definitions in binary_reduce_common.h
*
* \tparam XPU the device flag
* \tparam Idx type of node/edge index (e.g. int32_t, int64_t)
* \tparam DType type of the feature data (e.g. float32)
* \tparam LeftSelect lhs category type
* \tparam RightSelect rhs category type
* \tparam BinaryOp Binary operator type
* \tparam Reducer Reducer type
* \param rtcfg Runtime configuration used by miningun
* \param graph The graph object.
* \param gdata The feature and mapping data used by the computation.
*/
template <int XPU, typename Idx, typename DType,
typename LeftSelector, typename RightSelector,
typename BinaryOp, typename Reducer>
void CallBinaryReduce(
const minigun::advance::RuntimeConfig& rtcfg,
const CSRWrapper& graph,
GData<Idx, DType>* gdata);
/*!
* \brief Template declaration for common logics shared by different devices.
*
* \tparam XPU the device flag
* \param reducer The type of the reducer ("sum", "max", "mean", "min", "none").
* If the reducer is "none", the output is an edge feature tensor.
* Otherwise, a node feature tensor is returned.
* \param op The type of the binary operator ("mul", "add").
* \param graph The graph object.
* \param lhs The lhs target (src, dst, edge)
* \param rhs The rhs target (src, dst, edge)
* \param lhs_data The lhs feature tensor.
* \param rhs_data The rhs feature tensor.
* \param out_data The output tensor. Could be either node or edge feature
* tensor depending on the reducer.
* \param lhs_mapping An optional int64 id mapping array.
* \param rhs_mapping An optional int64 id mapping array.
* \param out_mapping An optional int64 id mapping array.
*/
template <int XPU>
void BinaryReduceImpl(
const std::string& reducer,
const std::string& op,
const CSRWrapper& graph,
binary_op::Target lhs, binary_op::Target rhs,
runtime::NDArray lhs_data, runtime::NDArray rhs_data, runtime::NDArray out_data,
runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping, runtime::NDArray out_mapping);
///////////////////////////////////////////////////////////////////////////////
// BackwardBinaryReduce declarations
///////////////////////////////////////////////////////////////////////////////
/*!\brief Data structure used by computing BackwardBinaryReduce in Minigun. */
template <typename Idx, typename DType>
struct BackwardGData {
// length along x(feature) dimension
int64_t x_length{0};
// size of data, can be single value or a vector
int64_t data_len{0};
// input data
DType *lhs_data{nullptr}, *rhs_data{nullptr}, *out_data{nullptr};
DType *grad_out_data{nullptr};
// output data
DType *grad_lhs_data{nullptr}, *grad_rhs_data{nullptr};
// input id mappings
Idx *lhs_mapping{nullptr}, *rhs_mapping{nullptr};
// output id mapping
Idx *out_mapping{nullptr};
};
/*!
* \brief Template declaration for BackwardBinaryReduce operator.
*
* Mode must be one of the enum code in binary_op::BackwardMode.
*
* LeftSelector and RightSelector must be one of the four operand target
* categories.
*
* BinaryOp must be one of the binary operator types.
*
* Reducer must be one of the reducer types.
*
* The implementation of this template is device-dependent
* (see kernel/xpu/backward_binary_reduce_impl.(cu)h).
*
* See definitions in binary_reduce_common.h
*
* \tparam XPU the device flag
* \tparam Mode the backward mode code
* \tparam Idx type of node/edge index (e.g. int32_t, int64_t)
* \tparam DType type of the feature data (e.g. float32)
* \tparam LeftSelect lhs category type
* \tparam RightSelect rhs category type
* \tparam BinaryOp Binary operator type
* \tparam Reducer Reducer type
* \param rtcfg Runtime configuration used by miningun
* \param graph The graph object.
* \param gdata The feature and mapping data used by the computation.
*/
template <int XPU, int Mode, typename Idx, typename DType,
typename LeftSelector, typename RightSelector,
typename BinaryOp, typename Reducer>
void CallBackwardBinaryReduce(
const minigun::advance::RuntimeConfig& rtcfg,
const CSRWrapper& graph,
BackwardGData<Idx, DType>* gdata);
/*!
* \brief Template declaration for common logics shared by different devices.
*
* \tparam XPU the device flag
* \param reducer The type of the reducer ("sum", "max", "mean", "min", "none").
* If the reducer is "none", the output is an edge feature tensor.
* Otherwise, a node feature tensor is returned.
* \param op The type of the binary operator ("mul", "add").
* \param graph The graph object.
* \param lhs The lhs target (src, dst, edge)
* \param rhs The rhs target (src, dst, edge)
* \param lhs_mapping An optional int64 id mapping array.
* \param rhs_mapping An optional int64 id mapping array.
* \param out_mapping An optional int64 id mapping array.
* \param lhs_data The lhs feature tensor.
* \param rhs_data The rhs feature tensor.
* \param out_data The output tensor. Could be either node or edge feature
* tensor depending on the reducer.
* \param grad_out_data The gradient output tensor.
* \param grad_lhs_data The gradient lhs tensor.
*/
template <int XPU>
void BackwardBinaryReduceImpl(
const std::string& reducer,
const std::string& op,
const CSRWrapper& graph,
binary_op::Target lhs, binary_op::Target rhs,
runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping, runtime::NDArray out_mapping,
runtime::NDArray lhs_data, runtime::NDArray rhs_data, runtime::NDArray out_data,
runtime::NDArray grad_out_data,
runtime::NDArray grad_lhs_data, runtime::NDArray grad_rhs_data);
///////////////////////////////////////////////////////////////////////////////
// BinaryReduce with broadcasting declarations
///////////////////////////////////////////////////////////////////////////////
/*!
* \brief Data structure used by computing BinaryOp with broadcasting in Minigun.
*
* Note that all the shapes and strides are for the feature dimensions.
*
* \tparam NDim maximum number of feature dimensions
* \tparam Idx id index type
* \tparam DType feature data type
*/
template <int NDim, typename Idx, typename DType>
struct BcastGData {
// actual number of feature dimensions
int ndim{0};
// input feature shape and stride
int64_t lhs_len{0}, rhs_len{0};
int64_t lhs_shape[NDim]{0}, lhs_stride[NDim]{0};
int64_t rhs_shape[NDim]{0}, rhs_stride[NDim]{0};
// size of data, can be single value or a vector
int64_t data_len{0};
// input data
DType *lhs_data{nullptr}, *rhs_data{nullptr};
// input id mappings
Idx *lhs_mapping{nullptr}, *rhs_mapping{nullptr};
// output feature shape and stride
int64_t out_len{0}; // output total feature length (equal to prod(out_shape));
int64_t out_shape[NDim]{0}, out_stride[NDim]{0};
// output data
DType *out_data{nullptr};
// output id mapping
Idx *out_mapping{nullptr};
};
/*!
* \brief Template declaration for BinaryReduce with broadcasting operator.
*
* LeftSelector and RightSelector must be one of the four operand target
* categories.
*
* BinaryOp must be one of the binary operator types.
*
* Reducer must be one of the reducer types.
*
* The implementation of this template is device-dependent
* (see kernel/xpu/binary_reduce_impl.(cu)h).
*
* See definitions in binary_reduce_common.h
*
* \tparam XPU the device flag
* \tparam NDim maximum number of feature dimensions
* \tparam Idx type of node/edge index (e.g. int32_t, int64_t)
* \tparam DType type of the feature data (e.g. float32)
* \tparam LeftSelect lhs category type
* \tparam RightSelect rhs category type
* \tparam BinaryOp rinary operator type
* \tparam Reducer reducer type
* \param rtcfg runtime configuration used by miningun
* \param graph The graph object.
* \param gdata The feature and mapping data used by the computation.
*/
template <int XPU, int NDim, typename Idx, typename DType,
typename LeftSelector, typename RightSelector,
typename BinaryOp, typename Reducer>
void CallBinaryReduceBcast(
const minigun::advance::RuntimeConfig& rtcfg,
const CSRWrapper& graph,
BcastGData<NDim, Idx, DType>* gdata);
/*!
* \brief Template declaration for common logics shared by different devices.
*
* \tparam XPU the device flag
* \param reducer The type of the reducer ("sum", "max", "mean", "min", "none").
* If the reducer is "none", the output is an edge feature tensor.
* Otherwise, a node feature tensor is returned.
* \param op The type of the binary operator ("mul", "add").
* \param graph The graph object.
* \param lhs The lhs target (src, dst, edge)
* \param rhs The rhs target (src, dst, edge)
* \param lhs_data The lhs feature tensor.
* \param rhs_data The rhs feature tensor.
* \param out_data The output tensor. Could be either node or edge feature
* tensor depending on the reducer.
* \param lhs_mapping An optional int64 id mapping array.
* \param rhs_mapping An optional int64 id mapping array.
* \param out_mapping An optional int64 id mapping array.
*/
template <int XPU>
void BinaryReduceBcastImpl(
const BcastInfo& info,
const std::string& reducer,
const std::string& op,
const CSRWrapper& graph,
binary_op::Target lhs, binary_op::Target rhs,
runtime::NDArray lhs_data, runtime::NDArray rhs_data,
runtime::NDArray out_data,
runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping,
runtime::NDArray out_mapping);
///////////////////////////////////////////////////////////////////////////////
// BackwardBinaryReduce with broadcasting declarations
///////////////////////////////////////////////////////////////////////////////
/*!
* \brief Data and auxiliary information for backward binary broadcasting op.
*
* Note that all the shapes and strides are for the feature dimensions.
*
* The gradients of the broadcasting dimensions are not reduced. As a result,
* The grad_lhs and grad_rhs have the same shape as grad_out.
*
* \tparam NDim maximum number of feature dimensions
* \tparam Idx id index type
* \tparam DType feature data type
*/
template <int NDim, typename Idx, typename DType>
struct BackwardBcastGData {
// actual number of feature dimensions
int ndim{0};
// input shape and stride
int64_t lhs_len{0}, rhs_len{0}, out_len{0};
int64_t lhs_shape[NDim]{0}, lhs_stride[NDim]{0};
int64_t rhs_shape[NDim]{0}, rhs_stride[NDim]{0};
int64_t out_shape[NDim]{0}, out_stride[NDim]{0};
// size of data, can be single value or a vector
int64_t data_len{0};
// input id mappings
Idx *lhs_mapping{nullptr}, *rhs_mapping{nullptr}, *out_mapping{nullptr};
// input data
DType *lhs_data{nullptr}, *rhs_data{nullptr}, *out_data{nullptr};
DType *grad_out_data{nullptr};
// output data
DType *grad_lhs_data{nullptr}, *grad_rhs_data{nullptr};
};
/*!
* \brief Template declaration for BackwardBinaryReduce with broadcasting operator.
*
* LeftSelector and RightSelector must be one of the four operand target
* categories.
*
* BinaryOp must be one of the binary operator types.
*
* Reducer must be one of the reducer types.
*
* The implementation of this template is device-dependent
* (see kernel/xpu/binary_reduce_impl.(cu)h).
*
* See definitions in binary_reduce_common.h
*
* \tparam XPU the device flag
* \tparam Mode the backward mode code
* \tparam NDim maximum number of feature dimensions
* \tparam Idx type of node/edge index (e.g. int32_t, int64_t)
* \tparam DType type of the feature data (e.g. float32)
* \tparam LeftSelect lhs category type
* \tparam RightSelect rhs category type
* \tparam BinaryOp rinary operator type
* \tparam Reducer reducer type
* \param rtcfg runtime configuration used by miningun
* \param graph The graph object.
* \param gdata The feature and mapping data used by the computation.
*/
template <int XPU, int Mode, int NDim, typename Idx, typename DType,
typename LeftSelector, typename RightSelector,
typename BinaryOp, typename Reducer>
void CallBackwardBinaryReduceBcast(
const minigun::advance::RuntimeConfig& rtcfg,
const CSRWrapper& graph,
BackwardBcastGData<NDim, Idx, DType>* gdata);
/*!
* \brief Template declaration for common logics shared by different devices.
*
* \tparam XPU the device flag
* \param reducer The type of the reducer ("sum", "max", "mean", "min", "none").
* If the reducer is "none", the output is an edge feature tensor.
* Otherwise, a node feature tensor is returned.
* \param op The type of the binary operator ("mul", "add").
* \param graph The graph object.
* \param lhs The lhs target (src, dst, edge)
* \param rhs The rhs target (src, dst, edge)
* \param lhs_mapping An optional int64 id mapping array.
* \param rhs_mapping An optional int64 id mapping array.
* \param out_mapping An optional int64 id mapping array.
* \param lhs_data The lhs feature tensor.
* \param rhs_data The rhs feature tensor.
* \param out_data The output tensor. Could be either node or edge feature
* tensor depending on the reducer.
* \param grad_out_data The gradient output tensor.
* \param grad_lhs_data The gradient lhs tensor.
*/
template <int XPU>
void BackwardBinaryReduceBcastImpl(
const BcastInfo& info,
const std::string& reducer,
const std::string& op,
const CSRWrapper& graph,
binary_op::Target lhs, binary_op::Target rhs,
runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping, runtime::NDArray out_mapping,
runtime::NDArray lhs_data, runtime::NDArray rhs_data, runtime::NDArray out_data,
runtime::NDArray grad_out_data,
runtime::NDArray grad_lhs_data, runtime::NDArray grad_rhs_data);
} // namespace kernel
} // namespace dgl
#endif // DGL_KERNEL_BINARY_REDUCE_IMPL_DECL_H_
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/common.h
* \brief Kernel common utilities
*/
#ifndef DGL_KERNEL_COMMON_H_
#define DGL_KERNEL_COMMON_H_
#include <dgl/runtime/ndarray.h>
#include <cstdint>
#include "../c_api_common.h"
namespace dgl {
namespace kernel {
#ifdef __CUDACC__
#define DGLDEVICE __device__
#define DGLINLINE __forceinline__
#else
#define DGLDEVICE
#define DGLINLINE inline
#endif // __CUDACC__
// Macro for dispatch device flag to template function calls
#ifdef DGL_USE_CUDA
#define DGL_XPU_SWITCH(val, Method, ...) \
if (val == kDLCPU) { \
Method<kDLCPU>(__VA_ARGS__); \
} else if (val == kDLGPU) { \
Method<kDLGPU>(__VA_ARGS__); \
} else { \
LOG(FATAL) << "Unsupported device type: " << val; \
}
#else // DGL_USE_CUDA
#define DGL_XPU_SWITCH(val, Method, ...) \
if (val == kDLCPU) { \
Method<kDLCPU>(__VA_ARGS__); \
} else { \
LOG(FATAL) << "Unsupported device type: " << val; \
}
#endif // DGL_USE_CUDA
// MSVC does not expand __VA_ARGS__ correctly, and needs this expand hack
#define MSVC_EXPAND(x) x
// Macro for dispatch dtype flag to template argument. Currently only
// support float32.
#define DGL_DTYPE_SWITCH(val, DType, ...) \
if (val.code == kDLFloat && val.bits == 32) { \
typedef float DType; \
{ __VA_ARGS__ } \
} else { \
LOG(FATAL) << "Unsupported dtype: " << val; \
}
// Macro for unrolling with data type arguments.
#define GEN_DTYPE(GEN, ...) \
MSVC_EXPAND(GEN(__VA_ARGS__, float))
// Macro for dispatch index nbits to template argument.
#ifdef __CUDACC__
#define DGL_IDX_TYPE_SWITCH(bits, Idx, ...) \
if (bits == 32) { \
typedef int32_t Idx; \
{__VA_ARGS__} \
} else { \
LOG(FATAL) << "Unsupported idx bits: " << bits; \
}
#else
#define DGL_IDX_TYPE_SWITCH(bits, Idx, ...) \
if (bits == 32) { \
typedef int32_t Idx; \
{__VA_ARGS__} \
} else if (bits == 64) { \
typedef int64_t Idx; \
{__VA_ARGS__} \
} else { \
LOG(FATAL) << "Unsupported idx bits: " << bits; \
}
#endif
} // namespace kernel
} // namespace dgl
#endif // DGL_KERNEL_COMMON_H_
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cuda/backward_binary_reduce_impl.h
* \brief Minigun CPU UDFs for bacward binary reduce
*/
#ifndef DGL_KERNEL_CPU_BACKWARD_BINARY_REDUCE_IMPL_H_
#define DGL_KERNEL_CPU_BACKWARD_BINARY_REDUCE_IMPL_H_
#include <minigun/minigun.h>
#include "../binary_reduce_impl_decl.h"
#include "../utils.h"
#include "./functor.h"
#include "../csr_interface.h"
namespace dgl {
namespace kernel {
namespace cpu {
// Minigun UDF to compute backward binary reduce.
template <int Mode, typename Idx, typename DType, typename Functors>
struct BackwardBinaryReduce {
static inline bool CondEdge(
Idx src, Idx dst, Idx eid, BackwardGData<Idx, DType>* gdata) {
return true;
}
static inline void ApplyEdge(
Idx src, Idx dst, Idx eid, BackwardGData<Idx, DType>* gdata) {
const int64_t D = gdata->x_length;
const int64_t len = gdata->data_len;
Idx lid = Functors::SelectLeft(src, eid, dst);
Idx rid = Functors::SelectRight(src, eid, dst);
Idx oid = Functors::SelectOut(src, eid, dst);
if (gdata->lhs_mapping) {
lid = Functors::GetId(lid, gdata->lhs_mapping);
}
if (gdata->rhs_mapping) {
rid = Functors::GetId(rid, gdata->rhs_mapping);
}
if (gdata->out_mapping) {
oid = Functors::GetId(oid, gdata->out_mapping);
}
DType* lhsoff = gdata->lhs_data + lid * D * len;
DType* rhsoff = gdata->rhs_data + rid * D * len;
DType* outoff = gdata->out_data + oid * D;
DType* gradlhsoff = gdata->grad_lhs_data + lid * D * len;
DType* gradrhsoff = gdata->grad_rhs_data + rid * D * len;
DType* gradoutoff = gdata->grad_out_data + oid * D;
for (int64_t tx = 0; tx < D; ++tx) {
DType out = Functors::Read(outoff + tx);
DType grad_out = Functors::Read(gradoutoff + tx);
DType e = Functors::Op(lhsoff + tx * len, rhsoff + tx * len, len);
DType grad_e = grad_out * Functors::BackwardWrite(e, out);
if (0 == grad_e)
continue;
DType* lhs_base = lhsoff + tx * len;
DType* rhs_base = rhsoff + tx * len;
if (Mode == binary_op::kGradBoth) {
for (int64_t i = 0; i < len; ++i) {
DType lhs = Functors::Read(lhs_base + i);
DType rhs = Functors::Read(rhs_base + i);
DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e);
DType grad = grad_lhs + grad_rhs;
#pragma omp atomic
gradlhsoff[tx * len + i] += grad;
}
} else if (Mode == binary_op::kGradLhs) {
for (int64_t i = 0; i < len; ++i) {
DType lhs = Functors::Read(lhs_base + i);
DType rhs = Functors::Read(rhs_base + i);
DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
#pragma omp atomic
gradlhsoff[tx * len + i] += grad_lhs;
}
} else if (Mode == binary_op::kGradRhs) {
for (int64_t i = 0; i < len; ++i) {
DType lhs = Functors::Read(lhs_base + i);
DType rhs = Functors::Read(rhs_base + i);
DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e);
#pragma omp atomic
gradrhsoff[tx * len + i] += grad_rhs;
}
}
}
}
};
// Minigun UDF to compute backward binary reduce with broadcasting.
template <int Mode, int NDim,
typename Idx, typename DType, typename Functors>
struct BackwardBinaryReduceBcast {
static inline bool CondEdge(
Idx src, Idx dst, Idx eid, BackwardBcastGData<NDim, Idx, DType>* gdata) {
return true;
}
static inline void ApplyEdge(
Idx src, Idx dst, Idx eid, BackwardBcastGData<NDim, Idx, DType>* gdata) {
const int64_t len = gdata->data_len;
Idx lid = Functors::SelectLeft(src, eid, dst);
Idx rid = Functors::SelectRight(src, eid, dst);
Idx oid = Functors::SelectOut(src, eid, dst);
if (gdata->lhs_mapping) {
lid = Functors::GetId(lid, gdata->lhs_mapping);
}
if (gdata->rhs_mapping) {
rid = Functors::GetId(rid, gdata->rhs_mapping);
}
if (gdata->out_mapping) {
oid = Functors::GetId(oid, gdata->out_mapping);
}
DType* lhsoff = gdata->lhs_data + lid * gdata->lhs_len * len;
DType* rhsoff = gdata->rhs_data + rid * gdata->rhs_len * len;
DType* outoff = gdata->out_data + oid * gdata->out_len;
DType* gradlhsoff = gdata->grad_lhs_data + lid * gdata->out_len * len;
DType* gradrhsoff = gdata->grad_rhs_data + rid * gdata->out_len * len;
DType* gradoutoff = gdata->grad_out_data + oid * gdata->out_len;
int64_t tmp[NDim]; // store unraveled idx.
for (int64_t tx = 0; tx < gdata->out_len; ++tx) {
Unravel(tx, gdata->ndim, gdata->out_shape, gdata->out_stride, tmp);
DType out = Functors::Read(outoff + tx);
DType grad_out = Functors::Read(gradoutoff + tx);
DType e = Functors::Op(
lhsoff + Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len,
rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len,
len);
DType grad_e = grad_out * Functors::BackwardWrite(e, out);
// (pawelpiotrowicz) Although we can technically add the same condition for
// skipping atomic additions as in BackwardBinaryReduce, doing so made the
// speed 2% slower in GCMC training on MovieLens-1M with 24 OpenMP threads.
// For more details, see https://github.com/dmlc/dgl/pull/1527.
// TODO(BarclayII): Needs further investigation and benchmarking.
DType* lhs_base = lhsoff +
Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len;
DType* rhs_base = rhsoff +
Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len;
if (Mode == binary_op::kGradBoth) {
for (int64_t i = 0; i < len; ++i) {
DType lhs = Functors::Read(lhs_base + i);
DType rhs = Functors::Read(rhs_base + i);
DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e);
DType grad = grad_lhs + grad_rhs;
#pragma omp atomic
gradlhsoff[tx * len + i] += grad;
}
} else if (Mode == binary_op::kGradLhs) {
for (int64_t i = 0; i < len; ++i) {
DType lhs = Functors::Read(lhs_base + i);
DType rhs = Functors::Read(rhs_base + i);
DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
#pragma omp atomic
gradlhsoff[tx * len + i] += grad_lhs;
}
} else if (Mode == binary_op::kGradRhs) {
for (int64_t i = 0; i < len; ++i) {
DType lhs = Functors::Read(lhs_base + i);
DType rhs = Functors::Read(rhs_base + i);
DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e);
#pragma omp atomic
gradrhsoff[tx * len + i] += grad_rhs;
}
}
}
}
};
// Auxiliary template used in UDF.
template <typename Idx, typename DType,
typename LeftSelector, typename RightSelector,
typename BinaryOp, typename Reducer>
struct BackwardFunctorsTempl {
static inline Idx SelectOut(
Idx src, Idx edge, Idx dst) {
typedef typename OutSelector<Reducer>::Type OutTarget;
return SwitchSrcDst<OutTarget>::Type::Call(src, edge, dst);
}
static inline Idx SelectLeft(
Idx src, Idx edge, Idx dst) {
return LeftSelector::Call(src, edge, dst);
}
static inline Idx SelectRight(
Idx src, Idx edge, Idx dst) {
return RightSelector::Call(src, edge, dst);
}
static inline DType Op(DType* lhs, DType* rhs, int64_t len) {
return BinaryOp::Call(lhs, rhs, len);
}
static inline DType Read(DType* addr) {
return *addr;
}
static inline void Write(DType* addr, DType val) {
Reducer::Call(addr, val);
}
static inline Idx GetId(Idx id, Idx* id_map) {
return *(id_map + id);
}
static inline DType BackwardWrite(DType val, DType accum) {
return Reducer::BackwardCall(val, accum);
}
static inline DType BackwardOpLhs(DType lhs, DType rhs, DType out) {
return BinaryOp::BackwardLhs(lhs, rhs, out);
}
static inline DType BackwardOpRhs(DType lhs, DType rhs, DType out) {
return BinaryOp::BackwardRhs(lhs, rhs, out);
}
};
typedef minigun::advance::Config<true, minigun::advance::kV2N> AdvanceConfig;
} // namespace cpu
// Template implementation of BackwardBinaryReduce operator.
template <int XPU, int Mode, typename Idx, typename DType,
typename LeftSelector, typename RightSelector,
typename BinaryOp, typename Reducer>
void CallBackwardBinaryReduce(
const minigun::advance::RuntimeConfig& rtcfg,
const CSRWrapper& graph,
BackwardGData<Idx, DType>* gdata) {
// For backward computation, we use reverse csr and switch dst and src.
// This benefits the most common src_op_edge or copy_src case, because the
// gradients of src are now aggregated into destination buffer to reduce
// competition of atomic add.
auto incsr = graph.GetInCSRMatrix();
minigun::Csr<Idx> csr = utils::CreateCsr<Idx>(incsr.indptr, incsr.indices);
typedef cpu::BackwardFunctorsTempl<Idx, DType,
typename SwitchSrcDst<LeftSelector>::Type,
typename SwitchSrcDst<RightSelector>::Type,
BinaryOp, Reducer> Functors;
typedef cpu::BackwardBinaryReduce<Mode, Idx, DType, Functors> UDF;
// If the user-given mapping is none and the target is edge data, we need to
// replace the mapping by the edge ids in the csr graph so that the edge
// data is correctly read/written.
if (LeftSelector::target == binary_op::kEdge
&& gdata->lhs_mapping == nullptr) {
gdata->lhs_mapping = static_cast<Idx*>(incsr.data->data);
}
if (RightSelector::target == binary_op::kEdge
&& gdata->rhs_mapping == nullptr) {
gdata->rhs_mapping = static_cast<Idx*>(incsr.data->data);
}
if (OutSelector<Reducer>::Type::target == binary_op::kEdge
&& gdata->out_mapping == nullptr) {
gdata->out_mapping = static_cast<Idx*>(incsr.data->data);
}
// TODO(minjie): allocator
minigun::advance::Advance<XPU, Idx, cpu::AdvanceConfig, BackwardGData<Idx, DType>, UDF>(
rtcfg, csr, gdata, minigun::IntArray1D<Idx>());
}
// Following macro is used to generate explicit-specialization of the template
// operator.
#define GEN_BACKWARD_DEFINE(mode, dtype, lhs_tgt, rhs_tgt, op) \
template void CallBackwardBinaryReduce<XPU, \
mode, IDX, dtype, \
lhs_tgt, rhs_tgt, \
op<dtype>, REDUCER<XPU, dtype>>( \
const minigun::advance::RuntimeConfig& rtcfg, \
const CSRWrapper& graph, \
BackwardGData<IDX, dtype>* gdata);
// Template implementation of BackwardBinaryReduce with broadcasting operator.
template <int XPU, int Mode, int NDim, typename Idx, typename DType,
typename LeftSelector, typename RightSelector,
typename BinaryOp, typename Reducer>
void CallBackwardBinaryReduceBcast(
const minigun::advance::RuntimeConfig& rtcfg,
const CSRWrapper& graph,
BackwardBcastGData<NDim, Idx, DType>* gdata) {
// For backward computation, we use reverse csr and switch dst and src.
// This benefits the most common src_op_edge or copy_src case, because the
// gradients of src are now aggregated into destination buffer to reduce
// competition of atomic add.
auto incsr = graph.GetInCSRMatrix();
minigun::Csr<Idx> csr = utils::CreateCsr<Idx>(incsr.indptr, incsr.indices);
typedef cpu::BackwardFunctorsTempl<Idx, DType,
typename SwitchSrcDst<LeftSelector>::Type,
typename SwitchSrcDst<RightSelector>::Type,
BinaryOp, Reducer> Functors;
typedef cpu::BackwardBinaryReduceBcast<Mode, NDim, Idx, DType, Functors> UDF;
// If the user-given mapping is none and the target is edge data, we need to
// replace the mapping by the edge ids in the csr graph so that the edge
// data is correctly read/written.
if (LeftSelector::target == binary_op::kEdge
&& gdata->lhs_mapping == nullptr) {
gdata->lhs_mapping = static_cast<Idx*>(incsr.data->data);
}
if (RightSelector::target == binary_op::kEdge
&& gdata->rhs_mapping == nullptr) {
gdata->rhs_mapping = static_cast<Idx*>(incsr.data->data);
}
if (OutSelector<Reducer>::Type::target == binary_op::kEdge
&& gdata->out_mapping == nullptr) {
gdata->out_mapping = static_cast<Idx*>(incsr.data->data);
}
// TODO(minjie): allocator
minigun::advance::Advance<XPU, Idx, cpu::AdvanceConfig,
BackwardBcastGData<NDim, Idx, DType>, UDF>(
rtcfg, csr, gdata, minigun::IntArray1D<Idx>());
}
// Following macro is used to generate explicit-specialization of the template
// operator.
#define GEN_BACKWARD_BCAST_DEFINE(mode, ndim, dtype, lhs_tgt, rhs_tgt, op) \
template void CallBackwardBinaryReduceBcast<XPU, \
mode, ndim, IDX, dtype, \
lhs_tgt, rhs_tgt, \
op<dtype>, REDUCER<XPU, dtype>>( \
const minigun::advance::RuntimeConfig& rtcfg, \
const CSRWrapper& graph, \
BackwardBcastGData<ndim, IDX, dtype>* gdata);
} // namespace kernel
} // namespace dgl
#endif // DGL_KERNEL_CPU_BACKWARD_BINARY_REDUCE_IMPL_H_
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cpu/binary_bcast_reduce_max.cc
* \brief CPU kernels for braodcasting binary reduce max
*/
#include "./binary_reduce_impl.h"
#include "./backward_binary_reduce_impl.h"
namespace dgl {
namespace kernel {
#define REDUCER ReduceMax
#define XPU kDLCPU
#define IDX int32_t
EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE);
EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
GEN_BACKWARD_BCAST_DEFINE);
#undef IDX
#define IDX int64_t
EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE);
EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
GEN_BACKWARD_BCAST_DEFINE);
#undef IDX
} // namespace kernel
} // namespace dgl
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cpu/binary_bcast_reduce_min.cc
* \brief CPU kernels for braodcasting binary reduce min
*/
#include "./binary_reduce_impl.h"
#include "./backward_binary_reduce_impl.h"
namespace dgl {
namespace kernel {
#define REDUCER ReduceMin
#define XPU kDLCPU
#define IDX int32_t
EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE);
EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
GEN_BACKWARD_BCAST_DEFINE);
#undef IDX
#define IDX int64_t
EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE);
EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
GEN_BACKWARD_BCAST_DEFINE);
#undef IDX
} // namespace kernel
} // namespace dgl
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cpu/binary_bcast_reduce_none.cc
* \brief CPU kernels for braodcasting binary reduce none
*/
#include "./binary_reduce_impl.h"
#include "./backward_binary_reduce_impl.h"
namespace dgl {
namespace kernel {
#define REDUCER ReduceNone
#define XPU kDLCPU
#define IDX int32_t
EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE);
EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
GEN_BACKWARD_BCAST_DEFINE);
#undef IDX
#define IDX int64_t
EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE);
EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
GEN_BACKWARD_BCAST_DEFINE);
#undef IDX
} // namespace kernel
} // namespace dgl
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cpu/binary_bcast_reduce_prod.cc
* \brief CPU kernels for braodcasting binary reduce prod
*/
#include "./binary_reduce_impl.h"
#include "./backward_binary_reduce_impl.h"
namespace dgl {
namespace kernel {
#define REDUCER ReduceProd
#define XPU kDLCPU
#define IDX int32_t
EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE);
EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
GEN_BACKWARD_BCAST_DEFINE);
#undef IDX
#define IDX int64_t
EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE);
EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
GEN_BACKWARD_BCAST_DEFINE);
#undef IDX
} // namespace kernel
} // namespace dgl
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cpu/binary_bcast_reduce_sum.cc
* \brief CPU kernels for braodcasting binary reduce sum
*/
#include "./binary_reduce_impl.h"
#include "./backward_binary_reduce_impl.h"
namespace dgl {
namespace kernel {
#define REDUCER ReduceSum
#define XPU kDLCPU
#define IDX int32_t
EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE);
EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
GEN_BACKWARD_BCAST_DEFINE);
#undef IDX
#define IDX int64_t
EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE);
EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
GEN_BACKWARD_BCAST_DEFINE);
#undef IDX
} // namespace kernel
} // namespace dgl
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cpu/binary_reduce_impl.cc
* \brief Binary reduce implementation on CPU.
*/
#include "../binary_reduce_impl.h"
#include "../csr_interface.h"
using dgl::runtime::NDArray;
namespace dgl {
namespace kernel {
template void BinaryReduceImpl<kDLCPU>(
const std::string& reducer,
const std::string& op,
const CSRWrapper& graph,
binary_op::Target lhs, binary_op::Target rhs,
runtime::NDArray lhs_data, runtime::NDArray rhs_data,
runtime::NDArray out_data,
runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping,
runtime::NDArray out_mapping);
template void BinaryReduceBcastImpl<kDLCPU>(
const BcastInfo& info,
const std::string& reducer,
const std::string& op,
const CSRWrapper& graph,
binary_op::Target lhs, binary_op::Target rhs,
runtime::NDArray lhs_data, runtime::NDArray rhs_data,
runtime::NDArray out_data,
runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping,
runtime::NDArray out_mapping);
template void BackwardBinaryReduceImpl<kDLCPU>(
const std::string& reducer,
const std::string& op,
const CSRWrapper& graph,
binary_op::Target lhs, binary_op::Target rhs,
NDArray lhs_mapping, NDArray rhs_mapping, NDArray out_mapping,
NDArray lhs_data, NDArray rhs_data, NDArray out_data,
NDArray grad_out_data,
NDArray grad_lhs_data, NDArray grad_rhs_data);
template void BackwardBinaryReduceBcastImpl<kDLCPU>(
const BcastInfo& info,
const std::string& reducer,
const std::string& op,
const CSRWrapper& graph,
binary_op::Target lhs_tgt, binary_op::Target rhs_tgt,
runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping, runtime::NDArray out_mapping,
runtime::NDArray lhs, runtime::NDArray rhs, runtime::NDArray out, runtime::NDArray grad_out,
runtime::NDArray grad_lhs, runtime::NDArray grad_rhs);
} // namespace kernel
} // namespace dgl
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cpu/binary_reduce_impl.h
* \brief Minigun CPU UDFs for binary reduce
*/
#ifndef DGL_KERNEL_CPU_BINARY_REDUCE_IMPL_H_
#define DGL_KERNEL_CPU_BINARY_REDUCE_IMPL_H_
#include <minigun/minigun.h>
#include <algorithm>
#include "../binary_reduce_impl_decl.h"
#include "../utils.h"
#include "./functor.h"
#include "../csr_interface.h"
namespace dgl {
namespace kernel {
namespace cpu {
// Minigun UDF to compute binary reduce.
template <typename Idx, typename DType, typename Functors>
struct BinaryReduce {
static inline bool CondEdge(
Idx src, Idx dst, Idx eid, GData<Idx, DType>* gdata) {
return true;
}
static inline void ApplyEdge(
Idx src, Idx dst, Idx eid, GData<Idx, DType>* gdata) {
const int64_t D = gdata->x_length;
const int64_t len = gdata->data_len;
Idx lid = Functors::SelectLeft(src, eid, dst);
Idx rid = Functors::SelectRight(src, eid, dst);
Idx oid = Functors::SelectOut(src, eid, dst);
if (gdata->lhs_mapping) {
lid = Functors::GetId(lid, gdata->lhs_mapping);
}
if (gdata->rhs_mapping) {
rid = Functors::GetId(rid, gdata->rhs_mapping);
}
if (gdata->out_mapping) {
oid = Functors::GetId(oid, gdata->out_mapping);
}
DType* lhsoff = gdata->lhs_data + lid * D * len;
DType* rhsoff = gdata->rhs_data + rid * D * len;
DType* outoff = gdata->out_data + oid * D;
for (int64_t tx = 0; tx < D; ++tx) {
DType out = Functors::Op(lhsoff + tx * len, rhsoff + tx * len, len);
Functors::Write(outoff + tx, out);
}
}
};
// Convert flattened index to multi-dimension index (assume row-major).
inline void Unravel(int64_t idx, int ndim,
const int64_t* shape, const int64_t* stride, int64_t* out) {
for (int d = 0; d < ndim; ++d) {
out[d] = (idx / stride[d]) % shape[d];
}
}
// Convert multi-dimension index to flattened index (assume row-major).
inline int64_t Ravel(const int64_t* idx, int ndim,
const int64_t* shape, const int64_t* stride) {
int64_t out = 0;
for (int d = 0; d < ndim; ++d) {
out += std::min(idx[d], shape[d] - 1) * stride[d];
}
return out;
}
// Minigun UDF to compute binary reduce with broadcasting.
template <int NDim, typename Idx, typename DType, typename Functors>
struct BinaryReduceBcast {
static inline bool CondEdge(
Idx src, Idx dst, Idx eid, BcastGData<NDim, Idx, DType>* gdata) {
return true;
}
static inline void ApplyEdge(
Idx src, Idx dst, Idx eid, BcastGData<NDim, Idx, DType>* gdata) {
const int64_t len = gdata->data_len;
Idx lid = Functors::SelectLeft(src, eid, dst);
Idx rid = Functors::SelectRight(src, eid, dst);
Idx oid = Functors::SelectOut(src, eid, dst);
if (gdata->lhs_mapping) {
lid = Functors::GetId(lid, gdata->lhs_mapping);
}
if (gdata->rhs_mapping) {
rid = Functors::GetId(rid, gdata->rhs_mapping);
}
if (gdata->out_mapping) {
oid = Functors::GetId(oid, gdata->out_mapping);
}
DType* lhsoff = gdata->lhs_data + lid * gdata->lhs_len * len; // data with len size
DType* rhsoff = gdata->rhs_data + rid * gdata->rhs_len * len;
DType* outoff = gdata->out_data + oid * gdata->out_len;
int64_t tmp[NDim]; // store unraveled idx.
for (int64_t tx = 0; tx < gdata->out_len; ++tx) {
Unravel(tx, gdata->ndim, gdata->out_shape, gdata->out_stride, tmp);
DType out = Functors::Op(
lhsoff + Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len,
rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len,
len);
Functors::Write(outoff + tx, out);
}
}
};
// Auxiliary template used in UDF.
template <typename Idx, typename DType,
typename LeftSelector, typename RightSelector,
typename BinaryOp, typename Reducer>
struct FunctorsTempl {
static inline Idx SelectOut(
Idx src, Idx edge, Idx dst) {
return OutSelector<Reducer>::Type::Call(src, edge, dst);
}
static inline Idx SelectLeft(
Idx src, Idx edge, Idx dst) {
return LeftSelector::Call(src, edge, dst);
}
static inline Idx SelectRight(
Idx src, Idx edge, Idx dst) {
return RightSelector::Call(src, edge, dst);
}
static inline DType Op(DType *lhs, DType *rhs, int64_t len) {
return BinaryOp::Call(lhs, rhs, len);
}
static inline void Write(DType* addr, DType val) {
Reducer::Call(addr, val);
}
static inline Idx GetId(Idx id, Idx* id_map) {
return *(id_map + id);
}
};
typedef minigun::advance::Config<true, minigun::advance::kV2N> AdvanceConfig;
} // namespace cpu
// Template implementation of BinaryReduce operator.
template <int XPU, typename Idx, typename DType,
typename LeftSelector, typename RightSelector,
typename BinaryOp, typename Reducer>
void CallBinaryReduce(const minigun::advance::RuntimeConfig& rtcfg,
const CSRWrapper& graph,
GData<Idx, DType>* gdata) {
typedef cpu::FunctorsTempl<Idx, DType, LeftSelector,
RightSelector, BinaryOp, Reducer>
Functors;
typedef cpu::BinaryReduce<Idx, DType, Functors> UDF;
// csr
auto outcsr = graph.GetOutCSRMatrix();
minigun::Csr<Idx> csr = utils::CreateCsr<Idx>(outcsr.indptr, outcsr.indices);
// If the user-given mapping is none and the target is edge data, we need to
// replace the mapping by the edge ids in the csr graph so that the edge
// data is correctly read/written.
if (LeftSelector::target == binary_op::kEdge && gdata->lhs_mapping == nullptr) {
gdata->lhs_mapping = static_cast<Idx*>(outcsr.data->data);
}
if (RightSelector::target == binary_op::kEdge && gdata->rhs_mapping == nullptr) {
gdata->rhs_mapping = static_cast<Idx*>(outcsr.data->data);
}
if (OutSelector<Reducer>::Type::target == binary_op::kEdge
&& gdata->out_mapping == nullptr) {
gdata->out_mapping = static_cast<Idx*>(outcsr.data->data);
}
// TODO(minjie): allocator
minigun::advance::Advance<XPU, Idx, cpu::AdvanceConfig, GData<Idx, DType>, UDF>(
rtcfg, csr, gdata, minigun::IntArray1D<Idx>());
}
// Template implementation of BinaryReduce broadcasting operator.
template <int XPU, int NDim, typename Idx, typename DType,
typename LeftSelector, typename RightSelector,
typename BinaryOp, typename Reducer>
void CallBinaryReduceBcast(
const minigun::advance::RuntimeConfig& rtcfg,
const CSRWrapper& graph,
BcastGData<NDim, Idx, DType>* gdata) {
typedef cpu::FunctorsTempl<Idx, DType, LeftSelector,
RightSelector, BinaryOp, Reducer>
Functors;
typedef cpu::BinaryReduceBcast<NDim, Idx, DType, Functors> UDF;
// csr
auto outcsr = graph.GetOutCSRMatrix();
minigun::Csr<Idx> csr = utils::CreateCsr<Idx>(outcsr.indptr, outcsr.indices);
// If the user-given mapping is none and the target is edge data, we need to
// replace the mapping by the edge ids in the csr graph so that the edge
// data is correctly read/written.
if (LeftSelector::target == binary_op::kEdge && gdata->lhs_mapping == nullptr) {
gdata->lhs_mapping = static_cast<Idx*>(outcsr.data->data);
}
if (RightSelector::target == binary_op::kEdge && gdata->rhs_mapping == nullptr) {
gdata->rhs_mapping = static_cast<Idx*>(outcsr.data->data);
}
if (OutSelector<Reducer>::Type::target == binary_op::kEdge
&& gdata->out_mapping == nullptr) {
gdata->out_mapping = static_cast<Idx*>(outcsr.data->data);
}
// TODO(minjie): allocator
minigun::advance::Advance<XPU, Idx, cpu::AdvanceConfig,
BcastGData<NDim, Idx, DType>, UDF>(
rtcfg, csr, gdata, minigun::IntArray1D<Idx>());
}
// Following macro is used to generate explicit-specialization of the template
// operator.
#define GEN_DEFINE(dtype, lhs_tgt, rhs_tgt, op) \
template void CallBinaryReduce<XPU, IDX, \
dtype, lhs_tgt, rhs_tgt, op<dtype>, REDUCER<XPU, dtype>>( \
const minigun::advance::RuntimeConfig& rtcfg, \
const CSRWrapper& graph, \
GData<IDX, dtype>* gdata);
#define GEN_BCAST_DEFINE(ndim, dtype, lhs_tgt, rhs_tgt, op) \
template void CallBinaryReduceBcast<XPU, ndim, IDX, dtype, \
lhs_tgt, rhs_tgt, \
op<dtype>, REDUCER<XPU, dtype>>( \
const minigun::advance::RuntimeConfig& rtcfg, \
const CSRWrapper& graph, \
BcastGData<ndim, IDX, dtype>* gdata);
#define EVAL(F, ...) MSVC_EXPAND(F(__VA_ARGS__))
} // namespace kernel
} // namespace dgl
#endif // DGL_KERNEL_CPU_BINARY_REDUCE_IMPL_H_
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cpu/binary_reduce_max.cc
* \brief CPU kernels for binary reduce max
*/
#include "./binary_reduce_impl.h"
#include "./backward_binary_reduce_impl.h"
namespace dgl {
namespace kernel {
#define REDUCER ReduceMax
#define XPU kDLCPU
#define IDX int32_t
EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE);
EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE);
#undef IDX
#define IDX int64_t
EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE);
EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE);
#undef IDX
} // namespace kernel
} // namespace dgl
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cpu/binary_reduce_min.cc
* \brief CPU kernels for binary reduce min
*/
#include "./binary_reduce_impl.h"
#include "./backward_binary_reduce_impl.h"
namespace dgl {
namespace kernel {
#define REDUCER ReduceMin
#define XPU kDLCPU
#define IDX int32_t
EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE);
EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE);
#undef IDX
#define IDX int64_t
EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE);
EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE);
#undef IDX
} // namespace kernel
} // namespace dgl
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cpu/binary_reduce_none.cc
* \brief CPU kernels for binary reduce none
*/
#include "./binary_reduce_impl.h"
#include "./backward_binary_reduce_impl.h"
namespace dgl {
namespace kernel {
#define REDUCER ReduceNone
#define XPU kDLCPU
#define IDX int32_t
EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE);
EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE);
#undef IDX
#define IDX int64_t
EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE);
EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE);
#undef IDX
} // namespace kernel
} // namespace dgl
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment