Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
c81efdf2
Unverified
Commit
c81efdf2
authored
Sep 06, 2021
by
Jinjing Zhou
Committed by
GitHub
Sep 06, 2021
Browse files
Remove deprecated kernels (#3316)
* remove * remove * fix * remove * remove
parent
75d793a1
Changes
46
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
1641 deletions
+0
-1641
src/kernel/cpu/binary_reduce_prod.cc
src/kernel/cpu/binary_reduce_prod.cc
+0
-26
src/kernel/cpu/binary_reduce_sum.cc
src/kernel/cpu/binary_reduce_sum.cc
+0
-26
src/kernel/cpu/functor.h
src/kernel/cpu/functor.h
+0
-78
src/kernel/cpu/utils.cc
src/kernel/cpu/utils.cc
+0
-24
src/kernel/csr_interface.h
src/kernel/csr_interface.h
+0
-34
src/kernel/cuda/atomic.cuh
src/kernel/cuda/atomic.cuh
+0
-201
src/kernel/cuda/backward_binary_reduce_impl.cuh
src/kernel/cuda/backward_binary_reduce_impl.cuh
+0
-315
src/kernel/cuda/binary_bcast_reduce_max.cu
src/kernel/cuda/binary_bcast_reduce_max.cu
+0
-22
src/kernel/cuda/binary_bcast_reduce_min.cu
src/kernel/cuda/binary_bcast_reduce_min.cu
+0
-21
src/kernel/cuda/binary_bcast_reduce_none.cu
src/kernel/cuda/binary_bcast_reduce_none.cu
+0
-21
src/kernel/cuda/binary_bcast_reduce_prod.cu
src/kernel/cuda/binary_bcast_reduce_prod.cu
+0
-22
src/kernel/cuda/binary_bcast_reduce_sum.cu
src/kernel/cuda/binary_bcast_reduce_sum.cu
+0
-22
src/kernel/cuda/binary_reduce_impl.cu
src/kernel/cuda/binary_reduce_impl.cu
+0
-56
src/kernel/cuda/binary_reduce_impl.cuh
src/kernel/cuda/binary_reduce_impl.cuh
+0
-265
src/kernel/cuda/binary_reduce_max.cu
src/kernel/cuda/binary_reduce_max.cu
+0
-19
src/kernel/cuda/binary_reduce_min.cu
src/kernel/cuda/binary_reduce_min.cu
+0
-19
src/kernel/cuda/binary_reduce_none.cu
src/kernel/cuda/binary_reduce_none.cu
+0
-19
src/kernel/cuda/binary_reduce_prod.cu
src/kernel/cuda/binary_reduce_prod.cu
+0
-20
src/kernel/cuda/binary_reduce_sum.cu
src/kernel/cuda/binary_reduce_sum.cu
+0
-347
src/kernel/cuda/functor.cuh
src/kernel/cuda/functor.cuh
+0
-84
No files found.
src/kernel/cpu/binary_reduce_prod.cc
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cpu/binary_reduce_prod.cc
* \brief CPU kernels for binary reduce prod
*/
#include "./binary_reduce_impl.h"
#include "./backward_binary_reduce_impl.h"
namespace
dgl
{
namespace
kernel
{
#define REDUCER ReduceProd
#define XPU kDLCPU
#define IDX int32_t
EVAL
(
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_DEFINE
);
EVAL
(
GEN_BACKWARD_MODE
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BACKWARD_DEFINE
);
#undef IDX
#define IDX int64_t
EVAL
(
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_DEFINE
);
EVAL
(
GEN_BACKWARD_MODE
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BACKWARD_DEFINE
);
#undef IDX
}
// namespace kernel
}
// namespace dgl
src/kernel/cpu/binary_reduce_sum.cc
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cpu/binary_reduce_sum.cc
* \brief CPU kernels for binary reduce sum
*/
#include "./binary_reduce_impl.h"
#include "./backward_binary_reduce_impl.h"
namespace
dgl
{
namespace
kernel
{
#define REDUCER ReduceSum
#define XPU kDLCPU
#define IDX int32_t
EVAL
(
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_DEFINE
);
EVAL
(
GEN_BACKWARD_MODE
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BACKWARD_DEFINE
);
#undef IDX
#define IDX int64_t
EVAL
(
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_DEFINE
);
EVAL
(
GEN_BACKWARD_MODE
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BACKWARD_DEFINE
);
#undef IDX
}
// namespace kernel
}
// namespace dgl
src/kernel/cpu/functor.h
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cpu/functor.h
* \brief Functors for template on CPU
*/
#ifndef DGL_KERNEL_CPU_FUNCTOR_H_
#define DGL_KERNEL_CPU_FUNCTOR_H_
#include <dmlc/omp.h>
#include <algorithm>
#include "../binary_reduce_common.h"
namespace
dgl
{
namespace
kernel
{
// Reducer functor specialization
template
<
typename
DType
>
struct
ReduceSum
<
kDLCPU
,
DType
>
{
static
void
Call
(
DType
*
addr
,
DType
val
)
{
if
(
0
==
val
)
return
;
#pragma omp atomic
*
addr
+=
val
;
}
static
DType
BackwardCall
(
DType
val
,
DType
accum
)
{
return
1
;
}
};
template
<
typename
DType
>
struct
ReduceMax
<
kDLCPU
,
DType
>
{
static
void
Call
(
DType
*
addr
,
DType
val
)
{
#pragma omp critical
*
addr
=
std
::
max
(
*
addr
,
val
);
}
static
DType
BackwardCall
(
DType
val
,
DType
accum
)
{
return
static_cast
<
DType
>
(
val
==
accum
);
}
};
template
<
typename
DType
>
struct
ReduceMin
<
kDLCPU
,
DType
>
{
static
void
Call
(
DType
*
addr
,
DType
val
)
{
#pragma omp critical
*
addr
=
std
::
min
(
*
addr
,
val
);
}
static
DType
BackwardCall
(
DType
val
,
DType
accum
)
{
return
static_cast
<
DType
>
(
val
==
accum
);
}
};
template
<
typename
DType
>
struct
ReduceProd
<
kDLCPU
,
DType
>
{
static
void
Call
(
DType
*
addr
,
DType
val
)
{
#pragma omp atomic
*
addr
*=
val
;
}
static
DType
BackwardCall
(
DType
val
,
DType
accum
)
{
return
accum
/
val
;
}
};
template
<
typename
DType
>
struct
ReduceNone
<
kDLCPU
,
DType
>
{
static
void
Call
(
DType
*
addr
,
DType
val
)
{
*
addr
=
val
;
}
static
DType
BackwardCall
(
DType
val
,
DType
accum
)
{
return
1
;
}
};
}
// namespace kernel
}
// namespace dgl
#endif // DGL_KERNEL_CPU_FUNCTOR_H_
src/kernel/cpu/utils.cc
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cpu/utils.cc
* \brief Utility function implementations on CPU
*/
#include "../utils.h"
namespace
dgl
{
namespace
kernel
{
namespace
utils
{
template
<
int
XPU
,
typename
DType
>
void
Fill
(
const
DLContext
&
ctx
,
DType
*
ptr
,
size_t
length
,
DType
val
)
{
for
(
size_t
i
=
0
;
i
<
length
;
++
i
)
{
*
(
ptr
+
i
)
=
val
;
}
}
template
void
Fill
<
kDLCPU
,
float
>(
const
DLContext
&
ctx
,
float
*
ptr
,
size_t
length
,
float
val
);
template
void
Fill
<
kDLCPU
,
double
>(
const
DLContext
&
ctx
,
double
*
ptr
,
size_t
length
,
double
val
);
}
// namespace utils
}
// namespace kernel
}
// namespace dgl
src/kernel/csr_interface.h
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/csr_interface.h
* \brief Kernel common utilities
*/
#ifndef DGL_KERNEL_CSR_INTERFACE_H_
#define DGL_KERNEL_CSR_INTERFACE_H_
#include <dgl/array.h>
#include <dgl/runtime/c_runtime_api.h>
namespace
dgl
{
namespace
kernel
{
/*!
* \brief Wrapper class that unifies ImmutableGraph and Bipartite, which do
* not share a base class.
*
* \note This is an ugly temporary solution, and shall be removed after
* refactoring ImmutableGraph and Bipartite to use the same data structure.
*/
class
CSRWrapper
{
public:
virtual
aten
::
CSRMatrix
GetInCSRMatrix
()
const
=
0
;
virtual
aten
::
CSRMatrix
GetOutCSRMatrix
()
const
=
0
;
virtual
DGLContext
Context
()
const
=
0
;
virtual
int
NumBits
()
const
=
0
;
};
};
// namespace kernel
};
// namespace dgl
#endif // DGL_KERNEL_CSR_INTERFACE_H_
src/kernel/cuda/atomic.cuh
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cuda/atomic.cuh
* \brief Atomic functions
*/
#ifndef DGL_KERNEL_CUDA_ATOMIC_H_
#define DGL_KERNEL_CUDA_ATOMIC_H_
#include <cuda_runtime.h>
#if __CUDA_ARCH__ >= 600
#include <cuda_fp16.h>
#endif
namespace
dgl
{
namespace
kernel
{
namespace
cuda
{
// Type trait for selecting code type
template
<
int
Bytes
>
struct
Code
{
};
template
<
>
struct
Code
<
4
>
{
typedef
unsigned
int
Type
;
};
template
<
>
struct
Code
<
8
>
{
typedef
unsigned
long
long
int
Type
;
};
// Helper class for converting to/from atomicCAS compatible types.
template
<
typename
T
>
struct
Cast
{
typedef
typename
Code
<
sizeof
(
T
)
>::
Type
Type
;
static
__device__
__forceinline__
Type
Encode
(
T
val
)
{
return
static_cast
<
Type
>
(
val
);
}
static
__device__
__forceinline__
T
Decode
(
Type
code
)
{
return
static_cast
<
T
>
(
code
);
}
};
template
<
>
struct
Cast
<
float
>
{
typedef
Code
<
sizeof
(
float
)
>::
Type
Type
;
static
__device__
__forceinline__
Type
Encode
(
float
val
)
{
return
__float_as_uint
(
val
);
}
static
__device__
__forceinline__
float
Decode
(
Type
code
)
{
return
__uint_as_float
(
code
);
}
};
template
<
>
struct
Cast
<
double
>
{
typedef
Code
<
sizeof
(
double
)
>::
Type
Type
;
static
__device__
__forceinline__
Type
Encode
(
double
val
)
{
return
__double_as_longlong
(
val
);
}
static
__device__
__forceinline__
double
Decode
(
Type
code
)
{
return
__longlong_as_double
(
code
);
}
};
#define DEFINE_ATOMIC(NAME) \
template <typename T> \
__device__ __forceinline__ T Atomic##NAME(T* addr, T val) { \
typedef typename Cast<T>::Type CT; \
CT* addr_as_ui = reinterpret_cast<CT*>(addr); \
CT old = *addr_as_ui; \
CT assumed = old; \
do { \
assumed = old; \
old = atomicCAS(addr_as_ui, assumed, \
Cast<T>::Encode(OP(val, Cast<T>::Decode(old)))); \
} while (assumed != old); \
return Cast<T>::Decode(old); \
}
#define OP(a, b) max(a, b)
DEFINE_ATOMIC
(
Max
)
#undef OP
#define OP(a, b) min(a, b)
DEFINE_ATOMIC
(
Min
)
#undef OP
#define OP(a, b) a + b
DEFINE_ATOMIC
(
Add
)
#undef OP
#if __CUDA_ARCH__ >= 200
template
<
>
__device__
__forceinline__
float
AtomicAdd
<
float
>
(
float
*
addr
,
float
val
)
{
return
atomicAdd
(
addr
,
val
);
}
#endif // __CUDA_ARCH__
#if __CUDA_ARCH__ >= 600
template
<
>
__device__
__forceinline__
double
AtomicAdd
<
double
>
(
double
*
addr
,
double
val
)
{
return
atomicAdd
(
addr
,
val
);
}
#endif
#if defined(CUDART_VERSION) && CUDART_VERSION >= 10000
#if __CUDA_ARCH__ >= 600
template
<
>
__device__
__forceinline__
__half2
AtomicAdd
<
__half2
>
(
__half2
*
addr
,
__half2
val
)
{
return
atomicAdd
(
addr
,
val
);
}
#endif // __CUDA_ARCH__
#if __CUDA_ARCH__ >= 700
template
<
>
__device__
__forceinline__
__half
AtomicAdd
<
__half
>
(
__half
*
addr
,
__half
val
)
{
return
atomicAdd
(
addr
,
val
);
}
#endif // __CUDA_ARCH__
#endif
#define OP(a, b) a * b
DEFINE_ATOMIC
(
Mul
)
#undef OP
/**
* \brief Performs an atomic compare-and-swap on 64 bit integers. That is,
* it the word `old` at the memory location `address`, computes
* `(old == compare ? val : old)` , and stores the result back to memory at
* the same address.
*
* \param address The address to perform the atomic operation on.
* \param compare The value to compare to.
* \param val The new value to conditionally store.
*
* \return The old value at the address.
*/
inline
__device__
int64_t
AtomicCAS
(
int64_t
*
const
address
,
const
int64_t
compare
,
const
int64_t
val
)
{
// match the type of "::atomicCAS", so ignore lint warning
using
Type
=
unsigned
long
long
int
;
// NOLINT
static_assert
(
sizeof
(
Type
)
==
sizeof
(
*
address
),
"Type width must match"
);
return
atomicCAS
(
reinterpret_cast
<
Type
*>
(
address
),
static_cast
<
Type
>
(
compare
),
static_cast
<
Type
>
(
val
));
}
/**
* \brief Performs an atomic compare-and-swap on 32 bit integers. That is,
* it the word `old` at the memory location `address`, computes
* `(old == compare ? val : old)` , and stores the result back to memory at
* the same address.
*
* \param address The address to perform the atomic operation on.
* \param compare The value to compare to.
* \param val The new value to conditionally store.
*
* \return The old value at the address.
*/
inline
__device__
int32_t
AtomicCAS
(
int32_t
*
const
address
,
const
int32_t
compare
,
const
int32_t
val
)
{
// match the type of "::atomicCAS", so ignore lint warning
using
Type
=
int
;
// NOLINT
static_assert
(
sizeof
(
Type
)
==
sizeof
(
*
address
),
"Type width must match"
);
return
atomicCAS
(
reinterpret_cast
<
Type
*>
(
address
),
static_cast
<
Type
>
(
compare
),
static_cast
<
Type
>
(
val
));
}
inline
__device__
int64_t
AtomicMax
(
int64_t
*
const
address
,
const
int64_t
val
)
{
// match the type of "::atomicCAS", so ignore lint warning
using
Type
=
unsigned
long
long
int
;
// NOLINT
static_assert
(
sizeof
(
Type
)
==
sizeof
(
*
address
),
"Type width must match"
);
return
atomicMax
(
reinterpret_cast
<
Type
*>
(
address
),
static_cast
<
Type
>
(
val
));
}
inline
__device__
int32_t
AtomicMax
(
int32_t
*
const
address
,
const
int32_t
val
)
{
// match the type of "::atomicCAS", so ignore lint warning
using
Type
=
int
;
// NOLINT
static_assert
(
sizeof
(
Type
)
==
sizeof
(
*
address
),
"Type width must match"
);
return
atomicMax
(
reinterpret_cast
<
Type
*>
(
address
),
static_cast
<
Type
>
(
val
));
}
}
// namespace cuda
}
// namespace kernel
}
// namespace dgl
#endif // DGL_KERNEL_CUDA_ATOMIC_H_
src/kernel/cuda/backward_binary_reduce_impl.cuh
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cuda/backward_binary_reduce_impl.cuh
* \brief Minigun CUDA UDFs for bacward binary reduce
*/
#ifndef DGL_KERNEL_CUDA_BACKWARD_BINARY_REDUCE_IMPL_CUH_
#define DGL_KERNEL_CUDA_BACKWARD_BINARY_REDUCE_IMPL_CUH_
#include <minigun/minigun.h>
#include "../binary_reduce_impl_decl.h"
#include "../utils.h"
#include "./functor.cuh"
#include "../csr_interface.h"
namespace
dgl
{
namespace
kernel
{
namespace
cuda
{
// Minigun UDF to compute backward binary reduce.
template
<
int
Mode
,
typename
Idx
,
typename
DType
,
typename
Functors
>
struct
BackwardBinaryReduce
{
static
__device__
__forceinline__
bool
CondEdge
(
Idx
src
,
Idx
dst
,
Idx
eid
,
BackwardGData
<
Idx
,
DType
>*
gdata
)
{
return
true
;
}
static
__device__
__forceinline__
void
ApplyEdge
(
Idx
src
,
Idx
dst
,
Idx
eid
,
BackwardGData
<
Idx
,
DType
>*
gdata
)
{
const
int64_t
D
=
gdata
->
x_length
;
int64_t
tx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
stride_x
=
blockDim
.
x
*
gridDim
.
x
;
const
int64_t
len
=
gdata
->
data_len
;
Idx
lid
=
Functors
::
SelectLeft
(
src
,
eid
,
dst
);
Idx
rid
=
Functors
::
SelectRight
(
src
,
eid
,
dst
);
Idx
oid
=
Functors
::
SelectOut
(
src
,
eid
,
dst
);
if
(
gdata
->
lhs_mapping
)
{
lid
=
Functors
::
GetId
(
lid
,
gdata
->
lhs_mapping
);
}
if
(
gdata
->
rhs_mapping
)
{
rid
=
Functors
::
GetId
(
rid
,
gdata
->
rhs_mapping
);
}
if
(
gdata
->
out_mapping
)
{
oid
=
Functors
::
GetId
(
oid
,
gdata
->
out_mapping
);
}
DType
*
lhsoff
=
gdata
->
lhs_data
+
lid
*
D
*
len
;
DType
*
rhsoff
=
gdata
->
rhs_data
+
rid
*
D
*
len
;
DType
*
outoff
=
gdata
->
out_data
+
oid
*
D
;
DType
*
gradlhsoff
=
gdata
->
grad_lhs_data
+
lid
*
D
*
len
;
DType
*
gradrhsoff
=
gdata
->
grad_rhs_data
+
rid
*
D
*
len
;
DType
*
gradoutoff
=
gdata
->
grad_out_data
+
oid
*
D
;
while
(
tx
<
D
)
{
DType
out
=
Functors
::
Read
(
outoff
+
tx
);
DType
grad_out
=
Functors
::
Read
(
gradoutoff
+
tx
);
DType
e
=
Functors
::
Op
(
lhsoff
+
tx
*
len
,
rhsoff
+
tx
*
len
,
len
);
DType
grad_e
=
grad_out
*
Functors
::
BackwardWrite
(
e
,
out
);
DType
*
lhs_base
=
lhsoff
+
tx
*
len
;
DType
*
rhs_base
=
rhsoff
+
tx
*
len
;
if
(
Mode
==
binary_op
::
kGradBoth
)
{
#pragma unroll
for
(
int64_t
i
=
0
;
i
<
len
;
++
i
)
{
DType
lhs
=
Functors
::
Read
(
lhs_base
+
i
);
DType
rhs
=
Functors
::
Read
(
rhs_base
+
i
);
DType
grad_lhs
=
grad_e
*
Functors
::
BackwardOpLhs
(
lhs
,
rhs
,
e
);
DType
grad_rhs
=
grad_e
*
Functors
::
BackwardOpRhs
(
lhs
,
rhs
,
e
);
DType
grad
=
grad_lhs
+
grad_rhs
;
AtomicAdd
(
gradlhsoff
+
tx
*
len
+
i
,
grad
);
}
}
else
if
(
Mode
==
binary_op
::
kGradLhs
)
{
#pragma unroll
for
(
int64_t
i
=
0
;
i
<
len
;
++
i
)
{
DType
lhs
=
Functors
::
Read
(
lhs_base
+
i
);
DType
rhs
=
Functors
::
Read
(
rhs_base
+
i
);
DType
grad_lhs
=
grad_e
*
Functors
::
BackwardOpLhs
(
lhs
,
rhs
,
e
);
AtomicAdd
(
gradlhsoff
+
tx
*
len
+
i
,
grad_lhs
);
}
}
else
if
(
Mode
==
binary_op
::
kGradRhs
)
{
#pragma unroll
for
(
int64_t
i
=
0
;
i
<
len
;
++
i
)
{
DType
lhs
=
Functors
::
Read
(
lhs_base
+
i
);
DType
rhs
=
Functors
::
Read
(
rhs_base
+
i
);
DType
grad_rhs
=
grad_e
*
Functors
::
BackwardOpRhs
(
lhs
,
rhs
,
e
);
AtomicAdd
(
gradrhsoff
+
tx
*
len
+
i
,
grad_rhs
);
}
}
tx
+=
stride_x
;
}
}
};
// Minigun UDF to compute backward binary reduce with broadcasting.
template
<
int
Mode
,
int
NDim
,
typename
Idx
,
typename
DType
,
typename
Functors
>
struct
BackwardBinaryReduceBcast
{
static
__device__
__forceinline__
bool
CondEdge
(
Idx
src
,
Idx
dst
,
Idx
eid
,
BackwardBcastGData
<
NDim
,
Idx
,
DType
>*
gdata
)
{
return
true
;
}
static
__device__
__forceinline__
void
ApplyEdge
(
Idx
src
,
Idx
dst
,
Idx
eid
,
BackwardBcastGData
<
NDim
,
Idx
,
DType
>*
gdata
)
{
int64_t
tx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
stride_x
=
blockDim
.
x
*
gridDim
.
x
;
const
int64_t
len
=
gdata
->
data_len
;
Idx
lid
=
Functors
::
SelectLeft
(
src
,
eid
,
dst
);
Idx
rid
=
Functors
::
SelectRight
(
src
,
eid
,
dst
);
Idx
oid
=
Functors
::
SelectOut
(
src
,
eid
,
dst
);
if
(
gdata
->
lhs_mapping
)
{
lid
=
Functors
::
GetId
(
lid
,
gdata
->
lhs_mapping
);
}
if
(
gdata
->
rhs_mapping
)
{
rid
=
Functors
::
GetId
(
rid
,
gdata
->
rhs_mapping
);
}
if
(
gdata
->
out_mapping
)
{
oid
=
Functors
::
GetId
(
oid
,
gdata
->
out_mapping
);
}
DType
*
lhsoff
=
gdata
->
lhs_data
+
lid
*
gdata
->
lhs_len
*
len
;
DType
*
rhsoff
=
gdata
->
rhs_data
+
rid
*
gdata
->
rhs_len
*
len
;
DType
*
outoff
=
gdata
->
out_data
+
oid
*
gdata
->
out_len
;
DType
*
gradlhsoff
=
gdata
->
grad_lhs_data
+
lid
*
gdata
->
out_len
*
len
;
DType
*
gradrhsoff
=
gdata
->
grad_rhs_data
+
rid
*
gdata
->
out_len
*
len
;
DType
*
gradoutoff
=
gdata
->
grad_out_data
+
oid
*
gdata
->
out_len
;
while
(
tx
<
gdata
->
out_len
)
{
int64_t
lhs_add
=
0
;
int64_t
rhs_add
=
0
;
UnravelRavel
(
tx
,
gdata
->
ndim
,
gdata
->
out_shape
,
gdata
->
out_stride
,
gdata
->
lhs_shape
,
gdata
->
lhs_stride
,
gdata
->
rhs_shape
,
gdata
->
rhs_stride
,
&
lhs_add
,
&
rhs_add
);
DType
out
=
Functors
::
Read
(
outoff
+
tx
);
DType
grad_out
=
Functors
::
Read
(
gradoutoff
+
tx
);
DType
e
=
Functors
::
Op
(
lhsoff
+
lhs_add
*
len
,
rhsoff
+
rhs_add
*
len
,
len
);
DType
grad_e
=
grad_out
*
Functors
::
BackwardWrite
(
e
,
out
);
DType
*
lhs_base
=
lhsoff
+
lhs_add
*
len
;
DType
*
rhs_base
=
rhsoff
+
rhs_add
*
len
;
if
(
Mode
==
binary_op
::
kGradBoth
)
{
#pragma unroll
for
(
int64_t
i
=
0
;
i
<
len
;
++
i
)
{
DType
lhs
=
Functors
::
Read
(
lhs_base
+
i
);
DType
rhs
=
Functors
::
Read
(
rhs_base
+
i
);
DType
grad_lhs
=
grad_e
*
Functors
::
BackwardOpLhs
(
lhs
,
rhs
,
e
);
DType
grad_rhs
=
grad_e
*
Functors
::
BackwardOpRhs
(
lhs
,
rhs
,
e
);
DType
grad
=
grad_lhs
+
grad_rhs
;
AtomicAdd
(
gradlhsoff
+
tx
*
len
+
i
,
grad
);
}
}
else
if
(
Mode
==
binary_op
::
kGradLhs
)
{
#pragma unroll
for
(
int64_t
i
=
0
;
i
<
len
;
++
i
)
{
DType
lhs
=
Functors
::
Read
(
lhs_base
+
i
);
DType
rhs
=
Functors
::
Read
(
rhs_base
+
i
);
DType
grad_lhs
=
grad_e
*
Functors
::
BackwardOpLhs
(
lhs
,
rhs
,
e
);
AtomicAdd
(
gradlhsoff
+
tx
*
len
+
i
,
grad_lhs
);
}
}
else
if
(
Mode
==
binary_op
::
kGradRhs
)
{
#pragma unroll
for
(
int64_t
i
=
0
;
i
<
len
;
++
i
)
{
DType
lhs
=
Functors
::
Read
(
lhs_base
+
i
);
DType
rhs
=
Functors
::
Read
(
rhs_base
+
i
);
DType
grad_rhs
=
grad_e
*
Functors
::
BackwardOpRhs
(
lhs
,
rhs
,
e
);
AtomicAdd
(
gradrhsoff
+
tx
*
len
+
i
,
grad_rhs
);
}
}
tx
+=
stride_x
;
}
}
};
// Auxiliary template used in UDF.
template
<
typename
Idx
,
typename
DType
,
typename
LeftSelector
,
typename
RightSelector
,
typename
BinaryOp
,
typename
Reducer
>
struct
BackwardFunctorsTempl
{
static
__device__
__forceinline__
Idx
SelectOut
(
Idx
src
,
Idx
edge
,
Idx
dst
)
{
typedef
typename
OutSelector
<
Reducer
>::
Type
OutTarget
;
return
SwitchSrcDst
<
OutTarget
>::
Type
::
Call
(
src
,
edge
,
dst
);
}
static
__device__
__forceinline__
Idx
SelectLeft
(
Idx
src
,
Idx
edge
,
Idx
dst
)
{
return
LeftSelector
::
Call
(
src
,
edge
,
dst
);
}
static
__device__
__forceinline__
Idx
SelectRight
(
Idx
src
,
Idx
edge
,
Idx
dst
)
{
return
RightSelector
::
Call
(
src
,
edge
,
dst
);
}
static
__device__
__forceinline__
DType
Op
(
DType
*
lhs
,
DType
*
rhs
,
int64_t
len
)
{
return
BinaryOp
::
Call
(
lhs
,
rhs
,
len
);
}
static
__device__
__forceinline__
DType
Read
(
DType
*
addr
)
{
return
LDGReader
<
DType
>::
Call
(
addr
);
}
static
__device__
__forceinline__
void
Write
(
DType
*
addr
,
DType
val
)
{
Reducer
::
Call
(
addr
,
val
);
}
static
__device__
__forceinline__
Idx
GetId
(
Idx
id
,
Idx
*
id_map
)
{
return
LDGReader
<
Idx
>::
Call
(
id_map
+
id
);
}
static
__device__
__forceinline__
DType
BackwardWrite
(
DType
val
,
DType
accum
)
{
return
Reducer
::
BackwardCall
(
val
,
accum
);
}
static
__device__
__forceinline__
DType
BackwardOpLhs
(
DType
lhs
,
DType
rhs
,
DType
out
)
{
return
BinaryOp
::
BackwardLhs
(
lhs
,
rhs
,
out
);
}
static
__device__
__forceinline__
DType
BackwardOpRhs
(
DType
lhs
,
DType
rhs
,
DType
out
)
{
return
BinaryOp
::
BackwardRhs
(
lhs
,
rhs
,
out
);
}
};
typedef
minigun
::
advance
::
Config
<
true
,
minigun
::
advance
::
kV2N
>
AdvanceConfig
;
}
// namespace cuda
// Template implementation of BackwardBinaryReduce operator.
template
<
int
XPU
,
int
Mode
,
typename
Idx
,
typename
DType
,
typename
LeftSelector
,
typename
RightSelector
,
typename
BinaryOp
,
typename
Reducer
>
void
CallBackwardBinaryReduce
(
const
minigun
::
advance
::
RuntimeConfig
&
rtcfg
,
const
CSRWrapper
&
graph
,
BackwardGData
<
Idx
,
DType
>*
gdata
)
{
// For backward computation, we use reverse csr and switch dst and src.
// This benefits the most common src_op_edge or copy_src case, because the
// gradients of src are now aggregated into destination buffer to reduce
// competition of atomic add.
auto
incsr
=
graph
.
GetInCSRMatrix
();
minigun
::
Csr
<
Idx
>
csr
=
utils
::
CreateCsr
<
Idx
>
(
incsr
.
indptr
,
incsr
.
indices
);
typedef
cuda
::
BackwardFunctorsTempl
<
Idx
,
DType
,
typename
SwitchSrcDst
<
LeftSelector
>::
Type
,
typename
SwitchSrcDst
<
RightSelector
>::
Type
,
BinaryOp
,
Reducer
>
Functors
;
typedef
cuda
::
BackwardBinaryReduce
<
Mode
,
Idx
,
DType
,
Functors
>
UDF
;
// If the user-given mapping is none and the target is edge data, we need to
// replace the mapping by the edge ids in the csr graph so that the edge
// data is correctly read/written.
if
(
LeftSelector
::
target
==
binary_op
::
kEdge
&&
gdata
->
lhs_mapping
==
nullptr
)
{
gdata
->
lhs_mapping
=
static_cast
<
Idx
*>
(
incsr
.
data
->
data
);
}
if
(
RightSelector
::
target
==
binary_op
::
kEdge
&&
gdata
->
rhs_mapping
==
nullptr
)
{
gdata
->
rhs_mapping
=
static_cast
<
Idx
*>
(
incsr
.
data
->
data
);
}
if
(
OutSelector
<
Reducer
>::
Type
::
target
==
binary_op
::
kEdge
&&
gdata
->
out_mapping
==
nullptr
)
{
gdata
->
out_mapping
=
static_cast
<
Idx
*>
(
incsr
.
data
->
data
);
}
// TODO(minjie): allocator
minigun
::
advance
::
Advance
<
XPU
,
Idx
,
cuda
::
AdvanceConfig
,
BackwardGData
<
Idx
,
DType
>
,
UDF
>
(
rtcfg
,
csr
,
gdata
,
minigun
::
IntArray1D
<
Idx
>
());
}
// Following macro is used to generate explicit-specialization of the template
// operator.
#define GEN_BACKWARD_DEFINE(mode, dtype, lhs_tgt, rhs_tgt, op) \
template void CallBackwardBinaryReduce<XPU, \
mode, IDX, dtype, \
lhs_tgt, rhs_tgt, \
op<dtype>, REDUCER<XPU, dtype>>( \
const minigun::advance::RuntimeConfig& rtcfg, \
const CSRWrapper& graph, \
BackwardGData<IDX, dtype>* gdata);
// Template implementation of BackwardBinaryReduce with broadcasting operator.
template
<
int
XPU
,
int
Mode
,
int
NDim
,
typename
Idx
,
typename
DType
,
typename
LeftSelector
,
typename
RightSelector
,
typename
BinaryOp
,
typename
Reducer
>
void
CallBackwardBinaryReduceBcast
(
const
minigun
::
advance
::
RuntimeConfig
&
rtcfg
,
const
CSRWrapper
&
graph
,
BackwardBcastGData
<
NDim
,
Idx
,
DType
>*
gdata
)
{
// For backward computation, we use reverse csr and switch dst and src.
// This benefits the most common src_op_edge or copy_src case, because the
// gradients of src are now aggregated into destination buffer to reduce
// competition of atomic add.
auto
incsr
=
graph
.
GetInCSRMatrix
();
minigun
::
Csr
<
Idx
>
csr
=
utils
::
CreateCsr
<
Idx
>
(
incsr
.
indptr
,
incsr
.
indices
);
typedef
cuda
::
BackwardFunctorsTempl
<
Idx
,
DType
,
typename
SwitchSrcDst
<
LeftSelector
>::
Type
,
typename
SwitchSrcDst
<
RightSelector
>::
Type
,
BinaryOp
,
Reducer
>
Functors
;
typedef
cuda
::
BackwardBinaryReduceBcast
<
Mode
,
NDim
,
Idx
,
DType
,
Functors
>
UDF
;
// If the user-given mapping is none and the target is edge data, we need to
// replace the mapping by the edge ids in the csr graph so that the edge
// data is correctly read/written.
if
(
LeftSelector
::
target
==
binary_op
::
kEdge
&&
gdata
->
lhs_mapping
==
nullptr
)
{
gdata
->
lhs_mapping
=
static_cast
<
Idx
*>
(
incsr
.
data
->
data
);
}
if
(
RightSelector
::
target
==
binary_op
::
kEdge
&&
gdata
->
rhs_mapping
==
nullptr
)
{
gdata
->
rhs_mapping
=
static_cast
<
Idx
*>
(
incsr
.
data
->
data
);
}
if
(
OutSelector
<
Reducer
>::
Type
::
target
==
binary_op
::
kEdge
&&
gdata
->
out_mapping
==
nullptr
)
{
gdata
->
out_mapping
=
static_cast
<
Idx
*>
(
incsr
.
data
->
data
);
}
// TODO(minjie): allocator
minigun
::
advance
::
Advance
<
XPU
,
Idx
,
cuda
::
AdvanceConfig
,
BackwardBcastGData
<
NDim
,
Idx
,
DType
>
,
UDF
>
(
rtcfg
,
csr
,
gdata
,
minigun
::
IntArray1D
<
Idx
>
());
}
// Following macro is used to generate explicit-specialization of the template
// operator.
#define GEN_BACKWARD_BCAST_DEFINE(mode, ndim, dtype, lhs_tgt, rhs_tgt, op) \
template void CallBackwardBinaryReduceBcast<XPU, \
mode, ndim, IDX, dtype, \
lhs_tgt, rhs_tgt, \
op<dtype>, REDUCER<XPU, dtype>>( \
const minigun::advance::RuntimeConfig& rtcfg, \
const CSRWrapper& graph, \
BackwardBcastGData<ndim, IDX, dtype>* gdata);
}
// namespace kernel
}
// namespace dgl
#endif // DGL_KERNEL_CUDA_BACKWARD_BINARY_REDUCE_IMPL_CUH_
src/kernel/cuda/binary_bcast_reduce_max.cu
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cuda/binary_bcast_reduce_max.cu
* \brief CUDA kernels for braodcasting binary reduce max
*/
#include "./binary_reduce_impl.cuh"
#include "./backward_binary_reduce_impl.cuh"
namespace
dgl
{
namespace
kernel
{
namespace
cuda
{
}
// namespace cuda
#define REDUCER ReduceMax
#define XPU kDLGPU
#define IDX int32_t
EVAL
(
GEN_NDIM
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BCAST_DEFINE
)
EVAL
(
GEN_BACKWARD_MODE
,
GEN_NDIM
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BACKWARD_BCAST_DEFINE
);
}
// namespace kernel
}
// namespace dgl
src/kernel/cuda/binary_bcast_reduce_min.cu
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cuda/binary_bcast_reduce_min.cu
* \brief CUDA kernels for braodcasting binary reduce min
*/
#include "./binary_reduce_impl.cuh"
#include "./backward_binary_reduce_impl.cuh"
namespace
dgl
{
namespace
kernel
{
namespace
cuda
{
}
// namespace cuda
#define REDUCER ReduceMin
#define XPU kDLGPU
#define IDX int32_t
EVAL
(
GEN_NDIM
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BCAST_DEFINE
);
EVAL
(
GEN_BACKWARD_MODE
,
GEN_NDIM
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BACKWARD_BCAST_DEFINE
);
}
// namespace kernel
}
// namespace dgl
src/kernel/cuda/binary_bcast_reduce_none.cu
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cuda/binary_bcast_reduce_none.cu
* \brief CUDA kernels for braodcasting binary reduce none
*/
#include "./binary_reduce_impl.cuh"
#include "./backward_binary_reduce_impl.cuh"
namespace
dgl
{
namespace
kernel
{
namespace
cuda
{
}
// namespace cuda
#define REDUCER ReduceNone
#define XPU kDLGPU
#define IDX int32_t
EVAL
(
GEN_NDIM
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BCAST_DEFINE
)
EVAL
(
GEN_BACKWARD_MODE
,
GEN_NDIM
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BACKWARD_BCAST_DEFINE
);
}
// namespace kernel
}
// namespace dgl
src/kernel/cuda/binary_bcast_reduce_prod.cu
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cuda/binary_bcast_reduce_prod.cu
* \brief CUDA kernels for braodcasting binary reduce prod
*/
#include "./binary_reduce_impl.cuh"
#include "./backward_binary_reduce_impl.cuh"
namespace
dgl
{
namespace
kernel
{
namespace
cuda
{
}
// namespace cuda
#define REDUCER ReduceProd
#define XPU kDLGPU
#define IDX int32_t
EVAL
(
GEN_NDIM
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BCAST_DEFINE
)
EVAL
(
GEN_BACKWARD_MODE
,
GEN_NDIM
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BACKWARD_BCAST_DEFINE
);
}
// namespace kernel
}
// namespace dgl
src/kernel/cuda/binary_bcast_reduce_sum.cu
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cuda/binary_bcast_reduce_sum.cu
* \brief CUDA kernels for braodcasting binary reduce sum
*/
#include "./binary_reduce_impl.cuh"
#include "./backward_binary_reduce_impl.cuh"
namespace
dgl
{
namespace
kernel
{
namespace
cuda
{
}
// namespace cuda
#define REDUCER ReduceSum
#define XPU kDLGPU
#define IDX int32_t
EVAL
(
GEN_NDIM
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BCAST_DEFINE
);
EVAL
(
GEN_BACKWARD_MODE
,
GEN_NDIM
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BACKWARD_BCAST_DEFINE
);
}
// namespace kernel
}
// namespace dgl
src/kernel/cuda/binary_reduce_impl.cu
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cuda/binary_reduce_impl.cu
* \brief Binary reduce implementation on cuda.
*/
#include "../binary_reduce_impl.h"
#include "../csr_interface.h"
using
dgl
::
runtime
::
NDArray
;
namespace
dgl
{
namespace
kernel
{
template
void
BinaryReduceImpl
<
kDLGPU
>(
const
std
::
string
&
reducer
,
const
std
::
string
&
op
,
const
CSRWrapper
&
graph
,
binary_op
::
Target
lhs
,
binary_op
::
Target
rhs
,
runtime
::
NDArray
lhs_data
,
runtime
::
NDArray
rhs_data
,
runtime
::
NDArray
out_data
,
runtime
::
NDArray
lhs_mapping
,
runtime
::
NDArray
rhs_mapping
,
runtime
::
NDArray
out_mapping
);
template
void
BinaryReduceBcastImpl
<
kDLGPU
>(
const
BcastInfo
&
info
,
const
std
::
string
&
reducer
,
const
std
::
string
&
op
,
const
CSRWrapper
&
graph
,
binary_op
::
Target
lhs
,
binary_op
::
Target
rhs
,
runtime
::
NDArray
lhs_data
,
runtime
::
NDArray
rhs_data
,
runtime
::
NDArray
out_data
,
runtime
::
NDArray
lhs_mapping
,
runtime
::
NDArray
rhs_mapping
,
runtime
::
NDArray
out_mapping
);
template
void
BackwardBinaryReduceImpl
<
kDLGPU
>(
const
std
::
string
&
reducer
,
const
std
::
string
&
op
,
const
CSRWrapper
&
graph
,
binary_op
::
Target
lhs
,
binary_op
::
Target
rhs
,
NDArray
lhs_mapping
,
NDArray
rhs_mapping
,
NDArray
out_mapping
,
NDArray
lhs_data
,
NDArray
rhs_data
,
NDArray
out_data
,
NDArray
grad_out_data
,
NDArray
grad_lhs_data
,
NDArray
grad_rhs_data
);
template
void
BackwardBinaryReduceBcastImpl
<
kDLGPU
>(
const
BcastInfo
&
info
,
const
std
::
string
&
reducer
,
const
std
::
string
&
op
,
const
CSRWrapper
&
graph
,
binary_op
::
Target
lhs_tgt
,
binary_op
::
Target
rhs_tgt
,
runtime
::
NDArray
lhs_mapping
,
runtime
::
NDArray
rhs_mapping
,
runtime
::
NDArray
out_mapping
,
runtime
::
NDArray
lhs
,
runtime
::
NDArray
rhs
,
runtime
::
NDArray
out
,
runtime
::
NDArray
grad_out
,
runtime
::
NDArray
grad_lhs
,
runtime
::
NDArray
grad_rhs
);
}
// namespace kernel
}
// namespace dgl
src/kernel/cuda/binary_reduce_impl.cuh
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cuda/binary_reduce_impl.cuh
* \brief Minigun CUDA UDFs for binary reduce
*/
#ifndef DGL_KERNEL_CUDA_BINARY_REDUCE_IMPL_CUH_
#define DGL_KERNEL_CUDA_BINARY_REDUCE_IMPL_CUH_
#include <minigun/minigun.h>
#include "../binary_reduce_impl_decl.h"
#include "../utils.h"
#include "./functor.cuh"
#include "../csr_interface.h"
namespace
dgl
{
namespace
kernel
{
namespace
cuda
{
// Minigun UDF to compute binary reduce.
template
<
typename
Idx
,
typename
DType
,
typename
Functors
>
struct
BinaryReduce
{
static
__device__
__forceinline__
bool
CondEdge
(
Idx
src
,
Idx
dst
,
Idx
eid
,
GData
<
Idx
,
DType
>*
gdata
)
{
return
true
;
}
static
__device__
__forceinline__
void
ApplyEdge
(
Idx
src
,
Idx
dst
,
Idx
eid
,
GData
<
Idx
,
DType
>*
gdata
)
{
const
int64_t
D
=
gdata
->
x_length
;
int64_t
tx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
const
int64_t
stride_x
=
blockDim
.
x
*
gridDim
.
x
;
const
int64_t
len
=
gdata
->
data_len
;
Idx
lid
=
Functors
::
SelectLeft
(
src
,
eid
,
dst
);
Idx
rid
=
Functors
::
SelectRight
(
src
,
eid
,
dst
);
Idx
oid
=
Functors
::
SelectOut
(
src
,
eid
,
dst
);
if
(
gdata
->
lhs_mapping
)
{
lid
=
Functors
::
GetId
(
lid
,
gdata
->
lhs_mapping
);
}
if
(
gdata
->
rhs_mapping
)
{
rid
=
Functors
::
GetId
(
rid
,
gdata
->
rhs_mapping
);
}
if
(
gdata
->
out_mapping
)
{
oid
=
Functors
::
GetId
(
oid
,
gdata
->
out_mapping
);
}
DType
*
lhsoff
=
gdata
->
lhs_data
+
lid
*
D
*
len
;
DType
*
rhsoff
=
gdata
->
rhs_data
+
rid
*
D
*
len
;
DType
*
outoff
=
gdata
->
out_data
+
oid
*
D
;
while
(
tx
<
D
)
{
DType
out
=
Functors
::
Op
(
lhsoff
+
tx
*
len
,
rhsoff
+
tx
*
len
,
len
);
Functors
::
Write
(
outoff
+
tx
,
out
);
tx
+=
stride_x
;
}
}
};
/*
* This func do the followings:
* 1. Convert flattened index to multi-dimension index
* according to output shape (assume row-major).
* 2. Convert multi-dimension index to flattened index for lhs.
* 3. Convert multi-dimension index to flattened index for rhs.
*/
__device__
__forceinline__
void
UnravelRavel
(
const
int64_t
idx
,
const
int
ndim
,
const
int64_t
*
out_shape
,
const
int64_t
*
out_stride
,
const
int64_t
*
lhs_shape
,
const
int64_t
*
lhs_stride
,
const
int64_t
*
rhs_shape
,
const
int64_t
*
rhs_stride
,
int64_t
*
lhs_out
,
int64_t
*
rhs_out
)
{
if
(
out_stride
[
0
]
==
lhs_stride
[
0
])
{
#pragma unroll
for
(
int
d
=
0
;
d
<
ndim
;
++
d
)
{
int64_t
o_sh
=
out_shape
[
d
];
int64_t
o_st
=
out_stride
[
d
];
int64_t
rhs_sh
=
rhs_shape
[
d
];
int64_t
rhs_st
=
rhs_stride
[
d
];
int64_t
i
=
(
idx
/
o_st
)
%
o_sh
;
/*
* Simplfied for rhs_out += min(i, rhs_sh - 1) * rhs_st;
* rhs_sh be o_sh or 1
*/
if
(
rhs_sh
>
i
)
{
*
rhs_out
+=
i
*
rhs_st
;
}
}
*
lhs_out
=
idx
;
}
else
{
#pragma unroll
for
(
int
d
=
0
;
d
<
ndim
;
++
d
)
{
int64_t
o_sh
=
out_shape
[
d
];
int64_t
o_st
=
out_stride
[
d
];
int64_t
lhs_sh
=
lhs_shape
[
d
];
int64_t
lhs_st
=
lhs_stride
[
d
];
int64_t
i
=
(
idx
/
o_st
)
%
o_sh
;
/*
* Simplfied for lhs_out += min(i, lhs_sh - 1) * lhs_st;
* lhs_sh be o_sh or 1
*/
if
(
lhs_sh
>
i
)
{
*
lhs_out
+=
i
*
lhs_st
;
}
}
*
rhs_out
=
idx
;
}
}
// Minigun UDF to compute binary reduce with broadcasting.
template
<
int
NDim
,
typename
Idx
,
typename
DType
,
typename
Functors
>
struct
BinaryReduceBcast
{
static
__device__
__forceinline__
bool
CondEdge
(
Idx
src
,
Idx
dst
,
Idx
eid
,
BcastGData
<
NDim
,
Idx
,
DType
>*
gdata
)
{
return
true
;
}
static
__device__
__forceinline__
void
ApplyEdge
(
Idx
src
,
Idx
dst
,
Idx
eid
,
BcastGData
<
NDim
,
Idx
,
DType
>*
gdata
)
{
int64_t
tx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
const
int64_t
stride_x
=
blockDim
.
x
*
gridDim
.
x
;
const
int64_t
len
=
gdata
->
data_len
;
Idx
lid
=
Functors
::
SelectLeft
(
src
,
eid
,
dst
);
Idx
rid
=
Functors
::
SelectRight
(
src
,
eid
,
dst
);
Idx
oid
=
Functors
::
SelectOut
(
src
,
eid
,
dst
);
if
(
gdata
->
lhs_mapping
)
{
lid
=
Functors
::
GetId
(
lid
,
gdata
->
lhs_mapping
);
}
if
(
gdata
->
rhs_mapping
)
{
rid
=
Functors
::
GetId
(
rid
,
gdata
->
rhs_mapping
);
}
if
(
gdata
->
out_mapping
)
{
oid
=
Functors
::
GetId
(
oid
,
gdata
->
out_mapping
);
}
DType
*
lhsoff
=
gdata
->
lhs_data
+
lid
*
gdata
->
lhs_len
*
len
;
//data with len size
DType
*
rhsoff
=
gdata
->
rhs_data
+
rid
*
gdata
->
rhs_len
*
len
;
DType
*
outoff
=
gdata
->
out_data
+
oid
*
gdata
->
out_len
;
while
(
tx
<
gdata
->
out_len
)
{
int64_t
lhs_add
=
0
;
int64_t
rhs_add
=
0
;
UnravelRavel
(
tx
,
gdata
->
ndim
,
gdata
->
out_shape
,
gdata
->
out_stride
,
gdata
->
lhs_shape
,
gdata
->
lhs_stride
,
gdata
->
rhs_shape
,
gdata
->
rhs_stride
,
&
lhs_add
,
&
rhs_add
);
DType
out
=
Functors
::
Op
(
lhsoff
+
lhs_add
*
len
,
rhsoff
+
rhs_add
*
len
,
len
);
Functors
::
Write
(
outoff
+
tx
,
out
);
tx
+=
stride_x
;
}
}
};
// Auxiliary template used in UDF.
template
<
typename
Idx
,
typename
DType
,
typename
LeftSelector
,
typename
RightSelector
,
typename
BinaryOp
,
typename
Reducer
>
struct
FunctorsTempl
{
static
__device__
__forceinline__
Idx
SelectOut
(
Idx
src
,
Idx
edge
,
Idx
dst
)
{
return
OutSelector
<
Reducer
>::
Type
::
Call
(
src
,
edge
,
dst
);
}
static
__device__
__forceinline__
Idx
SelectLeft
(
Idx
src
,
Idx
edge
,
Idx
dst
)
{
return
LeftSelector
::
Call
(
src
,
edge
,
dst
);
}
static
__device__
__forceinline__
Idx
SelectRight
(
Idx
src
,
Idx
edge
,
Idx
dst
)
{
return
RightSelector
::
Call
(
src
,
edge
,
dst
);
}
static
__device__
__forceinline__
DType
Op
(
DType
*
lhs
,
DType
*
rhs
,
int64_t
len
)
{
return
BinaryOp
::
Call
(
lhs
,
rhs
,
len
);
}
static
__device__
__forceinline__
void
Write
(
DType
*
addr
,
DType
val
)
{
Reducer
::
Call
(
addr
,
val
);
}
static
__device__
__forceinline__
Idx
GetId
(
Idx
id
,
Idx
*
id_map
)
{
return
LDGReader
<
Idx
>::
Call
(
id_map
+
id
);
}
};
typedef
minigun
::
advance
::
Config
<
true
,
minigun
::
advance
::
kV2N
>
AdvanceConfig
;
}
// namespace cuda
// Template implementation of BinaryReduce operator.
template
<
int
XPU
,
typename
Idx
,
typename
DType
,
typename
LeftSelector
,
typename
RightSelector
,
typename
BinaryOp
,
typename
Reducer
>
void
CallBinaryReduce
(
const
minigun
::
advance
::
RuntimeConfig
&
rtcfg
,
const
CSRWrapper
&
graph
,
GData
<
Idx
,
DType
>*
gdata
)
{
typedef
cuda
::
FunctorsTempl
<
Idx
,
DType
,
LeftSelector
,
RightSelector
,
BinaryOp
,
Reducer
>
Functors
;
typedef
cuda
::
BinaryReduce
<
Idx
,
DType
,
Functors
>
UDF
;
// csr
auto
outcsr
=
graph
.
GetOutCSRMatrix
();
minigun
::
Csr
<
Idx
>
csr
=
utils
::
CreateCsr
<
Idx
>
(
outcsr
.
indptr
,
outcsr
.
indices
);
// If the user-given mapping is none and the target is edge data, we need to
// replace the mapping by the edge ids in the csr graph so that the edge
// data is correctly read/written.
if
(
LeftSelector
::
target
==
binary_op
::
kEdge
&&
gdata
->
lhs_mapping
==
nullptr
)
{
gdata
->
lhs_mapping
=
static_cast
<
Idx
*>
(
outcsr
.
data
->
data
);
}
if
(
RightSelector
::
target
==
binary_op
::
kEdge
&&
gdata
->
rhs_mapping
==
nullptr
)
{
gdata
->
rhs_mapping
=
static_cast
<
Idx
*>
(
outcsr
.
data
->
data
);
}
if
(
OutSelector
<
Reducer
>::
Type
::
target
==
binary_op
::
kEdge
&&
gdata
->
out_mapping
==
nullptr
)
{
gdata
->
out_mapping
=
static_cast
<
Idx
*>
(
outcsr
.
data
->
data
);
}
// TODO(minjie): allocator
minigun
::
advance
::
Advance
<
XPU
,
Idx
,
cuda
::
AdvanceConfig
,
GData
<
Idx
,
DType
>
,
UDF
>
(
rtcfg
,
csr
,
gdata
,
minigun
::
IntArray1D
<
Idx
>
());
}
// Template implementation of BinaryReduce broadcasting operator.
template
<
int
XPU
,
int
NDim
,
typename
Idx
,
typename
DType
,
typename
LeftSelector
,
typename
RightSelector
,
typename
BinaryOp
,
typename
Reducer
>
void
CallBinaryReduceBcast
(
const
minigun
::
advance
::
RuntimeConfig
&
rtcfg
,
const
CSRWrapper
&
graph
,
BcastGData
<
NDim
,
Idx
,
DType
>*
gdata
)
{
typedef
cuda
::
FunctorsTempl
<
Idx
,
DType
,
LeftSelector
,
RightSelector
,
BinaryOp
,
Reducer
>
Functors
;
typedef
cuda
::
BinaryReduceBcast
<
NDim
,
Idx
,
DType
,
Functors
>
UDF
;
// csr
auto
outcsr
=
graph
.
GetOutCSRMatrix
();
minigun
::
Csr
<
Idx
>
csr
=
utils
::
CreateCsr
<
Idx
>
(
outcsr
.
indptr
,
outcsr
.
indices
);
// If the user-given mapping is none and the target is edge data, we need to
// replace the mapping by the edge ids in the csr graph so that the edge
// data is correctly read/written.
if
(
LeftSelector
::
target
==
binary_op
::
kEdge
&&
gdata
->
lhs_mapping
==
nullptr
)
{
gdata
->
lhs_mapping
=
static_cast
<
Idx
*>
(
outcsr
.
data
->
data
);
}
if
(
RightSelector
::
target
==
binary_op
::
kEdge
&&
gdata
->
rhs_mapping
==
nullptr
)
{
gdata
->
rhs_mapping
=
static_cast
<
Idx
*>
(
outcsr
.
data
->
data
);
}
if
(
OutSelector
<
Reducer
>::
Type
::
target
==
binary_op
::
kEdge
&&
gdata
->
out_mapping
==
nullptr
)
{
gdata
->
out_mapping
=
static_cast
<
Idx
*>
(
outcsr
.
data
->
data
);
}
// TODO(minjie): allocator
minigun
::
advance
::
Advance
<
XPU
,
Idx
,
cuda
::
AdvanceConfig
,
BcastGData
<
NDim
,
Idx
,
DType
>
,
UDF
>
(
rtcfg
,
csr
,
gdata
,
minigun
::
IntArray1D
<
Idx
>
());
}
// Following macro is used to generate explicit-specialization of the template
// operator.
#define GEN_DEFINE(dtype, lhs_tgt, rhs_tgt, op) \
template void CallBinaryReduce<XPU, IDX, \
dtype, lhs_tgt, rhs_tgt, op<dtype>, REDUCER<XPU, dtype>>( \
const minigun::advance::RuntimeConfig& rtcfg, \
const CSRWrapper& graph, \
GData<IDX, dtype>* gdata);
#define GEN_BCAST_DEFINE(ndim, dtype, lhs_tgt, rhs_tgt, op) \
template void CallBinaryReduceBcast<XPU, ndim, IDX, dtype, \
lhs_tgt, rhs_tgt, \
op<dtype>, REDUCER<XPU, dtype>>( \
const minigun::advance::RuntimeConfig& rtcfg, \
const CSRWrapper& graph, \
BcastGData<ndim, IDX, dtype>* gdata);
#define EVAL(F, ...) MSVC_EXPAND(F(__VA_ARGS__))
}
// namespace kernel
}
// namespace dgl
#endif // DGL_KERNEL_CUDA_BINARY_REDUCE_IMPL_CUH_
src/kernel/cuda/binary_reduce_max.cu
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cuda/binary_reduce_max.cu
* \brief CUDA kernels for binary reduce max
*/
#include "./binary_reduce_impl.cuh"
#include "./backward_binary_reduce_impl.cuh"
namespace
dgl
{
namespace
kernel
{
#define REDUCER ReduceMax
#define XPU kDLGPU
#define IDX int32_t
EVAL
(
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_DEFINE
)
EVAL
(
GEN_BACKWARD_MODE
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BACKWARD_DEFINE
)
}
// namespace kernel
}
// namespace dgl
src/kernel/cuda/binary_reduce_min.cu
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cuda/binary_reduce_min.cu
* \brief CUDA kernels for binary reduce min
*/
#include "./binary_reduce_impl.cuh"
#include "./backward_binary_reduce_impl.cuh"
namespace
dgl
{
namespace
kernel
{
#define REDUCER ReduceMin
#define XPU kDLGPU
#define IDX int32_t
EVAL
(
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_DEFINE
)
EVAL
(
GEN_BACKWARD_MODE
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BACKWARD_DEFINE
)
}
// namespace kernel
}
// namespace dgl
src/kernel/cuda/binary_reduce_none.cu
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cuda/binary_reduce_none.cu
* \brief CUDA kernels for binary reduce none
*/
#include "./binary_reduce_impl.cuh"
#include "./backward_binary_reduce_impl.cuh"
namespace
dgl
{
namespace
kernel
{
#define REDUCER ReduceNone
#define XPU kDLGPU
#define IDX int32_t
EVAL
(
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_DEFINE
)
EVAL
(
GEN_BACKWARD_MODE
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BACKWARD_DEFINE
)
}
// namespace kernel
}
// namespace dgl
src/kernel/cuda/binary_reduce_prod.cu
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cuda/binary_reduce_prod.cu
* \brief CUDA kernels for binary reduce prod
*/
#include "./binary_reduce_impl.cuh"
#include "./backward_binary_reduce_impl.cuh"
namespace
dgl
{
namespace
kernel
{
#define REDUCER ReduceProd
#define XPU kDLGPU
#define IDX int32_t
EVAL
(
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_DEFINE
)
EVAL
(
GEN_BACKWARD_MODE
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BACKWARD_DEFINE
)
}
// namespace kernel
}
// namespace dgl
src/kernel/cuda/binary_reduce_sum.cu
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cuda/binary_reduce_sum.cu
* \brief CUDA kernels for binary reduce sum
*/
#include <dgl/runtime/device_api.h>
#include "../../runtime/cuda/cuda_common.h"
#include "./binary_reduce_impl.cuh"
#include "./backward_binary_reduce_impl.cuh"
#include "../utils.h"
#include "../csr_interface.h"
using
minigun
::
advance
::
RuntimeConfig
;
namespace
dgl
{
namespace
kernel
{
namespace
cuda
{
// specialization for cusparse
#if CUDART_VERSION < 11000
template
<
typename
DType
>
cusparseStatus_t
Xcsrmm2
(
cusparseHandle_t
handle
,
cusparseOperation_t
transA
,
cusparseOperation_t
transB
,
int
m
,
int
n
,
int
k
,
int
nnz
,
const
DType
*
alpha
,
const
cusparseMatDescr_t
descrA
,
const
DType
*
csrValA
,
const
int
*
csrRowPtrA
,
const
int
*
csrColIndA
,
const
DType
*
B
,
int
ldb
,
const
DType
*
beta
,
DType
*
C
,
int
ldc
)
{
LOG
(
INFO
)
<<
"Not supported dtype"
;
return
CUSPARSE_STATUS_EXECUTION_FAILED
;
}
template
<
>
cusparseStatus_t
Xcsrmm2
<
float
>
(
cusparseHandle_t
handle
,
cusparseOperation_t
transA
,
cusparseOperation_t
transB
,
int
m
,
int
n
,
int
k
,
int
nnz
,
const
float
*
alpha
,
const
cusparseMatDescr_t
descrA
,
const
float
*
csrValA
,
const
int
*
csrRowPtrA
,
const
int
*
csrColIndA
,
const
float
*
B
,
int
ldb
,
const
float
*
beta
,
float
*
C
,
int
ldc
)
{
return
cusparseScsrmm2
(
handle
,
transA
,
transB
,
m
,
n
,
k
,
nnz
,
alpha
,
descrA
,
csrValA
,
csrRowPtrA
,
csrColIndA
,
B
,
ldb
,
beta
,
C
,
ldc
);
}
template
<
>
cusparseStatus_t
Xcsrmm2
<
double
>
(
cusparseHandle_t
handle
,
cusparseOperation_t
transA
,
cusparseOperation_t
transB
,
int
m
,
int
n
,
int
k
,
int
nnz
,
const
double
*
alpha
,
const
cusparseMatDescr_t
descrA
,
const
double
*
csrValA
,
const
int
*
csrRowPtrA
,
const
int
*
csrColIndA
,
const
double
*
B
,
int
ldb
,
const
double
*
beta
,
double
*
C
,
int
ldc
)
{
return
cusparseDcsrmm2
(
handle
,
transA
,
transB
,
m
,
n
,
k
,
nnz
,
alpha
,
descrA
,
csrValA
,
csrRowPtrA
,
csrColIndA
,
B
,
ldb
,
beta
,
C
,
ldc
);
}
#endif
template
<
typename
DType
>
cublasStatus_t
Xgeam
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
const
DType
*
alpha
,
const
DType
*
A
,
int
lda
,
const
DType
*
beta
,
const
DType
*
B
,
int
ldb
,
DType
*
C
,
int
ldc
)
{
LOG
(
INFO
)
<<
"Not supported dtype"
;
return
CUBLAS_STATUS_EXECUTION_FAILED
;
}
template
<
>
cublasStatus_t
Xgeam
<
float
>
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
const
float
*
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
beta
,
const
float
*
B
,
int
ldb
,
float
*
C
,
int
ldc
)
{
return
cublasSgeam
(
handle
,
transa
,
transb
,
m
,
n
,
alpha
,
A
,
lda
,
beta
,
B
,
ldb
,
C
,
ldc
);
}
template
<
>
cublasStatus_t
Xgeam
<
double
>
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
const
double
*
alpha
,
const
double
*
A
,
int
lda
,
const
double
*
beta
,
const
double
*
B
,
int
ldb
,
double
*
C
,
int
ldc
)
{
return
cublasDgeam
(
handle
,
transa
,
transb
,
m
,
n
,
alpha
,
A
,
lda
,
beta
,
B
,
ldb
,
C
,
ldc
);
}
template
<
typename
DType
>
void
CusparseCsrmm2
(
const
RuntimeConfig
&
rtcfg
,
const
aten
::
CSRMatrix
&
csr
,
const
DType
*
B_data
,
DType
*
C_data
,
int
x_length
)
{
// We use csrmm2 to perform following operation:
// C = A x B, where A is a sparse matrix in csr format, B is the dense matrix for node
// feature tensor. However, since cusparse only supports column-major, while our tensor
// is stored in row-major, the actual computation is:
// C = trans(A x trans(B)).
// Currently, we use cublasXgeam to implement transposition and allocate intermediate
// workspace memory for this.
const
int
m
=
csr
.
num_rows
;
const
int
n
=
x_length
;
const
int
k
=
csr
.
num_cols
;
const
int
nnz
=
csr
.
indices
->
shape
[
0
];
const
DType
alpha
=
1.0
;
const
DType
beta
=
0.0
;
// device
auto
device
=
runtime
::
DeviceAPI
::
Get
(
rtcfg
.
ctx
);
auto
*
thr_entry
=
runtime
::
CUDAThreadEntry
::
ThreadLocal
();
// allocate cusparse handle if needed
if
(
!
thr_entry
->
cusparse_handle
)
{
CUSPARSE_CALL
(
cusparseCreate
(
&
(
thr_entry
->
cusparse_handle
)));
}
CUSPARSE_CALL
(
cusparseSetStream
(
thr_entry
->
cusparse_handle
,
rtcfg
.
stream
));
// allocate matrix for temporary transposed output
DType
*
trans_out
=
static_cast
<
DType
*>
(
device
->
AllocWorkspace
(
rtcfg
.
ctx
,
m
*
n
*
sizeof
(
DType
)));
// all one data array
DType
*
valptr
=
static_cast
<
DType
*>
(
device
->
AllocWorkspace
(
rtcfg
.
ctx
,
nnz
*
sizeof
(
DType
)));
utils
::
Fill
<
kDLGPU
>
(
rtcfg
.
ctx
,
valptr
,
nnz
,
static_cast
<
DType
>
(
1.
));
#if CUDART_VERSION >= 11000
auto
ctx
=
rtcfg
.
ctx
;
cusparseSpMatDescr_t
matA
;
cusparseDnMatDescr_t
matB
,
matC
;
constexpr
auto
cuda_dtype
=
std
::
is_same
<
DType
,
float
>::
value
?
CUDA_R_32F
:
CUDA_R_64F
;
CUSPARSE_CALL
(
cusparseCreateCsr
(
&
matA
,
m
,
k
,
nnz
,
static_cast
<
int32_t
*>
(
csr
.
indptr
->
data
),
static_cast
<
int32_t
*>
(
csr
.
indices
->
data
),
const_cast
<
DType
*>
(
valptr
),
CUSPARSE_INDEX_32I
,
CUSPARSE_INDEX_32I
,
CUSPARSE_INDEX_BASE_ZERO
,
cuda_dtype
));
CUSPARSE_CALL
(
cusparseCreateDnMat
(
&
matB
,
n
,
k
,
n
,
const_cast
<
DType
*>
(
B_data
),
cuda_dtype
,
CUSPARSE_ORDER_COL
));
CUSPARSE_CALL
(
cusparseCreateDnMat
(
&
matC
,
m
,
n
,
m
,
trans_out
,
cuda_dtype
,
CUSPARSE_ORDER_COL
));
auto
transA
=
CUSPARSE_OPERATION_NON_TRANSPOSE
;
auto
transB
=
CUSPARSE_OPERATION_TRANSPOSE
;
size_t
workspace_size
;
CUSPARSE_CALL
(
cusparseSpMM_bufferSize
(
thr_entry
->
cusparse_handle
,
transA
,
transB
,
&
alpha
,
matA
,
matB
,
&
beta
,
matC
,
cuda_dtype
,
CUSPARSE_CSRMM_ALG1
,
&
workspace_size
));
void
*
workspace
=
device
->
AllocWorkspace
(
ctx
,
workspace_size
);
CUSPARSE_CALL
(
cusparseSpMM
(
thr_entry
->
cusparse_handle
,
transA
,
transB
,
&
alpha
,
matA
,
matB
,
&
beta
,
matC
,
cuda_dtype
,
CUSPARSE_CSRMM_ALG1
,
workspace
));
device
->
FreeWorkspace
(
ctx
,
workspace
);
CUSPARSE_CALL
(
cusparseDestroySpMat
(
matA
));
CUSPARSE_CALL
(
cusparseDestroyDnMat
(
matB
));
CUSPARSE_CALL
(
cusparseDestroyDnMat
(
matC
));
#else
cusparseMatDescr_t
descr
;
CUSPARSE_CALL
(
cusparseCreateMatDescr
(
&
descr
));
CUSPARSE_CALL
(
cusparseSetMatType
(
descr
,
CUSPARSE_MATRIX_TYPE_GENERAL
));
CUSPARSE_CALL
(
cusparseSetMatIndexBase
(
descr
,
CUSPARSE_INDEX_BASE_ZERO
));
CUSPARSE_CALL
(
Xcsrmm2
<
DType
>
(
thr_entry
->
cusparse_handle
,
CUSPARSE_OPERATION_NON_TRANSPOSE
,
CUSPARSE_OPERATION_TRANSPOSE
,
m
,
n
,
k
,
nnz
,
&
alpha
,
descr
,
valptr
,
static_cast
<
int32_t
*>
(
csr
.
indptr
->
data
),
static_cast
<
int32_t
*>
(
csr
.
indices
->
data
),
B_data
,
n
,
&
beta
,
trans_out
,
m
));
CUSPARSE_CALL
(
cusparseDestroyMatDescr
(
descr
));
#endif
device
->
FreeWorkspace
(
rtcfg
.
ctx
,
valptr
);
// transpose the output matrix
if
(
!
thr_entry
->
cublas_handle
)
{
CUBLAS_CALL
(
cublasCreate
(
&
(
thr_entry
->
cublas_handle
)));
}
CUBLAS_CALL
(
cublasSetStream
(
thr_entry
->
cublas_handle
,
rtcfg
.
stream
));
CUBLAS_CALL
(
Xgeam
<
DType
>
(
thr_entry
->
cublas_handle
,
CUBLAS_OP_T
,
CUBLAS_OP_N
,
n
,
m
,
&
alpha
,
trans_out
,
m
,
&
beta
,
nullptr
,
n
,
C_data
,
n
));
device
->
FreeWorkspace
(
rtcfg
.
ctx
,
trans_out
);
}
// forward
template
<
typename
DType
>
void
FallbackCallBinaryReduce
(
const
RuntimeConfig
&
rtcfg
,
const
CSRWrapper
&
graph
,
GData
<
int32_t
,
DType
>*
gdata
)
{
constexpr
int
XPU
=
kDLGPU
;
typedef
int32_t
Idx
;
typedef
SelectSrc
LeftSelector
;
typedef
SelectNone
RightSelector
;
typedef
BinaryUseLhs
<
DType
>
BinaryOp
;
typedef
ReduceSum
<
kDLGPU
,
DType
>
Reducer
;
typedef
cuda
::
FunctorsTempl
<
Idx
,
DType
,
LeftSelector
,
RightSelector
,
BinaryOp
,
Reducer
>
Functors
;
typedef
cuda
::
BinaryReduce
<
Idx
,
DType
,
Functors
>
UDF
;
// csr
auto
outcsr
=
graph
.
GetOutCSRMatrix
();
minigun
::
Csr
<
Idx
>
csr
=
utils
::
CreateCsr
<
Idx
>
(
outcsr
.
indptr
,
outcsr
.
indices
);
// If the user-given mapping is none and the target is edge data, we need to
// replace the mapping by the edge ids in the csr graph so that the edge
// data is correctly read/written.
if
(
LeftSelector
::
target
==
binary_op
::
kEdge
&&
gdata
->
lhs_mapping
==
nullptr
)
{
gdata
->
lhs_mapping
=
static_cast
<
Idx
*>
(
outcsr
.
data
->
data
);
}
if
(
RightSelector
::
target
==
binary_op
::
kEdge
&&
gdata
->
rhs_mapping
==
nullptr
)
{
gdata
->
rhs_mapping
=
static_cast
<
Idx
*>
(
outcsr
.
data
->
data
);
}
if
(
OutSelector
<
Reducer
>::
Type
::
target
==
binary_op
::
kEdge
&&
gdata
->
out_mapping
==
nullptr
)
{
gdata
->
out_mapping
=
static_cast
<
Idx
*>
(
outcsr
.
data
->
data
);
}
// TODO(minjie): allocator
minigun
::
advance
::
Advance
<
XPU
,
Idx
,
cuda
::
AdvanceConfig
,
GData
<
Idx
,
DType
>
,
UDF
>
(
rtcfg
,
csr
,
gdata
,
minigun
::
IntArray1D
<
Idx
>
());
}
template
<
typename
DType
>
void
FallbackCallBackwardBinaryReduce
(
const
RuntimeConfig
&
rtcfg
,
const
CSRWrapper
&
graph
,
BackwardGData
<
int32_t
,
DType
>*
gdata
)
{
constexpr
int
XPU
=
kDLGPU
;
constexpr
int
Mode
=
binary_op
::
kGradLhs
;
typedef
int32_t
Idx
;
typedef
SelectSrc
LeftSelector
;
typedef
SelectNone
RightSelector
;
typedef
BinaryUseLhs
<
DType
>
BinaryOp
;
typedef
ReduceSum
<
kDLGPU
,
DType
>
Reducer
;
// For backward computation, we use reverse csr and switch dst and src.
// This benefits the most common src_op_edge or copy_src case, because the
// gradients of src are now aggregated into destination buffer to reduce
// competition of atomic add.
auto
incsr
=
graph
.
GetInCSRMatrix
();
minigun
::
Csr
<
Idx
>
csr
=
utils
::
CreateCsr
<
Idx
>
(
incsr
.
indptr
,
incsr
.
indices
);
typedef
cuda
::
BackwardFunctorsTempl
<
Idx
,
DType
,
typename
SwitchSrcDst
<
LeftSelector
>::
Type
,
typename
SwitchSrcDst
<
RightSelector
>::
Type
,
BinaryOp
,
Reducer
>
Functors
;
typedef
cuda
::
BackwardBinaryReduce
<
Mode
,
Idx
,
DType
,
Functors
>
UDF
;
// If the user-given mapping is none and the target is edge data, we need to
// replace the mapping by the edge ids in the csr graph so that the edge
// data is correctly read/written.
if
(
LeftSelector
::
target
==
binary_op
::
kEdge
&&
gdata
->
lhs_mapping
==
nullptr
)
{
gdata
->
lhs_mapping
=
static_cast
<
Idx
*>
(
incsr
.
data
->
data
);
}
if
(
RightSelector
::
target
==
binary_op
::
kEdge
&&
gdata
->
rhs_mapping
==
nullptr
)
{
gdata
->
rhs_mapping
=
static_cast
<
Idx
*>
(
incsr
.
data
->
data
);
}
if
(
OutSelector
<
Reducer
>::
Type
::
target
==
binary_op
::
kEdge
&&
gdata
->
out_mapping
==
nullptr
)
{
gdata
->
out_mapping
=
static_cast
<
Idx
*>
(
incsr
.
data
->
data
);
}
// TODO(minjie): allocator
minigun
::
advance
::
Advance
<
XPU
,
Idx
,
cuda
::
AdvanceConfig
,
BackwardGData
<
Idx
,
DType
>
,
UDF
>
(
rtcfg
,
csr
,
gdata
,
minigun
::
IntArray1D
<
Idx
>
());
}
}
// namespace cuda
template
<
>
void
CallBinaryReduce
<
kDLGPU
,
int32_t
,
float
,
SelectSrc
,
SelectNone
,
BinaryUseLhs
<
float
>
,
ReduceSum
<
kDLGPU
,
float
>>
(
const
RuntimeConfig
&
rtcfg
,
const
CSRWrapper
&
graph
,
GData
<
int32_t
,
float
>*
gdata
)
{
if
(
gdata
->
lhs_mapping
||
gdata
->
rhs_mapping
||
gdata
->
out_mapping
)
{
cuda
::
FallbackCallBinaryReduce
<
float
>
(
rtcfg
,
graph
,
gdata
);
}
else
{
// cusparse use rev csr for csrmm
auto
csr
=
graph
.
GetInCSRMatrix
();
cuda
::
CusparseCsrmm2
(
rtcfg
,
csr
,
gdata
->
lhs_data
,
gdata
->
out_data
,
gdata
->
x_length
);
}
}
template
<
>
void
CallBinaryReduce
<
kDLGPU
,
int32_t
,
double
,
SelectSrc
,
SelectNone
,
BinaryUseLhs
<
double
>
,
ReduceSum
<
kDLGPU
,
double
>>
(
const
RuntimeConfig
&
rtcfg
,
const
CSRWrapper
&
graph
,
GData
<
int32_t
,
double
>*
gdata
)
{
if
(
gdata
->
lhs_mapping
||
gdata
->
rhs_mapping
||
gdata
->
out_mapping
)
{
cuda
::
FallbackCallBinaryReduce
<
double
>
(
rtcfg
,
graph
,
gdata
);
}
else
{
// cusparse use rev csr for csrmm
auto
csr
=
graph
.
GetInCSRMatrix
();
cuda
::
CusparseCsrmm2
(
rtcfg
,
csr
,
gdata
->
lhs_data
,
gdata
->
out_data
,
gdata
->
x_length
);
}
}
// backward
template
<
>
void
CallBackwardBinaryReduce
<
kDLGPU
,
binary_op
::
kGradLhs
,
int32_t
,
float
,
SelectSrc
,
SelectNone
,
BinaryUseLhs
<
float
>
,
ReduceSum
<
kDLGPU
,
float
>>
(
const
RuntimeConfig
&
rtcfg
,
const
CSRWrapper
&
graph
,
BackwardGData
<
int32_t
,
float
>*
gdata
)
{
if
(
gdata
->
lhs_mapping
||
gdata
->
rhs_mapping
||
gdata
->
out_mapping
)
{
cuda
::
FallbackCallBackwardBinaryReduce
<
float
>
(
rtcfg
,
graph
,
gdata
);
}
else
{
auto
csr
=
graph
.
GetOutCSRMatrix
();
cuda
::
CusparseCsrmm2
(
rtcfg
,
csr
,
gdata
->
grad_out_data
,
gdata
->
grad_lhs_data
,
gdata
->
x_length
);
}
}
template
<
>
void
CallBackwardBinaryReduce
<
kDLGPU
,
binary_op
::
kGradLhs
,
int32_t
,
double
,
SelectSrc
,
SelectNone
,
BinaryUseLhs
<
double
>
,
ReduceSum
<
kDLGPU
,
double
>>
(
const
RuntimeConfig
&
rtcfg
,
const
CSRWrapper
&
graph
,
BackwardGData
<
int32_t
,
double
>*
gdata
)
{
if
(
gdata
->
lhs_mapping
||
gdata
->
rhs_mapping
||
gdata
->
out_mapping
)
{
cuda
::
FallbackCallBackwardBinaryReduce
<
double
>
(
rtcfg
,
graph
,
gdata
);
}
else
{
auto
csr
=
graph
.
GetOutCSRMatrix
();
cuda
::
CusparseCsrmm2
(
rtcfg
,
csr
,
gdata
->
grad_out_data
,
gdata
->
grad_lhs_data
,
gdata
->
x_length
);
}
}
// generate definitions
#define REDUCER ReduceSum
#define XPU kDLGPU
#define IDX int32_t
EVAL
(
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_DEFINE
);
EVAL
(
GEN_BACKWARD_MODE
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BACKWARD_DEFINE
);
}
// namespace kernel
}
// namespace dgl
src/kernel/cuda/functor.cuh
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cuda/functor.cuh
* \brief Functors for template on CUDA
*/
#ifndef DGL_KERNEL_CUDA_FUNCTOR_CUH_
#define DGL_KERNEL_CUDA_FUNCTOR_CUH_
#include "../binary_reduce_common.h"
#include "./atomic.cuh"
namespace
dgl
{
namespace
kernel
{
namespace
cuda
{
// Cache load from global memory
template
<
typename
DType
>
struct
LDGReader
{
static
__device__
__forceinline__
DType
Call
(
DType
*
addr
)
{
#if __CUDA_ARCH__ >= 350
return
__ldg
(
addr
);
#else
return
*
addr
;
#endif
}
};
}
// namespace cuda
// Reducer functor specialization
template
<
typename
DType
>
struct
ReduceSum
<
kDLGPU
,
DType
>
{
static
__device__
__forceinline__
void
Call
(
DType
*
addr
,
DType
val
)
{
cuda
::
AtomicAdd
(
addr
,
val
);
}
static
__device__
__forceinline__
DType
BackwardCall
(
DType
val
,
DType
accum
)
{
return
1
;
}
};
template
<
typename
DType
>
struct
ReduceMax
<
kDLGPU
,
DType
>
{
static
__device__
__forceinline__
void
Call
(
DType
*
addr
,
DType
val
)
{
cuda
::
AtomicMax
(
addr
,
val
);
}
static
__device__
__forceinline__
DType
BackwardCall
(
DType
val
,
DType
accum
)
{
return
static_cast
<
DType
>
(
val
==
accum
);
}
};
template
<
typename
DType
>
struct
ReduceMin
<
kDLGPU
,
DType
>
{
static
__device__
__forceinline__
void
Call
(
DType
*
addr
,
DType
val
)
{
cuda
::
AtomicMin
(
addr
,
val
);
}
static
__device__
__forceinline__
DType
BackwardCall
(
DType
val
,
DType
accum
)
{
return
static_cast
<
DType
>
(
val
==
accum
);
}
};
template
<
typename
DType
>
struct
ReduceProd
<
kDLGPU
,
DType
>
{
static
__device__
__forceinline__
void
Call
(
DType
*
addr
,
DType
val
)
{
cuda
::
AtomicMul
(
addr
,
val
);
}
static
__device__
__forceinline__
DType
BackwardCall
(
DType
val
,
DType
accum
)
{
return
accum
/
val
;
}
};
template
<
typename
DType
>
struct
ReduceNone
<
kDLGPU
,
DType
>
{
static
__device__
__forceinline__
void
Call
(
DType
*
addr
,
DType
val
)
{
*
addr
=
val
;
}
static
__device__
__forceinline__
DType
BackwardCall
(
DType
val
,
DType
accum
)
{
return
1
;
}
};
}
// namespace kernel
}
// namespace dgl
#endif // DGL_KERNEL_CUDA_FUNCTOR_CUH_
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment