Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
c81efdf2
Unverified
Commit
c81efdf2
authored
Sep 06, 2021
by
Jinjing Zhou
Committed by
GitHub
Sep 06, 2021
Browse files
Remove deprecated kernels (#3316)
* remove * remove * fix * remove * remove
parent
75d793a1
Changes
46
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
83 additions
and
3173 deletions
+83
-3173
CMakeLists.txt
CMakeLists.txt
+0
-2
src/array/cuda/atomic.cuh
src/array/cuda/atomic.cuh
+81
-0
src/array/cuda/rowwise_sampling.cu
src/array/cuda/rowwise_sampling.cu
+2
-2
src/kernel/binary_reduce.cc
src/kernel/binary_reduce.cc
+0
-612
src/kernel/binary_reduce.h
src/kernel/binary_reduce.h
+0
-253
src/kernel/binary_reduce_common.h
src/kernel/binary_reduce_common.h
+0
-541
src/kernel/binary_reduce_impl.h
src/kernel/binary_reduce_impl.h
+0
-440
src/kernel/binary_reduce_impl_decl.h
src/kernel/binary_reduce_impl_decl.h
+0
-414
src/kernel/common.h
src/kernel/common.h
+0
-86
src/kernel/cpu/backward_binary_reduce_impl.h
src/kernel/cpu/backward_binary_reduce_impl.h
+0
-318
src/kernel/cpu/binary_bcast_reduce_max.cc
src/kernel/cpu/binary_bcast_reduce_max.cc
+0
-28
src/kernel/cpu/binary_bcast_reduce_min.cc
src/kernel/cpu/binary_bcast_reduce_min.cc
+0
-28
src/kernel/cpu/binary_bcast_reduce_none.cc
src/kernel/cpu/binary_bcast_reduce_none.cc
+0
-28
src/kernel/cpu/binary_bcast_reduce_prod.cc
src/kernel/cpu/binary_bcast_reduce_prod.cc
+0
-28
src/kernel/cpu/binary_bcast_reduce_sum.cc
src/kernel/cpu/binary_bcast_reduce_sum.cc
+0
-28
src/kernel/cpu/binary_reduce_impl.cc
src/kernel/cpu/binary_reduce_impl.cc
+0
-56
src/kernel/cpu/binary_reduce_impl.h
src/kernel/cpu/binary_reduce_impl.h
+0
-231
src/kernel/cpu/binary_reduce_max.cc
src/kernel/cpu/binary_reduce_max.cc
+0
-26
src/kernel/cpu/binary_reduce_min.cc
src/kernel/cpu/binary_reduce_min.cc
+0
-26
src/kernel/cpu/binary_reduce_none.cc
src/kernel/cpu/binary_reduce_none.cc
+0
-26
No files found.
CMakeLists.txt
View file @
c81efdf2
...
...
@@ -145,8 +145,6 @@ file(GLOB DGL_SRC
src/array/cpu/*.cc
src/random/*.cc
src/random/cpu/*.cc
src/kernel/*.cc
src/kernel/cpu/*.cc
src/runtime/*.cc
src/geometry/*.cc
src/geometry/cpu/*.cc
...
...
src/array/cuda/atomic.cuh
View file @
c81efdf2
...
...
@@ -10,6 +10,9 @@
#include <cassert>
#include "fp16.cuh"
#if __CUDA_ARCH__ >= 600
#include <cuda_fp16.h>
#endif
namespace
dgl
{
namespace
aten
{
...
...
@@ -133,6 +136,84 @@ DEFINE_ATOMIC_HALF(Min)
DEFINE_ATOMIC
(
Add
)
#undef OP
/**
* \brief Performs an atomic compare-and-swap on 64 bit integers. That is,
* it the word `old` at the memory location `address`, computes
* `(old == compare ? val : old)` , and stores the result back to memory at
* the same address.
*
* \param address The address to perform the atomic operation on.
* \param compare The value to compare to.
* \param val The new value to conditionally store.
*
* \return The old value at the address.
*/
inline
__device__
int64_t
AtomicCAS
(
int64_t
*
const
address
,
const
int64_t
compare
,
const
int64_t
val
)
{
// match the type of "::atomicCAS", so ignore lint warning
using
Type
=
unsigned
long
long
int
;
// NOLINT
static_assert
(
sizeof
(
Type
)
==
sizeof
(
*
address
),
"Type width must match"
);
return
atomicCAS
(
reinterpret_cast
<
Type
*>
(
address
),
static_cast
<
Type
>
(
compare
),
static_cast
<
Type
>
(
val
));
}
/**
* \brief Performs an atomic compare-and-swap on 32 bit integers. That is,
* it the word `old` at the memory location `address`, computes
* `(old == compare ? val : old)` , and stores the result back to memory at
* the same address.
*
* \param address The address to perform the atomic operation on.
* \param compare The value to compare to.
* \param val The new value to conditionally store.
*
* \return The old value at the address.
*/
inline
__device__
int32_t
AtomicCAS
(
int32_t
*
const
address
,
const
int32_t
compare
,
const
int32_t
val
)
{
// match the type of "::atomicCAS", so ignore lint warning
using
Type
=
int
;
// NOLINT
static_assert
(
sizeof
(
Type
)
==
sizeof
(
*
address
),
"Type width must match"
);
return
atomicCAS
(
reinterpret_cast
<
Type
*>
(
address
),
static_cast
<
Type
>
(
compare
),
static_cast
<
Type
>
(
val
));
}
inline
__device__
int64_t
AtomicMax
(
int64_t
*
const
address
,
const
int64_t
val
)
{
// match the type of "::atomicCAS", so ignore lint warning
using
Type
=
unsigned
long
long
int
;
// NOLINT
static_assert
(
sizeof
(
Type
)
==
sizeof
(
*
address
),
"Type width must match"
);
return
atomicMax
(
reinterpret_cast
<
Type
*>
(
address
),
static_cast
<
Type
>
(
val
));
}
inline
__device__
int32_t
AtomicMax
(
int32_t
*
const
address
,
const
int32_t
val
)
{
// match the type of "::atomicCAS", so ignore lint warning
using
Type
=
int
;
// NOLINT
static_assert
(
sizeof
(
Type
)
==
sizeof
(
*
address
),
"Type width must match"
);
return
atomicMax
(
reinterpret_cast
<
Type
*>
(
address
),
static_cast
<
Type
>
(
val
));
}
template
<
>
__device__
__forceinline__
float
AtomicAdd
<
float
>
(
float
*
addr
,
float
val
)
{
#if __CUDA_ARCH__ >= 200
...
...
src/array/cuda/rowwise_sampling.cu
View file @
c81efdf2
...
...
@@ -10,10 +10,10 @@
#include <numeric>
#include "./dgl_cub.cuh"
#include "../../
kernel
/cuda/atomic.cuh"
#include "../../
array
/cuda/atomic.cuh"
#include "../../runtime/cuda/cuda_common.h"
using
namespace
dgl
::
kernel
::
cuda
;
using
namespace
dgl
::
aten
::
cuda
;
namespace
dgl
{
namespace
aten
{
...
...
src/kernel/binary_reduce.cc
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/binary_reduce.cc
* \brief Binary reduce C APIs and definitions.
*/
#include <dgl/packed_func_ext.h>
#include <dgl/immutable_graph.h>
#include "./binary_reduce.h"
#include "./common.h"
#include "./binary_reduce_impl_decl.h"
#include "./utils.h"
#include "../c_api_common.h"
#include "../array/check.h"
#include "../graph/unit_graph.h"
#include "./csr_interface.h"
using
namespace
dgl
::
runtime
;
namespace
dgl
{
namespace
kernel
{
namespace
{
// convert ndarray shape to string
std
::
string
ShapeString
(
NDArray
nd
)
{
std
::
ostringstream
oss
;
oss
<<
"("
;
for
(
int
i
=
1
;
i
<
nd
->
ndim
;
++
i
)
{
oss
<<
nd
->
shape
[
i
];
if
(
i
!=
nd
->
ndim
-
1
)
{
oss
<<
","
;
}
}
oss
<<
")"
;
return
oss
.
str
();
}
// compute stride vector given shape; assume row-major storage
std
::
vector
<
int64_t
>
ComputeStride
(
const
std
::
vector
<
int64_t
>&
shape
)
{
std
::
vector
<
int64_t
>
ret
(
shape
.
size
(),
1
);
for
(
int
i
=
shape
.
size
()
-
2
;
i
>=
0
;
--
i
)
{
ret
[
i
]
=
ret
[
i
+
1
]
*
shape
[
i
+
1
];
}
return
ret
;
}
// Return true if the feature shapes of the two ndarrays can be
// computed element-wisely *without* broadcasting.
// Examples:
//
// valid:
// lhs.shape = (N, D1, D2)
// rhs.shape = (M, D1, D2) # the first dimension could be different
//
// invalid:
// lhs.shape = (N, D1, D2)
// rhs.shape = (M, D1)
bool
IsValidBinaryOpShape
(
NDArray
lhs
,
NDArray
rhs
)
{
if
(
lhs
->
ndim
!=
rhs
->
ndim
)
{
return
false
;
}
for
(
int
i
=
1
;
i
<
lhs
->
ndim
;
++
i
)
{
if
(
lhs
->
shape
[
i
]
!=
rhs
->
shape
[
i
])
{
return
false
;
}
}
return
true
;
}
// Return true if broadcasting might be required to compute the element-wise
// operation between the features of the two ndarrays.
// The broadcasting semantic strictly follows numpy.
// Note that the function could return true for invalid element-wise shapes
// (e.g. lhs.shape = (N, 3), rhs.shape = (N, 5)). This is fine since
// ``CalcBcastInfo`` will handle that.
bool
HasBcast
(
NDArray
lhs
,
NDArray
rhs
)
{
if
(
lhs
->
ndim
!=
rhs
->
ndim
)
{
return
true
;
}
for
(
int
i
=
1
;
i
<
lhs
->
ndim
;
++
i
)
{
if
(
lhs
->
shape
[
i
]
!=
rhs
->
shape
[
i
])
{
return
true
;
}
}
return
false
;
}
// Compute auxiliary information of broadcasting dimensions.
// The function preprocesses the feature shapes so that:
// - The first dimension (for graph) is removed.
// - Feature dimensions are aligned.
// e.g. (4,) and (3, 4) become (1, 4) and (3, 4)
// - Continuous non-broadcasting dimenions are flattened to reduce number of
// integers used to represent the feature shape.
// e.g. (4, 1, 3, 3) and (4, 5, 3, 3) become (4, 1, 9) and (4, 5, 9)
//
// See also: BcastInfo (kernel/binary_reduce.h)
BcastInfo
CalcBcastInfo
(
const
std
::
string
&
op
,
NDArray
lhs
,
NDArray
rhs
)
{
BcastInfo
ret
;
const
int
max_ndim
=
std
::
max
(
lhs
->
ndim
,
rhs
->
ndim
)
-
1
;
int64_t
accum
=
0
;
int
j
=
0
;
// for dot operation: vector [dot] vector
// lhs_shape[ndim-1] == rhs_shape[ndim-1] = sizeof(vector)
// out_shape[ndim-1] = 1
if
(
op
==
binary_op
::
kDot
)
{
// get size of vector
ret
.
data_len
=
lhs
->
shape
[
lhs
->
ndim
-
1
];
// skip vector size dim
++
j
;
ret
.
real_out_shape
.
push_back
(
ret
.
data_len
);
}
else
{
// op != binary_op::kDot
ret
.
data_len
=
1
;
}
for
(;
j
<
max_ndim
;
++
j
)
{
const
int
dl
=
(
lhs
->
ndim
-
1
-
j
<
1
)
?
1
:
lhs
->
shape
[
lhs
->
ndim
-
1
-
j
];
const
int
dr
=
(
rhs
->
ndim
-
1
-
j
<
1
)
?
1
:
rhs
->
shape
[
rhs
->
ndim
-
1
-
j
];
if
(
dl
!=
dr
)
{
if
(
dl
!=
1
&&
dr
!=
1
)
{
LOG
(
FATAL
)
<<
"Invalid broadcasting between feature shapes "
<<
ShapeString
(
lhs
)
<<
" and "
<<
ShapeString
(
rhs
);
}
if
(
accum
!=
0
)
{
ret
.
lhs_shape
.
push_back
(
accum
);
ret
.
rhs_shape
.
push_back
(
accum
);
ret
.
out_shape
.
push_back
(
accum
);
accum
=
0
;
}
ret
.
lhs_shape
.
push_back
(
dl
);
ret
.
rhs_shape
.
push_back
(
dr
);
ret
.
out_shape
.
push_back
(
std
::
max
(
dl
,
dr
));
}
else
{
if
(
accum
==
0
)
{
accum
=
dl
;
}
else
{
accum
*=
dl
;
}
}
ret
.
real_out_shape
.
push_back
(
std
::
max
(
dl
,
dr
));
}
if
(
accum
!=
0
)
{
ret
.
lhs_shape
.
push_back
(
accum
);
ret
.
rhs_shape
.
push_back
(
accum
);
ret
.
out_shape
.
push_back
(
accum
);
accum
=
0
;
}
std
::
reverse
(
ret
.
real_out_shape
.
begin
(),
ret
.
real_out_shape
.
end
());
std
::
reverse
(
ret
.
lhs_shape
.
begin
(),
ret
.
lhs_shape
.
end
());
std
::
reverse
(
ret
.
rhs_shape
.
begin
(),
ret
.
rhs_shape
.
end
());
std
::
reverse
(
ret
.
out_shape
.
begin
(),
ret
.
out_shape
.
end
());
// stride
ret
.
lhs_stride
=
ComputeStride
(
ret
.
lhs_shape
);
ret
.
rhs_stride
=
ComputeStride
(
ret
.
rhs_shape
);
ret
.
out_stride
=
ComputeStride
(
ret
.
out_shape
);
return
ret
;
}
// Function to convert an idarray to string
std
::
string
IdArrayToStr
(
IdArray
arr
)
{
arr
=
arr
.
CopyTo
(
DLContext
{
kDLCPU
,
0
});
int64_t
len
=
arr
->
shape
[
0
];
std
::
ostringstream
oss
;
oss
<<
"("
<<
len
<<
")["
;
if
(
arr
->
dtype
.
bits
==
32
)
{
int32_t
*
data
=
static_cast
<
int32_t
*>
(
arr
->
data
);
for
(
int64_t
i
=
0
;
i
<
len
;
++
i
)
{
oss
<<
data
[
i
]
<<
" "
;
}
}
else
{
int64_t
*
data
=
static_cast
<
int64_t
*>
(
arr
->
data
);
for
(
int64_t
i
=
0
;
i
<
len
;
++
i
)
{
oss
<<
data
[
i
]
<<
" "
;
}
}
oss
<<
"]"
;
return
oss
.
str
();
}
// Check whether the given arguments use the same number of bits.
inline
void
CheckIdArray
(
const
uint8_t
bits
,
const
std
::
vector
<
NDArray
>&
arrays
,
const
std
::
vector
<
std
::
string
>&
names
)
{
for
(
size_t
i
=
0
;
i
<
arrays
.
size
();
++
i
)
{
if
(
aten
::
IsNullArray
(
arrays
[
i
]))
continue
;
CHECK
(
arrays
[
i
]
->
dtype
.
code
==
kDLInt
);
CHECK_EQ
(
arrays
[
i
]
->
ndim
,
1
);
CHECK_EQ
(
bits
,
arrays
[
i
]
->
dtype
.
bits
)
<<
"Expected "
<<
bits
<<
" integer array. But got "
<<
arrays
[
i
]
->
dtype
.
bits
<<
" for "
<<
names
[
i
]
<<
"."
;
}
}
// Return true if the operator is commutative and lhs and rhs need
// to be switched. For example, Add(kDst, kSrc) needs to be changed
// to Add(kSrc, kDst).
// This is because we only generate kernels for
// Add(kSrc, kDst), Add(kDst, kEdge), Add(kSrc, kDst)
// to save compilation time.
inline
bool
NeedSwitchOrder
(
const
std
::
string
&
op
,
binary_op
::
Target
lhs
,
binary_op
::
Target
rhs
)
{
CHECK_NE
(
lhs
,
rhs
);
return
(
op
==
binary_op
::
kAdd
||
op
==
binary_op
::
kMul
)
&&
lhs
>
rhs
;
}
class
ImmutableGraphCSRWrapper
:
public
CSRWrapper
{
public:
explicit
ImmutableGraphCSRWrapper
(
const
ImmutableGraph
*
graph
)
:
gptr_
(
graph
)
{
}
aten
::
CSRMatrix
GetInCSRMatrix
()
const
override
{
return
gptr_
->
GetInCSR
()
->
ToCSRMatrix
();
}
aten
::
CSRMatrix
GetOutCSRMatrix
()
const
override
{
return
gptr_
->
GetOutCSR
()
->
ToCSRMatrix
();
}
DGLContext
Context
()
const
override
{
return
gptr_
->
Context
();
}
int
NumBits
()
const
override
{
return
gptr_
->
NumBits
();
}
private:
const
ImmutableGraph
*
gptr_
;
};
class
UnitGraphCSRWrapper
:
public
CSRWrapper
{
public:
explicit
UnitGraphCSRWrapper
(
const
UnitGraph
*
graph
)
:
gptr_
(
graph
)
{
}
aten
::
CSRMatrix
GetInCSRMatrix
()
const
override
{
return
gptr_
->
GetCSCMatrix
(
0
);
}
aten
::
CSRMatrix
GetOutCSRMatrix
()
const
override
{
return
gptr_
->
GetCSRMatrix
(
0
);
}
DGLContext
Context
()
const
override
{
return
gptr_
->
Context
();
}
int
NumBits
()
const
override
{
return
gptr_
->
NumBits
();
}
private:
const
UnitGraph
*
gptr_
;
};
}
// namespace
std
::
vector
<
int64_t
>
InferBinaryFeatureShape
(
const
std
::
string
&
op
,
NDArray
lhs
,
NDArray
rhs
)
{
return
CalcBcastInfo
(
op
,
lhs
,
rhs
).
real_out_shape
;
}
DGL_REGISTER_GLOBAL
(
"_deprecate.kernel._CAPI_DGLKernelInferBinaryFeatureShape"
)
.
set_body
([]
(
DGLArgs
args
,
DGLRetValue
*
rv
)
{
std
::
string
op
=
args
[
0
];
NDArray
lhs
=
args
[
1
];
NDArray
rhs
=
args
[
2
];
const
auto
&
shape
=
InferBinaryFeatureShape
(
op
,
lhs
,
rhs
);
const
int64_t
len
=
shape
.
size
();
NDArray
ret
=
NDArray
::
Empty
(
{
len
},
DLDataType
{
kDLInt
,
64
,
1
},
DLContext
{
kDLCPU
,
0
});
int64_t
*
ret_data
=
static_cast
<
int64_t
*>
(
ret
->
data
);
std
::
copy
(
shape
.
begin
(),
shape
.
end
(),
ret_data
);
*
rv
=
ret
;
});
void
BinaryOpReduce
(
const
std
::
string
&
reducer
,
const
std
::
string
&
op
,
const
CSRWrapper
&
graph
,
binary_op
::
Target
lhs
,
binary_op
::
Target
rhs
,
NDArray
lhs_data
,
NDArray
rhs_data
,
NDArray
out_data
,
NDArray
lhs_mapping
,
NDArray
rhs_mapping
,
NDArray
out_mapping
)
{
const
auto
&
ctx
=
graph
.
Context
();
// sanity check
aten
::
CheckCtx
(
ctx
,
{
lhs_data
,
rhs_data
,
out_data
,
lhs_mapping
,
rhs_mapping
,
out_mapping
},
{
"lhs_data"
,
"rhs_data"
,
"out_data"
,
"lhs_mapping"
,
"rhs_mapping"
,
"out_mapping"
});
CheckIdArray
(
graph
.
NumBits
(),
{
lhs_mapping
,
rhs_mapping
,
out_mapping
},
{
"lhs_mapping"
,
"rhs_mapping"
,
"out_mapping"
});
// Switch order for commutative operation
if
(
NeedSwitchOrder
(
op
,
lhs
,
rhs
))
{
BinaryOpReduce
(
reducer
,
op
,
graph
,
rhs
,
lhs
,
rhs_data
,
lhs_data
,
out_data
,
rhs_mapping
,
lhs_mapping
,
out_mapping
);
}
else
{
if
(
HasBcast
(
lhs_data
,
rhs_data
))
{
BcastInfo
info
=
CalcBcastInfo
(
op
,
lhs_data
,
rhs_data
);
DGL_XPU_SWITCH
(
ctx
.
device_type
,
BinaryReduceBcastImpl
,
info
,
reducer
,
op
,
graph
,
lhs
,
rhs
,
lhs_data
,
rhs_data
,
out_data
,
lhs_mapping
,
rhs_mapping
,
out_mapping
);
}
else
{
CHECK
(
IsValidBinaryOpShape
(
lhs_data
,
rhs_data
))
<<
"Cannot compute binary operation between feature shapes "
<<
ShapeString
(
lhs_data
)
<<
" and "
<<
ShapeString
(
rhs_data
);
DGL_XPU_SWITCH
(
ctx
.
device_type
,
BinaryReduceImpl
,
reducer
,
op
,
graph
,
lhs
,
rhs
,
lhs_data
,
rhs_data
,
out_data
,
lhs_mapping
,
rhs_mapping
,
out_mapping
);
}
}
}
void
csrwrapper_switch
(
DGLArgValue
argval
,
std
::
function
<
void
(
const
CSRWrapper
&
)
>
fn
)
{
DGL_CHECK_TYPE_CODE
(
argval
.
type_code
(),
kObjectHandle
);
if
(
argval
.
IsObjectType
<
GraphRef
>
())
{
GraphRef
g
=
argval
;
auto
igptr
=
std
::
dynamic_pointer_cast
<
ImmutableGraph
>
(
g
.
sptr
());
CHECK_NOTNULL
(
igptr
);
ImmutableGraphCSRWrapper
wrapper
(
igptr
.
get
());
fn
(
wrapper
);
}
else
if
(
argval
.
IsObjectType
<
HeteroGraphRef
>
())
{
HeteroGraphRef
g
=
argval
;
auto
bgptr
=
std
::
dynamic_pointer_cast
<
UnitGraph
>
(
g
->
GetRelationGraph
(
0
));
CHECK_NOTNULL
(
bgptr
);
UnitGraphCSRWrapper
wrapper
(
bgptr
.
get
());
fn
(
wrapper
);
}
}
DGL_REGISTER_GLOBAL
(
"_deprecate.kernel._CAPI_DGLKernelBinaryOpReduce"
)
.
set_body
([]
(
DGLArgs
args
,
DGLRetValue
*
rv
)
{
std
::
string
reducer
=
args
[
0
];
std
::
string
op
=
args
[
1
];
int
lhs
=
args
[
3
];
int
rhs
=
args
[
4
];
NDArray
lhs_data
=
args
[
5
];
NDArray
rhs_data
=
args
[
6
];
NDArray
out_data
=
args
[
7
];
NDArray
lhs_mapping
=
args
[
8
];
NDArray
rhs_mapping
=
args
[
9
];
NDArray
out_mapping
=
args
[
10
];
auto
f
=
[
&
reducer
,
&
op
,
&
lhs
,
&
rhs
,
&
lhs_data
,
&
rhs_data
,
&
out_data
,
&
lhs_mapping
,
&
rhs_mapping
,
&
out_mapping
](
const
CSRWrapper
&
wrapper
)
{
BinaryOpReduce
(
reducer
,
op
,
wrapper
,
static_cast
<
binary_op
::
Target
>
(
lhs
),
static_cast
<
binary_op
::
Target
>
(
rhs
),
lhs_data
,
rhs_data
,
out_data
,
lhs_mapping
,
rhs_mapping
,
out_mapping
);
};
csrwrapper_switch
(
args
[
2
],
f
);
});
void
BackwardLhsBinaryOpReduce
(
const
std
::
string
&
reducer
,
const
std
::
string
&
op
,
const
CSRWrapper
&
graph
,
binary_op
::
Target
lhs
,
binary_op
::
Target
rhs
,
NDArray
lhs_mapping
,
NDArray
rhs_mapping
,
NDArray
out_mapping
,
NDArray
lhs_data
,
NDArray
rhs_data
,
NDArray
out_data
,
NDArray
grad_out_data
,
NDArray
grad_lhs_data
)
{
const
auto
&
ctx
=
graph
.
Context
();
// sanity check
aten
::
CheckCtx
(
ctx
,
{
lhs_data
,
rhs_data
,
out_data
,
grad_out_data
,
grad_lhs_data
,
lhs_mapping
,
rhs_mapping
,
out_mapping
},
{
"lhs_data"
,
"rhs_data"
,
"out_data"
,
"grad_out_data"
,
"grad_lhs_data"
,
"lhs_mapping"
,
"rhs_mapping"
,
"out_mapping"
});
CheckIdArray
(
graph
.
NumBits
(),
{
lhs_mapping
,
rhs_mapping
,
out_mapping
},
{
"lhs_mapping"
,
"rhs_mapping"
,
"out_mapping"
});
// Switch order for commutative operation
if
(
NeedSwitchOrder
(
op
,
lhs
,
rhs
))
{
BackwardRhsBinaryOpReduce
(
reducer
,
op
,
graph
,
rhs
,
lhs
,
rhs_mapping
,
lhs_mapping
,
out_mapping
,
rhs_data
,
lhs_data
,
out_data
,
grad_out_data
,
grad_lhs_data
);
}
else
{
if
(
HasBcast
(
lhs_data
,
rhs_data
))
{
BcastInfo
info
=
CalcBcastInfo
(
op
,
lhs_data
,
rhs_data
);
DGL_XPU_SWITCH
(
ctx
.
device_type
,
BackwardBinaryReduceBcastImpl
,
info
,
reducer
,
op
,
graph
,
lhs
,
rhs
,
lhs_mapping
,
rhs_mapping
,
out_mapping
,
lhs_data
,
rhs_data
,
out_data
,
grad_out_data
,
grad_lhs_data
,
aten
::
NullArray
());
}
else
{
DGL_XPU_SWITCH
(
ctx
.
device_type
,
BackwardBinaryReduceImpl
,
reducer
,
op
,
graph
,
lhs
,
rhs
,
lhs_mapping
,
rhs_mapping
,
out_mapping
,
lhs_data
,
rhs_data
,
out_data
,
grad_out_data
,
grad_lhs_data
,
aten
::
NullArray
());
}
}
}
DGL_REGISTER_GLOBAL
(
"_deprecate.kernel._CAPI_DGLKernelBackwardLhsBinaryOpReduce"
)
.
set_body
([]
(
DGLArgs
args
,
DGLRetValue
*
rv
)
{
std
::
string
reducer
=
args
[
0
];
std
::
string
op
=
args
[
1
];
int
lhs
=
args
[
3
];
int
rhs
=
args
[
4
];
NDArray
lhs_mapping
=
args
[
5
];
NDArray
rhs_mapping
=
args
[
6
];
NDArray
out_mapping
=
args
[
7
];
NDArray
lhs_data
=
args
[
8
];
NDArray
rhs_data
=
args
[
9
];
NDArray
out_data
=
args
[
10
];
NDArray
grad_out_data
=
args
[
11
];
NDArray
grad_lhs_data
=
args
[
12
];
auto
f
=
[
&
reducer
,
&
op
,
&
lhs
,
&
rhs
,
&
lhs_mapping
,
&
rhs_mapping
,
&
out_mapping
,
&
lhs_data
,
&
rhs_data
,
&
out_data
,
&
grad_out_data
,
&
grad_lhs_data
](
const
CSRWrapper
&
wrapper
)
{
BackwardLhsBinaryOpReduce
(
reducer
,
op
,
wrapper
,
static_cast
<
binary_op
::
Target
>
(
lhs
),
static_cast
<
binary_op
::
Target
>
(
rhs
),
lhs_mapping
,
rhs_mapping
,
out_mapping
,
lhs_data
,
rhs_data
,
out_data
,
grad_out_data
,
grad_lhs_data
);
};
csrwrapper_switch
(
args
[
2
],
f
);
});
void
BackwardRhsBinaryOpReduce
(
const
std
::
string
&
reducer
,
const
std
::
string
&
op
,
const
CSRWrapper
&
graph
,
binary_op
::
Target
lhs
,
binary_op
::
Target
rhs
,
NDArray
lhs_mapping
,
NDArray
rhs_mapping
,
NDArray
out_mapping
,
NDArray
lhs_data
,
NDArray
rhs_data
,
NDArray
out_data
,
NDArray
grad_out_data
,
NDArray
grad_rhs_data
)
{
const
auto
&
ctx
=
graph
.
Context
();
// sanity check
aten
::
CheckCtx
(
ctx
,
{
lhs_data
,
rhs_data
,
out_data
,
grad_out_data
,
grad_rhs_data
,
lhs_mapping
,
rhs_mapping
,
out_mapping
},
{
"lhs_data"
,
"rhs_data"
,
"out_data"
,
"grad_out_data"
,
"grad_rhs_data"
,
"lhs_mapping"
,
"rhs_mapping"
,
"out_mapping"
});
CheckIdArray
(
graph
.
NumBits
(),
{
lhs_mapping
,
rhs_mapping
,
out_mapping
},
{
"lhs_mapping"
,
"rhs_mapping"
,
"out_mapping"
});
if
(
NeedSwitchOrder
(
op
,
lhs
,
rhs
))
{
BackwardLhsBinaryOpReduce
(
reducer
,
op
,
graph
,
rhs
,
lhs
,
rhs_mapping
,
lhs_mapping
,
out_mapping
,
rhs_data
,
lhs_data
,
out_data
,
grad_out_data
,
grad_rhs_data
);
}
else
{
if
(
HasBcast
(
lhs_data
,
rhs_data
))
{
BcastInfo
info
=
CalcBcastInfo
(
op
,
lhs_data
,
rhs_data
);
DGL_XPU_SWITCH
(
ctx
.
device_type
,
BackwardBinaryReduceBcastImpl
,
info
,
reducer
,
op
,
graph
,
lhs
,
rhs
,
lhs_mapping
,
rhs_mapping
,
out_mapping
,
lhs_data
,
rhs_data
,
out_data
,
grad_out_data
,
aten
::
NullArray
(),
grad_rhs_data
);
}
else
{
DGL_XPU_SWITCH
(
ctx
.
device_type
,
BackwardBinaryReduceImpl
,
reducer
,
op
,
graph
,
lhs
,
rhs
,
lhs_mapping
,
rhs_mapping
,
out_mapping
,
lhs_data
,
rhs_data
,
out_data
,
grad_out_data
,
aten
::
NullArray
(),
grad_rhs_data
);
}
}
}
DGL_REGISTER_GLOBAL
(
"_deprecate.kernel._CAPI_DGLKernelBackwardRhsBinaryOpReduce"
)
.
set_body
([]
(
DGLArgs
args
,
DGLRetValue
*
rv
)
{
std
::
string
reducer
=
args
[
0
];
std
::
string
op
=
args
[
1
];
int
lhs
=
args
[
3
];
int
rhs
=
args
[
4
];
NDArray
lhs_mapping
=
args
[
5
];
NDArray
rhs_mapping
=
args
[
6
];
NDArray
out_mapping
=
args
[
7
];
NDArray
lhs_data
=
args
[
8
];
NDArray
rhs_data
=
args
[
9
];
NDArray
out_data
=
args
[
10
];
NDArray
grad_out_data
=
args
[
11
];
NDArray
grad_rhs_data
=
args
[
12
];
auto
f
=
[
&
reducer
,
&
op
,
&
lhs
,
&
rhs
,
&
lhs_mapping
,
&
rhs_mapping
,
&
out_mapping
,
&
lhs_data
,
&
rhs_data
,
out_data
,
&
grad_out_data
,
&
grad_rhs_data
](
const
CSRWrapper
&
wrapper
)
{
BackwardRhsBinaryOpReduce
(
reducer
,
op
,
wrapper
,
static_cast
<
binary_op
::
Target
>
(
lhs
),
static_cast
<
binary_op
::
Target
>
(
rhs
),
lhs_mapping
,
rhs_mapping
,
out_mapping
,
lhs_data
,
rhs_data
,
out_data
,
grad_out_data
,
grad_rhs_data
);
};
csrwrapper_switch
(
args
[
2
],
f
);
});
void
CopyReduce
(
const
std
::
string
&
reducer
,
const
CSRWrapper
&
graph
,
binary_op
::
Target
target
,
NDArray
in_data
,
NDArray
out_data
,
NDArray
in_mapping
,
NDArray
out_mapping
)
{
const
auto
&
ctx
=
graph
.
Context
();
// sanity check
aten
::
CheckCtx
(
ctx
,
{
in_data
,
out_data
,
in_mapping
,
out_mapping
},
{
"in_data"
,
"out_data"
,
"in_mapping"
,
"out_mapping"
});
CheckIdArray
(
graph
.
NumBits
(),
{
in_mapping
,
out_mapping
},
{
"in_mapping"
,
"out_mapping"
});
DGL_XPU_SWITCH
(
ctx
.
device_type
,
BinaryReduceImpl
,
reducer
,
binary_op
::
kUseLhs
,
graph
,
target
,
binary_op
::
kNone
,
in_data
,
aten
::
NullArray
(),
out_data
,
in_mapping
,
aten
::
NullArray
(),
out_mapping
);
}
DGL_REGISTER_GLOBAL
(
"_deprecate.kernel._CAPI_DGLKernelCopyReduce"
)
.
set_body
([]
(
DGLArgs
args
,
DGLRetValue
*
rv
)
{
std
::
string
reducer
=
args
[
0
];
int
target
=
args
[
2
];
NDArray
in_data
=
args
[
3
];
NDArray
out_data
=
args
[
4
];
NDArray
in_mapping
=
args
[
5
];
NDArray
out_mapping
=
args
[
6
];
auto
f
=
[
&
reducer
,
&
target
,
&
in_data
,
&
out_data
,
&
in_mapping
,
&
out_mapping
](
const
CSRWrapper
&
wrapper
)
{
CopyReduce
(
reducer
,
wrapper
,
static_cast
<
binary_op
::
Target
>
(
target
),
in_data
,
out_data
,
in_mapping
,
out_mapping
);
};
csrwrapper_switch
(
args
[
1
],
f
);
});
void
BackwardCopyReduce
(
const
std
::
string
&
reducer
,
const
CSRWrapper
&
graph
,
binary_op
::
Target
target
,
NDArray
in_mapping
,
NDArray
out_mapping
,
NDArray
in_data
,
NDArray
out_data
,
NDArray
grad_out_data
,
NDArray
grad_in_data
)
{
const
auto
&
ctx
=
graph
.
Context
();
// sanity check
aten
::
CheckCtx
(
ctx
,
{
in_data
,
out_data
,
grad_out_data
,
grad_in_data
,
in_mapping
,
out_mapping
},
{
"in_data"
,
"out_data"
,
"grad_out_data"
,
"grad_in_data"
,
"in_mapping"
,
"out_mapping"
});
CheckIdArray
(
graph
.
NumBits
(),
{
in_mapping
,
out_mapping
},
{
"in_mapping"
,
"out_mapping"
});
if
(
!
aten
::
IsNullArray
(
out_mapping
))
{
CHECK_EQ
(
ctx
,
out_mapping
->
ctx
)
<<
"Expected device context "
<<
ctx
<<
". But got "
<<
out_mapping
->
ctx
<<
" for rhs_data."
;
}
DGL_XPU_SWITCH
(
ctx
.
device_type
,
BackwardBinaryReduceImpl
,
reducer
,
binary_op
::
kUseLhs
,
graph
,
target
,
binary_op
::
kNone
,
in_mapping
,
aten
::
NullArray
(),
out_mapping
,
in_data
,
aten
::
NullArray
(),
out_data
,
grad_out_data
,
grad_in_data
,
aten
::
NullArray
());
}
DGL_REGISTER_GLOBAL
(
"_deprecate.kernel._CAPI_DGLKernelBackwardCopyReduce"
)
.
set_body
([]
(
DGLArgs
args
,
DGLRetValue
*
rv
)
{
std
::
string
reducer
=
args
[
0
];
int
target
=
args
[
2
];
NDArray
in_data
=
args
[
3
];
NDArray
out_data
=
args
[
4
];
NDArray
grad_out_data
=
args
[
5
];
NDArray
grad_in_data
=
args
[
6
];
NDArray
in_mapping
=
args
[
7
];
NDArray
out_mapping
=
args
[
8
];
auto
f
=
[
&
reducer
,
&
target
,
&
in_mapping
,
&
out_mapping
,
&
in_data
,
&
out_data
,
&
grad_out_data
,
&
grad_in_data
](
const
CSRWrapper
&
wrapper
)
{
BackwardCopyReduce
(
reducer
,
wrapper
,
static_cast
<
binary_op
::
Target
>
(
target
),
in_mapping
,
out_mapping
,
in_data
,
out_data
,
grad_out_data
,
grad_in_data
);
};
csrwrapper_switch
(
args
[
1
],
f
);
});
}
// namespace kernel
}
// namespace dgl
src/kernel/binary_reduce.h
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/binary_reduce.h
* \brief Binary reduce function C++ header.
*/
#ifndef DGL_KERNEL_BINARY_REDUCE_H_
#define DGL_KERNEL_BINARY_REDUCE_H_
#include <dgl/runtime/ndarray.h>
#include <vector>
#include <string>
#include "./binary_reduce_common.h"
#include "./csr_interface.h"
namespace
dgl
{
namespace
kernel
{
// Structure for broadcasting shapes
struct
BcastInfo
{
// inferred output shape
std
::
vector
<
int64_t
>
real_out_shape
;
// Following shapes here have been preprocessed, so that:
// - The first dimension (for graph) is removed. Shapes here are only for features.
// - They have the same number of dimensions.
// e.g. (4,) and (3, 4) become (1, 4) and (3, 4)
// - Continuous non-broadcasting dimenions are flattened.
// e.g. (4, 1, 3, 3) and (4, 5, 3, 3) become (4, 1, 9) and (4, 5, 9)
std
::
vector
<
int64_t
>
lhs_shape
,
lhs_stride
;
std
::
vector
<
int64_t
>
rhs_shape
,
rhs_stride
;
std
::
vector
<
int64_t
>
out_shape
,
out_stride
;
int64_t
data_len
;
};
/*
* !\brief Compute the feature shape after binary reduce computation.
*/
std
::
vector
<
int64_t
>
InferBinaryFeatureShape
(
runtime
::
NDArray
lhs
,
runtime
::
NDArray
rhs
);
/*!
* \brief Perform binary operation between the given data and reduce by the graph.
*
* If the reducer is one of "sum, "max, "min", "prod", the operator computes,
* for each node i,
*
* out[i] = Sigma_{j\in Neighbor(i)} ( A[s1(i, j, e)] op B[s2(i, j, e)] )
*
* , where A, B are two input feature tensors, op could be element-wise add/sub/div/mul.
* Depending on the lhs and rhs target, s1 and s2 will select the src/dst/edge
* ids of each neighbor.
*
* If the reducer is "none", the operator computes, for each edge e,
*
* out[e] = A[s1(i, j, e)] op B[s2(i, j, e)]
*
* Here, the node/edge feature (e.g., A[i], B[e]) could be dense tensor. In such
* case, broadcasting is supported on the feature dimensions.
*
* Examples:
*
* A.shape = (N, D1, D2) # N is the number of nodes
* B.shape = (M, D1, 1) # M is the number of edges
* C = BinaryOpReduce("sum", "add", graph, A, B, ...)
* C.shape = (N, D1, D2)
*
* \param reducer The type of the reducer ("sum", "max", "prod", "min", "none").
* If the reducer is "none", the output is an edge feature tensor.
* Otherwise, a node feature tensor is returned.
* \param op The type of the binary operator ("mul", "add").
* \param graph The graph object.
* \param lhs The lhs target (src, dst, edge)
* \param rhs The rhs target (src, dst, edge)
* \param lhs_data The lhs feature tensor.
* \param rhs_data The rhs feature tensor.
* \param out_data The output tensor. Could be either node or edge feature
* tensor depending on the reducer.
* \param lhs_mapping An optional int64 id mapping array.
* \param rhs_mapping An optional int64 id mapping array.
* \param out_mapping An optional int64 id mapping array.
*/
void
BinaryOpReduce
(
const
std
::
string
&
reducer
,
const
std
::
string
&
op
,
const
CSRWrapper
&
graph
,
binary_op
::
Target
lhs
,
binary_op
::
Target
rhs
,
runtime
::
NDArray
lhs_data
,
runtime
::
NDArray
rhs_data
,
runtime
::
NDArray
out_data
,
runtime
::
NDArray
lhs_mapping
,
runtime
::
NDArray
rhs_mapping
,
runtime
::
NDArray
out_mapping
);
/*!
* \brief Compute the lhs gradient of BinaryOpReduce
*
* Broadcasting along feature dimensions is supported. However, the gradient
* of the being-broadcasted dimensions will *not* be reduced. Therefore, the
* gradient tensor has the same shape with the out tensor.
*
* Examples:
* A.shape = (N, D1, 1) # N is the number of nodes
* B.shape = (M, D1, D2) # M is the number of edges
* C = BinaryOpReduce("sum", "add", graph, A, B, ...)
* C.shape = (N, D1, D2)
* dC.shape = (N, D1, D2)
* dA = BackwardLhsBinaryOpReduce("sum", "add", graph, A, B, C, dC, ...)
* dA.shape = (N, D1, D2) # extra reduction should be handled afterwards
*
* \param reducer The type of the reducer ("sum", "max", "prod", "min", "none").
* If the reducer is "none", the output is an edge feature tensor.
* Otherwise, a node feature tensor is returned.
* \param op The type of the binary operator ("mul", "add").
* \param graph The graph object.
* \param lhs The lhs target (src, dst, edge)
* \param rhs The rhs target (src, dst, edge)
* \param lhs_mapping An optional int64 id mapping array.
* \param rhs_mapping An optional int64 id mapping array.
* \param out_mapping An optional int64 id mapping array.
* \param lhs_data The lhs feature tensor.
* \param rhs_data The rhs feature tensor.
* \param out_data The output tensor. Could be either node or edge feature
* tensor depending on the reducer.
* \param grad_out_data The gradient output tensor.
* \param grad_lhs_data The gradient lhs tensor.
*/
void
BackwardLhsBinaryOpReduce
(
const
std
::
string
&
reducer
,
const
std
::
string
&
op
,
const
CSRWrapper
&
graph
,
binary_op
::
Target
lhs
,
binary_op
::
Target
rhs
,
runtime
::
NDArray
lhs_mapping
,
runtime
::
NDArray
rhs_mapping
,
runtime
::
NDArray
out_mapping
,
runtime
::
NDArray
lhs_data
,
runtime
::
NDArray
rhs_data
,
runtime
::
NDArray
out_data
,
runtime
::
NDArray
grad_out_data
,
runtime
::
NDArray
grad_lhs_data
);
/*!
* \brief Compute the rhs gradient of BinaryOpReduce
*
* Broadcasting along feature dimensions is supported. However, the gradient
* of the being-broadcasted dimensions will *not* be reduced. Therefore, the
* gradient tensor has the same shape with the out tensor.
*
* Examples:
* A.shape = (N, D1, D2) # N is the number of nodes
* B.shape = (M, D1, 1) # M is the number of edges
* C = BinaryOpReduce("sum", "add", graph, A, B, ...)
* C.shape = (N, D1, D2)
* dC.shape = (N, D1, D2)
* dB = BackwardRhsBinaryOpReduce("sum", "add", graph, A, B, C, dC, ...)
* dB.shape = (N, D1, D2) # extra reduction should be handled afterwards
*
* \param reducer The type of the reducer ("sum", "max", "prod", "min", "none").
* If the reducer is "none", the output is an edge feature tensor.
* Otherwise, a node feature tensor is returned.
* \param op The type of the binary operator ("mul", "add").
* \param graph The graph object.
* \param lhs The lhs target (src, dst, edge)
* \param rhs The rhs target (src, dst, edge)
* \param lhs_mapping An optional int64 id mapping array.
* \param rhs_mapping An optional int64 id mapping array.
* \param out_mapping An optional int64 id mapping array.
* \param lhs_data The lhs feature tensor.
* \param rhs_data The rhs feature tensor.
* \param out_data The output tensor. Could be either node or edge feature
* tensor depending on the reducer.
* \param grad_out_data The gradient output tensor.
* \param grad_rhs_data The gradient rhs tensor.
*/
void
BackwardRhsBinaryOpReduce
(
const
std
::
string
&
reducer
,
const
std
::
string
&
op
,
const
CSRWrapper
&
graph
,
binary_op
::
Target
lhs
,
binary_op
::
Target
rhs
,
runtime
::
NDArray
lhs_mapping
,
runtime
::
NDArray
rhs_mapping
,
runtime
::
NDArray
out_mapping
,
runtime
::
NDArray
lhs_data
,
runtime
::
NDArray
rhs_data
,
runtime
::
NDArray
out_data
,
runtime
::
NDArray
grad_out_data
,
runtime
::
NDArray
grad_rhs_data
);
/*!
* \brief Copy the target data and reduce by graph structure.
*
* If the reducer is one of "sum, "max, "min", "prod", the operator computes,
* for each node i,
*
* out[i] = Sigma_{j\in Neighbor(i)} A[s1(i, j, e)]
*
* , where A, B are two input feature tensors.
* Depending on the lhs and rhs target, s1 and s2 will select the src/dst/edge
* ids of each neighbor.
*
* If the reducer is "none", the operator computes, for each edge e,
*
* out[e] = A[s1(i, j, e)]
*
* \param reducer The type of the reducer ("sum", "max", "prod", "min", "none").
* If the reducer is "none", the output is an edge feature tensor.
* Otherwise, a node feature tensor is returned.
* \param graph The graph object.
* \param target The nput target (src, edge)
* \param in_data The input feature tensor.
* \param out_data The output tensor. Could be either node or edge feature
* tensor depending on the reducer.
* \param in_mapping An optional int64 id mapping array.
* \param out_mapping An optional int64 id mapping array.
*/
void
CopyReduce
(
const
std
::
string
&
reducer
,
const
CSRWrapper
&
graph
,
binary_op
::
Target
target
,
runtime
::
NDArray
in_data
,
runtime
::
NDArray
out_data
,
runtime
::
NDArray
in_mapping
,
runtime
::
NDArray
out_mapping
);
/*!
* \brief Compute backward of the CopyReduce
*
* \param reducer The type of the reducer ("sum", "max", "prod", "min", "none").
* If the reducer is "none", the output is an edge feature tensor.
* Otherwise, a node feature tensor is returned.
* \param graph The graph object.
* \param target The nput target (src, edge)
* \param in_mapping An optional int64 id mapping array.
* \param out_mapping An optional int64 id mapping array.
* \param in_data The input feature tensor.
* \param out_data The output tensor. Could be either node or edge feature
* tensor depending on the reducer.
* \param grad_out_data The gradient output tensor.
* \param grad_in_data The gradient input tensor.
*/
void
BackwardCopyReduce
(
const
std
::
string
&
reducer
,
const
CSRWrapper
&
graph
,
binary_op
::
Target
target
,
runtime
::
NDArray
in_mapping
,
runtime
::
NDArray
out_mapping
,
runtime
::
NDArray
in_data
,
runtime
::
NDArray
out_data
,
runtime
::
NDArray
grad_out_data
,
runtime
::
NDArray
grad_in_data
);
}
// namespace kernel
}
// namespace dgl
#endif // DGL_KERNEL_BINARY_REDUCE_H_
src/kernel/binary_reduce_common.h
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/binary_reduce_common.h
* \brief Common utilities for binary reduce operation.
*/
#ifndef DGL_KERNEL_BINARY_REDUCE_COMMON_H_
#define DGL_KERNEL_BINARY_REDUCE_COMMON_H_
#include <dgl/runtime/ndarray.h>
#include <limits>
#include <string>
#include "./common.h"
namespace
dgl
{
namespace
kernel
{
namespace
binary_op
{
/*! \brief Reducer names. */
static
const
char
kReduceSum
[]
=
"sum"
;
static
const
char
kReduceMax
[]
=
"max"
;
static
const
char
kReduceMin
[]
=
"min"
;
static
const
char
kReduceMean
[]
=
"mean"
;
static
const
char
kReduceProd
[]
=
"prod"
;
static
const
char
kReduceNone
[]
=
"none"
;
/*! \brief Binary op names. */
static
const
char
kAdd
[]
=
"add"
;
static
const
char
kSub
[]
=
"sub"
;
static
const
char
kMul
[]
=
"mul"
;
static
const
char
kDiv
[]
=
"div"
;
static
const
char
kDot
[]
=
"dot"
;
static
const
char
kUseLhs
[]
=
"use_lhs"
;
/*!
* \brief Enum code for operand targets.
* \seealso BinaryOpReduce in binary_reduce_common.h
*/
enum
Target
{
kSrc
=
0
,
// select src node
kDst
,
// select dst node
kEdge
,
// select edge
kNone
,
// select none
};
/*! \brief Enum code for backward operator mode. */
enum
BackwardMode
{
kGradLhs
=
0
,
// compute lhs gradient
kGradRhs
,
// compute rhs gradient
kGradBoth
,
// compute both gradients
};
}
// namespace binary_op
//////////////////////////////////////////////////////////////////////////
// Defines operand target category. Each category is a structure with
// two static members:
// - target: The enum code of this category.
// - Call: The call functor that returns the selected target.
//////////////////////////////////////////////////////////////////////////
/*! \brief Select src category. */
struct
SelectSrc
{
// Target value
static
constexpr
binary_op
::
Target
target
=
binary_op
::
kSrc
;
// Call functor.
template
<
typename
T
>
static
DGLDEVICE
DGLINLINE
T
Call
(
T
src
,
T
edge
,
T
dst
)
{
return
src
;
}
};
/*! \brief Select dst category. */
struct
SelectDst
{
// Target value
static
constexpr
binary_op
::
Target
target
=
binary_op
::
kDst
;
// Call functor.
template
<
typename
T
>
static
DGLDEVICE
DGLINLINE
T
Call
(
T
src
,
T
edge
,
T
dst
)
{
return
dst
;
}
};
/*! \brief Select edge category. */
struct
SelectEdge
{
// Target value
static
constexpr
binary_op
::
Target
target
=
binary_op
::
kEdge
;
// Call functor.
template
<
typename
T
>
static
DGLDEVICE
DGLINLINE
T
Call
(
T
src
,
T
edge
,
T
dst
)
{
return
edge
;
}
};
/*! \brief Select none category. */
struct
SelectNone
{
// Target value
static
constexpr
binary_op
::
Target
target
=
binary_op
::
kNone
;
// Call functor.
template
<
typename
T
>
static
DGLDEVICE
DGLINLINE
T
Call
(
T
src
,
T
edge
,
T
dst
)
{
return
0
;
}
};
/*! \brief Type functor to switch SelectSrc and SelectDst category.
* SelectEdge and SelectNone will remain the same. */
template
<
typename
Selector
>
struct
SwitchSrcDst
{
typedef
Selector
Type
;
};
template
<
>
struct
SwitchSrcDst
<
SelectSrc
>
{
typedef
SelectDst
Type
;
};
template
<
>
struct
SwitchSrcDst
<
SelectDst
>
{
typedef
SelectSrc
Type
;
};
//////////////////////////////////////////////////////////////////////////
// Defines binary op category. Each category is a structure with
// three static members:
// - Call: The forward computation given two operand.
// - BackwardLhs: Compute lhs gradient.
// - BackwardRhs: Compute rhs gradient.
//////////////////////////////////////////////////////////////////////////
// common binary functors
template
<
typename
DType
>
struct
BinaryAdd
{
static
DGLDEVICE
DGLINLINE
DType
Call
(
const
DType
*
lhs
,
const
DType
*
rhs
,
int64_t
len
)
{
return
lhs
[
0
]
+
rhs
[
0
];
}
static
DGLDEVICE
DGLINLINE
DType
BackwardLhs
(
DType
lhs
,
DType
rhs
,
DType
out
)
{
return
1
;
}
static
DGLDEVICE
DGLINLINE
DType
BackwardRhs
(
DType
lhs
,
DType
rhs
,
DType
out
)
{
return
1
;
}
};
template
<
typename
DType
>
struct
BinaryMul
{
static
DGLDEVICE
DGLINLINE
DType
Call
(
const
DType
*
lhs
,
const
DType
*
rhs
,
int64_t
len
)
{
return
lhs
[
0
]
*
rhs
[
0
];
}
static
DGLDEVICE
DGLINLINE
DType
BackwardLhs
(
DType
lhs
,
DType
rhs
,
DType
out
)
{
return
rhs
;
}
static
DGLDEVICE
DGLINLINE
DType
BackwardRhs
(
DType
lhs
,
DType
rhs
,
DType
out
)
{
return
lhs
;
}
};
template
<
typename
DType
>
struct
BinarySub
{
static
DGLDEVICE
DGLINLINE
DType
Call
(
const
DType
*
lhs
,
const
DType
*
rhs
,
int64_t
len
)
{
return
lhs
[
0
]
-
rhs
[
0
];
}
static
DGLDEVICE
DGLINLINE
DType
BackwardLhs
(
DType
lhs
,
DType
rhs
,
DType
out
)
{
return
1
;
}
static
DGLDEVICE
DGLINLINE
DType
BackwardRhs
(
DType
lhs
,
DType
rhs
,
DType
out
)
{
return
-
1
;
}
};
template
<
typename
DType
>
struct
BinaryDiv
{
static
DGLDEVICE
DGLINLINE
DType
Call
(
const
DType
*
lhs
,
const
DType
*
rhs
,
int64_t
len
)
{
return
lhs
[
0
]
/
rhs
[
0
];
}
static
DGLDEVICE
DGLINLINE
DType
BackwardLhs
(
DType
lhs
,
DType
rhs
,
DType
out
)
{
return
static_cast
<
DType
>
(
1
)
/
rhs
;
}
static
DGLDEVICE
DGLINLINE
DType
BackwardRhs
(
DType
lhs
,
DType
rhs
,
DType
out
)
{
return
-
lhs
/
(
rhs
*
rhs
);
}
};
template
<
typename
DType
>
struct
BinaryUseLhs
{
static
DGLDEVICE
DGLINLINE
DType
Call
(
const
DType
*
lhs
,
const
DType
*
rhs
,
int64_t
len
)
{
return
lhs
[
0
];
}
static
DGLDEVICE
DGLINLINE
DType
BackwardLhs
(
DType
lhs
,
DType
rhs
,
DType
out
)
{
return
1
;
}
static
DGLDEVICE
DGLINLINE
DType
BackwardRhs
(
DType
lhs
,
DType
rhs
,
DType
out
)
{
return
0
;
}
};
template
<
typename
DType
>
struct
BinaryDot
{
static
DGLDEVICE
DGLINLINE
DType
Call
(
const
DType
*
lhs
,
const
DType
*
rhs
,
int64_t
len
)
{
DType
out
=
0
;
// simple vector dot vector
#pragma unroll
for
(
int
i
=
0
;
i
<
len
;
i
++
)
out
+=
lhs
[
i
]
*
rhs
[
i
];
return
out
;
}
static
DGLDEVICE
DGLINLINE
DType
BackwardLhs
(
DType
lhs
,
DType
rhs
,
DType
out
)
{
return
rhs
;
}
static
DGLDEVICE
DGLINLINE
DType
BackwardRhs
(
DType
lhs
,
DType
rhs
,
DType
out
)
{
return
lhs
;
}
};
// Macro for dispatching op enum code and target code into template arguments.
// The macro dispatches following combinations:
// - Add(Src, Dst), Add(Src, Edge), Add(Dst, Edge)
// - Mul(Src, Dst), Mul(Src, Edge), Mul(Dst, Edge)
// - Sub(Src, Dst), Sub(Src, Edge), Sub(Dst, Edge)
// Sub(Dst, Src), Sub(Edge, Src), Sub(Edge, Dst)
// - Div(Src, Dst), Div(Src, Edge), Div(Dst, Edge)
// Div(Dst, Src), Div(Edge, Src), Div(Edge, Dst)
// - UseLhs(Src, None), UseLhs(Edge, None)
// - Dot(Src, Dst), Dot(Src, Edge), Dot(Dst, Edge)
// - Dot(Dst, Src), Dot(Edge, Src), Dot(Edge, Dst)
// Note that for commutative operators (e.g. Add and Mul), we only generate
// kernels for lhs code smaller than rhs code.
#define OP_TARGET_SWITCH(op, lhs, rhs, DType, OpType, LeftType, RightType, ...) \
{ \
using namespace binary_op; \
if (op == kAdd && lhs == kSrc && rhs == kDst) { \
typedef BinaryAdd<DType> OpType; \
typedef SelectSrc LeftType; \
typedef SelectDst RightType; \
{__VA_ARGS__} \
} else if (op == kAdd && lhs == kSrc && rhs == kEdge) { \
typedef BinaryAdd<DType> OpType; \
typedef SelectSrc LeftType; \
typedef SelectEdge RightType; \
{__VA_ARGS__} \
} else if (op == kAdd && lhs == kDst && rhs == kEdge) { \
typedef BinaryAdd<DType> OpType; \
typedef SelectDst LeftType; \
typedef SelectEdge RightType; \
{__VA_ARGS__} \
} else if (op == kMul && lhs == kSrc && rhs == kDst) { \
typedef BinaryMul<DType> OpType; \
typedef SelectSrc LeftType; \
typedef SelectDst RightType; \
{__VA_ARGS__} \
} else if (op == kMul && lhs == kSrc && rhs == kEdge) { \
typedef BinaryMul<DType> OpType; \
typedef SelectSrc LeftType; \
typedef SelectEdge RightType; \
{__VA_ARGS__} \
} else if (op == kMul && lhs == kDst && rhs == kEdge) { \
typedef BinaryMul<DType> OpType; \
typedef SelectDst LeftType; \
typedef SelectEdge RightType; \
{__VA_ARGS__} \
} else if (op == kSub && lhs == kSrc && rhs == kDst) { \
typedef BinarySub<DType> OpType; \
typedef SelectSrc LeftType; \
typedef SelectDst RightType; \
{__VA_ARGS__} \
} else if (op == kSub && lhs == kDst && rhs == kSrc) { \
typedef BinarySub<DType> OpType; \
typedef SelectDst LeftType; \
typedef SelectSrc RightType; \
{__VA_ARGS__} \
} else if (op == kSub && lhs == kSrc && rhs == kEdge) { \
typedef BinarySub<DType> OpType; \
typedef SelectSrc LeftType; \
typedef SelectEdge RightType; \
{__VA_ARGS__} \
} else if (op == kSub && lhs == kEdge && rhs == kSrc) { \
typedef BinarySub<DType> OpType; \
typedef SelectEdge LeftType; \
typedef SelectSrc RightType; \
{__VA_ARGS__} \
} else if (op == kSub && lhs == kDst && rhs == kEdge) { \
typedef BinarySub<DType> OpType; \
typedef SelectDst LeftType; \
typedef SelectEdge RightType; \
{__VA_ARGS__} \
} else if (op == kSub && lhs == kEdge && rhs == kDst) { \
typedef BinarySub<DType> OpType; \
typedef SelectEdge LeftType; \
typedef SelectDst RightType; \
{__VA_ARGS__} \
} else if (op == kDiv && lhs == kSrc && rhs == kDst) { \
typedef BinaryDiv<DType> OpType; \
typedef SelectSrc LeftType; \
typedef SelectDst RightType; \
{__VA_ARGS__} \
} else if (op == kDiv && lhs == kDst && rhs == kSrc) { \
typedef BinaryDiv<DType> OpType; \
typedef SelectDst LeftType; \
typedef SelectSrc RightType; \
{__VA_ARGS__} \
} else if (op == kDiv && lhs == kSrc && rhs == kEdge) { \
typedef BinaryDiv<DType> OpType; \
typedef SelectSrc LeftType; \
typedef SelectEdge RightType; \
{__VA_ARGS__} \
} else if (op == kDiv && lhs == kEdge && rhs == kSrc) { \
typedef BinaryDiv<DType> OpType; \
typedef SelectEdge LeftType; \
typedef SelectSrc RightType; \
{__VA_ARGS__} \
} else if (op == kDiv && lhs == kDst && rhs == kEdge) { \
typedef BinaryDiv<DType> OpType; \
typedef SelectDst LeftType; \
typedef SelectEdge RightType; \
{__VA_ARGS__} \
} else if (op == kDiv && lhs == kEdge && rhs == kDst) { \
typedef BinaryDiv<DType> OpType; \
typedef SelectEdge LeftType; \
typedef SelectDst RightType; \
{__VA_ARGS__} \
} else if (op == kUseLhs && lhs == kSrc) { \
typedef BinaryUseLhs<DType> OpType; \
typedef SelectSrc LeftType; \
typedef SelectNone RightType; \
{__VA_ARGS__} \
} else if (op == kUseLhs && lhs == kEdge) { \
typedef BinaryUseLhs<DType> OpType; \
typedef SelectEdge LeftType; \
typedef SelectNone RightType; \
{__VA_ARGS__} \
} else if (op == kDot && lhs == kSrc && rhs == kDst) { \
typedef BinaryDot<DType> OpType; \
typedef SelectSrc LeftType; \
typedef SelectDst RightType; \
{__VA_ARGS__} \
} else if (op == kDot && lhs == kSrc && rhs == kEdge) { \
typedef BinaryDot<DType> OpType; \
typedef SelectSrc LeftType; \
typedef SelectEdge RightType; \
{__VA_ARGS__} \
} else if (op == kDot && lhs == kDst && rhs == kEdge) { \
typedef BinaryDot<DType> OpType; \
typedef SelectDst LeftType; \
typedef SelectEdge RightType; \
{__VA_ARGS__} \
} else if (op == kDot && lhs == kDst && rhs == kSrc) { \
typedef BinaryDot<DType> OpType; \
typedef SelectDst LeftType; \
typedef SelectSrc RightType; \
{__VA_ARGS__} \
} else if (op == kDot && lhs == kEdge && rhs == kSrc) { \
typedef BinaryDot<DType> OpType; \
typedef SelectEdge LeftType; \
typedef SelectSrc RightType; \
{__VA_ARGS__} \
} else if (op == kDot && lhs == kEdge && rhs == kDst) { \
typedef BinaryDot<DType> OpType; \
typedef SelectEdge LeftType; \
typedef SelectDst RightType; \
{__VA_ARGS__} \
} else { \
LOG(FATAL) << "Unsupported operation: op=" << op \
<< " lhs=" << lhs << " rhs=" << rhs; \
} \
}
// Macro for unrolling with various template argument combinations
#define GEN_OP_TARGET(GEN, ...) \
MSVC_EXPAND(GEN(__VA_ARGS__, SelectSrc, SelectDst, BinaryAdd)) \
MSVC_EXPAND(GEN(__VA_ARGS__, SelectSrc, SelectEdge, BinaryAdd)) \
MSVC_EXPAND(GEN(__VA_ARGS__, SelectDst, SelectEdge, BinaryAdd)) \
MSVC_EXPAND(GEN(__VA_ARGS__, SelectSrc, SelectDst, BinaryMul)) \
MSVC_EXPAND(GEN(__VA_ARGS__, SelectSrc, SelectEdge, BinaryMul)) \
MSVC_EXPAND(GEN(__VA_ARGS__, SelectDst, SelectEdge, BinaryMul)) \
MSVC_EXPAND(GEN(__VA_ARGS__, SelectSrc, SelectDst, BinarySub)) \
MSVC_EXPAND(GEN(__VA_ARGS__, SelectDst, SelectSrc, BinarySub)) \
MSVC_EXPAND(GEN(__VA_ARGS__, SelectSrc, SelectEdge, BinarySub)) \
MSVC_EXPAND(GEN(__VA_ARGS__, SelectEdge, SelectSrc, BinarySub)) \
MSVC_EXPAND(GEN(__VA_ARGS__, SelectDst, SelectEdge, BinarySub)) \
MSVC_EXPAND(GEN(__VA_ARGS__, SelectEdge, SelectDst, BinarySub)) \
MSVC_EXPAND(GEN(__VA_ARGS__, SelectSrc, SelectDst, BinaryDiv)) \
MSVC_EXPAND(GEN(__VA_ARGS__, SelectDst, SelectSrc, BinaryDiv)) \
MSVC_EXPAND(GEN(__VA_ARGS__, SelectSrc, SelectEdge, BinaryDiv)) \
MSVC_EXPAND(GEN(__VA_ARGS__, SelectEdge, SelectSrc, BinaryDiv)) \
MSVC_EXPAND(GEN(__VA_ARGS__, SelectDst, SelectEdge, BinaryDiv)) \
MSVC_EXPAND(GEN(__VA_ARGS__, SelectEdge, SelectDst, BinaryDiv)) \
MSVC_EXPAND(GEN(__VA_ARGS__, SelectSrc, SelectNone, BinaryUseLhs)) \
MSVC_EXPAND(GEN(__VA_ARGS__, SelectEdge, SelectNone, BinaryUseLhs)) \
MSVC_EXPAND(GEN(__VA_ARGS__, SelectSrc, SelectDst, BinaryDot)) \
MSVC_EXPAND(GEN(__VA_ARGS__, SelectSrc, SelectEdge, BinaryDot)) \
MSVC_EXPAND(GEN(__VA_ARGS__, SelectDst, SelectEdge, BinaryDot)) \
MSVC_EXPAND(GEN(__VA_ARGS__, SelectDst, SelectSrc, BinaryDot)) \
MSVC_EXPAND(GEN(__VA_ARGS__, SelectEdge, SelectSrc, BinaryDot)) \
MSVC_EXPAND(GEN(__VA_ARGS__, SelectEdge, SelectDst, BinaryDot))
//////////////////////////////////////////////////////////////////////////
// Defines reducer category. Each category is an empty structure.
// The call functor is device dependent, so should be specialized
// in the each device's implementation.
// See Also:
// - kernel/cpu/functor.h
// - kernel/cuda/functor.cuh
//////////////////////////////////////////////////////////////////////////
// functors for reducers
template
<
int
XPU
,
typename
DType
>
struct
ReduceSum
{
};
template
<
int
XPU
,
typename
DType
>
struct
ReduceMax
{
};
template
<
int
XPU
,
typename
DType
>
struct
ReduceMin
{
};
template
<
int
XPU
,
typename
DType
>
struct
ReduceProd
{
};
template
<
int
XPU
,
typename
DType
>
struct
ReduceNone
{
};
// Macro for dispatching reducer names to Reducer op structure
#define REDUCER_SWITCH(val, XPU, DType, RedType, ...) \
if (val == binary_op::kReduceSum \
|| val == binary_op::kReduceMean) { \
typedef ReduceSum<XPU, DType> RedType; \
{__VA_ARGS__} \
} else if (val == binary_op::kReduceMax) { \
typedef ReduceMax<XPU, DType> RedType; \
{__VA_ARGS__} \
} else if (val == binary_op::kReduceMin) { \
typedef ReduceMin<XPU, DType> RedType; \
{__VA_ARGS__} \
} else if (val == binary_op::kReduceProd) { \
typedef ReduceProd<XPU, DType> RedType; \
{__VA_ARGS__} \
} else if (val == binary_op::kReduceNone) { \
typedef ReduceNone<XPU, DType> RedType; \
{__VA_ARGS__} \
} else { \
LOG(FATAL) << "Unsupported reducer: " << val; \
}
// Type trait for getting zero value of the given reducer type.
template
<
typename
Reducer
>
struct
Zero
{
};
template
<
int
XPU
,
typename
DType
>
struct
Zero
<
ReduceSum
<
XPU
,
DType
>>
{
static
constexpr
DType
value
=
0
;
};
template
<
int
XPU
,
typename
DType
>
struct
Zero
<
ReduceMax
<
XPU
,
DType
>>
{
static
constexpr
DType
value
=
std
::
numeric_limits
<
DType
>::
lowest
();
};
template
<
int
XPU
,
typename
DType
>
struct
Zero
<
ReduceMin
<
XPU
,
DType
>>
{
static
constexpr
DType
value
=
std
::
numeric_limits
<
DType
>::
max
();
};
template
<
int
XPU
,
typename
DType
>
struct
Zero
<
ReduceProd
<
XPU
,
DType
>>
{
static
constexpr
DType
value
=
1
;
};
template
<
int
XPU
,
typename
DType
>
struct
Zero
<
ReduceNone
<
XPU
,
DType
>>
{
static
constexpr
DType
value
=
0
;
};
template
<
int
XPU
,
typename
DType
>
constexpr
DType
Zero
<
ReduceSum
<
XPU
,
DType
>>::
value
;
template
<
int
XPU
,
typename
DType
>
constexpr
DType
Zero
<
ReduceMax
<
XPU
,
DType
>>::
value
;
template
<
int
XPU
,
typename
DType
>
constexpr
DType
Zero
<
ReduceMin
<
XPU
,
DType
>>::
value
;
template
<
int
XPU
,
typename
DType
>
constexpr
DType
Zero
<
ReduceProd
<
XPU
,
DType
>>::
value
;
template
<
int
XPU
,
typename
DType
>
constexpr
DType
Zero
<
ReduceNone
<
XPU
,
DType
>>::
value
;
// Type functor for selecting output target based on reducer type.
/*! \brief For all the reducer types except ReduceNone, select dst as the output target. */
template
<
typename
Reducer
>
struct
OutSelector
{
typedef
SelectDst
Type
;
};
/*! \brief For ReduceNone, select edge as the output target. */
template
<
int
XPU
,
typename
DType
>
struct
OutSelector
<
ReduceNone
<
XPU
,
DType
>>
{
typedef
SelectEdge
Type
;
};
// macro for dispatching number of broadcasting dimensions to template argument
#define BCAST_NDIM_SWITCH(ndim, NDim, ...) \
if (ndim <= 2) { \
constexpr int NDim = 2; \
{__VA_ARGS__} \
} else if (ndim <= 4) { \
constexpr int NDim = 4; \
{__VA_ARGS__} \
} else if (ndim <= 8) { \
constexpr int NDim = 8; \
{__VA_ARGS__} \
} else { \
LOG(FATAL) << "Too many broadcasting dimensions."; \
}
// macro for unrolling different broadcasting dimensions
#define GEN_NDIM(GEN, ...) \
MSVC_EXPAND(GEN(__VA_ARGS__, 2)) \
MSVC_EXPAND(GEN(__VA_ARGS__, 4)) \
MSVC_EXPAND(GEN(__VA_ARGS__, 8))
// macro for dispatching backward mode enum to template argument
#define BACKWARD_MODE_SWITCH(req_lhs, req_rhs, Mode, ...) \
CHECK(!(req_lhs && req_rhs)); \
if (req_lhs) { \
constexpr int Mode = binary_op::kGradLhs; \
{__VA_ARGS__} \
} else { \
constexpr int Mode = binary_op::kGradRhs; \
{__VA_ARGS__} \
}
// macro for unrolling different backward mode
#define GEN_BACKWARD_MODE(GEN, ...) \
MSVC_EXPAND(GEN(__VA_ARGS__, binary_op::kGradLhs)) \
MSVC_EXPAND(GEN(__VA_ARGS__, binary_op::kGradRhs)) \
MSVC_EXPAND(GEN(__VA_ARGS__, binary_op::kGradBoth))
}
// namespace kernel
}
// namespace dgl
#endif // DGL_KERNEL_BINARY_REDUCE_COMMON_H_
src/kernel/binary_reduce_impl.h
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/binary_reduce_impl.h
* \brief Implementations of binary reduce operations.
*/
#ifndef DGL_KERNEL_BINARY_REDUCE_IMPL_H_
#define DGL_KERNEL_BINARY_REDUCE_IMPL_H_
#include <minigun/minigun.h>
#include <dgl/runtime/device_api.h>
#include <algorithm>
#include <string>
#ifdef __CUDACC__
#include "../runtime/cuda/cuda_common.h"
#endif
#include "./binary_reduce.h"
#include "./binary_reduce_impl_decl.h"
#include "./csr_interface.h"
#include "./utils.h"
namespace
dgl
{
namespace
kernel
{
///////////////////////////////////////////////////////////////////////////////
// BinaryReduce device-agnostic implementation
///////////////////////////////////////////////////////////////////////////////
template
<
int
XPU
,
typename
Idx
,
typename
DType
,
typename
Reducer
>
GData
<
Idx
,
DType
>
AllocGData
(
const
std
::
string
&
op
,
const
DLContext
&
ctx
,
int64_t
x_len
,
runtime
::
NDArray
lhs_mapping
,
runtime
::
NDArray
rhs_mapping
,
runtime
::
NDArray
lhs_data
,
runtime
::
NDArray
rhs_data
,
runtime
::
NDArray
out_mapping
,
runtime
::
NDArray
out_data
)
{
// GData
GData
<
Idx
,
DType
>
gdata
;
gdata
.
x_length
=
x_len
;
gdata
.
lhs_data
=
static_cast
<
DType
*>
(
lhs_data
->
data
);
gdata
.
rhs_data
=
static_cast
<
DType
*>
(
rhs_data
->
data
);
gdata
.
out_data
=
static_cast
<
DType
*>
(
out_data
->
data
);
if
(
!
aten
::
IsNullArray
(
lhs_mapping
))
{
gdata
.
lhs_mapping
=
static_cast
<
Idx
*>
(
lhs_mapping
->
data
);
}
if
(
!
aten
::
IsNullArray
(
rhs_mapping
))
{
gdata
.
rhs_mapping
=
static_cast
<
Idx
*>
(
rhs_mapping
->
data
);
}
if
(
!
aten
::
IsNullArray
(
out_mapping
))
{
gdata
.
out_mapping
=
static_cast
<
Idx
*>
(
out_mapping
->
data
);
}
// for dot operation: vector [dot] vector
if
(
op
==
binary_op
::
kDot
)
{
// get size of vector
gdata
.
data_len
=
lhs_data
->
shape
[
lhs_data
->
ndim
-
1
];
}
else
{
gdata
.
data_len
=
1
;
}
// fill out data with zero values
utils
::
Fill
<
XPU
>
(
ctx
,
gdata
.
out_data
,
utils
::
NElements
(
out_data
),
Zero
<
Reducer
>::
value
);
return
gdata
;
}
template
<
int
XPU
>
void
BinaryReduceImpl
(
const
std
::
string
&
reducer
,
const
std
::
string
&
op
,
const
CSRWrapper
&
graph
,
binary_op
::
Target
lhs
,
binary_op
::
Target
rhs
,
runtime
::
NDArray
lhs_data
,
runtime
::
NDArray
rhs_data
,
runtime
::
NDArray
out_data
,
runtime
::
NDArray
lhs_mapping
,
runtime
::
NDArray
rhs_mapping
,
runtime
::
NDArray
out_mapping
)
{
using
runtime
::
NDArray
;
using
minigun
::
Csr
;
// device
#ifdef __CUDACC__
auto
*
thr_entry
=
runtime
::
CUDAThreadEntry
::
ThreadLocal
();
#endif
const
int64_t
x_len
=
utils
::
ComputeXLength
(
out_data
);
// advance config
minigun
::
advance
::
RuntimeConfig
rtcfg
;
rtcfg
.
ctx
=
out_data
->
ctx
;
#ifdef __CUDACC__
rtcfg
.
stream
=
thr_entry
->
stream
;
const
int
nt
=
utils
::
FindNumThreads
(
x_len
,
64
);
rtcfg
.
data_num_threads
=
nt
;
// XXX(minjie): hard-code to let each thread compute two elements to increase
// instruction level parallelism
rtcfg
.
data_num_blocks
=
(
x_len
+
(
nt
*
2
)
-
1
)
/
(
nt
*
2
);
#endif
if
(
reducer
==
binary_op
::
kReduceMean
)
{
// TODO(minjie): divide
LOG
(
FATAL
)
<<
"reduce mean is not supported."
;
}
const
DLDataType
&
dtype
=
out_data
->
dtype
;
const
auto
bits
=
graph
.
NumBits
();
DGL_DTYPE_SWITCH
(
dtype
,
DType
,
{
DGL_IDX_TYPE_SWITCH
(
bits
,
Idx
,
{
REDUCER_SWITCH
(
reducer
,
XPU
,
DType
,
Reducer
,
{
auto
gdata
=
AllocGData
<
XPU
,
Idx
,
DType
,
Reducer
>
(
op
,
rtcfg
.
ctx
,
x_len
,
lhs_mapping
,
rhs_mapping
,
lhs_data
,
rhs_data
,
out_mapping
,
out_data
);
OP_TARGET_SWITCH
(
op
,
lhs
,
rhs
,
DType
,
BinaryOp
,
LeftTarget
,
RightTarget
,
{
CallBinaryReduce
<
XPU
,
Idx
,
DType
,
LeftTarget
,
RightTarget
,
BinaryOp
,
Reducer
>
(
rtcfg
,
graph
,
&
gdata
);
});
});
});
});
}
///////////////////////////////////////////////////////////////////////////////
// BackwardBinaryReduce device-agnostic implementation
///////////////////////////////////////////////////////////////////////////////
template
<
int
XPU
,
typename
Idx
,
typename
DType
>
BackwardGData
<
Idx
,
DType
>
AllocBackwardGData
(
const
std
::
string
&
op
,
const
DLContext
&
ctx
,
int64_t
x_len
,
runtime
::
NDArray
lhs_mapping
,
runtime
::
NDArray
rhs_mapping
,
runtime
::
NDArray
out_mapping
,
runtime
::
NDArray
lhs_data
,
runtime
::
NDArray
rhs_data
,
runtime
::
NDArray
out_data
,
runtime
::
NDArray
grad_out_data
,
runtime
::
NDArray
grad_lhs_data
,
runtime
::
NDArray
grad_rhs_data
)
{
// GData
BackwardGData
<
Idx
,
DType
>
gdata
;
gdata
.
x_length
=
x_len
;
gdata
.
lhs_data
=
static_cast
<
DType
*>
(
lhs_data
->
data
);
gdata
.
rhs_data
=
static_cast
<
DType
*>
(
rhs_data
->
data
);
gdata
.
out_data
=
static_cast
<
DType
*>
(
out_data
->
data
);
gdata
.
grad_out_data
=
static_cast
<
DType
*>
(
grad_out_data
->
data
);
if
(
!
aten
::
IsNullArray
(
grad_lhs_data
))
{
gdata
.
grad_lhs_data
=
static_cast
<
DType
*>
(
grad_lhs_data
->
data
);
// fill out data with zero values
utils
::
Fill
<
XPU
>
(
ctx
,
gdata
.
grad_lhs_data
,
utils
::
NElements
(
grad_lhs_data
),
static_cast
<
DType
>
(
0
));
}
if
(
!
aten
::
IsNullArray
(
grad_rhs_data
))
{
gdata
.
grad_rhs_data
=
static_cast
<
DType
*>
(
grad_rhs_data
->
data
);
// fill out data with zero values
utils
::
Fill
<
XPU
>
(
ctx
,
gdata
.
grad_rhs_data
,
utils
::
NElements
(
grad_rhs_data
),
static_cast
<
DType
>
(
0
));
}
if
(
!
aten
::
IsNullArray
(
lhs_mapping
))
{
gdata
.
lhs_mapping
=
static_cast
<
Idx
*>
(
lhs_mapping
->
data
);
}
if
(
!
aten
::
IsNullArray
(
rhs_mapping
))
{
gdata
.
rhs_mapping
=
static_cast
<
Idx
*>
(
rhs_mapping
->
data
);
}
if
(
!
aten
::
IsNullArray
(
out_mapping
))
{
gdata
.
out_mapping
=
static_cast
<
Idx
*>
(
out_mapping
->
data
);
}
// for dot operation: vector [dot] vector
if
(
op
==
binary_op
::
kDot
)
{
// get size of vector
gdata
.
data_len
=
lhs_data
->
shape
[
lhs_data
->
ndim
-
1
];
}
else
{
gdata
.
data_len
=
1
;
}
return
gdata
;
}
template
<
int
XPU
>
void
BackwardBinaryReduceImpl
(
const
std
::
string
&
reducer
,
const
std
::
string
&
op
,
const
CSRWrapper
&
graph
,
binary_op
::
Target
lhs
,
binary_op
::
Target
rhs
,
runtime
::
NDArray
lhs_mapping
,
runtime
::
NDArray
rhs_mapping
,
runtime
::
NDArray
out_mapping
,
runtime
::
NDArray
lhs_data
,
runtime
::
NDArray
rhs_data
,
runtime
::
NDArray
out_data
,
runtime
::
NDArray
grad_out_data
,
runtime
::
NDArray
grad_lhs_data
,
runtime
::
NDArray
grad_rhs_data
)
{
using
runtime
::
NDArray
;
using
minigun
::
Csr
;
#ifdef __CUDACC__
// device
auto
*
thr_entry
=
runtime
::
CUDAThreadEntry
::
ThreadLocal
();
#endif
// Graph
const
int64_t
x_len
=
utils
::
ComputeXLength
(
out_data
);
// advance config
minigun
::
advance
::
RuntimeConfig
rtcfg
;
rtcfg
.
ctx
=
out_data
->
ctx
;
#ifdef __CUDACC__
rtcfg
.
stream
=
thr_entry
->
stream
;
const
int
nt
=
utils
::
FindNumThreads
(
x_len
,
64
);
rtcfg
.
data_num_threads
=
nt
;
// XXX(minjie): hard-code to let each thread compute two elements to increase
// instruction level parallelism
rtcfg
.
data_num_blocks
=
(
x_len
+
(
nt
*
2
)
-
1
)
/
(
nt
*
2
);
#endif
const
DLDataType
&
dtype
=
out_data
->
dtype
;
const
bool
req_lhs
=
!
aten
::
IsNullArray
(
grad_lhs_data
);
const
bool
req_rhs
=
!
aten
::
IsNullArray
(
grad_rhs_data
);
const
auto
bits
=
graph
.
NumBits
();
if
(
reducer
==
binary_op
::
kReduceMean
)
{
// TODO(minjie): divide
LOG
(
FATAL
)
<<
"reduce mean is not supported."
;
}
DGL_DTYPE_SWITCH
(
dtype
,
DType
,
{
DGL_IDX_TYPE_SWITCH
(
bits
,
Idx
,
{
auto
gdata
=
AllocBackwardGData
<
XPU
,
Idx
,
DType
>
(
op
,
rtcfg
.
ctx
,
x_len
,
lhs_mapping
,
rhs_mapping
,
out_mapping
,
lhs_data
,
rhs_data
,
out_data
,
grad_out_data
,
grad_lhs_data
,
grad_rhs_data
);
BACKWARD_MODE_SWITCH
(
req_lhs
,
req_rhs
,
Mode
,
{
REDUCER_SWITCH
(
reducer
,
XPU
,
DType
,
Reducer
,
{
OP_TARGET_SWITCH
(
op
,
lhs
,
rhs
,
DType
,
BinaryOp
,
LeftTarget
,
RightTarget
,
{
CallBackwardBinaryReduce
<
XPU
,
Mode
,
Idx
,
DType
,
LeftTarget
,
RightTarget
,
BinaryOp
,
Reducer
>
(
rtcfg
,
graph
,
&
gdata
);
});
});
});
});
});
}
///////////////////////////////////////////////////////////////////////////////
// BinaryReduceBcast device-agnostic implementation
///////////////////////////////////////////////////////////////////////////////
template
<
int
XPU
,
int
NDim
,
typename
Idx
,
typename
DType
,
typename
Reducer
>
BcastGData
<
NDim
,
Idx
,
DType
>
AllocBcastGData
(
const
DLContext
&
ctx
,
const
BcastInfo
&
info
,
runtime
::
NDArray
lhs_mapping
,
runtime
::
NDArray
rhs_mapping
,
runtime
::
NDArray
lhs_data
,
runtime
::
NDArray
rhs_data
,
runtime
::
NDArray
out_mapping
,
runtime
::
NDArray
out_data
)
{
// GData
BcastGData
<
NDim
,
Idx
,
DType
>
gdata
;
// dim, shape and stride
gdata
.
ndim
=
info
.
lhs_shape
.
size
();
std
::
copy
(
info
.
lhs_shape
.
begin
(),
info
.
lhs_shape
.
end
(),
gdata
.
lhs_shape
);
std
::
copy
(
info
.
lhs_stride
.
begin
(),
info
.
lhs_stride
.
end
(),
gdata
.
lhs_stride
);
std
::
copy
(
info
.
rhs_shape
.
begin
(),
info
.
rhs_shape
.
end
(),
gdata
.
rhs_shape
);
std
::
copy
(
info
.
rhs_stride
.
begin
(),
info
.
rhs_stride
.
end
(),
gdata
.
rhs_stride
);
std
::
copy
(
info
.
out_shape
.
begin
(),
info
.
out_shape
.
end
(),
gdata
.
out_shape
);
std
::
copy
(
info
.
out_stride
.
begin
(),
info
.
out_stride
.
end
(),
gdata
.
out_stride
);
gdata
.
lhs_len
=
utils
::
Prod
(
info
.
lhs_shape
);
gdata
.
rhs_len
=
utils
::
Prod
(
info
.
rhs_shape
);
gdata
.
out_len
=
utils
::
Prod
(
info
.
out_shape
);
// data
gdata
.
lhs_data
=
static_cast
<
DType
*>
(
lhs_data
->
data
);
gdata
.
rhs_data
=
static_cast
<
DType
*>
(
rhs_data
->
data
);
gdata
.
out_data
=
static_cast
<
DType
*>
(
out_data
->
data
);
if
(
!
aten
::
IsNullArray
(
lhs_mapping
))
{
gdata
.
lhs_mapping
=
static_cast
<
Idx
*>
(
lhs_mapping
->
data
);
}
if
(
!
aten
::
IsNullArray
(
rhs_mapping
))
{
gdata
.
rhs_mapping
=
static_cast
<
Idx
*>
(
rhs_mapping
->
data
);
}
if
(
!
aten
::
IsNullArray
(
out_mapping
))
{
gdata
.
out_mapping
=
static_cast
<
Idx
*>
(
out_mapping
->
data
);
}
gdata
.
data_len
=
info
.
data_len
;
// fill out data with zero values
utils
::
Fill
<
XPU
>
(
ctx
,
gdata
.
out_data
,
utils
::
NElements
(
out_data
),
Zero
<
Reducer
>::
value
);
return
gdata
;
}
template
<
int
XPU
>
void
BinaryReduceBcastImpl
(
const
BcastInfo
&
info
,
const
std
::
string
&
reducer
,
const
std
::
string
&
op
,
const
CSRWrapper
&
graph
,
binary_op
::
Target
lhs
,
binary_op
::
Target
rhs
,
runtime
::
NDArray
lhs_data
,
runtime
::
NDArray
rhs_data
,
runtime
::
NDArray
out_data
,
runtime
::
NDArray
lhs_mapping
,
runtime
::
NDArray
rhs_mapping
,
runtime
::
NDArray
out_mapping
)
{
using
runtime
::
NDArray
;
using
minigun
::
Csr
;
#ifdef __CUDACC__
auto
*
thr_entry
=
runtime
::
CUDAThreadEntry
::
ThreadLocal
();
#endif
// advance config
minigun
::
advance
::
RuntimeConfig
rtcfg
;
rtcfg
.
ctx
=
out_data
->
ctx
;
#ifdef __CUDACC__
rtcfg
.
stream
=
thr_entry
->
stream
;
const
int64_t
x_len
=
utils
::
ComputeXLength
(
out_data
);
const
int
nt
=
utils
::
FindNumThreads
(
x_len
,
64
);
rtcfg
.
data_num_threads
=
nt
;
// XXX(minjie): hard-code to let each thread compute two elements to increase
// instruction level parallelism
rtcfg
.
data_num_blocks
=
(
x_len
+
(
nt
*
2
)
-
1
)
/
(
nt
*
2
);
#endif
const
DLDataType
&
dtype
=
out_data
->
dtype
;
const
int
bcast_ndim
=
info
.
out_shape
.
size
();
const
auto
bits
=
graph
.
NumBits
();
if
(
reducer
==
binary_op
::
kReduceMean
)
{
// TODO(minjie): divide
LOG
(
FATAL
)
<<
"reduce mean is not supported."
;
}
DGL_DTYPE_SWITCH
(
dtype
,
DType
,
{
DGL_IDX_TYPE_SWITCH
(
bits
,
Idx
,
{
REDUCER_SWITCH
(
reducer
,
XPU
,
DType
,
Reducer
,
{
BCAST_NDIM_SWITCH
(
bcast_ndim
,
NDim
,
{
auto
gdata
=
AllocBcastGData
<
XPU
,
NDim
,
Idx
,
DType
,
Reducer
>
(
rtcfg
.
ctx
,
info
,
lhs_mapping
,
rhs_mapping
,
lhs_data
,
rhs_data
,
out_mapping
,
out_data
);
OP_TARGET_SWITCH
(
op
,
lhs
,
rhs
,
DType
,
BinaryOp
,
LeftTarget
,
RightTarget
,
{
CallBinaryReduceBcast
<
XPU
,
NDim
,
Idx
,
DType
,
LeftTarget
,
RightTarget
,
BinaryOp
,
Reducer
>
(
rtcfg
,
graph
,
&
gdata
);
});
});
});
});
});
}
///////////////////////////////////////////////////////////////////////////////
// BackwardBinaryReduceBcast device-agnostic implementation
///////////////////////////////////////////////////////////////////////////////
template
<
int
XPU
,
int
NDim
,
typename
Idx
,
typename
DType
>
BackwardBcastGData
<
NDim
,
Idx
,
DType
>
AllocBackwardBcastGData
(
const
DLContext
&
ctx
,
const
BcastInfo
&
info
,
runtime
::
NDArray
lhs_mapping
,
runtime
::
NDArray
rhs_mapping
,
runtime
::
NDArray
out_mapping
,
runtime
::
NDArray
lhs
,
runtime
::
NDArray
rhs
,
runtime
::
NDArray
out
,
runtime
::
NDArray
grad_out
,
runtime
::
NDArray
grad_lhs
,
runtime
::
NDArray
grad_rhs
)
{
// GData
BackwardBcastGData
<
NDim
,
Idx
,
DType
>
gdata
;
// dim, shape and stride
gdata
.
ndim
=
info
.
lhs_shape
.
size
();
gdata
.
lhs_len
=
utils
::
Prod
(
info
.
lhs_shape
);
gdata
.
rhs_len
=
utils
::
Prod
(
info
.
rhs_shape
);
gdata
.
out_len
=
utils
::
Prod
(
info
.
out_shape
);
std
::
copy
(
info
.
lhs_shape
.
begin
(),
info
.
lhs_shape
.
end
(),
gdata
.
lhs_shape
);
std
::
copy
(
info
.
lhs_stride
.
begin
(),
info
.
lhs_stride
.
end
(),
gdata
.
lhs_stride
);
std
::
copy
(
info
.
rhs_shape
.
begin
(),
info
.
rhs_shape
.
end
(),
gdata
.
rhs_shape
);
std
::
copy
(
info
.
rhs_stride
.
begin
(),
info
.
rhs_stride
.
end
(),
gdata
.
rhs_stride
);
std
::
copy
(
info
.
out_shape
.
begin
(),
info
.
out_shape
.
end
(),
gdata
.
out_shape
);
std
::
copy
(
info
.
out_stride
.
begin
(),
info
.
out_stride
.
end
(),
gdata
.
out_stride
);
// mappings
if
(
!
aten
::
IsNullArray
(
lhs_mapping
))
{
gdata
.
lhs_mapping
=
static_cast
<
Idx
*>
(
lhs_mapping
->
data
);
}
if
(
!
aten
::
IsNullArray
(
rhs_mapping
))
{
gdata
.
rhs_mapping
=
static_cast
<
Idx
*>
(
rhs_mapping
->
data
);
}
if
(
!
aten
::
IsNullArray
(
out_mapping
))
{
gdata
.
out_mapping
=
static_cast
<
Idx
*>
(
out_mapping
->
data
);
}
gdata
.
data_len
=
info
.
data_len
;
// data
gdata
.
lhs_data
=
static_cast
<
DType
*>
(
lhs
->
data
);
gdata
.
rhs_data
=
static_cast
<
DType
*>
(
rhs
->
data
);
gdata
.
out_data
=
static_cast
<
DType
*>
(
out
->
data
);
gdata
.
grad_out_data
=
static_cast
<
DType
*>
(
grad_out
->
data
);
if
(
!
aten
::
IsNullArray
(
grad_lhs
))
{
gdata
.
grad_lhs_data
=
static_cast
<
DType
*>
(
grad_lhs
->
data
);
// fill out data with zero values
utils
::
Fill
<
XPU
>
(
ctx
,
gdata
.
grad_lhs_data
,
utils
::
NElements
(
grad_lhs
),
static_cast
<
DType
>
(
0
));
}
if
(
!
aten
::
IsNullArray
(
grad_rhs
))
{
gdata
.
grad_rhs_data
=
static_cast
<
DType
*>
(
grad_rhs
->
data
);
// fill out data with zero values
utils
::
Fill
<
XPU
>
(
ctx
,
gdata
.
grad_rhs_data
,
utils
::
NElements
(
grad_rhs
),
static_cast
<
DType
>
(
0
));
}
return
gdata
;
}
template
<
int
XPU
>
void
BackwardBinaryReduceBcastImpl
(
const
BcastInfo
&
info
,
const
std
::
string
&
reducer
,
const
std
::
string
&
op
,
const
CSRWrapper
&
graph
,
binary_op
::
Target
lhs_tgt
,
binary_op
::
Target
rhs_tgt
,
runtime
::
NDArray
lhs_mapping
,
runtime
::
NDArray
rhs_mapping
,
runtime
::
NDArray
out_mapping
,
runtime
::
NDArray
lhs
,
runtime
::
NDArray
rhs
,
runtime
::
NDArray
out
,
runtime
::
NDArray
grad_out
,
runtime
::
NDArray
grad_lhs
,
runtime
::
NDArray
grad_rhs
)
{
using
runtime
::
NDArray
;
using
minigun
::
Csr
;
#ifdef __CUDACC__
auto
*
thr_entry
=
runtime
::
CUDAThreadEntry
::
ThreadLocal
();
#endif
// advance config
minigun
::
advance
::
RuntimeConfig
rtcfg
;
rtcfg
.
ctx
=
out
->
ctx
;
#ifdef __CUDACC__
rtcfg
.
stream
=
thr_entry
->
stream
;
const
int64_t
x_len
=
utils
::
ComputeXLength
(
out
);
const
int
nt
=
utils
::
FindNumThreads
(
x_len
,
64
);
rtcfg
.
data_num_threads
=
nt
;
// XXX(minjie): hard-code to let each thread compute two elements to increase
// instruction level parallelism
rtcfg
.
data_num_blocks
=
(
x_len
+
(
nt
*
2
)
-
1
)
/
(
nt
*
2
);
#endif
const
DLDataType
&
dtype
=
out
->
dtype
;
const
int
bcast_ndim
=
info
.
out_shape
.
size
();
const
bool
req_lhs
=
!
aten
::
IsNullArray
(
grad_lhs
);
const
bool
req_rhs
=
!
aten
::
IsNullArray
(
grad_rhs
);
const
auto
bits
=
graph
.
NumBits
();
if
(
reducer
==
binary_op
::
kReduceMean
)
{
// TODO(minjie): divide
LOG
(
FATAL
)
<<
"reduce mean is not supported."
;
}
DGL_DTYPE_SWITCH
(
dtype
,
DType
,
{
DGL_IDX_TYPE_SWITCH
(
bits
,
Idx
,
{
BCAST_NDIM_SWITCH
(
bcast_ndim
,
NDim
,
{
auto
gdata
=
AllocBackwardBcastGData
<
XPU
,
NDim
,
Idx
,
DType
>
(
rtcfg
.
ctx
,
info
,
lhs_mapping
,
rhs_mapping
,
out_mapping
,
lhs
,
rhs
,
out
,
grad_out
,
grad_lhs
,
grad_rhs
);
BACKWARD_MODE_SWITCH
(
req_lhs
,
req_rhs
,
Mode
,
{
REDUCER_SWITCH
(
reducer
,
XPU
,
DType
,
Reducer
,
{
OP_TARGET_SWITCH
(
op
,
lhs_tgt
,
rhs_tgt
,
DType
,
BinaryOp
,
LeftTarget
,
RightTarget
,
{
CallBackwardBinaryReduceBcast
<
XPU
,
Mode
,
NDim
,
Idx
,
DType
,
LeftTarget
,
RightTarget
,
BinaryOp
,
Reducer
>
(
rtcfg
,
graph
,
&
gdata
);
});
});
});
});
});
});
}
}
// namespace kernel
}
// namespace dgl
#endif // DGL_KERNEL_BINARY_REDUCE_IMPL_H_
src/kernel/binary_reduce_impl_decl.h
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/binary_reduce_impl_decl.h
* \brief Data structure and function declarations for implementations.
*/
#ifndef DGL_KERNEL_BINARY_REDUCE_IMPL_DECL_H_
#define DGL_KERNEL_BINARY_REDUCE_IMPL_DECL_H_
#include <dgl/runtime/ndarray.h>
#include <string>
#include "./binary_reduce_common.h"
#include "./csr_interface.h"
namespace
minigun
{
namespace
advance
{
// forward declaration
struct
RuntimeConfig
;
}
// namespace advance
}
// namespace minigun
namespace
dgl
{
namespace
kernel
{
// forward declaration
struct
BcastInfo
;
///////////////////////////////////////////////////////////////////////////////
// BinaryReduce declarations
///////////////////////////////////////////////////////////////////////////////
/*!\brief Data structure used by computing BinaryOpReduce in Minigun. */
template
<
typename
Idx
,
typename
DType
>
struct
GData
{
// length along x(feature) dimension
int64_t
x_length
{
0
};
// size of data, can be single value or a vector
int64_t
data_len
{
0
};
// input data
DType
*
lhs_data
{
nullptr
},
*
rhs_data
{
nullptr
};
// output data
DType
*
out_data
{
nullptr
};
// input id mappings
Idx
*
lhs_mapping
{
nullptr
},
*
rhs_mapping
{
nullptr
};
// output id mapping
Idx
*
out_mapping
{
nullptr
};
};
/*!
* \brief Template declaration for BinaryReduce operator.
*
* LeftSelector and RightSelector must be one of the four operand target
* categories.
*
* BinaryOp must be one of the binary operator types.
*
* Reducer must be one of the reducer types.
*
* The implementation of this template is device-dependent
* (see kernel/xpu/binary_reduce_impl.(cu)h).
*
* See definitions in binary_reduce_common.h
*
* \tparam XPU the device flag
* \tparam Idx type of node/edge index (e.g. int32_t, int64_t)
* \tparam DType type of the feature data (e.g. float32)
* \tparam LeftSelect lhs category type
* \tparam RightSelect rhs category type
* \tparam BinaryOp Binary operator type
* \tparam Reducer Reducer type
* \param rtcfg Runtime configuration used by miningun
* \param graph The graph object.
* \param gdata The feature and mapping data used by the computation.
*/
template
<
int
XPU
,
typename
Idx
,
typename
DType
,
typename
LeftSelector
,
typename
RightSelector
,
typename
BinaryOp
,
typename
Reducer
>
void
CallBinaryReduce
(
const
minigun
::
advance
::
RuntimeConfig
&
rtcfg
,
const
CSRWrapper
&
graph
,
GData
<
Idx
,
DType
>*
gdata
);
/*!
* \brief Template declaration for common logics shared by different devices.
*
* \tparam XPU the device flag
* \param reducer The type of the reducer ("sum", "max", "mean", "min", "none").
* If the reducer is "none", the output is an edge feature tensor.
* Otherwise, a node feature tensor is returned.
* \param op The type of the binary operator ("mul", "add").
* \param graph The graph object.
* \param lhs The lhs target (src, dst, edge)
* \param rhs The rhs target (src, dst, edge)
* \param lhs_data The lhs feature tensor.
* \param rhs_data The rhs feature tensor.
* \param out_data The output tensor. Could be either node or edge feature
* tensor depending on the reducer.
* \param lhs_mapping An optional int64 id mapping array.
* \param rhs_mapping An optional int64 id mapping array.
* \param out_mapping An optional int64 id mapping array.
*/
template
<
int
XPU
>
void
BinaryReduceImpl
(
const
std
::
string
&
reducer
,
const
std
::
string
&
op
,
const
CSRWrapper
&
graph
,
binary_op
::
Target
lhs
,
binary_op
::
Target
rhs
,
runtime
::
NDArray
lhs_data
,
runtime
::
NDArray
rhs_data
,
runtime
::
NDArray
out_data
,
runtime
::
NDArray
lhs_mapping
,
runtime
::
NDArray
rhs_mapping
,
runtime
::
NDArray
out_mapping
);
///////////////////////////////////////////////////////////////////////////////
// BackwardBinaryReduce declarations
///////////////////////////////////////////////////////////////////////////////
/*!\brief Data structure used by computing BackwardBinaryReduce in Minigun. */
template
<
typename
Idx
,
typename
DType
>
struct
BackwardGData
{
// length along x(feature) dimension
int64_t
x_length
{
0
};
// size of data, can be single value or a vector
int64_t
data_len
{
0
};
// input data
DType
*
lhs_data
{
nullptr
},
*
rhs_data
{
nullptr
},
*
out_data
{
nullptr
};
DType
*
grad_out_data
{
nullptr
};
// output data
DType
*
grad_lhs_data
{
nullptr
},
*
grad_rhs_data
{
nullptr
};
// input id mappings
Idx
*
lhs_mapping
{
nullptr
},
*
rhs_mapping
{
nullptr
};
// output id mapping
Idx
*
out_mapping
{
nullptr
};
};
/*!
* \brief Template declaration for BackwardBinaryReduce operator.
*
* Mode must be one of the enum code in binary_op::BackwardMode.
*
* LeftSelector and RightSelector must be one of the four operand target
* categories.
*
* BinaryOp must be one of the binary operator types.
*
* Reducer must be one of the reducer types.
*
* The implementation of this template is device-dependent
* (see kernel/xpu/backward_binary_reduce_impl.(cu)h).
*
* See definitions in binary_reduce_common.h
*
* \tparam XPU the device flag
* \tparam Mode the backward mode code
* \tparam Idx type of node/edge index (e.g. int32_t, int64_t)
* \tparam DType type of the feature data (e.g. float32)
* \tparam LeftSelect lhs category type
* \tparam RightSelect rhs category type
* \tparam BinaryOp Binary operator type
* \tparam Reducer Reducer type
* \param rtcfg Runtime configuration used by miningun
* \param graph The graph object.
* \param gdata The feature and mapping data used by the computation.
*/
template
<
int
XPU
,
int
Mode
,
typename
Idx
,
typename
DType
,
typename
LeftSelector
,
typename
RightSelector
,
typename
BinaryOp
,
typename
Reducer
>
void
CallBackwardBinaryReduce
(
const
minigun
::
advance
::
RuntimeConfig
&
rtcfg
,
const
CSRWrapper
&
graph
,
BackwardGData
<
Idx
,
DType
>*
gdata
);
/*!
* \brief Template declaration for common logics shared by different devices.
*
* \tparam XPU the device flag
* \param reducer The type of the reducer ("sum", "max", "mean", "min", "none").
* If the reducer is "none", the output is an edge feature tensor.
* Otherwise, a node feature tensor is returned.
* \param op The type of the binary operator ("mul", "add").
* \param graph The graph object.
* \param lhs The lhs target (src, dst, edge)
* \param rhs The rhs target (src, dst, edge)
* \param lhs_mapping An optional int64 id mapping array.
* \param rhs_mapping An optional int64 id mapping array.
* \param out_mapping An optional int64 id mapping array.
* \param lhs_data The lhs feature tensor.
* \param rhs_data The rhs feature tensor.
* \param out_data The output tensor. Could be either node or edge feature
* tensor depending on the reducer.
* \param grad_out_data The gradient output tensor.
* \param grad_lhs_data The gradient lhs tensor.
*/
template
<
int
XPU
>
void
BackwardBinaryReduceImpl
(
const
std
::
string
&
reducer
,
const
std
::
string
&
op
,
const
CSRWrapper
&
graph
,
binary_op
::
Target
lhs
,
binary_op
::
Target
rhs
,
runtime
::
NDArray
lhs_mapping
,
runtime
::
NDArray
rhs_mapping
,
runtime
::
NDArray
out_mapping
,
runtime
::
NDArray
lhs_data
,
runtime
::
NDArray
rhs_data
,
runtime
::
NDArray
out_data
,
runtime
::
NDArray
grad_out_data
,
runtime
::
NDArray
grad_lhs_data
,
runtime
::
NDArray
grad_rhs_data
);
///////////////////////////////////////////////////////////////////////////////
// BinaryReduce with broadcasting declarations
///////////////////////////////////////////////////////////////////////////////
/*!
* \brief Data structure used by computing BinaryOp with broadcasting in Minigun.
*
* Note that all the shapes and strides are for the feature dimensions.
*
* \tparam NDim maximum number of feature dimensions
* \tparam Idx id index type
* \tparam DType feature data type
*/
template
<
int
NDim
,
typename
Idx
,
typename
DType
>
struct
BcastGData
{
// actual number of feature dimensions
int
ndim
{
0
};
// input feature shape and stride
int64_t
lhs_len
{
0
},
rhs_len
{
0
};
int64_t
lhs_shape
[
NDim
]{
0
},
lhs_stride
[
NDim
]{
0
};
int64_t
rhs_shape
[
NDim
]{
0
},
rhs_stride
[
NDim
]{
0
};
// size of data, can be single value or a vector
int64_t
data_len
{
0
};
// input data
DType
*
lhs_data
{
nullptr
},
*
rhs_data
{
nullptr
};
// input id mappings
Idx
*
lhs_mapping
{
nullptr
},
*
rhs_mapping
{
nullptr
};
// output feature shape and stride
int64_t
out_len
{
0
};
// output total feature length (equal to prod(out_shape));
int64_t
out_shape
[
NDim
]{
0
},
out_stride
[
NDim
]{
0
};
// output data
DType
*
out_data
{
nullptr
};
// output id mapping
Idx
*
out_mapping
{
nullptr
};
};
/*!
* \brief Template declaration for BinaryReduce with broadcasting operator.
*
* LeftSelector and RightSelector must be one of the four operand target
* categories.
*
* BinaryOp must be one of the binary operator types.
*
* Reducer must be one of the reducer types.
*
* The implementation of this template is device-dependent
* (see kernel/xpu/binary_reduce_impl.(cu)h).
*
* See definitions in binary_reduce_common.h
*
* \tparam XPU the device flag
* \tparam NDim maximum number of feature dimensions
* \tparam Idx type of node/edge index (e.g. int32_t, int64_t)
* \tparam DType type of the feature data (e.g. float32)
* \tparam LeftSelect lhs category type
* \tparam RightSelect rhs category type
* \tparam BinaryOp rinary operator type
* \tparam Reducer reducer type
* \param rtcfg runtime configuration used by miningun
* \param graph The graph object.
* \param gdata The feature and mapping data used by the computation.
*/
template
<
int
XPU
,
int
NDim
,
typename
Idx
,
typename
DType
,
typename
LeftSelector
,
typename
RightSelector
,
typename
BinaryOp
,
typename
Reducer
>
void
CallBinaryReduceBcast
(
const
minigun
::
advance
::
RuntimeConfig
&
rtcfg
,
const
CSRWrapper
&
graph
,
BcastGData
<
NDim
,
Idx
,
DType
>*
gdata
);
/*!
* \brief Template declaration for common logics shared by different devices.
*
* \tparam XPU the device flag
* \param reducer The type of the reducer ("sum", "max", "mean", "min", "none").
* If the reducer is "none", the output is an edge feature tensor.
* Otherwise, a node feature tensor is returned.
* \param op The type of the binary operator ("mul", "add").
* \param graph The graph object.
* \param lhs The lhs target (src, dst, edge)
* \param rhs The rhs target (src, dst, edge)
* \param lhs_data The lhs feature tensor.
* \param rhs_data The rhs feature tensor.
* \param out_data The output tensor. Could be either node or edge feature
* tensor depending on the reducer.
* \param lhs_mapping An optional int64 id mapping array.
* \param rhs_mapping An optional int64 id mapping array.
* \param out_mapping An optional int64 id mapping array.
*/
template
<
int
XPU
>
void
BinaryReduceBcastImpl
(
const
BcastInfo
&
info
,
const
std
::
string
&
reducer
,
const
std
::
string
&
op
,
const
CSRWrapper
&
graph
,
binary_op
::
Target
lhs
,
binary_op
::
Target
rhs
,
runtime
::
NDArray
lhs_data
,
runtime
::
NDArray
rhs_data
,
runtime
::
NDArray
out_data
,
runtime
::
NDArray
lhs_mapping
,
runtime
::
NDArray
rhs_mapping
,
runtime
::
NDArray
out_mapping
);
///////////////////////////////////////////////////////////////////////////////
// BackwardBinaryReduce with broadcasting declarations
///////////////////////////////////////////////////////////////////////////////
/*!
* \brief Data and auxiliary information for backward binary broadcasting op.
*
* Note that all the shapes and strides are for the feature dimensions.
*
* The gradients of the broadcasting dimensions are not reduced. As a result,
* The grad_lhs and grad_rhs have the same shape as grad_out.
*
* \tparam NDim maximum number of feature dimensions
* \tparam Idx id index type
* \tparam DType feature data type
*/
template
<
int
NDim
,
typename
Idx
,
typename
DType
>
struct
BackwardBcastGData
{
// actual number of feature dimensions
int
ndim
{
0
};
// input shape and stride
int64_t
lhs_len
{
0
},
rhs_len
{
0
},
out_len
{
0
};
int64_t
lhs_shape
[
NDim
]{
0
},
lhs_stride
[
NDim
]{
0
};
int64_t
rhs_shape
[
NDim
]{
0
},
rhs_stride
[
NDim
]{
0
};
int64_t
out_shape
[
NDim
]{
0
},
out_stride
[
NDim
]{
0
};
// size of data, can be single value or a vector
int64_t
data_len
{
0
};
// input id mappings
Idx
*
lhs_mapping
{
nullptr
},
*
rhs_mapping
{
nullptr
},
*
out_mapping
{
nullptr
};
// input data
DType
*
lhs_data
{
nullptr
},
*
rhs_data
{
nullptr
},
*
out_data
{
nullptr
};
DType
*
grad_out_data
{
nullptr
};
// output data
DType
*
grad_lhs_data
{
nullptr
},
*
grad_rhs_data
{
nullptr
};
};
/*!
* \brief Template declaration for BackwardBinaryReduce with broadcasting operator.
*
* LeftSelector and RightSelector must be one of the four operand target
* categories.
*
* BinaryOp must be one of the binary operator types.
*
* Reducer must be one of the reducer types.
*
* The implementation of this template is device-dependent
* (see kernel/xpu/binary_reduce_impl.(cu)h).
*
* See definitions in binary_reduce_common.h
*
* \tparam XPU the device flag
* \tparam Mode the backward mode code
* \tparam NDim maximum number of feature dimensions
* \tparam Idx type of node/edge index (e.g. int32_t, int64_t)
* \tparam DType type of the feature data (e.g. float32)
* \tparam LeftSelect lhs category type
* \tparam RightSelect rhs category type
* \tparam BinaryOp rinary operator type
* \tparam Reducer reducer type
* \param rtcfg runtime configuration used by miningun
* \param graph The graph object.
* \param gdata The feature and mapping data used by the computation.
*/
template
<
int
XPU
,
int
Mode
,
int
NDim
,
typename
Idx
,
typename
DType
,
typename
LeftSelector
,
typename
RightSelector
,
typename
BinaryOp
,
typename
Reducer
>
void
CallBackwardBinaryReduceBcast
(
const
minigun
::
advance
::
RuntimeConfig
&
rtcfg
,
const
CSRWrapper
&
graph
,
BackwardBcastGData
<
NDim
,
Idx
,
DType
>*
gdata
);
/*!
* \brief Template declaration for common logics shared by different devices.
*
* \tparam XPU the device flag
* \param reducer The type of the reducer ("sum", "max", "mean", "min", "none").
* If the reducer is "none", the output is an edge feature tensor.
* Otherwise, a node feature tensor is returned.
* \param op The type of the binary operator ("mul", "add").
* \param graph The graph object.
* \param lhs The lhs target (src, dst, edge)
* \param rhs The rhs target (src, dst, edge)
* \param lhs_mapping An optional int64 id mapping array.
* \param rhs_mapping An optional int64 id mapping array.
* \param out_mapping An optional int64 id mapping array.
* \param lhs_data The lhs feature tensor.
* \param rhs_data The rhs feature tensor.
* \param out_data The output tensor. Could be either node or edge feature
* tensor depending on the reducer.
* \param grad_out_data The gradient output tensor.
* \param grad_lhs_data The gradient lhs tensor.
*/
template
<
int
XPU
>
void
BackwardBinaryReduceBcastImpl
(
const
BcastInfo
&
info
,
const
std
::
string
&
reducer
,
const
std
::
string
&
op
,
const
CSRWrapper
&
graph
,
binary_op
::
Target
lhs
,
binary_op
::
Target
rhs
,
runtime
::
NDArray
lhs_mapping
,
runtime
::
NDArray
rhs_mapping
,
runtime
::
NDArray
out_mapping
,
runtime
::
NDArray
lhs_data
,
runtime
::
NDArray
rhs_data
,
runtime
::
NDArray
out_data
,
runtime
::
NDArray
grad_out_data
,
runtime
::
NDArray
grad_lhs_data
,
runtime
::
NDArray
grad_rhs_data
);
}
// namespace kernel
}
// namespace dgl
#endif // DGL_KERNEL_BINARY_REDUCE_IMPL_DECL_H_
src/kernel/common.h
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/common.h
* \brief Kernel common utilities
*/
#ifndef DGL_KERNEL_COMMON_H_
#define DGL_KERNEL_COMMON_H_
#include <dgl/runtime/ndarray.h>
#include <cstdint>
#include "../c_api_common.h"
namespace
dgl
{
namespace
kernel
{
#ifdef __CUDACC__
#define DGLDEVICE __device__
#define DGLINLINE __forceinline__
#else
#define DGLDEVICE
#define DGLINLINE inline
#endif // __CUDACC__
// Macro for dispatch device flag to template function calls
#ifdef DGL_USE_CUDA
#define DGL_XPU_SWITCH(val, Method, ...) \
if (val == kDLCPU) { \
Method<kDLCPU>(__VA_ARGS__); \
} else if (val == kDLGPU) { \
Method<kDLGPU>(__VA_ARGS__); \
} else { \
LOG(FATAL) << "Unsupported device type: " << val; \
}
#else // DGL_USE_CUDA
#define DGL_XPU_SWITCH(val, Method, ...) \
if (val == kDLCPU) { \
Method<kDLCPU>(__VA_ARGS__); \
} else { \
LOG(FATAL) << "Unsupported device type: " << val; \
}
#endif // DGL_USE_CUDA
// MSVC does not expand __VA_ARGS__ correctly, and needs this expand hack
#define MSVC_EXPAND(x) x
// Macro for dispatch dtype flag to template argument. Currently only
// support float32.
#define DGL_DTYPE_SWITCH(val, DType, ...) \
if (val.code == kDLFloat && val.bits == 32) { \
typedef float DType; \
{ __VA_ARGS__ } \
} else { \
LOG(FATAL) << "Unsupported dtype: " << val; \
}
// Macro for unrolling with data type arguments.
#define GEN_DTYPE(GEN, ...) \
MSVC_EXPAND(GEN(__VA_ARGS__, float))
// Macro for dispatch index nbits to template argument.
#ifdef __CUDACC__
#define DGL_IDX_TYPE_SWITCH(bits, Idx, ...) \
if (bits == 32) { \
typedef int32_t Idx; \
{__VA_ARGS__} \
} else { \
LOG(FATAL) << "Unsupported idx bits: " << bits; \
}
#else
#define DGL_IDX_TYPE_SWITCH(bits, Idx, ...) \
if (bits == 32) { \
typedef int32_t Idx; \
{__VA_ARGS__} \
} else if (bits == 64) { \
typedef int64_t Idx; \
{__VA_ARGS__} \
} else { \
LOG(FATAL) << "Unsupported idx bits: " << bits; \
}
#endif
}
// namespace kernel
}
// namespace dgl
#endif // DGL_KERNEL_COMMON_H_
src/kernel/cpu/backward_binary_reduce_impl.h
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cuda/backward_binary_reduce_impl.h
* \brief Minigun CPU UDFs for bacward binary reduce
*/
#ifndef DGL_KERNEL_CPU_BACKWARD_BINARY_REDUCE_IMPL_H_
#define DGL_KERNEL_CPU_BACKWARD_BINARY_REDUCE_IMPL_H_
#include <minigun/minigun.h>
#include "../binary_reduce_impl_decl.h"
#include "../utils.h"
#include "./functor.h"
#include "../csr_interface.h"
namespace
dgl
{
namespace
kernel
{
namespace
cpu
{
// Minigun UDF to compute backward binary reduce.
template
<
int
Mode
,
typename
Idx
,
typename
DType
,
typename
Functors
>
struct
BackwardBinaryReduce
{
static
inline
bool
CondEdge
(
Idx
src
,
Idx
dst
,
Idx
eid
,
BackwardGData
<
Idx
,
DType
>*
gdata
)
{
return
true
;
}
static
inline
void
ApplyEdge
(
Idx
src
,
Idx
dst
,
Idx
eid
,
BackwardGData
<
Idx
,
DType
>*
gdata
)
{
const
int64_t
D
=
gdata
->
x_length
;
const
int64_t
len
=
gdata
->
data_len
;
Idx
lid
=
Functors
::
SelectLeft
(
src
,
eid
,
dst
);
Idx
rid
=
Functors
::
SelectRight
(
src
,
eid
,
dst
);
Idx
oid
=
Functors
::
SelectOut
(
src
,
eid
,
dst
);
if
(
gdata
->
lhs_mapping
)
{
lid
=
Functors
::
GetId
(
lid
,
gdata
->
lhs_mapping
);
}
if
(
gdata
->
rhs_mapping
)
{
rid
=
Functors
::
GetId
(
rid
,
gdata
->
rhs_mapping
);
}
if
(
gdata
->
out_mapping
)
{
oid
=
Functors
::
GetId
(
oid
,
gdata
->
out_mapping
);
}
DType
*
lhsoff
=
gdata
->
lhs_data
+
lid
*
D
*
len
;
DType
*
rhsoff
=
gdata
->
rhs_data
+
rid
*
D
*
len
;
DType
*
outoff
=
gdata
->
out_data
+
oid
*
D
;
DType
*
gradlhsoff
=
gdata
->
grad_lhs_data
+
lid
*
D
*
len
;
DType
*
gradrhsoff
=
gdata
->
grad_rhs_data
+
rid
*
D
*
len
;
DType
*
gradoutoff
=
gdata
->
grad_out_data
+
oid
*
D
;
for
(
int64_t
tx
=
0
;
tx
<
D
;
++
tx
)
{
DType
out
=
Functors
::
Read
(
outoff
+
tx
);
DType
grad_out
=
Functors
::
Read
(
gradoutoff
+
tx
);
DType
e
=
Functors
::
Op
(
lhsoff
+
tx
*
len
,
rhsoff
+
tx
*
len
,
len
);
DType
grad_e
=
grad_out
*
Functors
::
BackwardWrite
(
e
,
out
);
if
(
0
==
grad_e
)
continue
;
DType
*
lhs_base
=
lhsoff
+
tx
*
len
;
DType
*
rhs_base
=
rhsoff
+
tx
*
len
;
if
(
Mode
==
binary_op
::
kGradBoth
)
{
for
(
int64_t
i
=
0
;
i
<
len
;
++
i
)
{
DType
lhs
=
Functors
::
Read
(
lhs_base
+
i
);
DType
rhs
=
Functors
::
Read
(
rhs_base
+
i
);
DType
grad_lhs
=
grad_e
*
Functors
::
BackwardOpLhs
(
lhs
,
rhs
,
e
);
DType
grad_rhs
=
grad_e
*
Functors
::
BackwardOpRhs
(
lhs
,
rhs
,
e
);
DType
grad
=
grad_lhs
+
grad_rhs
;
#pragma omp atomic
gradlhsoff
[
tx
*
len
+
i
]
+=
grad
;
}
}
else
if
(
Mode
==
binary_op
::
kGradLhs
)
{
for
(
int64_t
i
=
0
;
i
<
len
;
++
i
)
{
DType
lhs
=
Functors
::
Read
(
lhs_base
+
i
);
DType
rhs
=
Functors
::
Read
(
rhs_base
+
i
);
DType
grad_lhs
=
grad_e
*
Functors
::
BackwardOpLhs
(
lhs
,
rhs
,
e
);
#pragma omp atomic
gradlhsoff
[
tx
*
len
+
i
]
+=
grad_lhs
;
}
}
else
if
(
Mode
==
binary_op
::
kGradRhs
)
{
for
(
int64_t
i
=
0
;
i
<
len
;
++
i
)
{
DType
lhs
=
Functors
::
Read
(
lhs_base
+
i
);
DType
rhs
=
Functors
::
Read
(
rhs_base
+
i
);
DType
grad_rhs
=
grad_e
*
Functors
::
BackwardOpRhs
(
lhs
,
rhs
,
e
);
#pragma omp atomic
gradrhsoff
[
tx
*
len
+
i
]
+=
grad_rhs
;
}
}
}
}
};
// Minigun UDF to compute backward binary reduce with broadcasting.
template
<
int
Mode
,
int
NDim
,
typename
Idx
,
typename
DType
,
typename
Functors
>
struct
BackwardBinaryReduceBcast
{
static
inline
bool
CondEdge
(
Idx
src
,
Idx
dst
,
Idx
eid
,
BackwardBcastGData
<
NDim
,
Idx
,
DType
>*
gdata
)
{
return
true
;
}
static
inline
void
ApplyEdge
(
Idx
src
,
Idx
dst
,
Idx
eid
,
BackwardBcastGData
<
NDim
,
Idx
,
DType
>*
gdata
)
{
const
int64_t
len
=
gdata
->
data_len
;
Idx
lid
=
Functors
::
SelectLeft
(
src
,
eid
,
dst
);
Idx
rid
=
Functors
::
SelectRight
(
src
,
eid
,
dst
);
Idx
oid
=
Functors
::
SelectOut
(
src
,
eid
,
dst
);
if
(
gdata
->
lhs_mapping
)
{
lid
=
Functors
::
GetId
(
lid
,
gdata
->
lhs_mapping
);
}
if
(
gdata
->
rhs_mapping
)
{
rid
=
Functors
::
GetId
(
rid
,
gdata
->
rhs_mapping
);
}
if
(
gdata
->
out_mapping
)
{
oid
=
Functors
::
GetId
(
oid
,
gdata
->
out_mapping
);
}
DType
*
lhsoff
=
gdata
->
lhs_data
+
lid
*
gdata
->
lhs_len
*
len
;
DType
*
rhsoff
=
gdata
->
rhs_data
+
rid
*
gdata
->
rhs_len
*
len
;
DType
*
outoff
=
gdata
->
out_data
+
oid
*
gdata
->
out_len
;
DType
*
gradlhsoff
=
gdata
->
grad_lhs_data
+
lid
*
gdata
->
out_len
*
len
;
DType
*
gradrhsoff
=
gdata
->
grad_rhs_data
+
rid
*
gdata
->
out_len
*
len
;
DType
*
gradoutoff
=
gdata
->
grad_out_data
+
oid
*
gdata
->
out_len
;
int64_t
tmp
[
NDim
];
// store unraveled idx.
for
(
int64_t
tx
=
0
;
tx
<
gdata
->
out_len
;
++
tx
)
{
Unravel
(
tx
,
gdata
->
ndim
,
gdata
->
out_shape
,
gdata
->
out_stride
,
tmp
);
DType
out
=
Functors
::
Read
(
outoff
+
tx
);
DType
grad_out
=
Functors
::
Read
(
gradoutoff
+
tx
);
DType
e
=
Functors
::
Op
(
lhsoff
+
Ravel
(
tmp
,
gdata
->
ndim
,
gdata
->
lhs_shape
,
gdata
->
lhs_stride
)
*
len
,
rhsoff
+
Ravel
(
tmp
,
gdata
->
ndim
,
gdata
->
rhs_shape
,
gdata
->
rhs_stride
)
*
len
,
len
);
DType
grad_e
=
grad_out
*
Functors
::
BackwardWrite
(
e
,
out
);
// (pawelpiotrowicz) Although we can technically add the same condition for
// skipping atomic additions as in BackwardBinaryReduce, doing so made the
// speed 2% slower in GCMC training on MovieLens-1M with 24 OpenMP threads.
// For more details, see https://github.com/dmlc/dgl/pull/1527.
// TODO(BarclayII): Needs further investigation and benchmarking.
DType
*
lhs_base
=
lhsoff
+
Ravel
(
tmp
,
gdata
->
ndim
,
gdata
->
lhs_shape
,
gdata
->
lhs_stride
)
*
len
;
DType
*
rhs_base
=
rhsoff
+
Ravel
(
tmp
,
gdata
->
ndim
,
gdata
->
rhs_shape
,
gdata
->
rhs_stride
)
*
len
;
if
(
Mode
==
binary_op
::
kGradBoth
)
{
for
(
int64_t
i
=
0
;
i
<
len
;
++
i
)
{
DType
lhs
=
Functors
::
Read
(
lhs_base
+
i
);
DType
rhs
=
Functors
::
Read
(
rhs_base
+
i
);
DType
grad_lhs
=
grad_e
*
Functors
::
BackwardOpLhs
(
lhs
,
rhs
,
e
);
DType
grad_rhs
=
grad_e
*
Functors
::
BackwardOpRhs
(
lhs
,
rhs
,
e
);
DType
grad
=
grad_lhs
+
grad_rhs
;
#pragma omp atomic
gradlhsoff
[
tx
*
len
+
i
]
+=
grad
;
}
}
else
if
(
Mode
==
binary_op
::
kGradLhs
)
{
for
(
int64_t
i
=
0
;
i
<
len
;
++
i
)
{
DType
lhs
=
Functors
::
Read
(
lhs_base
+
i
);
DType
rhs
=
Functors
::
Read
(
rhs_base
+
i
);
DType
grad_lhs
=
grad_e
*
Functors
::
BackwardOpLhs
(
lhs
,
rhs
,
e
);
#pragma omp atomic
gradlhsoff
[
tx
*
len
+
i
]
+=
grad_lhs
;
}
}
else
if
(
Mode
==
binary_op
::
kGradRhs
)
{
for
(
int64_t
i
=
0
;
i
<
len
;
++
i
)
{
DType
lhs
=
Functors
::
Read
(
lhs_base
+
i
);
DType
rhs
=
Functors
::
Read
(
rhs_base
+
i
);
DType
grad_rhs
=
grad_e
*
Functors
::
BackwardOpRhs
(
lhs
,
rhs
,
e
);
#pragma omp atomic
gradrhsoff
[
tx
*
len
+
i
]
+=
grad_rhs
;
}
}
}
}
};
// Auxiliary template used in UDF.
template
<
typename
Idx
,
typename
DType
,
typename
LeftSelector
,
typename
RightSelector
,
typename
BinaryOp
,
typename
Reducer
>
struct
BackwardFunctorsTempl
{
static
inline
Idx
SelectOut
(
Idx
src
,
Idx
edge
,
Idx
dst
)
{
typedef
typename
OutSelector
<
Reducer
>::
Type
OutTarget
;
return
SwitchSrcDst
<
OutTarget
>::
Type
::
Call
(
src
,
edge
,
dst
);
}
static
inline
Idx
SelectLeft
(
Idx
src
,
Idx
edge
,
Idx
dst
)
{
return
LeftSelector
::
Call
(
src
,
edge
,
dst
);
}
static
inline
Idx
SelectRight
(
Idx
src
,
Idx
edge
,
Idx
dst
)
{
return
RightSelector
::
Call
(
src
,
edge
,
dst
);
}
static
inline
DType
Op
(
DType
*
lhs
,
DType
*
rhs
,
int64_t
len
)
{
return
BinaryOp
::
Call
(
lhs
,
rhs
,
len
);
}
static
inline
DType
Read
(
DType
*
addr
)
{
return
*
addr
;
}
static
inline
void
Write
(
DType
*
addr
,
DType
val
)
{
Reducer
::
Call
(
addr
,
val
);
}
static
inline
Idx
GetId
(
Idx
id
,
Idx
*
id_map
)
{
return
*
(
id_map
+
id
);
}
static
inline
DType
BackwardWrite
(
DType
val
,
DType
accum
)
{
return
Reducer
::
BackwardCall
(
val
,
accum
);
}
static
inline
DType
BackwardOpLhs
(
DType
lhs
,
DType
rhs
,
DType
out
)
{
return
BinaryOp
::
BackwardLhs
(
lhs
,
rhs
,
out
);
}
static
inline
DType
BackwardOpRhs
(
DType
lhs
,
DType
rhs
,
DType
out
)
{
return
BinaryOp
::
BackwardRhs
(
lhs
,
rhs
,
out
);
}
};
typedef
minigun
::
advance
::
Config
<
true
,
minigun
::
advance
::
kV2N
>
AdvanceConfig
;
}
// namespace cpu
// Template implementation of BackwardBinaryReduce operator.
template
<
int
XPU
,
int
Mode
,
typename
Idx
,
typename
DType
,
typename
LeftSelector
,
typename
RightSelector
,
typename
BinaryOp
,
typename
Reducer
>
void
CallBackwardBinaryReduce
(
const
minigun
::
advance
::
RuntimeConfig
&
rtcfg
,
const
CSRWrapper
&
graph
,
BackwardGData
<
Idx
,
DType
>*
gdata
)
{
// For backward computation, we use reverse csr and switch dst and src.
// This benefits the most common src_op_edge or copy_src case, because the
// gradients of src are now aggregated into destination buffer to reduce
// competition of atomic add.
auto
incsr
=
graph
.
GetInCSRMatrix
();
minigun
::
Csr
<
Idx
>
csr
=
utils
::
CreateCsr
<
Idx
>
(
incsr
.
indptr
,
incsr
.
indices
);
typedef
cpu
::
BackwardFunctorsTempl
<
Idx
,
DType
,
typename
SwitchSrcDst
<
LeftSelector
>::
Type
,
typename
SwitchSrcDst
<
RightSelector
>::
Type
,
BinaryOp
,
Reducer
>
Functors
;
typedef
cpu
::
BackwardBinaryReduce
<
Mode
,
Idx
,
DType
,
Functors
>
UDF
;
// If the user-given mapping is none and the target is edge data, we need to
// replace the mapping by the edge ids in the csr graph so that the edge
// data is correctly read/written.
if
(
LeftSelector
::
target
==
binary_op
::
kEdge
&&
gdata
->
lhs_mapping
==
nullptr
)
{
gdata
->
lhs_mapping
=
static_cast
<
Idx
*>
(
incsr
.
data
->
data
);
}
if
(
RightSelector
::
target
==
binary_op
::
kEdge
&&
gdata
->
rhs_mapping
==
nullptr
)
{
gdata
->
rhs_mapping
=
static_cast
<
Idx
*>
(
incsr
.
data
->
data
);
}
if
(
OutSelector
<
Reducer
>::
Type
::
target
==
binary_op
::
kEdge
&&
gdata
->
out_mapping
==
nullptr
)
{
gdata
->
out_mapping
=
static_cast
<
Idx
*>
(
incsr
.
data
->
data
);
}
// TODO(minjie): allocator
minigun
::
advance
::
Advance
<
XPU
,
Idx
,
cpu
::
AdvanceConfig
,
BackwardGData
<
Idx
,
DType
>
,
UDF
>
(
rtcfg
,
csr
,
gdata
,
minigun
::
IntArray1D
<
Idx
>
());
}
// Following macro is used to generate explicit-specialization of the template
// operator.
#define GEN_BACKWARD_DEFINE(mode, dtype, lhs_tgt, rhs_tgt, op) \
template void CallBackwardBinaryReduce<XPU, \
mode, IDX, dtype, \
lhs_tgt, rhs_tgt, \
op<dtype>, REDUCER<XPU, dtype>>( \
const minigun::advance::RuntimeConfig& rtcfg, \
const CSRWrapper& graph, \
BackwardGData<IDX, dtype>* gdata);
// Template implementation of BackwardBinaryReduce with broadcasting operator.
template
<
int
XPU
,
int
Mode
,
int
NDim
,
typename
Idx
,
typename
DType
,
typename
LeftSelector
,
typename
RightSelector
,
typename
BinaryOp
,
typename
Reducer
>
void
CallBackwardBinaryReduceBcast
(
const
minigun
::
advance
::
RuntimeConfig
&
rtcfg
,
const
CSRWrapper
&
graph
,
BackwardBcastGData
<
NDim
,
Idx
,
DType
>*
gdata
)
{
// For backward computation, we use reverse csr and switch dst and src.
// This benefits the most common src_op_edge or copy_src case, because the
// gradients of src are now aggregated into destination buffer to reduce
// competition of atomic add.
auto
incsr
=
graph
.
GetInCSRMatrix
();
minigun
::
Csr
<
Idx
>
csr
=
utils
::
CreateCsr
<
Idx
>
(
incsr
.
indptr
,
incsr
.
indices
);
typedef
cpu
::
BackwardFunctorsTempl
<
Idx
,
DType
,
typename
SwitchSrcDst
<
LeftSelector
>::
Type
,
typename
SwitchSrcDst
<
RightSelector
>::
Type
,
BinaryOp
,
Reducer
>
Functors
;
typedef
cpu
::
BackwardBinaryReduceBcast
<
Mode
,
NDim
,
Idx
,
DType
,
Functors
>
UDF
;
// If the user-given mapping is none and the target is edge data, we need to
// replace the mapping by the edge ids in the csr graph so that the edge
// data is correctly read/written.
if
(
LeftSelector
::
target
==
binary_op
::
kEdge
&&
gdata
->
lhs_mapping
==
nullptr
)
{
gdata
->
lhs_mapping
=
static_cast
<
Idx
*>
(
incsr
.
data
->
data
);
}
if
(
RightSelector
::
target
==
binary_op
::
kEdge
&&
gdata
->
rhs_mapping
==
nullptr
)
{
gdata
->
rhs_mapping
=
static_cast
<
Idx
*>
(
incsr
.
data
->
data
);
}
if
(
OutSelector
<
Reducer
>::
Type
::
target
==
binary_op
::
kEdge
&&
gdata
->
out_mapping
==
nullptr
)
{
gdata
->
out_mapping
=
static_cast
<
Idx
*>
(
incsr
.
data
->
data
);
}
// TODO(minjie): allocator
minigun
::
advance
::
Advance
<
XPU
,
Idx
,
cpu
::
AdvanceConfig
,
BackwardBcastGData
<
NDim
,
Idx
,
DType
>
,
UDF
>
(
rtcfg
,
csr
,
gdata
,
minigun
::
IntArray1D
<
Idx
>
());
}
// Following macro is used to generate explicit-specialization of the template
// operator.
#define GEN_BACKWARD_BCAST_DEFINE(mode, ndim, dtype, lhs_tgt, rhs_tgt, op) \
template void CallBackwardBinaryReduceBcast<XPU, \
mode, ndim, IDX, dtype, \
lhs_tgt, rhs_tgt, \
op<dtype>, REDUCER<XPU, dtype>>( \
const minigun::advance::RuntimeConfig& rtcfg, \
const CSRWrapper& graph, \
BackwardBcastGData<ndim, IDX, dtype>* gdata);
}
// namespace kernel
}
// namespace dgl
#endif // DGL_KERNEL_CPU_BACKWARD_BINARY_REDUCE_IMPL_H_
src/kernel/cpu/binary_bcast_reduce_max.cc
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cpu/binary_bcast_reduce_max.cc
* \brief CPU kernels for braodcasting binary reduce max
*/
#include "./binary_reduce_impl.h"
#include "./backward_binary_reduce_impl.h"
namespace
dgl
{
namespace
kernel
{
#define REDUCER ReduceMax
#define XPU kDLCPU
#define IDX int32_t
EVAL
(
GEN_NDIM
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BCAST_DEFINE
);
EVAL
(
GEN_BACKWARD_MODE
,
GEN_NDIM
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BACKWARD_BCAST_DEFINE
);
#undef IDX
#define IDX int64_t
EVAL
(
GEN_NDIM
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BCAST_DEFINE
);
EVAL
(
GEN_BACKWARD_MODE
,
GEN_NDIM
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BACKWARD_BCAST_DEFINE
);
#undef IDX
}
// namespace kernel
}
// namespace dgl
src/kernel/cpu/binary_bcast_reduce_min.cc
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cpu/binary_bcast_reduce_min.cc
* \brief CPU kernels for braodcasting binary reduce min
*/
#include "./binary_reduce_impl.h"
#include "./backward_binary_reduce_impl.h"
namespace
dgl
{
namespace
kernel
{
#define REDUCER ReduceMin
#define XPU kDLCPU
#define IDX int32_t
EVAL
(
GEN_NDIM
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BCAST_DEFINE
);
EVAL
(
GEN_BACKWARD_MODE
,
GEN_NDIM
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BACKWARD_BCAST_DEFINE
);
#undef IDX
#define IDX int64_t
EVAL
(
GEN_NDIM
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BCAST_DEFINE
);
EVAL
(
GEN_BACKWARD_MODE
,
GEN_NDIM
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BACKWARD_BCAST_DEFINE
);
#undef IDX
}
// namespace kernel
}
// namespace dgl
src/kernel/cpu/binary_bcast_reduce_none.cc
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cpu/binary_bcast_reduce_none.cc
* \brief CPU kernels for braodcasting binary reduce none
*/
#include "./binary_reduce_impl.h"
#include "./backward_binary_reduce_impl.h"
namespace
dgl
{
namespace
kernel
{
#define REDUCER ReduceNone
#define XPU kDLCPU
#define IDX int32_t
EVAL
(
GEN_NDIM
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BCAST_DEFINE
);
EVAL
(
GEN_BACKWARD_MODE
,
GEN_NDIM
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BACKWARD_BCAST_DEFINE
);
#undef IDX
#define IDX int64_t
EVAL
(
GEN_NDIM
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BCAST_DEFINE
);
EVAL
(
GEN_BACKWARD_MODE
,
GEN_NDIM
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BACKWARD_BCAST_DEFINE
);
#undef IDX
}
// namespace kernel
}
// namespace dgl
src/kernel/cpu/binary_bcast_reduce_prod.cc
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cpu/binary_bcast_reduce_prod.cc
* \brief CPU kernels for braodcasting binary reduce prod
*/
#include "./binary_reduce_impl.h"
#include "./backward_binary_reduce_impl.h"
namespace
dgl
{
namespace
kernel
{
#define REDUCER ReduceProd
#define XPU kDLCPU
#define IDX int32_t
EVAL
(
GEN_NDIM
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BCAST_DEFINE
);
EVAL
(
GEN_BACKWARD_MODE
,
GEN_NDIM
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BACKWARD_BCAST_DEFINE
);
#undef IDX
#define IDX int64_t
EVAL
(
GEN_NDIM
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BCAST_DEFINE
);
EVAL
(
GEN_BACKWARD_MODE
,
GEN_NDIM
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BACKWARD_BCAST_DEFINE
);
#undef IDX
}
// namespace kernel
}
// namespace dgl
src/kernel/cpu/binary_bcast_reduce_sum.cc
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cpu/binary_bcast_reduce_sum.cc
* \brief CPU kernels for braodcasting binary reduce sum
*/
#include "./binary_reduce_impl.h"
#include "./backward_binary_reduce_impl.h"
namespace
dgl
{
namespace
kernel
{
#define REDUCER ReduceSum
#define XPU kDLCPU
#define IDX int32_t
EVAL
(
GEN_NDIM
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BCAST_DEFINE
);
EVAL
(
GEN_BACKWARD_MODE
,
GEN_NDIM
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BACKWARD_BCAST_DEFINE
);
#undef IDX
#define IDX int64_t
EVAL
(
GEN_NDIM
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BCAST_DEFINE
);
EVAL
(
GEN_BACKWARD_MODE
,
GEN_NDIM
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BACKWARD_BCAST_DEFINE
);
#undef IDX
}
// namespace kernel
}
// namespace dgl
src/kernel/cpu/binary_reduce_impl.cc
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cpu/binary_reduce_impl.cc
* \brief Binary reduce implementation on CPU.
*/
#include "../binary_reduce_impl.h"
#include "../csr_interface.h"
using
dgl
::
runtime
::
NDArray
;
namespace
dgl
{
namespace
kernel
{
template
void
BinaryReduceImpl
<
kDLCPU
>(
const
std
::
string
&
reducer
,
const
std
::
string
&
op
,
const
CSRWrapper
&
graph
,
binary_op
::
Target
lhs
,
binary_op
::
Target
rhs
,
runtime
::
NDArray
lhs_data
,
runtime
::
NDArray
rhs_data
,
runtime
::
NDArray
out_data
,
runtime
::
NDArray
lhs_mapping
,
runtime
::
NDArray
rhs_mapping
,
runtime
::
NDArray
out_mapping
);
template
void
BinaryReduceBcastImpl
<
kDLCPU
>(
const
BcastInfo
&
info
,
const
std
::
string
&
reducer
,
const
std
::
string
&
op
,
const
CSRWrapper
&
graph
,
binary_op
::
Target
lhs
,
binary_op
::
Target
rhs
,
runtime
::
NDArray
lhs_data
,
runtime
::
NDArray
rhs_data
,
runtime
::
NDArray
out_data
,
runtime
::
NDArray
lhs_mapping
,
runtime
::
NDArray
rhs_mapping
,
runtime
::
NDArray
out_mapping
);
template
void
BackwardBinaryReduceImpl
<
kDLCPU
>(
const
std
::
string
&
reducer
,
const
std
::
string
&
op
,
const
CSRWrapper
&
graph
,
binary_op
::
Target
lhs
,
binary_op
::
Target
rhs
,
NDArray
lhs_mapping
,
NDArray
rhs_mapping
,
NDArray
out_mapping
,
NDArray
lhs_data
,
NDArray
rhs_data
,
NDArray
out_data
,
NDArray
grad_out_data
,
NDArray
grad_lhs_data
,
NDArray
grad_rhs_data
);
template
void
BackwardBinaryReduceBcastImpl
<
kDLCPU
>(
const
BcastInfo
&
info
,
const
std
::
string
&
reducer
,
const
std
::
string
&
op
,
const
CSRWrapper
&
graph
,
binary_op
::
Target
lhs_tgt
,
binary_op
::
Target
rhs_tgt
,
runtime
::
NDArray
lhs_mapping
,
runtime
::
NDArray
rhs_mapping
,
runtime
::
NDArray
out_mapping
,
runtime
::
NDArray
lhs
,
runtime
::
NDArray
rhs
,
runtime
::
NDArray
out
,
runtime
::
NDArray
grad_out
,
runtime
::
NDArray
grad_lhs
,
runtime
::
NDArray
grad_rhs
);
}
// namespace kernel
}
// namespace dgl
src/kernel/cpu/binary_reduce_impl.h
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cpu/binary_reduce_impl.h
* \brief Minigun CPU UDFs for binary reduce
*/
#ifndef DGL_KERNEL_CPU_BINARY_REDUCE_IMPL_H_
#define DGL_KERNEL_CPU_BINARY_REDUCE_IMPL_H_
#include <minigun/minigun.h>
#include <algorithm>
#include "../binary_reduce_impl_decl.h"
#include "../utils.h"
#include "./functor.h"
#include "../csr_interface.h"
namespace
dgl
{
namespace
kernel
{
namespace
cpu
{
// Minigun UDF to compute binary reduce.
template
<
typename
Idx
,
typename
DType
,
typename
Functors
>
struct
BinaryReduce
{
static
inline
bool
CondEdge
(
Idx
src
,
Idx
dst
,
Idx
eid
,
GData
<
Idx
,
DType
>*
gdata
)
{
return
true
;
}
static
inline
void
ApplyEdge
(
Idx
src
,
Idx
dst
,
Idx
eid
,
GData
<
Idx
,
DType
>*
gdata
)
{
const
int64_t
D
=
gdata
->
x_length
;
const
int64_t
len
=
gdata
->
data_len
;
Idx
lid
=
Functors
::
SelectLeft
(
src
,
eid
,
dst
);
Idx
rid
=
Functors
::
SelectRight
(
src
,
eid
,
dst
);
Idx
oid
=
Functors
::
SelectOut
(
src
,
eid
,
dst
);
if
(
gdata
->
lhs_mapping
)
{
lid
=
Functors
::
GetId
(
lid
,
gdata
->
lhs_mapping
);
}
if
(
gdata
->
rhs_mapping
)
{
rid
=
Functors
::
GetId
(
rid
,
gdata
->
rhs_mapping
);
}
if
(
gdata
->
out_mapping
)
{
oid
=
Functors
::
GetId
(
oid
,
gdata
->
out_mapping
);
}
DType
*
lhsoff
=
gdata
->
lhs_data
+
lid
*
D
*
len
;
DType
*
rhsoff
=
gdata
->
rhs_data
+
rid
*
D
*
len
;
DType
*
outoff
=
gdata
->
out_data
+
oid
*
D
;
for
(
int64_t
tx
=
0
;
tx
<
D
;
++
tx
)
{
DType
out
=
Functors
::
Op
(
lhsoff
+
tx
*
len
,
rhsoff
+
tx
*
len
,
len
);
Functors
::
Write
(
outoff
+
tx
,
out
);
}
}
};
// Convert flattened index to multi-dimension index (assume row-major).
inline
void
Unravel
(
int64_t
idx
,
int
ndim
,
const
int64_t
*
shape
,
const
int64_t
*
stride
,
int64_t
*
out
)
{
for
(
int
d
=
0
;
d
<
ndim
;
++
d
)
{
out
[
d
]
=
(
idx
/
stride
[
d
])
%
shape
[
d
];
}
}
// Convert multi-dimension index to flattened index (assume row-major).
inline
int64_t
Ravel
(
const
int64_t
*
idx
,
int
ndim
,
const
int64_t
*
shape
,
const
int64_t
*
stride
)
{
int64_t
out
=
0
;
for
(
int
d
=
0
;
d
<
ndim
;
++
d
)
{
out
+=
std
::
min
(
idx
[
d
],
shape
[
d
]
-
1
)
*
stride
[
d
];
}
return
out
;
}
// Minigun UDF to compute binary reduce with broadcasting.
template
<
int
NDim
,
typename
Idx
,
typename
DType
,
typename
Functors
>
struct
BinaryReduceBcast
{
static
inline
bool
CondEdge
(
Idx
src
,
Idx
dst
,
Idx
eid
,
BcastGData
<
NDim
,
Idx
,
DType
>*
gdata
)
{
return
true
;
}
static
inline
void
ApplyEdge
(
Idx
src
,
Idx
dst
,
Idx
eid
,
BcastGData
<
NDim
,
Idx
,
DType
>*
gdata
)
{
const
int64_t
len
=
gdata
->
data_len
;
Idx
lid
=
Functors
::
SelectLeft
(
src
,
eid
,
dst
);
Idx
rid
=
Functors
::
SelectRight
(
src
,
eid
,
dst
);
Idx
oid
=
Functors
::
SelectOut
(
src
,
eid
,
dst
);
if
(
gdata
->
lhs_mapping
)
{
lid
=
Functors
::
GetId
(
lid
,
gdata
->
lhs_mapping
);
}
if
(
gdata
->
rhs_mapping
)
{
rid
=
Functors
::
GetId
(
rid
,
gdata
->
rhs_mapping
);
}
if
(
gdata
->
out_mapping
)
{
oid
=
Functors
::
GetId
(
oid
,
gdata
->
out_mapping
);
}
DType
*
lhsoff
=
gdata
->
lhs_data
+
lid
*
gdata
->
lhs_len
*
len
;
// data with len size
DType
*
rhsoff
=
gdata
->
rhs_data
+
rid
*
gdata
->
rhs_len
*
len
;
DType
*
outoff
=
gdata
->
out_data
+
oid
*
gdata
->
out_len
;
int64_t
tmp
[
NDim
];
// store unraveled idx.
for
(
int64_t
tx
=
0
;
tx
<
gdata
->
out_len
;
++
tx
)
{
Unravel
(
tx
,
gdata
->
ndim
,
gdata
->
out_shape
,
gdata
->
out_stride
,
tmp
);
DType
out
=
Functors
::
Op
(
lhsoff
+
Ravel
(
tmp
,
gdata
->
ndim
,
gdata
->
lhs_shape
,
gdata
->
lhs_stride
)
*
len
,
rhsoff
+
Ravel
(
tmp
,
gdata
->
ndim
,
gdata
->
rhs_shape
,
gdata
->
rhs_stride
)
*
len
,
len
);
Functors
::
Write
(
outoff
+
tx
,
out
);
}
}
};
// Auxiliary template used in UDF.
template
<
typename
Idx
,
typename
DType
,
typename
LeftSelector
,
typename
RightSelector
,
typename
BinaryOp
,
typename
Reducer
>
struct
FunctorsTempl
{
static
inline
Idx
SelectOut
(
Idx
src
,
Idx
edge
,
Idx
dst
)
{
return
OutSelector
<
Reducer
>::
Type
::
Call
(
src
,
edge
,
dst
);
}
static
inline
Idx
SelectLeft
(
Idx
src
,
Idx
edge
,
Idx
dst
)
{
return
LeftSelector
::
Call
(
src
,
edge
,
dst
);
}
static
inline
Idx
SelectRight
(
Idx
src
,
Idx
edge
,
Idx
dst
)
{
return
RightSelector
::
Call
(
src
,
edge
,
dst
);
}
static
inline
DType
Op
(
DType
*
lhs
,
DType
*
rhs
,
int64_t
len
)
{
return
BinaryOp
::
Call
(
lhs
,
rhs
,
len
);
}
static
inline
void
Write
(
DType
*
addr
,
DType
val
)
{
Reducer
::
Call
(
addr
,
val
);
}
static
inline
Idx
GetId
(
Idx
id
,
Idx
*
id_map
)
{
return
*
(
id_map
+
id
);
}
};
typedef
minigun
::
advance
::
Config
<
true
,
minigun
::
advance
::
kV2N
>
AdvanceConfig
;
}
// namespace cpu
// Template implementation of BinaryReduce operator.
template
<
int
XPU
,
typename
Idx
,
typename
DType
,
typename
LeftSelector
,
typename
RightSelector
,
typename
BinaryOp
,
typename
Reducer
>
void
CallBinaryReduce
(
const
minigun
::
advance
::
RuntimeConfig
&
rtcfg
,
const
CSRWrapper
&
graph
,
GData
<
Idx
,
DType
>*
gdata
)
{
typedef
cpu
::
FunctorsTempl
<
Idx
,
DType
,
LeftSelector
,
RightSelector
,
BinaryOp
,
Reducer
>
Functors
;
typedef
cpu
::
BinaryReduce
<
Idx
,
DType
,
Functors
>
UDF
;
// csr
auto
outcsr
=
graph
.
GetOutCSRMatrix
();
minigun
::
Csr
<
Idx
>
csr
=
utils
::
CreateCsr
<
Idx
>
(
outcsr
.
indptr
,
outcsr
.
indices
);
// If the user-given mapping is none and the target is edge data, we need to
// replace the mapping by the edge ids in the csr graph so that the edge
// data is correctly read/written.
if
(
LeftSelector
::
target
==
binary_op
::
kEdge
&&
gdata
->
lhs_mapping
==
nullptr
)
{
gdata
->
lhs_mapping
=
static_cast
<
Idx
*>
(
outcsr
.
data
->
data
);
}
if
(
RightSelector
::
target
==
binary_op
::
kEdge
&&
gdata
->
rhs_mapping
==
nullptr
)
{
gdata
->
rhs_mapping
=
static_cast
<
Idx
*>
(
outcsr
.
data
->
data
);
}
if
(
OutSelector
<
Reducer
>::
Type
::
target
==
binary_op
::
kEdge
&&
gdata
->
out_mapping
==
nullptr
)
{
gdata
->
out_mapping
=
static_cast
<
Idx
*>
(
outcsr
.
data
->
data
);
}
// TODO(minjie): allocator
minigun
::
advance
::
Advance
<
XPU
,
Idx
,
cpu
::
AdvanceConfig
,
GData
<
Idx
,
DType
>
,
UDF
>
(
rtcfg
,
csr
,
gdata
,
minigun
::
IntArray1D
<
Idx
>
());
}
// Template implementation of BinaryReduce broadcasting operator.
template
<
int
XPU
,
int
NDim
,
typename
Idx
,
typename
DType
,
typename
LeftSelector
,
typename
RightSelector
,
typename
BinaryOp
,
typename
Reducer
>
void
CallBinaryReduceBcast
(
const
minigun
::
advance
::
RuntimeConfig
&
rtcfg
,
const
CSRWrapper
&
graph
,
BcastGData
<
NDim
,
Idx
,
DType
>*
gdata
)
{
typedef
cpu
::
FunctorsTempl
<
Idx
,
DType
,
LeftSelector
,
RightSelector
,
BinaryOp
,
Reducer
>
Functors
;
typedef
cpu
::
BinaryReduceBcast
<
NDim
,
Idx
,
DType
,
Functors
>
UDF
;
// csr
auto
outcsr
=
graph
.
GetOutCSRMatrix
();
minigun
::
Csr
<
Idx
>
csr
=
utils
::
CreateCsr
<
Idx
>
(
outcsr
.
indptr
,
outcsr
.
indices
);
// If the user-given mapping is none and the target is edge data, we need to
// replace the mapping by the edge ids in the csr graph so that the edge
// data is correctly read/written.
if
(
LeftSelector
::
target
==
binary_op
::
kEdge
&&
gdata
->
lhs_mapping
==
nullptr
)
{
gdata
->
lhs_mapping
=
static_cast
<
Idx
*>
(
outcsr
.
data
->
data
);
}
if
(
RightSelector
::
target
==
binary_op
::
kEdge
&&
gdata
->
rhs_mapping
==
nullptr
)
{
gdata
->
rhs_mapping
=
static_cast
<
Idx
*>
(
outcsr
.
data
->
data
);
}
if
(
OutSelector
<
Reducer
>::
Type
::
target
==
binary_op
::
kEdge
&&
gdata
->
out_mapping
==
nullptr
)
{
gdata
->
out_mapping
=
static_cast
<
Idx
*>
(
outcsr
.
data
->
data
);
}
// TODO(minjie): allocator
minigun
::
advance
::
Advance
<
XPU
,
Idx
,
cpu
::
AdvanceConfig
,
BcastGData
<
NDim
,
Idx
,
DType
>
,
UDF
>
(
rtcfg
,
csr
,
gdata
,
minigun
::
IntArray1D
<
Idx
>
());
}
// Following macro is used to generate explicit-specialization of the template
// operator.
#define GEN_DEFINE(dtype, lhs_tgt, rhs_tgt, op) \
template void CallBinaryReduce<XPU, IDX, \
dtype, lhs_tgt, rhs_tgt, op<dtype>, REDUCER<XPU, dtype>>( \
const minigun::advance::RuntimeConfig& rtcfg, \
const CSRWrapper& graph, \
GData<IDX, dtype>* gdata);
#define GEN_BCAST_DEFINE(ndim, dtype, lhs_tgt, rhs_tgt, op) \
template void CallBinaryReduceBcast<XPU, ndim, IDX, dtype, \
lhs_tgt, rhs_tgt, \
op<dtype>, REDUCER<XPU, dtype>>( \
const minigun::advance::RuntimeConfig& rtcfg, \
const CSRWrapper& graph, \
BcastGData<ndim, IDX, dtype>* gdata);
#define EVAL(F, ...) MSVC_EXPAND(F(__VA_ARGS__))
}
// namespace kernel
}
// namespace dgl
#endif // DGL_KERNEL_CPU_BINARY_REDUCE_IMPL_H_
src/kernel/cpu/binary_reduce_max.cc
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cpu/binary_reduce_max.cc
* \brief CPU kernels for binary reduce max
*/
#include "./binary_reduce_impl.h"
#include "./backward_binary_reduce_impl.h"
namespace
dgl
{
namespace
kernel
{
#define REDUCER ReduceMax
#define XPU kDLCPU
#define IDX int32_t
EVAL
(
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_DEFINE
);
EVAL
(
GEN_BACKWARD_MODE
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BACKWARD_DEFINE
);
#undef IDX
#define IDX int64_t
EVAL
(
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_DEFINE
);
EVAL
(
GEN_BACKWARD_MODE
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BACKWARD_DEFINE
);
#undef IDX
}
// namespace kernel
}
// namespace dgl
src/kernel/cpu/binary_reduce_min.cc
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cpu/binary_reduce_min.cc
* \brief CPU kernels for binary reduce min
*/
#include "./binary_reduce_impl.h"
#include "./backward_binary_reduce_impl.h"
namespace
dgl
{
namespace
kernel
{
#define REDUCER ReduceMin
#define XPU kDLCPU
#define IDX int32_t
EVAL
(
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_DEFINE
);
EVAL
(
GEN_BACKWARD_MODE
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BACKWARD_DEFINE
);
#undef IDX
#define IDX int64_t
EVAL
(
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_DEFINE
);
EVAL
(
GEN_BACKWARD_MODE
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BACKWARD_DEFINE
);
#undef IDX
}
// namespace kernel
}
// namespace dgl
src/kernel/cpu/binary_reduce_none.cc
deleted
100644 → 0
View file @
75d793a1
/*!
* Copyright (c) 2019 by Contributors
* \file kernel/cpu/binary_reduce_none.cc
* \brief CPU kernels for binary reduce none
*/
#include "./binary_reduce_impl.h"
#include "./backward_binary_reduce_impl.h"
namespace
dgl
{
namespace
kernel
{
#define REDUCER ReduceNone
#define XPU kDLCPU
#define IDX int32_t
EVAL
(
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_DEFINE
);
EVAL
(
GEN_BACKWARD_MODE
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BACKWARD_DEFINE
);
#undef IDX
#define IDX int64_t
EVAL
(
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_DEFINE
);
EVAL
(
GEN_BACKWARD_MODE
,
GEN_DTYPE
,
GEN_OP_TARGET
,
GEN_BACKWARD_DEFINE
);
#undef IDX
}
// namespace kernel
}
// namespace dgl
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment