Unverified Commit a296c1ef authored by pawelpiotrowicz's avatar pawelpiotrowicz Committed by GitHub
Browse files

[Performance] Reduces OpenMP atomic additions with zero inputs (#1527)



* cirtical performance fix - reduce atomic

 * training faster up to 50%

* leave a TODO
Co-authored-by: default avatarQuan (Andy) Gan <coin2028@hotmail.com>
parent 0b902d03
...@@ -51,7 +51,8 @@ struct BackwardBinaryReduce { ...@@ -51,7 +51,8 @@ struct BackwardBinaryReduce {
DType grad_out = Functors::Read(gradoutoff + tx); DType grad_out = Functors::Read(gradoutoff + tx);
DType e = Functors::Op(lhsoff + tx * len, rhsoff + tx * len, len); DType e = Functors::Op(lhsoff + tx * len, rhsoff + tx * len, len);
DType grad_e = grad_out * Functors::BackwardWrite(e, out); DType grad_e = grad_out * Functors::BackwardWrite(e, out);
if (0 == grad_e)
continue;
DType* lhs_base = lhsoff + tx * len; DType* lhs_base = lhsoff + tx * len;
DType* rhs_base = rhsoff + tx * len; DType* rhs_base = rhsoff + tx * len;
if (Mode == binary_op::kGradBoth) { if (Mode == binary_op::kGradBoth) {
...@@ -124,6 +125,11 @@ struct BackwardBinaryReduceBcast { ...@@ -124,6 +125,11 @@ struct BackwardBinaryReduceBcast {
rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len, rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len,
len); len);
DType grad_e = grad_out * Functors::BackwardWrite(e, out); DType grad_e = grad_out * Functors::BackwardWrite(e, out);
// (pawelpiotrowicz) Although we can technically add the same condition for
// skipping atomic additions as in BackwardBinaryReduce, doing so made the
// speed 2% slower in GCMC training on MovieLens-1M with 24 OpenMP threads.
// For more details, see https://github.com/dmlc/dgl/pull/1527.
// TODO(BarclayII): Needs further investigation and benchmarking.
DType* lhs_base = lhsoff + DType* lhs_base = lhsoff +
Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len; Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len;
......
...@@ -19,6 +19,8 @@ namespace kernel { ...@@ -19,6 +19,8 @@ namespace kernel {
template <typename DType> template <typename DType>
struct ReduceSum<kDLCPU, DType> { struct ReduceSum<kDLCPU, DType> {
static void Call(DType* addr, DType val) { static void Call(DType* addr, DType val) {
if (0 == val)
return;
#pragma omp atomic #pragma omp atomic
*addr += val; *addr += val;
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment