[Performance] Reduces OpenMP atomic additions with zero inputs (#1527)

* cirtical performance fix - reduce atomic * training faster up to 50% * leave a TODO Co-authored-by: Quan (Andy) Gan <coin2028@hotmail.com>

[Performance] Reduces OpenMP atomic additions with zero inputs (#1527)
* cirtical performance fix - reduce atomic * training faster up to 50% * leave a TODO Co-authored-by: Quan (Andy) Gan <coin2028@hotmail.com>
a296c1ef · pawelpiotrowicz · GitHub · 0b902d03 · a296c1ef · a296c1ef
Unverified Commit a296c1ef authored May 18, 2020 by pawelpiotrowicz Committed by GitHub May 18, 2020
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 1 deletion

src/kernel/cpu/backward_binary_reduce_impl.h src/kernel/cpu/backward_binary_reduce_impl.h +7 -1

src/kernel/cpu/functor.h src/kernel/cpu/functor.h +2 -0

No files found.
--- a/src/kernel/cpu/backward_binary_reduce_impl.h
+++ b/src/kernel/cpu/backward_binary_reduce_impl.h
@@ -51,7 +51,8 @@ struct BackwardBinaryReduce {
      DType grad_out = Functors::Read(gradoutoff + tx);
      DType e = Functors::Op(lhsoff + tx * len, rhsoff + tx * len, len);
      DType grad_e = grad_out * Functors::BackwardWrite(e, out);
+      if (0 == grad_e)
+        continue;
      DType* lhs_base = lhsoff + tx * len;
      DType* rhs_base = rhsoff + tx * len;
      if (Mode == binary_op::kGradBoth) {
@@ -124,6 +125,11 @@ struct BackwardBinaryReduceBcast {
        rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len,
        len);
      DType grad_e = grad_out * Functors::BackwardWrite(e, out);
+      // (pawelpiotrowicz) Although we can technically add the same condition for
+      // skipping atomic additions as in BackwardBinaryReduce, doing so made the
+      // speed 2% slower in GCMC training on MovieLens-1M with 24 OpenMP threads.
+      // For more details, see https://github.com/dmlc/dgl/pull/1527.
+      // TODO(BarclayII): Needs further investigation and benchmarking.
      DType* lhs_base = lhsoff +
          Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len;

--- a/src/kernel/cpu/functor.h
+++ b/src/kernel/cpu/functor.h
@@ -19,6 +19,8 @@ namespace kernel {
 template <typename DType>
 struct ReduceSum<kDLCPU, DType> {
  static void Call(DType* addr, DType val) {
+    if (0 == val)
+      return;
 #pragma omp atomic
    *addr += val;
  }