[Kernel][Perf] Message builtin with broadcasting performance optimization (#815)

* upd * fig edgebatch edges * add test * trigger * Update README.md for pytorch PinSage example. Add noting that the PinSage model example under example/pytorch/recommendation only work with Python 3.6+ as its dataset loader depends on stanfordnlp package which work only with Python 3.6+. * Provid a frame agnostic API to test nn modules on both CPU and CUDA side. 1. make dgl.nn.xxx frame agnostic 2. make test.backend include dgl.nn modules 3. modify test_edge_softmax of test/mxnet/test_nn.py and test/pytorch/test_nn.py work on both CPU and GPU * Fix style * Delete unused code * Make agnostic test only related to tests/backend 1. clear all agnostic related code in dgl.nn 2. make test_graph_conv agnostic to cpu/gpu * Fix code style * fix * doc * Make all test code under tests.mxnet/pytorch.test_nn.py work on both CPU and GPU. * Fix syntex * Remove rand * Add TAGCN nn.module and example * Now tagcn can run on CPU. * Add unitest for TGConv * Fix style * For pubmed dataset, using --lr=0.005 can achieve better acc * Fix style * Fix some descriptions * Test performance of udf * trigger * Fix doc * Add nn.TGConv and example * Update test code * Fix bug * Update data in mxnet.tagcn test acc. * Fix some comments and code * delete useless code * Fix namming * Fix bug * Fix bug * Add test for mxnet TAGCov * Add test code for mxnet TAGCov * Update some docs * Fix some code * Update docs dgl.nn.mxnet * Update weight init * Fix * Minor opt for URRevel * Delete test code * Update code style and notes. * Fix func name

[Kernel][Perf] Message builtin with broadcasting performance optimization (#815)
* upd * fig edgebatch edges * add test * trigger * Update README.md for pytorch PinSage example. Add noting that the PinSage model example under example/pytorch/recommendation only work with Python 3.6+ as its dataset loader depends on stanfordnlp package which work only with Python 3.6+. * Provid a frame agnostic API to test nn modules on both CPU and CUDA side. 1. make dgl.nn.xxx frame agnostic 2. make test.backend include dgl.nn modules 3. modify test_edge_softmax of test/mxnet/test_nn.py and test/pytorch/test_nn.py work on both CPU and GPU * Fix style * Delete unused code * Make agnostic test only related to tests/backend 1. clear all agnostic related code in dgl.nn 2. make test_graph_conv agnostic to cpu/gpu * Fix code style * fix * doc * Make all test code under tests.mxnet/pytorch.test_nn.py work on both CPU and GPU. * Fix syntex * Remove rand * Add TAGCN nn.module and example * Now tagcn can run on CPU. * Add unitest for TGConv * Fix style * For pubmed dataset, using --lr=0.005 can achieve better acc * Fix style * Fix some descriptions * Test performance of udf * trigger * Fix doc * Add nn.TGConv and example * Update test code * Fix bug * Update data in mxnet.tagcn test acc. * Fix some comments and code * delete useless code * Fix namming * Fix bug * Fix bug * Add test for mxnet TAGCov * Add test code for mxnet TAGCov * Update some docs * Fix some code * Update docs dgl.nn.mxnet * Update weight init * Fix * Minor opt for URRevel * Delete test code * Update code style and notes. * Fix func name
bcd33e0a · xiang song(charlie.song) · Zihao Ye · 189c2c09 · bcd33e0a · bcd33e0a
Commit bcd33e0a authored Sep 09, 2019 by xiang song(charlie.song) Committed by Zihao Ye Sep 09, 2019
3 changed files
--- a/docs/source/api/python/nn.mxnet.rst
+++ b/docs/source/api/python/nn.mxnet.rst
@@ -38,6 +38,7 @@ TAGConv
    :members: forward
    :show-inheritance:
 Global Pooling Layers 
 ----------------------------------------

--- a/src/kernel/cuda/backward_binary_reduce_impl.cuh
+++ b/src/kernel/cuda/backward_binary_reduce_impl.cuh
@@ -67,6 +67,24 @@ struct BackwardBinaryReduce {
  }
 };
+// Convert flattened index to multi-dimension index (assume row-major).
+__device__ __forceinline__ void Unravel(
+    int64_t idx, int ndim, const int64_t* shape, const int64_t* stride, int64_t* out) {
+  for (int d = 0; d < ndim; ++d) {
+    out[d] = (idx / stride[d]) % shape[d];
+  }
+}
+// Convert multi-dimension index to flattened index (assume row-major).
+__device__ __forceinline__ int64_t Ravel(
+    const int64_t* idx, int ndim, const int64_t* shape, const int64_t* stride) {
+  int64_t out = 0;
+  for (int d = 0; d < ndim; ++d) {
+    out += min(idx[d], shape[d] - 1) * stride[d];
+  }
+  return out;
+}
 // Minigun UDF to compute backward binary reduce with broadcasting.
 template <int Mode, int NDim, typename Idx, typename DType, typename Functors>
 struct BackwardBinaryReduceBcast {

--- a/src/kernel/cuda/binary_reduce_impl.cuh
+++ b/src/kernel/cuda/binary_reduce_impl.cuh
@@ -54,24 +54,56 @@ struct BinaryReduce {
  }
 };
-// Convert flattened index to multi-dimension index (assume row-major).
+/*
-__device__ __forceinline__ void Unravel(
+ * This func do the followings:
-    int64_t idx, int ndim, const int64_t* shape, const int64_t* stride, int64_t* out) {
+ *   1. Convert flattened index to multi-dimension index
-  for (int d = 0; d < ndim; ++d) {
+ *      according to output shape (assume row-major).
-    out[d] = (idx / stride[d]) % shape[d];
+ *   2. Convert multi-dimension index to flattened index for lhs.
+ *   3. Convert multi-dimension index to flattened index for rhs.
+ */
+__device__ __forceinline__ void UnravelRavel(
+    const int64_t idx, const int ndim, const int64_t* out_shape, const int64_t* out_stride,
+    const int64_t* lhs_shape, const int64_t* lhs_stride,
+    const int64_t* rhs_shape, const int64_t* rhs_stride, int64_t *lhs_out, int64_t *rhs_out) {
+  if (out_stride[0] == lhs_stride[0]) {
+#pragma unroll
+    for (int d = 0; d < ndim; ++d) {
+      int64_t o_sh = out_shape[d];
+      int64_t o_st = out_stride[d];
+      int64_t rhs_sh = rhs_shape[d];
+      int64_t rhs_st = rhs_stride[d];
+      int64_t i = (idx / o_st) % o_sh;
+      /*
+       * Simplfied for rhs_out += min(i, rhs_sh - 1) * rhs_st;
+       * rhs_sh be o_sh or 1
+       */
+      if (rhs_sh > i) { 
+        *rhs_out += i * rhs_st;
+      }
+    }
+    *lhs_out = idx;
+  } else {
+#pragma unroll
+    for (int d = 0; d < ndim; ++d) {
+      int64_t o_sh = out_shape[d];
+      int64_t o_st = out_stride[d];
+      int64_t lhs_sh = lhs_shape[d];
+      int64_t lhs_st = lhs_stride[d];
+      int64_t i = (idx / o_st) % o_sh;
+      /*
+       * Simplfied for lhs_out += min(i, lhs_sh - 1) * lhs_st;
+       * lhs_sh be o_sh or 1
+       */
+      if (lhs_sh > i) {
+        *lhs_out += i * lhs_st;
+      }
+    }
+    *rhs_out = idx;
  }
 }
-// Convert multi-dimension index to flattened index (assume row-major).
-__device__ __forceinline__ int64_t Ravel(
-    const int64_t* idx, int ndim, const int64_t* shape, const int64_t* stride) {
-  int64_t out = 0;
-  for (int d = 0; d < ndim; ++d) {
-    out += min(idx[d], shape[d] - 1) * stride[d];
-  }
-  return out;
-}
 // Minigun UDF to compute binary reduce with broadcasting.
 template <int NDim, typename Idx, typename DType, typename Functors>
 struct BinaryReduceBcast {
@@ -98,13 +130,14 @@ struct BinaryReduceBcast {
    DType* lhsoff = gdata->lhs_data + lid * gdata->lhs_len;
    DType* rhsoff = gdata->rhs_data + rid * gdata->rhs_len;
    DType* outoff = gdata->out_data + oid * gdata->out_len;
-    int64_t tmp[NDim];  // store unraveled idx.
    while (tx < gdata->out_len) {
-      Unravel(tx, gdata->ndim, gdata->out_shape, gdata->out_stride, tmp);
+      int64_t lhs_add = 0;
-      DType lhs = Functors::Read(lhsoff +
+      int64_t rhs_add = 0;
-          Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride));
+      UnravelRavel(tx, gdata->ndim, gdata->out_shape, gdata->out_stride,
-      DType rhs = Functors::Read(rhsoff +
+          gdata->lhs_shape, gdata->lhs_stride,
-          Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride));
+          gdata->rhs_shape, gdata->rhs_stride, &lhs_add, &rhs_add);
+      DType lhs = Functors::Read(lhsoff + lhs_add);
+      DType rhs = Functors::Read(rhsoff + rhs_add);
      DType out = Functors::Op(lhs, rhs);
      Functors::Write(outoff + tx, out);
      tx += stride_x;