[Performance][GPU] Improving Disjoint Union kernel for Graph Dataloaders (#3895)

* Based on issue #3436. Improving _SegmentCopyKernel s GPU utilization by switching to nonzero based thread assignment * fixing lint issues * Update cub for cuda 11.5 compatibility (#3468) * fixing type mismatch * tx guaranteed to be smaller than nnz. Hence removing last check * minor: updating comment * adding three unit tests for csr slice method to cover some corner cases * timing repeatkernel * clean * clean * clean * updating _SegmentMaskColKernel * Working on requests: removing sorted array check and adding comments to utility functions * fixing lint issue * Optimizing disjoint union kernel * Trying to resolve compilation issue on CI * [EMPTY] Relevant commit message here * applying revision requests on cpu/disjoint_union.cc * removing unnecessary casts * remove extra space Co-authored-by: Abdurrahman Yasar <ayasar@nvidia.com> Co-authored-by: nv-dlasalle <63612878+nv-dlasalle@users.noreply.github.com> Co-authored-by: Jinjing Zhou <VoVAllen@users.noreply.github.com> Co-authored-by: Quan (Andy) Gan <coin2028@hotmail.com>

[Performance][GPU] Improving Disjoint Union kernel for Graph Dataloaders (#3895)
* Based on issue #3436. Improving _SegmentCopyKernel s GPU utilization by switching to nonzero based thread assignment * fixing lint issues * Update cub for cuda 11.5 compatibility (#3468) * fixing type mismatch * tx guaranteed to be smaller than nnz. Hence removing last check * minor: updating comment * adding three unit tests for csr slice method to cover some corner cases * timing repeatkernel * clean * clean * clean * updating _SegmentMaskColKernel * Working on requests: removing sorted array check and adding comments to utility functions * fixing lint issue * Optimizing disjoint union kernel * Trying to resolve compilation issue on CI * [EMPTY] Relevant commit message here * applying revision requests on cpu/disjoint_union.cc * removing unnecessary casts * remove extra space Co-authored-by: Abdurrahman Yasar <ayasar@nvidia.com> Co-authored-by: nv-dlasalle <63612878+nv-dlasalle@users.noreply.github.com> Co-authored-by: Jinjing Zhou <VoVAllen@users.noreply.github.com> Co-authored-by: Quan (Andy) Gan <coin2028@hotmail.com>
6e46bbf5 · ayasar70 · GitHub · 306e0a46 · 6e46bbf5 · 6e46bbf5
Unverified Commit 6e46bbf5 authored Apr 26, 2022 by ayasar70 Committed by GitHub Apr 26, 2022
5 changed files
--- a/src/array/array.cc
+++ b/src/array/array.cc
@@ -857,6 +857,16 @@ std::pair<COOMatrix, IdArray> COOCoalesce(COOMatrix coo) {
  return ret;
 }

+COOMatrix DisjointUnionCoo(const std::vector<COOMatrix>& coos) {
+  COOMatrix ret;
+  ATEN_XPU_SWITCH_CUDA(coos[0].row->ctx.device_type, XPU, "DisjointUnionCoo", {
+    ATEN_ID_TYPE_SWITCH(coos[0].row->dtype, IdType, {
+      ret = impl::DisjointUnionCoo<XPU, IdType>(coos);
+    });
+  });
+  return ret;
+}
+
 COOMatrix COOLineGraph(const COOMatrix &coo, bool backtracking) {
  COOMatrix ret;
  ATEN_COO_SWITCH(coo, XPU, IdType, "COOLineGraph", {

--- a/src/array/array_op.h
+++ b/src/array/array_op.h
@@ -256,6 +256,9 @@ COOMatrix COOSliceMatrix(COOMatrix coo, runtime::NDArray rows, runtime::NDArray
 template <DLDeviceType XPU, typename IdType>
 std::pair<COOMatrix, IdArray> COOCoalesce(COOMatrix coo);

+template <DLDeviceType XPU, typename IdType>
+COOMatrix DisjointUnionCoo(const std::vector<COOMatrix>& coos);
+
 template <DLDeviceType XPU, typename IdType>
 void COOSort_(COOMatrix* mat, bool sort_column);


--- a/src/array/cpu/disjoint_union.cc
+++ b/src/array/cpu/disjoint_union.cc
+/**
+*   Copyright (c) 2022, NVIDIA CORPORATION.
+*
+*   Licensed under the Apache License, Version 2.0 (the "License");
+*   you may not use this file except in compliance with the License.
+*   You may obtain a copy of the License at
+*
+*       http://www.apache.org/licenses/LICENSE-2.0
+*
+*   Unless required by applicable law or agreed to in writing, software
+*   distributed under the License is distributed on an "AS IS" BASIS,
+*   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*   See the License for the specific language governing permissions and
+*   limitations under the License.
+*
+* \file array/cpu/disjoint_union.cc
+* \brief Disjoint union CPU implementation.
+*/
+
+#include <dgl/array.h>
+#include <dgl/runtime/parallel_for.h>
+#include <tuple>
+
+namespace dgl {
+using runtime::NDArray;
+namespace aten {
+namespace impl {
+
+template <DLDeviceType XPU, typename IdType>
+std::tuple<IdArray, IdArray, IdArray> _ComputePrefixSums(const std::vector<COOMatrix>& coos) {
+  IdArray prefix_src_arr = NewIdArray(
+    coos.size(), coos[0].row->ctx, coos[0].row->dtype.bits);
+  IdArray prefix_dst_arr = NewIdArray(
+    coos.size(), coos[0].row->ctx, coos[0].row->dtype.bits);
+  IdArray prefix_elm_arr = NewIdArray(
+    coos.size(), coos[0].row->ctx, coos[0].row->dtype.bits);
+
+  auto prefix_src = prefix_src_arr.Ptr<IdType>();
+  auto prefix_dst = prefix_dst_arr.Ptr<IdType>();
+  auto prefix_elm = prefix_elm_arr.Ptr<IdType>();
+
+  dgl::runtime::parallel_for(0, coos.size(), [&](IdType b, IdType e){
+    for (IdType i = b; i < e; ++i) {
+      prefix_src[i] = coos[i].num_rows;
+      prefix_dst[i] = coos[i].num_cols;
+      prefix_elm[i] = coos[i].row->shape[0];
+    }
+  });
+
+  return std::make_tuple(CumSum(prefix_src_arr, true),
+                         CumSum(prefix_dst_arr, true),
+                         CumSum(prefix_elm_arr, true));
+}
+
+template <DLDeviceType XPU, typename IdType>
+COOMatrix DisjointUnionCoo(const std::vector<COOMatrix>& coos) {
+  bool has_data = false;
+  bool row_sorted = true;
+  bool col_sorted = true;
+  // check if data index array
+  for (size_t i = 0; i < coos.size(); ++i) {
+    CHECK_SAME_DTYPE(coos[0].row, coos[i].row);
+    CHECK_SAME_CONTEXT(coos[0].row, coos[i].row);
+    has_data |= COOHasData(coos[i]);
+  }
+
+  auto prefixes = _ComputePrefixSums<XPU, IdType>(coos);
+  auto prefix_src = static_cast<IdArray>(std::get<0>(prefixes)).Ptr<IdType>();
+  auto prefix_dst = static_cast<IdArray>(std::get<1>(prefixes)).Ptr<IdType>();
+  auto prefix_elm = static_cast<IdArray>(std::get<2>(prefixes)).Ptr<IdType>();
+
+  IdArray result_src = NewIdArray(
+    prefix_elm[coos.size()], coos[0].row->ctx, coos[0].row->dtype.bits);
+  IdArray result_dst = NewIdArray(
+    prefix_elm[coos.size()], coos[0].col->ctx, coos[0].col->dtype.bits);
+  IdArray result_dat = NullArray();
+  if (has_data) {
+    result_dat = NewIdArray(
+      prefix_elm[coos.size()], coos[0].row->ctx, coos[0].row->dtype.bits);
+  }
+
+  auto res_src_data = result_src.Ptr<IdType>();
+  auto res_dst_data = result_dst.Ptr<IdType>();
+  auto res_dat_data = result_dat.Ptr<IdType>();
+
+  dgl::runtime::parallel_for(0, coos.size(), [&](IdType b, IdType e){
+    for (IdType i = b; i < e; ++i) {
+      const aten::COOMatrix &coo = coos[i];
+      if (!coo.row_sorted) row_sorted = false;
+      if (!coo.col_sorted) col_sorted = false;
+
+      auto edges_src = coo.row.Ptr<IdType>();
+      auto edges_dst = coo.col.Ptr<IdType>();
+      auto edges_dat = coo.data.Ptr<IdType>();
+
+      for (IdType j = 0; j < coo.row->shape[0]; j++) {
+        res_src_data[prefix_elm[i] + j] = edges_src[j] + prefix_src[i];
+      }
+
+      for (IdType j = 0; j < coo.row->shape[0]; j++) {
+        res_dst_data[prefix_elm[i] + j] = edges_dst[j] + prefix_dst[i];
+      }
+
+      if (has_data) {
+        for (IdType j = 0; j < coo.row->shape[0]; j++) {
+          const auto d = (!COOHasData(coo)) ? j : edges_dat[j];
+          res_dat_data[prefix_elm[i]+j] = d + prefix_elm[i];
+        }
+      }
+    }
+  });
+  return COOMatrix(
+    prefix_src[coos.size()], prefix_dst[coos.size()],
+    result_src,
+    result_dst,
+    result_dat,
+    row_sorted,
+    col_sorted);
+}
+
+template COOMatrix DisjointUnionCoo<kDLCPU, int32_t>(const std::vector<COOMatrix>& coos);
+template COOMatrix DisjointUnionCoo<kDLCPU, int64_t>(const std::vector<COOMatrix>& coos);
+
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
--- a/src/array/cuda/disjoint_union.cu
+++ b/src/array/cuda/disjoint_union.cu
+/**
+*   Copyright (c) 2022, NVIDIA CORPORATION.
+*
+*   Licensed under the Apache License, Version 2.0 (the "License");
+*   you may not use this file except in compliance with the License.
+*   You may obtain a copy of the License at
+*
+*       http://www.apache.org/licenses/LICENSE-2.0
+*
+*   Unless required by applicable law or agreed to in writing, software
+*   distributed under the License is distributed on an "AS IS" BASIS,
+*   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*   See the License for the specific language governing permissions and
+*   limitations under the License.
+*
+* \file array/gpu/disjoint_union.cu
+* \brief Disjoint union GPU implementation.
+*/
+
+#include <dgl/runtime/parallel_for.h>
+#include <dgl/array.h>
+#include <vector>
+#include <tuple>
+#include "../../runtime/cuda/cuda_common.h"
+#include "./utils.h"
+
+namespace dgl {
+using runtime::NDArray;
+namespace aten {
+namespace impl {
+
+template <typename IdType>
+__global__ void _DisjointUnionKernel(
+    IdType** arrs, IdType* prefix, IdType* offset, IdType* out,
+    int64_t n_arrs, int n_elms) {
+  IdType tx = static_cast<IdType>(blockIdx.x) * blockDim.x + threadIdx.x;
+  const int stride_x = gridDim.x * blockDim.x;
+  while (tx < n_elms) {
+    IdType i = dgl::cuda::_UpperBound(offset, n_arrs, tx) - 1;
+    if (arrs[i] == NULL) {
+      out[tx] = tx;
+    } else {
+      IdType j = tx - offset[i];
+      out[tx] = arrs[i][j] + prefix[i];
+    }
+    tx += stride_x;
+  }
+}
+
+template <DLDeviceType XPU, typename IdType>
+std::tuple<IdArray, IdArray, IdArray> _ComputePrefixSums(const std::vector<COOMatrix>& coos) {
+  IdType n = coos.size(), nbits = coos[0].row->dtype.bits;
+  IdArray n_rows = NewIdArray(n, CPU, nbits);
+  IdArray n_cols = NewIdArray(n, CPU, nbits);
+  IdArray n_elms = NewIdArray(n, CPU, nbits);
+
+  IdType* n_rows_data = n_rows.Ptr<IdType>();
+  IdType* n_cols_data = n_cols.Ptr<IdType>();
+  IdType* n_elms_data = n_elms.Ptr<IdType>();
+
+  dgl::runtime::parallel_for(0, coos.size(), [&](IdType b, IdType e){
+    for (IdType i = b; i < e; ++i) {
+      n_rows_data[i] = coos[i].num_rows;
+      n_cols_data[i] = coos[i].num_cols;
+      n_elms_data[i] = coos[i].row->shape[0];
+    }
+  });
+
+  return std::make_tuple(CumSum(n_rows.CopyTo(coos[0].row->ctx), true),
+                         CumSum(n_cols.CopyTo(coos[0].row->ctx), true),
+                         CumSum(n_elms.CopyTo(coos[0].row->ctx), true));
+}
+
+template <DLDeviceType XPU, typename IdType>
+void _Merge(IdType** arrs, IdType* prefix, IdType* offset, IdType* out,
+            int64_t n_arrs, int n_elms,
+            DGLContext ctx, DGLType dtype, cudaStream_t stream) {
+  auto device = runtime::DeviceAPI::Get(ctx);
+  int nt = 256;
+  int nb = (n_elms + nt - 1) / nt;
+
+  IdType** arrs_dev = static_cast<IdType**>(
+      device->AllocWorkspace(ctx, n_arrs*sizeof(IdType*)));
+
+  device->CopyDataFromTo(
+      arrs, 0, arrs_dev, 0, sizeof(IdType*)*n_arrs,
+      DGLContext{kDLCPU, 0}, ctx, dtype, 0);
+
+  CUDA_KERNEL_CALL(_DisjointUnionKernel,
+      nb, nt, 0, stream,
+      arrs_dev, prefix, offset,
+      out, n_arrs, n_elms);
+
+  device->FreeWorkspace(ctx, arrs_dev);
+}
+
+template <DLDeviceType XPU, typename IdType>
+COOMatrix DisjointUnionCoo(const std::vector<COOMatrix>& coos) {
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  auto device = runtime::DeviceAPI::Get(coos[0].row->ctx);
+  uint64_t src_offset = 0, dst_offset = 0;
+  bool has_data = false;
+  bool row_sorted = true;
+  bool col_sorted = true;
+
+  // check if data index array
+  for (size_t i = 0; i < coos.size(); ++i) {
+    CHECK_SAME_DTYPE(coos[0].row, coos[i].row);
+    CHECK_SAME_CONTEXT(coos[0].row, coos[i].row);
+    has_data |= COOHasData(coos[i]);
+  }
+
+  auto prefixes = _ComputePrefixSums<XPU, IdType>(coos);
+  auto prefix_src = static_cast<IdType*>(std::get<0>(prefixes)->data);
+  auto prefix_dst = static_cast<IdType*>(std::get<1>(prefixes)->data);
+  auto prefix_elm = static_cast<IdType*>(std::get<2>(prefixes)->data);
+
+  std::unique_ptr<IdType*[]> rows(new IdType*[coos.size()]);
+  std::unique_ptr<IdType*[]> cols(new IdType*[coos.size()]);
+  std::unique_ptr<IdType*[]> data(new IdType*[coos.size()]);
+
+  for (size_t i = 0; i < coos.size(); i++) {
+    row_sorted &= coos[i].row_sorted;
+    col_sorted &= coos[i].col_sorted;
+    rows[i] = coos[i].row.Ptr<IdType>();
+    cols[i] = coos[i].col.Ptr<IdType>();
+    data[i] = coos[i].data.Ptr<IdType>();
+  }
+
+  auto ctx = coos[0].row->ctx;
+  auto dtype = coos[0].row->dtype;
+  auto stream = thr_entry->stream;
+
+  IdType n_elements = 0;
+  device->CopyDataFromTo(
+      &prefix_elm[coos.size()], 0, &n_elements, 0,
+      sizeof(IdType), coos[0].row->ctx, DGLContext{kDLCPU, 0},
+      coos[0].row->dtype, 0);
+
+  device->CopyDataFromTo(
+      &prefix_src[coos.size()], 0, &src_offset, 0,
+      sizeof(IdType), coos[0].row->ctx, DGLContext{kDLCPU, 0},
+      coos[0].row->dtype, 0);
+
+  device->CopyDataFromTo(
+      &prefix_dst[coos.size()], 0, &dst_offset, 0,
+      sizeof(IdType), coos[0].row->ctx, DGLContext{kDLCPU, 0},
+      coos[0].row->dtype, 0);
+
+  // Union src array
+  IdArray result_src = NewIdArray(
+    n_elements, coos[0].row->ctx, coos[0].row->dtype.bits);
+  _Merge<XPU, IdType>(rows.get(), prefix_src, prefix_elm, result_src.Ptr<IdType>(),
+         coos.size(), n_elements, ctx, dtype, stream);
+
+  // Union dst array
+  IdArray result_dst = NewIdArray(
+    n_elements, coos[0].col->ctx, coos[0].col->dtype.bits);
+  _Merge<XPU, IdType>(cols.get(), prefix_dst, prefix_elm, result_dst.Ptr<IdType>(),
+         coos.size(), n_elements, ctx, dtype, stream);
+
+  // Union data array if exists and fetch number of elements
+  IdArray result_dat = NullArray();
+  if (has_data) {
+    result_dat =  NewIdArray(
+      n_elements, coos[0].row->ctx, coos[0].row->dtype.bits);
+    _Merge<XPU, IdType>(data.get(), prefix_elm, prefix_elm, result_dat.Ptr<IdType>(),
+          coos.size(), n_elements, ctx, dtype, stream);
+  }
+
+  return COOMatrix(
+    src_offset, dst_offset,
+    result_src,
+    result_dst,
+    result_dat,
+    row_sorted,
+    col_sorted);
+}
+
+template COOMatrix DisjointUnionCoo<kDLGPU, int32_t>(const std::vector<COOMatrix>& coos);
+template COOMatrix DisjointUnionCoo<kDLGPU, int64_t>(const std::vector<COOMatrix>& coos);
+
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
--- a/src/array/union_partition.cc
+++ b/src/array/union_partition.cc
@@ -9,66 +9,6 @@
 namespace dgl {
 namespace aten {
 ///////////////////////// COO Based Operations/////////////////////////
-COOMatrix DisjointUnionCoo(const std::vector<COOMatrix>& coos) {
-  uint64_t src_offset = 0, dst_offset = 0;
-  int64_t edge_data_offset = 0;
-  bool has_data = false;
-  bool row_sorted = true;
-  bool col_sorted = true;
-
-  // check if data index array
-  for (size_t i = 0; i < coos.size(); ++i) {
-    CHECK_SAME_DTYPE(coos[0].row, coos[i].row);
-    CHECK_SAME_CONTEXT(coos[0].row, coos[i].row);
-    has_data |= COOHasData(coos[i]);
-  }
-
-  std::vector<IdArray> res_src;
-  std::vector<IdArray> res_dst;
-  std::vector<IdArray> res_data;
-  res_src.resize(coos.size());
-  res_dst.resize(coos.size());
-
-  for (size_t i = 0; i < coos.size(); ++i) {
-    const aten::COOMatrix &coo = coos[i];
-    row_sorted &= coo.row_sorted;
-    col_sorted &= coo.col_sorted;
-    IdArray edges_src = coo.row + src_offset;
-    IdArray edges_dst = coo.col + dst_offset;
-    res_src[i] = edges_src;
-    res_dst[i] = edges_dst;
-    src_offset += coo.num_rows;
-    dst_offset += coo.num_cols;
-
-    // any one of input coo has data index array
-    if (has_data) {
-      IdArray edges_data;
-      if (COOHasData(coo) == false) {
-        edges_data = Range(edge_data_offset,
-                           edge_data_offset + coo.row->shape[0],
-                           coo.row->dtype.bits,
-                           coo.row->ctx);
-      } else {
-        edges_data = coo.data + edge_data_offset;
-      }
-      res_data.push_back(edges_data);
-      edge_data_offset += coo.row->shape[0];
-    }
-  }
-
-  IdArray result_src = Concat(res_src);
-  IdArray result_dst = Concat(res_dst);
-  IdArray result_data = has_data ? Concat(res_data) : NullArray();
-
-  return COOMatrix(
-    src_offset, dst_offset,
-    result_src,
-    result_dst,
-    result_data,
-    row_sorted,
-    col_sorted);
-}
-
 std::vector<COOMatrix> DisjointPartitionCooBySizes(
  const COOMatrix &coo,
  const uint64_t batch_size,