[GraphBolt][CUDA] Make `_convert_to_sampled_subgraph` lighter. (#7312)

a3d20dce · Muhammed Fatih BALIN · GitHub · 6f9c20c3 · a3d20dce · a3d20dce
Unverified Commit a3d20dce authored Apr 18, 2024 by Muhammed Fatih BALIN Committed by GitHub Apr 18, 2024
5 changed files
--- a/graphbolt/include/graphbolt/cuda_sampling_ops.h
+++ b/graphbolt/include/graphbolt/cuda_sampling_ops.h
@@ -68,6 +68,7 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
    bool return_eids,
    torch::optional<torch::Tensor> type_per_edge = torch::nullopt,
    torch::optional<torch::Tensor> probs_or_mask = torch::nullopt,
+    torch::optional<torch::Tensor> node_type_offset = torch::nullopt,
    torch::optional<torch::Dict<std::string, int64_t>> node_type_to_id =
        torch::nullopt,
    torch::optional<torch::Dict<std::string, int64_t>> edge_type_to_id =

--- a/graphbolt/src/cuda/neighbor_sampler.cu
+++ b/graphbolt/src/cuda/neighbor_sampler.cu
@@ -9,6 +9,7 @@
 #include <graphbolt/continuous_seed.h>
 #include <graphbolt/cuda_ops.h>
 #include <graphbolt/cuda_sampling_ops.h>
+#include <thrust/copy.h>
 #include <thrust/gather.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
@@ -189,6 +190,7 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
    const std::vector<int64_t>& fanouts, bool replace, bool layer,
    bool return_eids, torch::optional<torch::Tensor> type_per_edge,
    torch::optional<torch::Tensor> probs_or_mask,
+    torch::optional<torch::Tensor> node_type_offset,
    torch::optional<torch::Dict<std::string, int64_t>> node_type_to_id,
    torch::optional<torch::Dict<std::string, int64_t>> edge_type_to_id,
    torch::optional<torch::Tensor> random_seed_tensor,
@@ -531,12 +533,33 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
    // Here, we check what are the dst node types for the given seeds so that
    // we can compute the output indptr space later.
    std::vector<int64_t> etype_id_to_dst_ntype_id(num_etypes);
+    // Here, we check what are the src node types for the given seeds so that
+    // we can subtract source node offset from indices later.
+    auto etype_id_to_src_ntype_id = torch::empty(
+        2 * num_etypes,
+        c10::TensorOptions().dtype(torch::kLong).pinned_memory(true));
+    auto etype_id_to_src_ntype_id_ptr =
+        etype_id_to_src_ntype_id.data_ptr<int64_t>();
    for (auto& etype_and_id : edge_type_to_id.value()) {
      auto etype = etype_and_id.key();
      auto id = etype_and_id.value();
-      auto dst_type = utils::parse_dst_ntype_from_etype(etype);
+      auto [src_type, dst_type] = utils::parse_src_dst_ntype_from_etype(etype);
      etype_id_to_dst_ntype_id[id] = node_type_to_id->at(dst_type);
+      etype_id_to_src_ntype_id_ptr[2 * id] =
+          etype_id_to_src_ntype_id_ptr[2 * id + 1] =
+              node_type_to_id->at(src_type);
    }
+    auto indices_offsets_device = torch::empty(
+        etype_id_to_src_ntype_id.size(0),
+        output_indices.options().dtype(torch::kLong));
+    AT_DISPATCH_INDEX_TYPES(
+        node_type_offset->scalar_type(), "SampleNeighborsNodeTypeOffset", ([&] {
+          THRUST_CALL(
+              gather, etype_id_to_src_ntype_id_ptr,
+              etype_id_to_src_ntype_id_ptr + etype_id_to_src_ntype_id.size(0),
+              node_type_offset->data_ptr<index_t>(),
+              indices_offsets_device.data_ptr<int64_t>());
+        }));
    // For each edge type, we compute the start and end offsets to index into
    // indptr to form the final output_indptr.
    auto indptr_offsets = torch::empty(
@@ -571,29 +594,60 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
        num_etypes * 2, c10::TensorOptions()
                            .dtype(output_indptr.scalar_type())
                            .pinned_memory(true));
+    auto edge_offsets_device =
+        torch::empty(num_etypes * 2, output_indptr.options());
    at::cuda::CUDAEvent edge_offsets_event;
    AT_DISPATCH_INDEX_TYPES(
        indptr.scalar_type(), "SampleNeighborsEdgeOffsets", ([&] {
+          auto edge_offsets_pinned_device_pair =
+              thrust::make_transform_output_iterator(
+                  thrust::make_zip_iterator(
+                      edge_offsets->data_ptr<index_t>(),
+                      edge_offsets_device.data_ptr<index_t>()),
+                  [=] __device__(index_t x) {
+                    return thrust::make_tuple(x, x);
+                  });
          THRUST_CALL(
              gather, indptr_offsets_ptr,
              indptr_offsets_ptr + indptr_offsets.size(0),
              output_indptr.data_ptr<index_t>(),
-              edge_offsets->data_ptr<index_t>());
+              edge_offsets_pinned_device_pair);
        }));
    edge_offsets_event.record();
+    auto indices_offset_subtract = ExpandIndptrImpl(
+        edge_offsets_device, indices.scalar_type(), indices_offsets_device,
+        output_indices.size(0));
    // The output_indices is permuted here.
    std::tie(output_indptr, output_indices) = IndexSelectCSCImpl(
        output_in_degree, sliced_output_indptr, output_indices, permutation,
        num_rows - 1, output_indices.size(0));
+    output_indices -= indices_offset_subtract;
+    auto output_indptr_offsets = torch::empty(
+        num_etypes * 2,
+        c10::TensorOptions().dtype(torch::kLong).pinned_memory(true));
+    auto output_indptr_offsets_ptr = output_indptr_offsets.data_ptr<int64_t>();
    std::vector<torch::Tensor> indptr_list;
    for (int i = 0; i < num_etypes; i++) {
      indptr_list.push_back(output_indptr.slice(
-          0, indptr_offsets_ptr[2 * i],
+          0, indptr_offsets_ptr[2 * i], indptr_offsets_ptr[2 * i + 1] + 1));
-          indptr_offsets_ptr[2 * i + 1] + (i == num_etypes - 1)));
+      output_indptr_offsets_ptr[2 * i] =
+          i == 0 ? 0 : output_indptr_offsets_ptr[2 * i - 1];
+      output_indptr_offsets_ptr[2 * i + 1] =
+          output_indptr_offsets_ptr[2 * i] + indptr_list.back().size(0);
    }
+    auto output_indptr_offsets_device = torch::empty(
+        output_indptr_offsets.size(0),
+        output_indptr.options().dtype(torch::kLong));
+    THRUST_CALL(
+        copy_n, output_indptr_offsets_ptr, output_indptr_offsets.size(0),
+        output_indptr_offsets_device.data_ptr<int64_t>());
    // We form the final output indptr by concatenating pieces for different
    // edge types.
    output_indptr = torch::cat(indptr_list);
+    auto indptr_offset_subtract = ExpandIndptrImpl(
+        output_indptr_offsets_device, indptr.scalar_type(), edge_offsets_device,
+        output_indptr.size(0));
+    output_indptr -= indptr_offset_subtract;
    edge_offsets_event.synchronize();
    // We read the edge_offsets here, they are in pairs but we don't need it to
    // be in pairs. So we remove the duplicate information from it and turn it

--- a/graphbolt/src/fused_csc_sampling_graph.cc
+++ b/graphbolt/src/fused_csc_sampling_graph.cc
@@ -646,8 +646,9 @@ c10::intrusive_ptr<FusedSampledSubgraph> FusedCSCSamplingGraph::SampleNeighbors(
        c10::DeviceType::CUDA, "SampleNeighbors", {
          return ops::SampleNeighbors(
              indptr_, indices_, seeds, seed_offsets, fanouts, replace, layer,
-              return_eids, type_per_edge_, probs_or_mask, node_type_to_id_,
+              return_eids, type_per_edge_, probs_or_mask, node_type_offset_,
-              edge_type_to_id_, random_seed, seed2_contribution);
+              node_type_to_id_, edge_type_to_id_, random_seed,
+              seed2_contribution);
        });
  }
  TORCH_CHECK(seeds.has_value(), "Nodes can not be None on the CPU.");

--- a/graphbolt/src/utils.h
+++ b/graphbolt/src/utils.h
@@ -27,14 +27,17 @@ inline bool is_accessible_from_gpu(torch::Tensor tensor) {
 }
 /**
- * @brief Parses the destination node type from a given edge type triple
+ * @brief Parses the source and destination node type from a given edge type
- * seperated with ":".
+ * triple seperated with ":".
 */
-inline std::string parse_dst_ntype_from_etype(std::string etype) {
+inline std::pair<std::string, std::string> parse_src_dst_ntype_from_etype(
+    std::string etype) {
  auto first_seperator_it = std::find(etype.begin(), etype.end(), ':');
  auto second_seperator_pos =
      std::find(first_seperator_it + 1, etype.end(), ':') - etype.begin();
-  return etype.substr(second_seperator_pos + 1);
+  return {
+      etype.substr(0, first_seperator_it - etype.begin()),
+      etype.substr(second_seperator_pos + 1)};
 }
 /**

--- a/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
+++ b/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
@@ -576,14 +576,14 @@ class FusedCSCSamplingGraph(SamplingGraph):
                        edge_offsets[-1]
                        + seed_offsets[ntype_id + 1]
                        - seed_offsets[ntype_id]
+                        + 1
                    )
                for etype, etype_id in self.edge_type_to_id.items():
                    src_ntype, _, dst_ntype = etype_str_to_tuple(etype)
                    ntype_id = self.node_type_to_id[dst_ntype]
-                    sub_indptr_ = indptr[
+                    sub_indptr[etype] = indptr[
-                        edge_offsets[etype_id] : edge_offsets[etype_id + 1] + 1
+                        edge_offsets[etype_id] : edge_offsets[etype_id + 1]
                    ]
-                    sub_indptr[etype] = sub_indptr_ - sub_indptr_[0]
                    sub_indices[etype] = indices[
                        etype_offsets[etype_id] : etype_offsets[etype_id + 1]
                    ]
@@ -593,8 +593,6 @@ class FusedCSCSamplingGraph(SamplingGraph):
                                etype_id + 1
                            ]
                        ]
-                    src_ntype_id = self.node_type_to_id[src_ntype]
-                    sub_indices[etype] -= offset[src_ntype_id]
            if has_original_eids:
                original_edge_ids = original_hetero_edge_ids