[Performance Improvement] Make GPU sampling and to_block use pinned memory to...

[Performance Improvement] Make GPU sampling and to_block use pinned memory to decrease required synchronization (#5685)

[Performance Improvement] Make GPU sampling and to_block use pinned memory to...
[Performance Improvement] Make GPU sampling and to_block use pinned memory to decrease required synchronization (#5685)
46af76c3 · nv-dlasalle · GitHub · 585ce94b · 46af76c3 · 46af76c3
Unverified Commit 46af76c3 authored May 17, 2023 by nv-dlasalle Committed by GitHub May 17, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 52 additions and 17 deletions

src/array/cuda/rowwise_sampling.cu src/array/cuda/rowwise_sampling.cu +19 -8

src/graph/transform/cuda/cuda_to_block.cu src/graph/transform/cuda/cuda_to_block.cu +33 -9

No files found.
--- a/src/array/cuda/rowwise_sampling.cu
+++ b/src/array/cuda/rowwise_sampling.cu
@@ -7,6 +7,7 @@
 #include <curand_kernel.h>
 #include <dgl/random.h>
 #include <dgl/runtime/device_api.h>
+#include <dgl/runtime/tensordispatch.h>

 #include <numeric>

@@ -15,9 +16,11 @@
 #include "./dgl_cub.cuh"
 #include "./utils.h"

+using namespace dgl::cuda;
+using namespace dgl::aten::cuda;
+using TensorDispatcher = dgl::runtime::TensorDispatcher;
+
 namespace dgl {
-using namespace cuda;
-using namespace aten::cuda;
 namespace aten {
 namespace impl {

@@ -287,13 +290,20 @@ COOMatrix _CSRRowWiseSamplingUniform(
  cudaEvent_t copyEvent;
  CUDA_CALL(cudaEventCreate(&copyEvent));

-  // TODO(dlasalle): use pinned memory to overlap with the actual sampling, and
-  // wait on a cudaevent
-  IdType new_len;
+  NDArray new_len_tensor;
+  if (TensorDispatcher::Global()->IsAvailable()) {
+    new_len_tensor = NDArray::PinnedEmpty(
+        {1}, DGLDataTypeTraits<IdType>::dtype, DGLContext{kDGLCPU, 0});
+  } else {
+    // use pageable memory, it will unecessarily block but be functional
+    new_len_tensor = NDArray::Empty(
+        {1}, DGLDataTypeTraits<IdType>::dtype, DGLContext{kDGLCPU, 0});
+  }
+
  // copy using the internal current stream
-  device->CopyDataFromTo(
-      out_ptr, num_rows * sizeof(new_len), &new_len, 0, sizeof(new_len), ctx,
-      DGLContext{kDGLCPU, 0}, mat.indptr->dtype);
+  CUDA_CALL(cudaMemcpyAsync(
+      new_len_tensor->data, out_ptr + num_rows, sizeof(IdType),
+      cudaMemcpyDeviceToHost, stream));
  CUDA_CALL(cudaEventRecord(copyEvent, stream));

  const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
@@ -322,6 +332,7 @@ COOMatrix _CSRRowWiseSamplingUniform(
  CUDA_CALL(cudaEventSynchronize(copyEvent));
  CUDA_CALL(cudaEventDestroy(copyEvent));

+  const IdType new_len = static_cast<const IdType*>(new_len_tensor->data)[0];
  picked_row = picked_row.CreateView({new_len}, picked_row->dtype);
  picked_col = picked_col.CreateView({new_len}, picked_col->dtype);
  picked_idx = picked_idx.CreateView({new_len}, picked_idx->dtype);

--- a/src/graph/transform/cuda/cuda_to_block.cu
+++ b/src/graph/transform/cuda/cuda_to_block.cu
@@ -23,6 +23,7 @@
 #include <cuda_runtime.h>
 #include <dgl/immutable_graph.h>
 #include <dgl/runtime/device_api.h>
+#include <dgl/runtime/tensordispatch.h>

 #include <algorithm>
 #include <memory>
@@ -36,6 +37,7 @@
 using namespace dgl::aten;
 using namespace dgl::runtime::cuda;
 using namespace dgl::transform::cuda;
+using TensorDispatcher = dgl::runtime::TensorDispatcher;

 namespace dgl {
 namespace transform {
@@ -165,6 +167,9 @@ struct CUDAIdsMapper {
            NewIdArray(maxNodesPerType[ntype], ctx, sizeof(IdType) * 8));
      }
    }
+
+    cudaEvent_t copyEvent;
+    NDArray new_len_tensor;
    // Populate the mappings.
    if (generate_lhs_nodes) {
      int64_t* count_lhs_device = static_cast<int64_t*>(
@@ -174,13 +179,23 @@ struct CUDAIdsMapper {
          src_nodes, rhs_nodes, &node_maps, count_lhs_device, &lhs_nodes,
          stream);

-      device->CopyDataFromTo(
-          count_lhs_device, 0, num_nodes_per_type.data(), 0,
-          sizeof(*num_nodes_per_type.data()) * num_ntypes, ctx,
-          DGLContext{kDGLCPU, 0}, DGLDataType{kDGLInt, 64, 1});
-      device->StreamSync(ctx, stream);
+      CUDA_CALL(cudaEventCreate(&copyEvent));
+      if (TensorDispatcher::Global()->IsAvailable()) {
+        new_len_tensor = NDArray::PinnedEmpty(
+            {num_ntypes}, DGLDataTypeTraits<int64_t>::dtype,
+            DGLContext{kDGLCPU, 0});
+      } else {
+        // use pageable memory, it will unecessarily block but be functional
+        new_len_tensor = NDArray::Empty(
+            {num_ntypes}, DGLDataTypeTraits<int64_t>::dtype,
+            DGLContext{kDGLCPU, 0});
+      }
+      CUDA_CALL(cudaMemcpyAsync(
+          new_len_tensor->data, count_lhs_device,
+          sizeof(*num_nodes_per_type.data()) * num_ntypes,
+          cudaMemcpyDeviceToHost, stream));
+      CUDA_CALL(cudaEventRecord(copyEvent, stream));

-      // Wait for the node counts to finish transferring.
      device->FreeWorkspace(ctx, count_lhs_device);
    } else {
      maker.Make(lhs_nodes, rhs_nodes, &node_maps, stream);
@@ -189,14 +204,23 @@ struct CUDAIdsMapper {
        num_nodes_per_type[ntype] = lhs_nodes[ntype]->shape[0];
      }
    }
-    // Resize lhs nodes.
+    // Map node numberings from global to local, and build pointer for CSR.
+    auto ret = MapEdges(graph, edge_arrays, node_maps, stream);
+
    if (generate_lhs_nodes) {
+      // wait for the previous copy
+      CUDA_CALL(cudaEventSynchronize(copyEvent));
+      CUDA_CALL(cudaEventDestroy(copyEvent));
+
+      // Resize lhs nodes.
      for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
+        num_nodes_per_type[ntype] =
+            static_cast<int64_t*>(new_len_tensor->data)[ntype];
        lhs_nodes[ntype]->shape[0] = num_nodes_per_type[ntype];
      }
    }
-    // Map node numberings from global to local, and build pointer for CSR.
-    return MapEdges(graph, edge_arrays, node_maps, stream);
+
+    return ret;
  }
 };