fix bug for randomwalk

28286ab3 · sangwzh · 35d7422e · 28286ab3
Commit 28286ab3 authored Oct 17, 2024 by sangwzh
Hide whitespace changes
Inline Side-by-side

Showing with 61 additions and 20 deletions

src/graph/sampling/randomwalks/randomwalk_gpu.hip src/graph/sampling/randomwalks/randomwalk_gpu.hip +61 -20

No files found.
--- a/src/graph/sampling/randomwalks/randomwalk_gpu.hip
+++ b/src/graph/sampling/randomwalks/randomwalk_gpu.hip
@@ -37,6 +37,22 @@ struct GraphKernelData {
  const IdType *in_cols;
  const IdType *data;
 };
+template<typename IdType>
+inline IdType* __GetDevicePointer(runtime::NDArray array) {
+  IdType* ptr = array.Ptr<IdType>();
+  if (array.IsPinned()) {
+    CUDA_CALL(hipHostGetDevicePointer(&ptr, ptr, 0));
+  }
+  return ptr;
+}
+inline void* __GetDevicePointer(runtime::NDArray array) {
+  void* ptr = array->data;
+  if (array.IsPinned()) {
+    CUDA_CALL(hipHostGetDevicePointer(&ptr, ptr, 0));
+  }
+  return ptr;
+}
 template <typename IdType, typename FloatType, int BLOCK_SIZE, int TILE_SIZE>
 __global__ void _RandomWalkKernel(
@@ -178,14 +194,17 @@ std::pair<IdArray, IdArray> RandomWalkUniform(
    const HeteroGraphPtr hg, const IdArray seeds, const TypeArray metapath,
    FloatArray restart_prob) {
  const int64_t max_num_steps = metapath->shape[0];
-  const IdType *metapath_data = static_cast<IdType *>(metapath->data);
+  // const IdType *metapath_data = static_cast<IdType *>(metapath->data);
+  const IdType *metapath_data = static_cast<const IdType *>(__GetDevicePointer(metapath));
  const int64_t begin_ntype =
      hg->meta_graph()->FindEdge(metapath_data[0]).first;
  const int64_t max_nodes = hg->NumVertices(begin_ntype);
  int64_t num_etypes = hg->NumEdgeTypes();
  auto ctx = seeds->ctx;
-  const IdType *seed_data = static_cast<const IdType *>(seeds->data);
+  // const IdType *seed_data = static_cast<const IdType *>(seeds->data);
+  const IdType *seed_data = static_cast<const IdType *>(__GetDevicePointer(seeds));
+  // const IdType *seed_data = static_cast<const IdType *>(__GetDevicePointer(seeds));
  CHECK(seeds->ndim == 1) << "seeds shape is not one dimension.";
  const int64_t num_seeds = seeds->shape[0];
  int64_t trace_length = max_num_steps + 1;
@@ -197,10 +216,15 @@ std::pair<IdArray, IdArray> RandomWalkUniform(
  std::vector<GraphKernelData<IdType>> h_graphs(num_etypes);
  for (int64_t etype = 0; etype < num_etypes; ++etype) {
    const CSRMatrix &csr = hg->GetCSRMatrix(etype);
-    h_graphs[etype].in_ptr = static_cast<const IdType *>(csr.indptr->data);
+    // h_graphs[etype].in_ptr = static_cast<const IdType *>(csr.indptr->data);
-    h_graphs[etype].in_cols = static_cast<const IdType *>(csr.indices->data);
+    // h_graphs[etype].in_cols = static_cast<const IdType *>(csr.indices->data);
+    // h_graphs[etype].data =
+    //     (CSRHasData(csr) ? static_cast<const IdType *>(csr.data->data)
+    //                      : nullptr);
+    h_graphs[etype].in_ptr = static_cast<const IdType *>(__GetDevicePointer(csr.indptr));
+    h_graphs[etype].in_cols = static_cast<const IdType *>(__GetDevicePointer(csr.indices));
    h_graphs[etype].data =
-        (CSRHasData(csr) ? static_cast<const IdType *>(csr.data->data)
+        (CSRHasData(csr) ? static_cast<const IdType *>(__GetDevicePointer(csr.data))
                         : nullptr);
  }
  // use cuda stream from local thread
@@ -227,7 +251,8 @@ std::pair<IdArray, IdArray> RandomWalkUniform(
        CHECK(restart_prob->ctx.device_type == kDGLCUDA||restart_prob->ctx.device_type == kDGLROCM)
            << "restart prob should be in GPU.";
        CHECK(restart_prob->ndim == 1) << "restart prob dimension should be 1.";
-        const FloatType *restart_prob_data = restart_prob.Ptr<FloatType>();
+        // const FloatType *restart_prob_data = restart_prob.Ptr<FloatType>();
+        const FloatType *restart_prob_data = static_cast<const FloatType *>(__GetDevicePointer(restart_prob));
        const int64_t restart_prob_size = restart_prob->shape[0];
        CUDA_KERNEL_CALL(
            (_RandomWalkKernel<IdType, FloatType, BLOCK_SIZE, TILE_SIZE>), grid,
@@ -249,21 +274,25 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
    const HeteroGraphPtr hg, const IdArray seeds, const TypeArray metapath,
    const std::vector<FloatArray> &prob, FloatArray restart_prob) {
  const int64_t max_num_steps = metapath->shape[0];
-  const IdType *metapath_data = static_cast<IdType *>(metapath->data);
+  // const IdType *metapath_data = static_cast<IdType *>(metapath->data);
+  const IdType *metapath_data = static_cast<IdType *>(__GetDevicePointer(metapath));
  const int64_t begin_ntype =
      hg->meta_graph()->FindEdge(metapath_data[0]).first;
  const int64_t max_nodes = hg->NumVertices(begin_ntype);
  int64_t num_etypes = hg->NumEdgeTypes();
  auto ctx = seeds->ctx;
-  const IdType *seed_data = static_cast<const IdType *>(seeds->data);
+  // const IdType *seed_data = static_cast<const IdType *>(seeds->data);
+  const IdType *seed_data = static_cast<const IdType *>(__GetDevicePointer(seeds));
  CHECK(seeds->ndim == 1) << "seeds shape is not one dimension.";
  const int64_t num_seeds = seeds->shape[0];
  int64_t trace_length = max_num_steps + 1;
  IdArray traces = IdArray::Empty({num_seeds, trace_length}, seeds->dtype, ctx);
  IdArray eids = IdArray::Empty({num_seeds, max_num_steps}, seeds->dtype, ctx);
  IdType *traces_data = traces.Ptr<IdType>();
-  IdType *eids_data = eids.Ptr<IdType>();
+  // IdType *traces_data = static_cast<IdType *>(__GetDevicePointer(traces));
+  // IdType *eids_data = eids.Ptr<IdType>();
+  IdType *eids_data = static_cast<IdType *>(__GetDevicePointer(eids));
  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
  auto device = DeviceAPI::Get(ctx);
@@ -278,10 +307,15 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
  std::vector<GraphKernelData<IdType>> h_graphs(num_etypes);
  for (int64_t etype = 0; etype < num_etypes; ++etype) {
    const CSRMatrix &csr = hg->GetCSRMatrix(etype);
-    h_graphs[etype].in_ptr = static_cast<const IdType *>(csr.indptr->data);
+    // h_graphs[etype].in_ptr = static_cast<const IdType *>(csr.indptr->data);
-    h_graphs[etype].in_cols = static_cast<const IdType *>(csr.indices->data);
+    // h_graphs[etype].in_cols = static_cast<const IdType *>(csr.indices->data);
+    // h_graphs[etype].data =
+    //     (CSRHasData(csr) ? static_cast<const IdType *>(csr.data->data)
+    //                      : nullptr);
+    h_graphs[etype].in_ptr = static_cast<const IdType *>(__GetDevicePointer(csr.indptr));
+    h_graphs[etype].in_cols = static_cast<const IdType *>(__GetDevicePointer(csr.indices));
    h_graphs[etype].data =
-        (CSRHasData(csr) ? static_cast<const IdType *>(csr.data->data)
+        (CSRHasData(csr) ? static_cast<const IdType *>(__GetDevicePointer(csr.data))
                         : nullptr);
    int64_t num_segments = csr.indptr->shape[0] - 1;
@@ -291,13 +325,16 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
      prob_sums[etype] = nullptr;
      continue;
    }
-    probs[etype] = prob[etype].Ptr<FloatType>();
+    // probs[etype] = prob[etype].Ptr<FloatType>();
+    probs[etype] = static_cast<FloatType *>(__GetDevicePointer(prob[etype]));
    prob_sums_arr.push_back(
        FloatArray::Empty({num_segments}, prob[etype]->dtype, ctx));
-    prob_sums[etype] = prob_sums_arr[etype].Ptr<FloatType>();
+    // prob_sums[etype] = prob_sums_arr[etype].Ptr<FloatType>();
+    prob_sums[etype] = static_cast<FloatType *>(__GetDevicePointer(prob_sums_arr[etype]));
    // calculate the sum of the neighbor weights
-    const IdType *d_offsets = static_cast<const IdType *>(csr.indptr->data);
+    // const IdType *d_offsets = static_cast<const IdType *>(csr.indptr->data);
+    const IdType *d_offsets = static_cast<const IdType *>(__GetDevicePointer(csr.indptr));
    size_t temp_storage_size = 0;
    CUDA_CALL(hipcub::DeviceSegmentedReduce::Sum(
        nullptr, temp_storage_size, probs[etype], prob_sums[etype],
@@ -330,7 +367,8 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
      DGLContext{kDGLCPU, 0}, ctx, prob[0]->dtype);
  // copy metapath to GPU
  auto d_metapath = metapath.CopyTo(ctx);
-  const IdType *d_metapath_data = static_cast<IdType *>(d_metapath->data);
+  // const IdType *d_metapath_data = static_cast<IdType *>(d_metapath->data);
+  const IdType *d_metapath_data = static_cast<IdType *>(__GetDevicePointer(d_metapath));
  constexpr int BLOCK_SIZE = 256;
  constexpr int TILE_SIZE = BLOCK_SIZE * 4;
@@ -340,8 +378,9 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
  CHECK(restart_prob->ctx.device_type == kDGLCUDA ||restart_prob->ctx.device_type == kDGLROCM)
      << "restart prob should be in GPU.";
  CHECK(restart_prob->ndim == 1) << "restart prob dimension should be 1.";
-  const FloatType *restart_prob_data = restart_prob.Ptr<FloatType>();
+  // const FloatType *restart_prob_data = restart_prob.Ptr<FloatType>();
-  const int64_t restart_prob_size = restart_prob->shape[0];
+  const FloatType *restart_prob_data = static_cast<const FloatType *>(__GetDevicePointer(restart_prob));
+  const int64_t restart_prob_size = restart_prob->shape[0];  
  CUDA_KERNEL_CALL(
      (_RandomWalkBiasedKernel<IdType, FloatType, BLOCK_SIZE, TILE_SIZE>), grid,
      block, 0, stream, random_seed, seed_data, num_seeds, d_metapath_data,
@@ -446,8 +485,10 @@ std::tuple<IdArray, IdArray, IdArray> SelectPinSageNeighbors(
    const IdArray src, const IdArray dst, const int64_t num_samples_per_node,
    const int64_t k) {
  CHECK(src->ctx.device_type == kDGLCUDA || src->ctx.device_type == kDGLROCM) << "IdArray needs be on GPU!";
-  const IdxType *src_data = src.Ptr<IdxType>();
+  // const IdxType *src_data = src.Ptr<IdxType>();
-  const IdxType *dst_data = dst.Ptr<IdxType>();
+  const IdxType *src_data = static_cast<IdxType*>(__GetDevicePointer(src));
+  // const IdxType *dst_data = dst.Ptr<IdxType>();
+  const IdxType *dst_data = static_cast<IdxType*>(__GetDevicePointer(dst));
  const int64_t num_dst_nodes = (dst->shape[0] / num_samples_per_node);
  auto ctx = src->ctx;
  // use cuda stream from local thread