[Feature] Add CUDA Weighted Neighborhood Sampling (#4064)

* add weighted sampling without replacement (A-Chao) * improve Algorithm A-Chao with block-wise prefix sum * correctly fill out_idxs * implement weighted sampling with replacement * small fix * merge host-side code of weighted/uniform sampling * enable unit tests for cuda weighted sampling * move thrust/cub wrapper to the cmake file * update docs accordingly * fix linting * fix linting * fix unit test * Bump external CUB/Thrust versions * Fix code style and update description of algorithm design * [Feature] GPU support weighted graph neighbor sampling commit by pengqirong(OPPO) * merge pengqirong's implementation * revert the change to cub and thrust * fix linting * use DeviceSegmentedSort for better performance * add more comments * add necessary notes * add necessary notes * resolve some comments * define THRUST_CUB_WRAPPED_NAMESPACE * fix doc Co-authored-by: 彭齐荣 <657017034@qq.com>

[Feature] Add CUDA Weighted Neighborhood Sampling (#4064)
* add weighted sampling without replacement (A-Chao) * improve Algorithm A-Chao with block-wise prefix sum * correctly fill out_idxs * implement weighted sampling with replacement * small fix * merge host-side code of weighted/uniform sampling * enable unit tests for cuda weighted sampling * move thrust/cub wrapper to the cmake file * update docs accordingly * fix linting * fix linting * fix unit test * Bump external CUB/Thrust versions * Fix code style and update description of algorithm design * [Feature] GPU support weighted graph neighbor sampling commit by pengqirong(OPPO) * merge pengqirong's implementation * revert the change to cub and thrust * fix linting * use DeviceSegmentedSort for better performance * add more comments * add necessary notes * add necessary notes * resolve some comments * define THRUST_CUB_WRAPPED_NAMESPACE * fix doc Co-authored-by: 彭齐荣 <657017034@qq.com>
86c81b4e · Xin Yao · GitHub · 17f1432a · 86c81b4e · 86c81b4e
Unverified Commit 86c81b4e authored Jul 29, 2022 by Xin Yao Committed by GitHub Jul 29, 2022
11 changed files
--- a/.gitmodules
+++ b/.gitmodules
@@ -12,8 +12,7 @@
 	url = https://github.com/KarypisLab/METIS.git
 [submodule "third_party/cub"]
 	path = third_party/cub
-	url = https://github.com/NVlabs/cub.git
-	branch = 1.8.0
+	url = https://github.com/NVIDIA/cub.git
 [submodule "third_party/phmap"]
 	path = third_party/phmap
 	url = https://github.com/greg7mdp/parallel-hashmap.git

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -46,18 +46,12 @@ endif(NOT MSVC)
 if(USE_CUDA)
  message(STATUS "Build with CUDA support")
  project(dgl C CXX)
+  # see https://github.com/NVIDIA/thrust/issues/1401
+  add_definitions(-DTHRUST_CUB_WRAPPED_NAMESPACE=dgl)
  include(cmake/modules/CUDA.cmake)
-  if ((CUDA_VERSION_MAJOR LESS 11) OR
-      ((CUDA_VERSION_MAJOR EQUAL 11) AND (CUDA_VERSION_MINOR EQUAL 0)))
-    # For cuda<11, use external CUB/Thrust library because CUB is not part of CUDA.
-    # For cuda==11.0, use external CUB/Thrust library because there is a bug in the
-    #   official CUB library which causes invalid device ordinal error for DGL. The bug
-    #   is fixed by https://github.com/NVIDIA/cub/commit/9143e47e048641aa0e6ddfd645bcd54ff1059939
-    #   in 11.1.
-    message(STATUS "Detected CUDA of version ${CUDA_VERSION}. Use external CUB/Thrust library.")
-    cuda_include_directories(BEFORE "${CMAKE_SOURCE_DIR}/third_party/thrust")
-    cuda_include_directories(BEFORE "${CMAKE_SOURCE_DIR}/third_party/cub")
-  endif()
+  message(STATUS "Use external CUB/Thrust library for a consistent API and performance.")
+  cuda_include_directories(BEFORE "${CMAKE_SOURCE_DIR}/third_party/thrust")
+  cuda_include_directories(BEFORE "${CMAKE_SOURCE_DIR}/third_party/cub")
 endif(USE_CUDA)

 # initial variables

--- a/docs/source/guide/minibatch-gpu-sampling.rst
+++ b/docs/source/guide/minibatch-gpu-sampling.rst
@@ -60,7 +60,7 @@ Using CUDA UVA-based neighborhood sampling in DGL data loaders

 For the case where the graph is too large to fit onto the GPU memory, we introduce the
 CUDA UVA (Unified Virtual Addressing)-based sampling, in which GPUs perform the sampling
-on the graph pinned on CPU memory via zero-copy access.
+on the graph pinned in CPU memory via zero-copy access.
 You can enable UVA-based neighborhood sampling in DGL data loaders via:

 * Put the ``train_nid`` onto GPU.
@@ -138,9 +138,6 @@ You can build your own GPU sampling pipelines with the following functions that
 operating on GPU:

 * :func:`dgl.sampling.sample_neighbors`
-
-  * Only has support for uniform sampling; non-uniform sampling can only run on CPU.
-
 * :func:`dgl.sampling.random_walk`

 Subgraph extraction ops:

--- a/python/dgl/sampling/neighbor.py
+++ b/python/dgl/sampling/neighbor.py
@@ -54,8 +54,6 @@ def sample_etype_neighbors(g, nodes, etype_field, fanout, edge_dir='in', prob=No
        The features must be non-negative floats, and the sum of the features of
        inbound/outbound edges for every node must be positive (though they don't have
        to sum up to one).  Otherwise, the result will be undefined.
-
-        If :attr:`prob` is not None, GPU sampling is not supported.
    replace : bool, optional
        If True, sample with replacement.
    copy_ndata: bool, optional
@@ -163,6 +161,9 @@ def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=False,
    Node/edge features are not preserved. The original IDs of
    the sampled edges are stored as the `dgl.EID` feature in the returned graph.

+    GPU sampling is supported for this function. Refer to :ref:`guide-minibatch-gpu-sampling`
+    for more details.
+
    Parameters
    ----------
    g : DGLGraph
@@ -193,8 +194,6 @@ def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=False,
        The features must be non-negative floats, and the sum of the features of
        inbound/outbound edges for every node must be positive (though they don't have
        to sum up to one).  Otherwise, the result will be undefined.
-
-        If :attr:`prob` is not None, GPU sampling is not supported.
    exclude_edges: tensor or dict
        Edge IDs to exclude during sampling neighbors for the seed nodes.


--- a/src/array/array.cc
+++ b/src/array/array.cc
@@ -549,11 +549,12 @@ COOMatrix CSRRowWiseSampling(
    CSRMatrix mat, IdArray rows, int64_t num_samples, FloatArray prob, bool replace) {
  COOMatrix ret;
  if (IsNullArray(prob)) {
-    ATEN_CSR_SWITCH_CUDA_UVA(mat, rows, XPU, IdType, "CSRRowWiseSampling", {
+    ATEN_CSR_SWITCH_CUDA_UVA(mat, rows, XPU, IdType, "CSRRowWiseSamplingUniform", {
      ret = impl::CSRRowWiseSamplingUniform<XPU, IdType>(mat, rows, num_samples, replace);
    });
  } else {
-    ATEN_CSR_SWITCH(mat, XPU, IdType, "CSRRowWiseSampling", {
+    CHECK_SAME_CONTEXT(rows, prob);
+    ATEN_CSR_SWITCH_CUDA_UVA(mat, rows, XPU, IdType, "CSRRowWiseSampling", {
      ATEN_FLOAT_TYPE_SWITCH(prob->dtype, FloatType, "probability", {
        ret = impl::CSRRowWiseSampling<XPU, IdType, FloatType>(
            mat, rows, num_samples, prob, replace);

--- a/src/array/cuda/dgl_cub.cuh
+++ b/src/array/cuda/dgl_cub.cuh
@@ -7,13 +7,11 @@
 #ifndef DGL_ARRAY_CUDA_DGL_CUB_CUH_
 #define DGL_ARRAY_CUDA_DGL_CUB_CUH_

-// include cub in a safe manner
-#define CUB_NS_PREFIX namespace dgl {
-#define CUB_NS_POSTFIX }
-#define CUB_NS_QUALIFIER ::dgl::cub
+// This should be defined in CMakeLists.txt
+#ifndef THRUST_CUB_WRAPPED_NAMESPACE
+static_assert(false, "THRUST_CUB_WRAPPED_NAMESPACE must be defined for DGL.");
+#endif
+
 #include "cub/cub.cuh"
-#undef CUB_NS_QUALIFIER
-#undef CUB_NS_POSTFIX
-#undef CUB_NS_PREFIX

 #endif
--- a/src/array/cuda/rowwise_sampling.cu
+++ b/src/array/cuda/rowwise_sampling.cu
 /*!
 *  Copyright (c) 2021 by Contributors
 * \file array/cuda/rowwise_sampling.cu
- * \brief rowwise sampling
+ * \brief uniform rowwise sampling
 */

 #include <dgl/random.h>
@@ -13,6 +13,7 @@
 #include "../../array/cuda/atomic.cuh"
 #include "../../runtime/cuda/cuda_common.h"

+
 using namespace dgl::aten::cuda;

 namespace dgl {
@@ -21,7 +22,7 @@ namespace impl {

 namespace {

-constexpr int CTA_SIZE = 128;
+constexpr int BLOCK_SIZE = 128;

 /**
 * @brief Compute the size of each row in the sampled CSR, without replacement.
@@ -41,14 +42,14 @@ __global__ void _CSRRowWiseSampleDegreeKernel(
    const IdType * const in_rows,
    const IdType * const in_ptr,
    IdType * const out_deg) {
-  const int tIdx = threadIdx.x + blockIdx.x*blockDim.x;
+  const int tIdx = threadIdx.x + blockIdx.x * blockDim.x;

  if (tIdx < num_rows) {
    const int in_row = in_rows[tIdx];
    const int out_row = tIdx;
-    out_deg[out_row] = min(static_cast<IdType>(num_picks), in_ptr[in_row+1]-in_ptr[in_row]);
+    out_deg[out_row] = min(static_cast<IdType>(num_picks), in_ptr[in_row + 1] - in_ptr[in_row]);

-    if (out_row == num_rows-1) {
+    if (out_row == num_rows - 1) {
      // make the prefixsum work
      out_deg[num_rows] = 0;
    }
@@ -73,19 +74,19 @@ __global__ void _CSRRowWiseSampleDegreeReplaceKernel(
    const IdType * const in_rows,
    const IdType * const in_ptr,
    IdType * const out_deg) {
-  const int tIdx = threadIdx.x + blockIdx.x*blockDim.x;
+  const int tIdx = threadIdx.x + blockIdx.x * blockDim.x;

  if (tIdx < num_rows) {
    const int64_t in_row = in_rows[tIdx];
    const int64_t out_row = tIdx;

-    if (in_ptr[in_row+1]-in_ptr[in_row] == 0) {
+    if (in_ptr[in_row + 1] - in_ptr[in_row] == 0) {
      out_deg[out_row] = 0;
    } else {
      out_deg[out_row] = static_cast<IdType>(num_picks);
    }

-    if (out_row == num_rows-1) {
+    if (out_row == num_rows - 1) {
      // make the prefixsum work
      out_deg[num_rows] = 0;
    }
@@ -93,11 +94,10 @@ __global__ void _CSRRowWiseSampleDegreeReplaceKernel(
 }

 /**
-* @brief Perform row-wise sampling on a CSR matrix, and generate a COO matrix,
-* without replacement.
+* @brief Perform row-wise uniform sampling on a CSR matrix,
+* and generate a COO matrix, without replacement.
 *
 * @tparam IdType The ID type used for matrices.
-* @tparam BLOCK_CTAS The number of rows each thread block runs in parallel.
 * @tparam TILE_SIZE The number of rows covered by each threadblock.
 * @param rand_seed The random seed to use.
 * @param num_picks The number of non-zeros to pick per row.
@@ -111,8 +111,8 @@ __global__ void _CSRRowWiseSampleDegreeReplaceKernel(
 * @param out_cols The columns of the output COO (output).
 * @param out_idxs The data array of the output COO (output).
 */
-template<typename IdType, int BLOCK_CTAS, int TILE_SIZE>
-__global__ void _CSRRowWiseSampleKernel(
+template<typename IdType, int TILE_SIZE>
+__global__ void _CSRRowWiseSampleUniformKernel(
    const uint64_t rand_seed,
    const int64_t num_picks,
    const int64_t num_rows,
@@ -125,68 +125,62 @@ __global__ void _CSRRowWiseSampleKernel(
    IdType * const out_cols,
    IdType * const out_idxs) {
  // we assign one warp per row
-  assert(blockDim.x == CTA_SIZE);
+  assert(blockDim.x == BLOCK_SIZE);

-  int64_t out_row = blockIdx.x*TILE_SIZE+threadIdx.y;
-  const int64_t last_row = min(static_cast<int64_t>(blockIdx.x+1)*TILE_SIZE, num_rows);
+  int64_t out_row = blockIdx.x * TILE_SIZE;
+  const int64_t last_row = min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);

  curandStatePhilox4_32_10_t rng;
-  curand_init((rand_seed*gridDim.x+blockIdx.x)*blockDim.y+threadIdx.y, threadIdx.x, 0, &rng);
+  curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);

  while (out_row < last_row) {
    const int64_t row = in_rows[out_row];
-
    const int64_t in_row_start = in_ptr[row];
-    const int64_t deg = in_ptr[row+1] - in_row_start;
-
+    const int64_t deg = in_ptr[row + 1] - in_row_start;
    const int64_t out_row_start = out_ptr[out_row];

    if (deg <= num_picks) {
-      // just copy row
-      for (int idx = threadIdx.x; idx < deg; idx += CTA_SIZE) {
-        const IdType in_idx = in_row_start+idx;
-        out_rows[out_row_start+idx] = row;
-        out_cols[out_row_start+idx] = in_index[in_idx];
-        out_idxs[out_row_start+idx] = data ? data[in_idx] : in_idx;
+      // just copy row when there is not enough nodes to sample.
+      for (int idx = threadIdx.x; idx < deg; idx += BLOCK_SIZE) {
+        const IdType in_idx = in_row_start + idx;
+        out_rows[out_row_start + idx] = row;
+        out_cols[out_row_start + idx] = in_index[in_idx];
+        out_idxs[out_row_start + idx] = data ? data[in_idx] : in_idx;
      }
    } else {
      // generate permutation list via reservoir algorithm
-      for (int idx = threadIdx.x; idx < num_picks; idx+=CTA_SIZE) {
-        out_idxs[out_row_start+idx] = idx;
+      for (int idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
+        out_idxs[out_row_start + idx] = idx;
      }
      __syncthreads();

-      for (int idx = num_picks+threadIdx.x; idx < deg; idx+=CTA_SIZE) {
-        const int num = curand(&rng)%(idx+1);
+      for (int idx = num_picks + threadIdx.x; idx < deg; idx += BLOCK_SIZE) {
+        const int num = curand(&rng) % (idx + 1);
        if (num < num_picks) {
          // use max so as to achieve the replacement order the serial
          // algorithm would have
-          AtomicMax(out_idxs+out_row_start+num, idx);
+          AtomicMax(out_idxs + out_row_start + num, idx);
        }
      }
      __syncthreads();

      // copy permutation over
-      for (int idx = threadIdx.x; idx < num_picks; idx += CTA_SIZE) {
-        const IdType perm_idx = out_idxs[out_row_start+idx]+in_row_start;
-        out_rows[out_row_start+idx] = row;
-        out_cols[out_row_start+idx] = in_index[perm_idx];
-        if (data) {
-          out_idxs[out_row_start+idx] = data[perm_idx];
-        }
+      for (int idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
+        const IdType perm_idx = out_idxs[out_row_start + idx] + in_row_start;
+        out_rows[out_row_start + idx] = row;
+        out_cols[out_row_start + idx] = in_index[perm_idx];
+        out_idxs[out_row_start + idx] = data ? data[perm_idx] : perm_idx;
      }
    }
-
-    out_row += BLOCK_CTAS;
+    out_row += 1;
  }
 }

 /**
-* @brief Perform row-wise sampling on a CSR matrix, and generate a COO matrix,
-* with replacement.
+* @brief Perform row-wise uniform sampling on a CSR matrix,
+* and generate a COO matrix, with replacement.
 *
 * @tparam IdType The ID type used for matrices.
-* @tparam BLOCK_CTAS The number of rows each thread block runs in parallel.
 * @tparam TILE_SIZE The number of rows covered by each threadblock.
 * @param rand_seed The random seed to use.
 * @param num_picks The number of non-zeros to pick per row.
@@ -200,8 +194,8 @@ __global__ void _CSRRowWiseSampleKernel(
 * @param out_cols The columns of the output COO (output).
 * @param out_idxs The data array of the output COO (output).
 */
-template<typename IdType, int BLOCK_CTAS, int TILE_SIZE>
-__global__ void _CSRRowWiseSampleReplaceKernel(
+template<typename IdType, int TILE_SIZE>
+__global__ void _CSRRowWiseSampleUniformReplaceKernel(
    const uint64_t rand_seed,
    const int64_t num_picks,
    const int64_t num_rows,
@@ -214,39 +208,37 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
    IdType * const out_cols,
    IdType * const out_idxs) {
  // we assign one warp per row
-  assert(blockDim.x == CTA_SIZE);
+  assert(blockDim.x == BLOCK_SIZE);

-  int64_t out_row = blockIdx.x*TILE_SIZE+threadIdx.y;
-  const int64_t last_row = min(static_cast<int64_t>(blockIdx.x+1)*TILE_SIZE, num_rows);
+  int64_t out_row = blockIdx.x * TILE_SIZE;
+  const int64_t last_row = min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);

  curandStatePhilox4_32_10_t rng;
-  curand_init((rand_seed*gridDim.x+blockIdx.x)*blockDim.y+threadIdx.y, threadIdx.x, 0, &rng);
+  curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);

  while (out_row < last_row) {
    const int64_t row = in_rows[out_row];
-
    const int64_t in_row_start = in_ptr[row];
    const int64_t out_row_start = out_ptr[out_row];
-
-    const int64_t deg = in_ptr[row+1] - in_row_start;
+    const int64_t deg = in_ptr[row + 1] - in_row_start;

    if (deg > 0) {
      // each thread then blindly copies in rows only if deg > 0.
-      for (int idx = threadIdx.x; idx < num_picks; idx += CTA_SIZE) {
+      for (int idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
        const int64_t edge = curand(&rng) % deg;
-        const int64_t out_idx = out_row_start+idx;
+        const int64_t out_idx = out_row_start + idx;
        out_rows[out_idx] = row;
-        out_cols[out_idx] = in_index[in_row_start+edge];
-        out_idxs[out_idx] = data ? data[in_row_start+edge] : in_row_start+edge;
+        out_cols[out_idx] = in_index[in_row_start + edge];
+        out_idxs[out_idx] = data ? data[in_row_start + edge] : in_row_start + edge;
      }
    }
-    out_row += BLOCK_CTAS;
+    out_row += 1;
  }
 }
-
 }  // namespace

-/////////////////////////////// CSR ///////////////////////////////
+
+///////////////////////////// CSR sampling //////////////////////////

 template <DLDeviceType XPU, typename IdType>
 COOMatrix CSRRowWiseSamplingUniform(CSRMatrix mat,
@@ -277,22 +269,26 @@ COOMatrix CSRRowWiseSamplingUniform(CSRMatrix mat,

  // compute degree
  IdType * out_deg = static_cast<IdType*>(
-      device->AllocWorkspace(ctx, (num_rows+1)*sizeof(IdType)));
+      device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
  if (replace) {
    const dim3 block(512);
-    const dim3 grid((num_rows+block.x-1)/block.x);
-    _CSRRowWiseSampleDegreeReplaceKernel<<<grid, block, 0, stream>>>(
+    const dim3 grid((num_rows + block.x - 1) / block.x);
+    CUDA_KERNEL_CALL(
+        _CSRRowWiseSampleDegreeReplaceKernel,
+        grid, block, 0, stream,
        num_picks, num_rows, slice_rows, in_ptr, out_deg);
  } else {
    const dim3 block(512);
-    const dim3 grid((num_rows+block.x-1)/block.x);
-    _CSRRowWiseSampleDegreeKernel<<<grid, block, 0, stream>>>(
+    const dim3 grid((num_rows + block.x - 1) / block.x);
+    CUDA_KERNEL_CALL(
+        _CSRRowWiseSampleDegreeKernel,
+        grid, block, 0, stream,
        num_picks, num_rows, slice_rows, in_ptr, out_deg);
  }

  // fill out_ptr
  IdType * out_ptr = static_cast<IdType*>(
-      device->AllocWorkspace(ctx, (num_rows+1)*sizeof(IdType)));
+      device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
  size_t prefix_temp_size = 0;
  CUDA_CALL(cub::DeviceScan::ExclusiveSum(nullptr, prefix_temp_size,
      out_deg,
@@ -314,24 +310,25 @@ COOMatrix CSRRowWiseSamplingUniform(CSRMatrix mat,
  // TODO(dlasalle): use pinned memory to overlap with the actual sampling, and wait on
  // a cudaevent
  IdType new_len;
-  device->CopyDataFromTo(out_ptr, num_rows*sizeof(new_len), &new_len, 0,
-        sizeof(new_len),
-        ctx,
-        DGLContext{kDLCPU, 0},
-        mat.indptr->dtype,
-        stream);
+  device->CopyDataFromTo(out_ptr, num_rows * sizeof(new_len), &new_len, 0,
+      sizeof(new_len),
+      ctx,
+      DGLContext{kDLCPU, 0},
+      mat.indptr->dtype,
+      stream);
  CUDA_CALL(cudaEventRecord(copyEvent, stream));

  const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);

  // select edges
-  if (replace) {
-    constexpr int BLOCK_CTAS = 128/CTA_SIZE;
-    // the number of rows each thread block will cover
-    constexpr int TILE_SIZE = BLOCK_CTAS;
-    const dim3 block(CTA_SIZE, BLOCK_CTAS);
-    const dim3 grid((num_rows+TILE_SIZE-1)/TILE_SIZE);
-    _CSRRowWiseSampleReplaceKernel<IdType, BLOCK_CTAS, TILE_SIZE><<<grid, block, 0, stream>>>(
+  // the number of rows each thread block will cover
+  constexpr int TILE_SIZE = 128 / BLOCK_SIZE;
+  if (replace) {  // with replacement
+    const dim3 block(BLOCK_SIZE);
+    const dim3 grid((num_rows + TILE_SIZE - 1) / TILE_SIZE);
+    CUDA_KERNEL_CALL(
+        (_CSRRowWiseSampleUniformReplaceKernel<IdType, TILE_SIZE>),
+        grid, block, 0, stream,
        random_seed,
        num_picks,
        num_rows,
@@ -343,13 +340,12 @@ COOMatrix CSRRowWiseSamplingUniform(CSRMatrix mat,
        out_rows,
        out_cols,
        out_idxs);
-  } else {
-    constexpr int BLOCK_CTAS = 128/CTA_SIZE;
-    // the number of rows each thread block will cover
-    constexpr int TILE_SIZE = BLOCK_CTAS;
-    const dim3 block(CTA_SIZE, BLOCK_CTAS);
-    const dim3 grid((num_rows+TILE_SIZE-1)/TILE_SIZE);
-    _CSRRowWiseSampleKernel<IdType, BLOCK_CTAS, TILE_SIZE><<<grid, block, 0, stream>>>(
+  } else {  // without replacement
+    const dim3 block(BLOCK_SIZE);
+    const dim3 grid((num_rows + TILE_SIZE - 1) / TILE_SIZE);
+    CUDA_KERNEL_CALL(
+        (_CSRRowWiseSampleUniformKernel<IdType, TILE_SIZE>),
+        grid, block, 0, stream,
        random_seed,
        num_picks,
        num_rows,

--- a/src/array/cuda/rowwise_sampling_prob.cu
+++ b/src/array/cuda/rowwise_sampling_prob.cu
--- a/tests/compute/test_sampling.py
+++ b/tests/compute/test_sampling.py
@@ -625,12 +625,10 @@ def test_sample_neighbors_noprob():
    _test_sample_neighbors(False, None)
    #_test_sample_neighbors(True)

-@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU sample neighbors with probability is not implemented")
 def test_sample_neighbors_prob():
    _test_sample_neighbors(False, 'prob')
    #_test_sample_neighbors(True)

-@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU sample neighbors not implemented")
 def test_sample_neighbors_outedge():
    _test_sample_neighbors_outedge(False)
    #_test_sample_neighbors_outedge(True)
@@ -645,9 +643,8 @@ def test_sample_neighbors_topk_outedge():
    _test_sample_neighbors_topk_outedge(False)
    #_test_sample_neighbors_topk_outedge(True)

-@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU sample neighbors not implemented")
 def test_sample_neighbors_with_0deg():
-    g = dgl.graph(([], []), num_nodes=5)
+    g = dgl.graph(([], []), num_nodes=5).to(F.ctx())
    sg = dgl.sampling.sample_neighbors(g, F.tensor([1, 2], dtype=F.int64), 2, edge_dir='in', replace=False)
    assert sg.number_of_edges() == 0
    sg = dgl.sampling.sample_neighbors(g, F.tensor([1, 2], dtype=F.int64), 2, edge_dir='in', replace=True)
@@ -884,7 +881,6 @@ def test_sample_neighbors_etype_sorted_homogeneous(format_, direction):
    assert fail

 @pytest.mark.parametrize('dtype', ['int32', 'int64'])
-@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU sample neighbors not implemented")
 def test_sample_neighbors_exclude_edges_heteroG(dtype):
    d_i_d_u_nodes = F.zerocopy_from_numpy(np.unique(np.random.randint(300, size=100, dtype=dtype)))
    d_i_d_v_nodes = F.zerocopy_from_numpy(np.random.randint(25, size=d_i_d_u_nodes.shape, dtype=dtype))
@@ -897,7 +893,7 @@ def test_sample_neighbors_exclude_edges_heteroG(dtype):
        ('drug', 'interacts', 'drug'): (d_i_d_u_nodes, d_i_d_v_nodes),
        ('drug', 'interacts', 'gene'): (d_i_g_u_nodes, d_i_g_v_nodes),
        ('drug', 'treats', 'disease'): (d_t_d_u_nodes, d_t_d_v_nodes)
-    })
+    }).to(F.ctx())

    (U, V, EID) = (0, 1, 2)

@@ -950,11 +946,10 @@ def test_sample_neighbors_exclude_edges_heteroG(dtype):
                                                     etype=('drug','treats','disease'))))

 @pytest.mark.parametrize('dtype', ['int32', 'int64'])
-@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU sample neighbors not implemented")
 def test_sample_neighbors_exclude_edges_homoG(dtype):
    u_nodes = F.zerocopy_from_numpy(np.unique(np.random.randint(300,size=100, dtype=dtype)))
    v_nodes = F.zerocopy_from_numpy(np.random.randint(25, size=u_nodes.shape, dtype=dtype))
-    g = dgl.graph((u_nodes, v_nodes))
+    g = dgl.graph((u_nodes, v_nodes)).to(F.ctx())

    (U, V, EID) = (0, 1, 2)


--- a/cub @ cdaa9558
+++ b/cub @ cdaa9558
-Subproject commit a3ee304a1f8e22f278df10600df2e4b333012592
+Subproject commit cdaa9558a85e45d849016e5fe7b6e4ee79113f95
--- a/thrust @ 6a3078c6
+++ b/thrust @ 6a3078c6
-Subproject commit 0ef5c509856e12cc408f0f00ed586b4c5b1a155c
+Subproject commit 6a3078c64cab0e2f276340fa5dcafa0d758ed890