[Feature] Adding kappa feature for labor (Cooperative Minibatching) (#6006)

Co-authored-by: Hongzhi (Steve), Chen <chenhongzhi.nkcs@gmail.com>

[Feature] Adding kappa feature for labor (Cooperative Minibatching) (#6006)
Co-authored-by: Hongzhi (Steve), Chen <chenhongzhi.nkcs@gmail.com>
d3bd4c61 · Muhammed Fatih BALIN · GitHub · e34a5072 · d3bd4c61 · d3bd4c61
Unverified Commit d3bd4c61 authored Jul 18, 2023 by Muhammed Fatih BALIN Committed by GitHub Jul 19, 2023
11 changed files
--- a/include/dgl/aten/coo.h
+++ b/include/dgl/aten/coo.h
@@ -463,6 +463,7 @@ COOMatrix COOReorder(
 * @param prob Probability array for nonuniform sampling
 * @param importance_sampling Whether to enable importance sampling
 * @param random_seed The random seed for the sampler
+ * @param seed2_contribution The contribution of the second random seed, [0, 1)
 * @param NIDs global nids if sampling from a subgraph
 * @return A pair of COOMatrix storing the picked row and col indices and edge
 *         weights if importance_sampling != 0 or prob argument was passed.
@@ -472,7 +473,8 @@ COOMatrix COOReorder(
 std::pair<COOMatrix, FloatArray> COOLaborSampling(
    COOMatrix mat, IdArray rows, int64_t num_samples,
    FloatArray prob = NullArray(), int importance_sampling = 0,
-    IdArray random_seed = NullArray(), IdArray NIDs = NullArray());
+    IdArray random_seed = NullArray(), float seed2_contribution = 0,
+    IdArray NIDs = NullArray());

 /**
 * @brief Randomly select a fixed number of non-zero entries along each given

--- a/include/dgl/aten/csr.h
+++ b/include/dgl/aten/csr.h
@@ -518,6 +518,7 @@ CSRMatrix CSRRemove(CSRMatrix csr, IdArray entries);
 * @param prob Probability array for nonuniform sampling
 * @param importance_sampling Whether to enable importance sampling
 * @param random_seed The random seed for the sampler
+ * @param seed2_contribution The contribution of the second random seed, [0, 1)
 * @param NIDs global nids if sampling from a subgraph
 * @return A pair of COOMatrix storing the picked row and col indices and edge
 *         weights if importance_sampling != 0 or prob argument was passed. Its
@@ -527,7 +528,8 @@ CSRMatrix CSRRemove(CSRMatrix csr, IdArray entries);
 std::pair<COOMatrix, FloatArray> CSRLaborSampling(
    CSRMatrix mat, IdArray rows, int64_t num_samples,
    FloatArray prob = NullArray(), int importance_sampling = 0,
-    IdArray random_seed = NullArray(), IdArray NIDs = NullArray());
+    IdArray random_seed = NullArray(), float seed2_contribution = 0,
+    IdArray NIDs = NullArray());

 /*!
 * @brief Randomly select a fixed number of non-zero entries along each given

--- a/python/dgl/dataloading/labor_sampler.py
+++ b/python/dgl/dataloading/labor_sampler.py
@@ -17,6 +17,8 @@
 #

 """Data loading components for labor sampling"""
+from numpy.random import default_rng
+
 from .. import backend as F
 from ..base import EID, NID
 from ..random import choice
@@ -67,6 +69,10 @@ class LaborSampler(BlockSampler):
        Specifies whether different layers should use same random variates.
        Results into a reduction in the number of vertices sampled, but may
        degrade the quality slightly.
+    batch_dependency : int, default ``1``
+        Specifies whether different minibatches should use similar random
+        variates. Results in a higher temporal access locality of sampled
+        vertices, but may degrade the quality slightly.
    prefetch_node_feats : list[str] or dict[ntype, list[str]], optional
        The source node data to prefetch for the first MFG, corresponding to the
        input node features necessary for the first GNN layer.
@@ -139,6 +145,7 @@ class LaborSampler(BlockSampler):
        prob=None,
        importance_sampling=0,
        layer_dependency=False,
+        batch_dependency=1,
        prefetch_node_feats=None,
        prefetch_labels=None,
        prefetch_edge_feats=None,
@@ -155,7 +162,13 @@ class LaborSampler(BlockSampler):
        self.prob = prob
        self.importance_sampling = importance_sampling
        self.layer_dependency = layer_dependency
-        self.set_seed()
+        self.cnt = F.zeros(2, F.int64, F.cpu())
+        self.cnt[0] = -1
+        self.cnt[1] = batch_dependency
+        self.random_seed = F.zeros(
+            2 if self.cnt[1] > 1 else 1, F.int64, F.cpu()
+        )
+        self.set_seed(None if batch_dependency > 0 else choice(1e18, 1).item())

    def set_seed(self, random_seed=None):
        """Updates the underlying seed for the sampler
@@ -184,9 +197,23 @@ class LaborSampler(BlockSampler):
            The random seed to be used for next sampling call.
        """
        if random_seed is None:
-            self.random_seed = choice(1e18, 1)
+            self.cnt[0] += 1
+            if self.cnt[1] > 0 and self.cnt[0] % self.cnt[1] == 0:
+                if self.cnt[0] <= 0 or self.cnt[1] <= 1:
+                    if not hasattr(self, "rng"):
+                        self.rng = default_rng(choice(1e18, 1).item())
+                    self.random_seed[0] = self.rng.integers(1e18)
+                    if self.cnt[1] > 1:
+                        self.random_seed[1] = self.rng.integers(1e18)
                else:
-            self.random_seed = F.tensor(random_seed, F.int64)
+                    self.random_seed[0] = self.random_seed[1]
+                    self.random_seed[1] = self.rng.integers(1e18)
+        else:
+            self.rng = default_rng(random_seed)
+            self.random_seed[0] = self.rng.integers(1e18)
+            if self.cnt[1] > 1:
+                self.random_seed[1] = self.rng.integers(1e18)
+            self.cnt[0] = 0

    def sample_blocks(self, g, seed_nodes, exclude_eids=None):
        output_nodes = seed_nodes
@@ -195,6 +222,10 @@ class LaborSampler(BlockSampler):
            random_seed_i = F.zerocopy_to_dgl_ndarray(
                self.random_seed + (i if not self.layer_dependency else 0)
            )
+            if self.cnt[1] <= 1:
+                seed2_contr = 0
+            else:
+                seed2_contr = ((self.cnt[0] % self.cnt[1]) / self.cnt[1]).item()
            frontier, importances = g.sample_labors(
                seed_nodes,
                fanout,
@@ -202,6 +233,7 @@ class LaborSampler(BlockSampler):
                prob=self.prob,
                importance_sampling=self.importance_sampling,
                random_seed=random_seed_i,
+                seed2_contribution=seed2_contr,
                output_device=self.output_device,
                exclude_edges=exclude_eids,
            )

--- a/python/dgl/sampling/labor.py
+++ b/python/dgl/sampling/labor.py
@@ -37,6 +37,7 @@ def sample_labors(
    prob=None,
    importance_sampling=0,
    random_seed=None,
+    seed2_contribution=0,
    copy_ndata=True,
    copy_edata=True,
    exclude_edges=None,
@@ -109,6 +110,10 @@ def sample_labors(
        If this function is called without a ``random_seed``, we get the random seed by getting a
        random number from DGL. Use this argument with identical random_seed if multiple calls to
        this function are used to sample as part of a single batch.
+    seed2_contribution : float, optional
+        A float value between [0, 1) that determines the contribution
+        of the second random seed to generate the random variates for the
+        LABOR sampling algorithm.
    copy_ndata: bool, optional
        If True, the node features of the new graph are copied from
        the original graph. If False, the new graph will not have any
@@ -206,6 +211,7 @@ def sample_labors(
            prob=prob,
            importance_sampling=importance_sampling,
            random_seed=random_seed,
+            seed2_contribution=seed2_contribution,
            copy_ndata=copy_ndata,
            copy_edata=copy_edata,
            exclude_edges=exclude_edges,
@@ -219,6 +225,7 @@ def sample_labors(
            prob=prob,
            importance_sampling=importance_sampling,
            random_seed=random_seed,
+            seed2_contribution=seed2_contribution,
            copy_ndata=copy_ndata,
            copy_edata=copy_edata,
        )
@@ -242,6 +249,7 @@ def _sample_labors(
    prob=None,
    importance_sampling=0,
    random_seed=None,
+    seed2_contribution=0,
    copy_ndata=True,
    copy_edata=True,
    exclude_edges=None,
@@ -329,6 +337,7 @@ def _sample_labors(
        excluded_edges_all_t,
        importance_sampling,
        random_seed,
+        seed2_contribution,
        nids_all_types,
    )
    subgidx = ret_val[0]

--- a/src/array/array.cc
+++ b/src/array/array.cc
@@ -556,14 +556,16 @@ CSRMatrix CSRRemove(CSRMatrix csr, IdArray entries) {

 std::pair<COOMatrix, FloatArray> CSRLaborSampling(
    CSRMatrix mat, IdArray rows, int64_t num_samples, FloatArray prob,
-    int importance_sampling, IdArray random_seed, IdArray NIDs) {
+    int importance_sampling, IdArray random_seed, float seed2_contribution,
+    IdArray NIDs) {
  std::pair<COOMatrix, FloatArray> ret;
  ATEN_CSR_SWITCH_CUDA_UVA(mat, rows, XPU, IdType, "CSRLaborSampling", {
    const auto dtype =
        IsNullArray(prob) ? DGLDataTypeTraits<float>::dtype : prob->dtype;
    ATEN_FLOAT_TYPE_SWITCH(dtype, FloatType, "probability", {
      ret = impl::CSRLaborSampling<XPU, IdType, FloatType>(
-          mat, rows, num_samples, prob, importance_sampling, random_seed, NIDs);
+          mat, rows, num_samples, prob, importance_sampling, random_seed,
+          seed2_contribution, NIDs);
    });
  });
  return ret;
@@ -829,14 +831,16 @@ COOMatrix COORemove(COOMatrix coo, IdArray entries) {

 std::pair<COOMatrix, FloatArray> COOLaborSampling(
    COOMatrix mat, IdArray rows, int64_t num_samples, FloatArray prob,
-    int importance_sampling, IdArray random_seed, IdArray NIDs) {
+    int importance_sampling, IdArray random_seed, float seed2_contribution,
+    IdArray NIDs) {
  std::pair<COOMatrix, FloatArray> ret;
  ATEN_COO_SWITCH(mat, XPU, IdType, "COOLaborSampling", {
    const auto dtype =
        IsNullArray(prob) ? DGLDataTypeTraits<float>::dtype : prob->dtype;
    ATEN_FLOAT_TYPE_SWITCH(dtype, FloatType, "probability", {
      ret = impl::COOLaborSampling<XPU, IdType, FloatType>(
-          mat, rows, num_samples, prob, importance_sampling, random_seed, NIDs);
+          mat, rows, num_samples, prob, importance_sampling, random_seed,
+          seed2_contribution, NIDs);
    });
  });
  return ret;

--- a/src/array/array_op.h
+++ b/src/array/array_op.h
@@ -168,12 +168,8 @@ CSRMatrix CSRRemove(CSRMatrix csr, IdArray entries);

 template <DGLDeviceType XPU, typename IdType, typename FloatType>
 std::pair<COOMatrix, FloatArray> CSRLaborSampling(
-    CSRMatrix mat,
-    IdArray rows,
-    int64_t num_samples,
-    FloatArray prob,
-    int importance_sampling,
-    IdArray random_seed,
+    CSRMatrix mat, IdArray rows, int64_t num_samples, FloatArray prob,
+    int importance_sampling, IdArray random_seed, float seed2_contribution,
    IdArray NIDs);

 // FloatType is the type of probability data.
@@ -285,12 +281,8 @@ COOMatrix COORemove(COOMatrix coo, IdArray entries);

 template <DGLDeviceType XPU, typename IdType, typename FloatType>
 std::pair<COOMatrix, FloatArray> COOLaborSampling(
-    COOMatrix mat,
-    IdArray rows,
-    int64_t num_samples,
-    FloatArray prob,
-    int importance_sampling,
-    IdArray random_seed,
+    COOMatrix mat, IdArray rows, int64_t num_samples, FloatArray prob,
+    int importance_sampling, IdArray random_seed, float seed2_contribution,
    IdArray NIDs);

 // FloatType is the type of probability data.

--- a/src/array/cpu/labor_pick.h
+++ b/src/array/cpu/labor_pick.h
@@ -37,10 +37,14 @@
 #include <utility>
 #include <vector>

+#include "../../random/continuous_seed.h"
+
 namespace dgl {
 namespace aten {
 namespace impl {

+using dgl::random::continuous_seed;
+
 constexpr double eps = 0.0001;

 template <typename IdxType, typename FloatType>
@@ -155,7 +159,8 @@ auto compute_importance_sampling_probabilities(
 template <typename IdxType, typename FloatType>
 std::pair<COOMatrix, FloatArray> CSRLaborPick(
    CSRMatrix mat, IdArray rows, int64_t num_picks, FloatArray prob,
-    int importance_sampling, IdArray random_seed_arr, IdArray NIDs) {
+    int importance_sampling, IdArray random_seed_arr, float seed2_contribution,
+    IdArray NIDs) {
  using namespace aten;
  const IdxType* indptr = mat.indptr.Ptr<IdxType>();
  const IdxType* indices = mat.indices.Ptr<IdxType>();
@@ -218,10 +223,10 @@ std::pair<COOMatrix, FloatArray> CSRLaborPick(
  IdxType* picked_idata = picked_idx.Ptr<IdxType>();
  FloatType* picked_imp_data = picked_imp.Ptr<FloatType>();

-  const uint64_t random_seed =
+  const continuous_seed random_seed =
      IsNullArray(random_seed_arr)
-          ? RandomEngine::ThreadLocal()->RandInt(1000000000)
-          : random_seed_arr.Ptr<int64_t>()[0];
+          ? continuous_seed(RandomEngine::ThreadLocal()->RandInt(1000000000))
+          : continuous_seed(random_seed_arr, seed2_contribution);

  // compute number of edges first and do sampling
  IdxType num_edges = 0;
@@ -233,11 +238,9 @@ std::pair<COOMatrix, FloatArray> CSRLaborPick(
    const auto off = num_edges;
    for (auto j = indptr[rid]; j < indptr[rid + 1]; j++) {
      const auto v = indices[j];
-      const auto t = nids ? nids[v] : v;  // t in the paper
-      pcg32 ng(random_seed, t);
-      std::uniform_real_distribution<FloatType> uni;
+      const uint64_t t = nids ? nids[v] : v;  // t in the paper
      // rolled random number r_t is a function of the random_seed and t
-      const auto rnd = uni(ng);
+      const auto rnd = random_seed.uniform(t);
      const auto w = (weighted ? A[j] : 1);
      // if hop_map is initialized, get ps from there, otherwise get it from the
      // alternative.
@@ -281,13 +284,15 @@ std::pair<COOMatrix, FloatArray> CSRLaborPick(
 template <typename IdxType, typename FloatType>
 std::pair<COOMatrix, FloatArray> COOLaborPick(
    COOMatrix mat, IdArray rows, int64_t num_picks, FloatArray prob,
-    int importance_sampling, IdArray random_seed, IdArray NIDs) {
+    int importance_sampling, IdArray random_seed, float seed2_contribution,
+    IdArray NIDs) {
  using namespace aten;
  const auto& csr = COOToCSR(COOSliceRows(mat, rows));
  const IdArray new_rows =
      Range(0, rows->shape[0], rows->dtype.bits, rows->ctx);
  const auto&& picked_importances = CSRLaborPick<IdxType, FloatType>(
-      csr, new_rows, num_picks, prob, importance_sampling, random_seed, NIDs);
+      csr, new_rows, num_picks, prob, importance_sampling, random_seed,
+      seed2_contribution, NIDs);
  const auto& picked = picked_importances.first;
  const auto& importances = picked_importances.second;
  return std::make_pair(

--- a/src/array/cpu/labor_sampling.cc
+++ b/src/array/cpu/labor_sampling.cc
@@ -29,46 +29,50 @@ namespace impl {
 template <DGLDeviceType XPU, typename IdxType, typename FloatType>
 std::pair<COOMatrix, FloatArray> CSRLaborSampling(
    CSRMatrix mat, IdArray rows, int64_t num_samples, FloatArray prob,
-    int importance_sampling, IdArray random_seed, IdArray NIDs) {
+    int importance_sampling, IdArray random_seed, float seed2_contribution,
+    IdArray NIDs) {
  return CSRLaborPick<IdxType, FloatType>(
-      mat, rows, num_samples, prob, importance_sampling, random_seed, NIDs);
+      mat, rows, num_samples, prob, importance_sampling, random_seed,
+      seed2_contribution, NIDs);
 }

 template std::pair<COOMatrix, FloatArray>
 CSRLaborSampling<kDGLCPU, int32_t, float>(
-    CSRMatrix, IdArray, int64_t, FloatArray, int, IdArray, IdArray);
+    CSRMatrix, IdArray, int64_t, FloatArray, int, IdArray, float, IdArray);
 template std::pair<COOMatrix, FloatArray>
 CSRLaborSampling<kDGLCPU, int64_t, float>(
-    CSRMatrix, IdArray, int64_t, FloatArray, int, IdArray, IdArray);
+    CSRMatrix, IdArray, int64_t, FloatArray, int, IdArray, float, IdArray);
 template std::pair<COOMatrix, FloatArray>
 CSRLaborSampling<kDGLCPU, int32_t, double>(
-    CSRMatrix, IdArray, int64_t, FloatArray, int, IdArray, IdArray);
+    CSRMatrix, IdArray, int64_t, FloatArray, int, IdArray, float, IdArray);
 template std::pair<COOMatrix, FloatArray>
 CSRLaborSampling<kDGLCPU, int64_t, double>(
-    CSRMatrix, IdArray, int64_t, FloatArray, int, IdArray, IdArray);
+    CSRMatrix, IdArray, int64_t, FloatArray, int, IdArray, float, IdArray);

 /////////////////////////////// COO ///////////////////////////////

 template <DGLDeviceType XPU, typename IdxType, typename FloatType>
 std::pair<COOMatrix, FloatArray> COOLaborSampling(
    COOMatrix mat, IdArray rows, int64_t num_samples, FloatArray prob,
-    int importance_sampling, IdArray random_seed, IdArray NIDs) {
+    int importance_sampling, IdArray random_seed, float seed2_contribution,
+    IdArray NIDs) {
  return COOLaborPick<IdxType, FloatType>(
-      mat, rows, num_samples, prob, importance_sampling, random_seed, NIDs);
+      mat, rows, num_samples, prob, importance_sampling, random_seed,
+      seed2_contribution, NIDs);
 }

 template std::pair<COOMatrix, FloatArray>
 COOLaborSampling<kDGLCPU, int32_t, float>(
-    COOMatrix, IdArray, int64_t, FloatArray, int, IdArray, IdArray);
+    COOMatrix, IdArray, int64_t, FloatArray, int, IdArray, float, IdArray);
 template std::pair<COOMatrix, FloatArray>
 COOLaborSampling<kDGLCPU, int64_t, float>(
-    COOMatrix, IdArray, int64_t, FloatArray, int, IdArray, IdArray);
+    COOMatrix, IdArray, int64_t, FloatArray, int, IdArray, float, IdArray);
 template std::pair<COOMatrix, FloatArray>
 COOLaborSampling<kDGLCPU, int32_t, double>(
-    COOMatrix, IdArray, int64_t, FloatArray, int, IdArray, IdArray);
+    COOMatrix, IdArray, int64_t, FloatArray, int, IdArray, float, IdArray);
 template std::pair<COOMatrix, FloatArray>
 COOLaborSampling<kDGLCPU, int64_t, double>(
-    COOMatrix, IdArray, int64_t, FloatArray, int, IdArray, IdArray);
+    COOMatrix, IdArray, int64_t, FloatArray, int, IdArray, float, IdArray);

 }  // namespace impl
 }  // namespace aten

--- a/src/array/cuda/labor_sampling.cu
+++ b/src/array/cuda/labor_sampling.cu
@@ -15,11 +15,10 @@
 *   See the License for the specific language governing permissions and
 *   limitations under the License.
 *
- * \file array/cuda/labor_sampling.cu
- * \brief labor sampling
+ * @file array/cuda/labor_sampling.cu
+ * @brief labor sampling
 */

-#include <curand_kernel.h>
 #include <dgl/aten/coo.h>
 #include <dgl/random.h>
 #include <dgl/runtime/device_api.h>
@@ -43,6 +42,7 @@
 #include "../../array/cuda/atomic.cuh"
 #include "../../array/cuda/utils.h"
 #include "../../graph/transform/cuda/cuda_map_edges.cuh"
+#include "../../random/continuous_seed.h"
 #include "../../runtime/cuda/cuda_common.h"
 #include "./dgl_cub.cuh"
 #include "./functor.cuh"
@@ -52,6 +52,8 @@ namespace dgl {
 namespace aten {
 namespace impl {

+using dgl::random::continuous_seed;
+
 constexpr int BLOCK_SIZE = 128;
 constexpr int CTA_SIZE = 128;
 constexpr double eps = 0.0001;
@@ -119,7 +121,7 @@ struct StencilOp {

 template <typename IdType, typename FloatType, typename ps_t, typename A_t>
 struct StencilOpFused {
-  const uint64_t rand_seed;
+  const continuous_seed seed;
  const IdType* idx_coo;
  const FloatType* cs;
  const ps_t probs;
@@ -136,10 +138,8 @@ struct StencilOpFused {
    const auto in_idx = indptr[in_row] + rofs;
    const auto u = indices[is_pinned ? idx : in_idx];
    const auto t = nids ? nids[u] : u;  // t in the paper
-    curandStatePhilox4_32_10_t rng;
    // rolled random number r_t is a function of the random_seed and t
-    curand_init(123123, rand_seed, t, &rng);
-    const float rnd = curand_uniform(&rng);
+    const float rnd = seed.uniform(t);
    return rnd <= cs[in_row] * A[in_idx] * ps;
  }
 };
@@ -205,16 +205,14 @@ struct DegreeFunc {

 template <typename IdType, typename FloatType>
 __global__ void _CSRRowWiseOneHopExtractorKernel(
-    const uint64_t rand_seed, const IdType hop_size, const IdType* const indptr,
-    const IdType* const subindptr, const IdType* const indices,
-    const IdType* const idx_coo, const IdType* const nids,
-    const FloatType* const A, FloatType* const rands, IdType* const hop,
-    FloatType* const A_l) {
+    const continuous_seed seed, const IdType hop_size,
+    const IdType* const indptr, const IdType* const subindptr,
+    const IdType* const indices, const IdType* const idx_coo,
+    const IdType* const nids, const FloatType* const A, FloatType* const rands,
+    IdType* const hop, FloatType* const A_l) {
  IdType tx = static_cast<IdType>(blockIdx.x) * blockDim.x + threadIdx.x;
  const int stride_x = gridDim.x * blockDim.x;

-  curandStatePhilox4_32_10_t rng;
-
  while (tx < hop_size) {
    IdType rpos = idx_coo[tx];
    IdType rofs = tx - subindptr[rpos];
@@ -222,12 +220,10 @@ __global__ void _CSRRowWiseOneHopExtractorKernel(
    const auto not_pinned = indices != hop;
    const auto u = indices[not_pinned ? in_idx : tx];
    if (not_pinned) hop[tx] = u;
-    const auto v = nids ? nids[u] : u;
-    // 123123 is just a number with no significance.
-    curand_init(123123, rand_seed, v, &rng);
-    const float rnd = curand_uniform(&rng);
+    const auto t = nids ? nids[u] : u;
    if (A) A_l[tx] = A[in_idx];
-    rands[tx] = (FloatType)rnd;
+    // rolled random number r_t is a function of the random_seed and t
+    rands[tx] = (FloatType)seed.uniform(t);
    tx += stride_x;
  }
 }
@@ -356,7 +352,7 @@ int log_size(const IdType size) {
 template <typename IdType, typename FloatType, typename exec_policy_t>
 void compute_importance_sampling_probabilities(
    CSRMatrix mat, const IdType hop_size, cudaStream_t stream,
-    const uint64_t random_seed, const IdType num_rows, const IdType* indptr,
+    const continuous_seed seed, const IdType num_rows, const IdType* indptr,
    const IdType* subindptr, const IdType* indices, IdArray idx_coo_arr,
    const IdType* nids,
    FloatArray cs_arr,  // holds the computed cs values, has size num_rows
@@ -384,8 +380,8 @@ void compute_importance_sampling_probabilities(
    const dim3 grid((hop_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
    CUDA_KERNEL_CALL(
        (_CSRRowWiseOneHopExtractorKernel<IdType, FloatType>), grid, block, 0,
-        stream, random_seed, hop_size, indptr, subindptr, indices, idx_coo,
-        nids, weighted ? A : nullptr, rands, hop_1, A_l);
+        stream, seed, hop_size, indptr, subindptr, indices, idx_coo, nids,
+        weighted ? A : nullptr, rands, hop_1, A_l);
  }
  int64_t hop_uniq_size = 0;
  IdArray hop_new_arr = NewIdArray(hop_size, ctx, sizeof(IdType) * 8);
@@ -518,7 +514,7 @@ template <DGLDeviceType XPU, typename IdType, typename FloatType>
 std::pair<COOMatrix, FloatArray> CSRLaborSampling(
    CSRMatrix mat, IdArray rows_arr, const int64_t num_picks,
    FloatArray prob_arr, const int importance_sampling, IdArray random_seed_arr,
-    IdArray NIDs) {
+    float seed2_contribution, IdArray NIDs) {
  const bool weighted = !IsNullArray(prob_arr);

  const auto& ctx = rows_arr->ctx;
@@ -663,10 +659,10 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
        indptr.get(), nullptr, A, subindptr);
  }

-  const uint64_t random_seed =
+  const continuous_seed random_seed =
      IsNullArray(random_seed_arr)
-          ? RandomEngine::ThreadLocal()->RandInt(1000000000)
-          : random_seed_arr.Ptr<int64_t>()[0];
+          ? continuous_seed(RandomEngine::ThreadLocal()->RandInt(1000000000))
+          : continuous_seed(random_seed_arr, seed2_contribution);

  if (importance_sampling)
    compute_importance_sampling_probabilities<
@@ -822,16 +818,16 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(

 template std::pair<COOMatrix, FloatArray>
 CSRLaborSampling<kDGLCUDA, int32_t, float>(
-    CSRMatrix, IdArray, int64_t, FloatArray, int, IdArray, IdArray);
+    CSRMatrix, IdArray, int64_t, FloatArray, int, IdArray, float, IdArray);
 template std::pair<COOMatrix, FloatArray>
 CSRLaborSampling<kDGLCUDA, int64_t, float>(
-    CSRMatrix, IdArray, int64_t, FloatArray, int, IdArray, IdArray);
+    CSRMatrix, IdArray, int64_t, FloatArray, int, IdArray, float, IdArray);
 template std::pair<COOMatrix, FloatArray>
 CSRLaborSampling<kDGLCUDA, int32_t, double>(
-    CSRMatrix, IdArray, int64_t, FloatArray, int, IdArray, IdArray);
+    CSRMatrix, IdArray, int64_t, FloatArray, int, IdArray, float, IdArray);
 template std::pair<COOMatrix, FloatArray>
 CSRLaborSampling<kDGLCUDA, int64_t, double>(
-    CSRMatrix, IdArray, int64_t, FloatArray, int, IdArray, IdArray);
+    CSRMatrix, IdArray, int64_t, FloatArray, int, IdArray, float, IdArray);

 }  // namespace impl
 }  // namespace aten

--- a/src/graph/sampling/neighbor/neighbor.cc
+++ b/src/graph/sampling/neighbor/neighbor.cc
@@ -87,7 +87,8 @@ std::pair<HeteroSubgraph, std::vector<FloatArray>> SampleLabors(
    const std::vector<int64_t>& fanouts, EdgeDir dir,
    const std::vector<FloatArray>& prob,
    const std::vector<IdArray>& exclude_edges, const int importance_sampling,
-    const IdArray random_seed, const std::vector<IdArray>& NIDs) {
+    const IdArray random_seed, const float seed2_contribution,
+    const std::vector<IdArray>& NIDs) {
  // sanity check
  CHECK_EQ(nodes.size(), hg->NumVertexTypes())
      << "Number of node ID tensors must match the number of node types.";
@@ -133,13 +134,14 @@ std::pair<HeteroSubgraph, std::vector<FloatArray>> SampleLabors(
            auto fs = aten::COOLaborSampling(
                aten::COOTranspose(hg->GetCOOMatrix(etype)), nodes_ntype,
                fanout, prob[etype], importance_sampling, random_seed,
-                NIDs_ntype);
+                seed2_contribution, NIDs_ntype);
            sampled_coo = aten::COOTranspose(fs.first);
            importances = fs.second;
          } else {
            std::tie(sampled_coo, importances) = aten::COOLaborSampling(
                hg->GetCOOMatrix(etype), nodes_ntype, fanout, prob[etype],
-                importance_sampling, random_seed, NIDs_ntype);
+                importance_sampling, random_seed, seed2_contribution,
+                NIDs_ntype);
          }
          break;
        case SparseFormat::kCSR:
@@ -147,13 +149,13 @@ std::pair<HeteroSubgraph, std::vector<FloatArray>> SampleLabors(
              << "Cannot sample out edges on CSC matrix.";
          std::tie(sampled_coo, importances) = aten::CSRLaborSampling(
              hg->GetCSRMatrix(etype), nodes_ntype, fanout, prob[etype],
-              importance_sampling, random_seed, NIDs_ntype);
+              importance_sampling, random_seed, seed2_contribution, NIDs_ntype);
          break;
        case SparseFormat::kCSC:
          CHECK(dir == EdgeDir::kIn) << "Cannot sample in edges on CSR matrix.";
          std::tie(sampled_coo, importances) = aten::CSRLaborSampling(
              hg->GetCSCMatrix(etype), nodes_ntype, fanout, prob[etype],
-              importance_sampling, random_seed, NIDs_ntype);
+              importance_sampling, random_seed, seed2_contribution, NIDs_ntype);
          sampled_coo = aten::COOTranspose(sampled_coo);
          break;
        default:
@@ -523,7 +525,8 @@ DGL_REGISTER_GLOBAL("sampling.labor._CAPI_DGLSampleLabors")
      const auto& exclude_edges = ListValueToVector<IdArray>(args[5]);
      const int importance_sampling = args[6];
      const IdArray random_seed = args[7];
-      const auto& NIDs = ListValueToVector<IdArray>(args[8]);
+      const double seed2_contribution = args[8];
+      const auto& NIDs = ListValueToVector<IdArray>(args[9]);

      CHECK(dir_str == "in" || dir_str == "out")
          << "Invalid edge direction. Must be \"in\" or \"out\".";
@@ -533,7 +536,7 @@ DGL_REGISTER_GLOBAL("sampling.labor._CAPI_DGLSampleLabors")

      auto&& subg_importances = sampling::SampleLabors(
          hg.sptr(), nodes, fanouts, dir, prob, exclude_edges,
-          importance_sampling, random_seed, NIDs);
+          importance_sampling, random_seed, seed2_contribution, NIDs);
      *subg_ptr = subg_importances.first;
      List<Value> ret_val;
      ret_val.push_back(Value(subg_ptr));

--- a/src/random/continuous_seed.h
+++ b/src/random/continuous_seed.h
+/*!
+ *   Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
+ *   All rights reserved.
+ *
+ *   Licensed under the Apache License, Version 2.0 (the "License");
+ *   you may not use this file except in compliance with the License.
+ *   You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *   Unless required by applicable law or agreed to in writing, software
+ *   distributed under the License is distributed on an "AS IS" BASIS,
+ *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *   See the License for the specific language governing permissions and
+ *   limitations under the License.
+ *
+ * @file dgl/continuous_seed.h
+ * @brief CPU and CUDA implementation for continuous random seeds
+ */
+#ifndef DGL_RANDOM_CONTINUOUS_SEED_H_
+#define DGL_RANDOM_CONTINUOUS_SEED_H_
+
+#include <dgl/array.h>
+
+#include <cmath>
+
+#ifdef __NVCC__
+#include <curand_kernel.h>
+#else
+#include <random>
+
+#include "pcg_random.hpp"
+#endif  // __CUDA_ARCH__
+
+#ifndef M_SQRT1_2
+#define M_SQRT1_2 0.707106781186547524401
+#endif  // M_SQRT1_2
+
+namespace dgl {
+namespace random {
+
+class continuous_seed {
+  uint64_t s[2];
+  float c[2];
+
+ public:
+  /* implicit */ continuous_seed(const int64_t seed) {  // NOLINT
+    s[0] = s[1] = seed;
+    c[0] = c[1] = 0;
+  }
+
+  continuous_seed(IdArray seed_arr, float r) {
+    auto seed = seed_arr.Ptr<int64_t>();
+    s[0] = seed[0];
+    s[1] = seed[seed_arr->shape[0] - 1];
+    const auto pi = std::acos(-1.0);
+    c[0] = std::cos(pi * r / 2);
+    c[1] = std::sin(pi * r / 2);
+  }
+
+#ifdef __CUDA_ARCH__
+  __device__ inline float uniform(const uint64_t t) const {
+    const uint64_t kCurandSeed = 999961;  // Could be any random number.
+    curandStatePhilox4_32_10_t rng;
+    curand_init(kCurandSeed, s[0], t, &rng);
+    float rnd;
+    if (s[0] != s[1]) {
+      rnd = c[0] * curand_normal(&rng);
+      curand_init(kCurandSeed, s[1], t, &rng);
+      rnd += c[1] * curand_normal(&rng);
+      rnd = normcdff(rnd);
+    } else {
+      rnd = curand_uniform(&rng);
+    }
+    return rnd;
+  }
+#else
+  inline float uniform(const uint64_t t) const {
+    pcg32 ng0(s[0], t);
+    float rnd;
+    if (s[0] != s[1]) {
+      std::normal_distribution<float> norm;
+      rnd = c[0] * norm(ng0);
+      pcg32 ng1(s[1], t);
+      norm.reset();
+      rnd += c[1] * norm(ng1);
+      rnd = std::erfc(-rnd * static_cast<float>(M_SQRT1_2)) / 2.0f;
+    } else {
+      std::uniform_real_distribution<float> uni;
+      rnd = uni(ng0);
+    }
+    return rnd;
+  }
+#endif  // __CUDA_ARCH__
+};
+
+}  // namespace random
+}  // namespace dgl
+
+#endif  // DGL_RANDOM_CONTINUOUS_SEED_H_