Unverified Commit 86c81b4e authored by Xin Yao's avatar Xin Yao Committed by GitHub
Browse files

[Feature] Add CUDA Weighted Neighborhood Sampling (#4064)



* add weighted sampling without replacement (A-Chao)

* improve Algorithm A-Chao with block-wise prefix sum

* correctly fill out_idxs

* implement weighted sampling with replacement

* small fix

* merge host-side code of weighted/uniform sampling

* enable unit tests for cuda weighted sampling

* move thrust/cub wrapper to the cmake file

* update docs accordingly

* fix linting

* fix linting

* fix unit test

* Bump external CUB/Thrust versions

* Fix code style and update description of algorithm design

* [Feature] GPU support weighted graph neighbor sampling
commit by pengqirong(OPPO)

* merge pengqirong's implementation

* revert the change to cub and thrust

* fix linting

* use DeviceSegmentedSort for better performance

* add more comments

* add necessary notes

* add necessary notes

* resolve some comments

* define THRUST_CUB_WRAPPED_NAMESPACE

* fix doc
Co-authored-by: default avatar彭齐荣 <657017034@qq.com>
parent 17f1432a
......@@ -12,8 +12,7 @@
url = https://github.com/KarypisLab/METIS.git
[submodule "third_party/cub"]
path = third_party/cub
url = https://github.com/NVlabs/cub.git
branch = 1.8.0
url = https://github.com/NVIDIA/cub.git
[submodule "third_party/phmap"]
path = third_party/phmap
url = https://github.com/greg7mdp/parallel-hashmap.git
......
......@@ -46,18 +46,12 @@ endif(NOT MSVC)
if(USE_CUDA)
message(STATUS "Build with CUDA support")
project(dgl C CXX)
# see https://github.com/NVIDIA/thrust/issues/1401
add_definitions(-DTHRUST_CUB_WRAPPED_NAMESPACE=dgl)
include(cmake/modules/CUDA.cmake)
if ((CUDA_VERSION_MAJOR LESS 11) OR
((CUDA_VERSION_MAJOR EQUAL 11) AND (CUDA_VERSION_MINOR EQUAL 0)))
# For cuda<11, use external CUB/Thrust library because CUB is not part of CUDA.
# For cuda==11.0, use external CUB/Thrust library because there is a bug in the
# official CUB library which causes invalid device ordinal error for DGL. The bug
# is fixed by https://github.com/NVIDIA/cub/commit/9143e47e048641aa0e6ddfd645bcd54ff1059939
# in 11.1.
message(STATUS "Detected CUDA of version ${CUDA_VERSION}. Use external CUB/Thrust library.")
cuda_include_directories(BEFORE "${CMAKE_SOURCE_DIR}/third_party/thrust")
cuda_include_directories(BEFORE "${CMAKE_SOURCE_DIR}/third_party/cub")
endif()
message(STATUS "Use external CUB/Thrust library for a consistent API and performance.")
cuda_include_directories(BEFORE "${CMAKE_SOURCE_DIR}/third_party/thrust")
cuda_include_directories(BEFORE "${CMAKE_SOURCE_DIR}/third_party/cub")
endif(USE_CUDA)
# initial variables
......
......@@ -60,7 +60,7 @@ Using CUDA UVA-based neighborhood sampling in DGL data loaders
For the case where the graph is too large to fit onto the GPU memory, we introduce the
CUDA UVA (Unified Virtual Addressing)-based sampling, in which GPUs perform the sampling
on the graph pinned on CPU memory via zero-copy access.
on the graph pinned in CPU memory via zero-copy access.
You can enable UVA-based neighborhood sampling in DGL data loaders via:
* Put the ``train_nid`` onto GPU.
......@@ -138,9 +138,6 @@ You can build your own GPU sampling pipelines with the following functions that
operating on GPU:
* :func:`dgl.sampling.sample_neighbors`
* Only has support for uniform sampling; non-uniform sampling can only run on CPU.
* :func:`dgl.sampling.random_walk`
Subgraph extraction ops:
......
......@@ -54,8 +54,6 @@ def sample_etype_neighbors(g, nodes, etype_field, fanout, edge_dir='in', prob=No
The features must be non-negative floats, and the sum of the features of
inbound/outbound edges for every node must be positive (though they don't have
to sum up to one). Otherwise, the result will be undefined.
If :attr:`prob` is not None, GPU sampling is not supported.
replace : bool, optional
If True, sample with replacement.
copy_ndata: bool, optional
......@@ -163,6 +161,9 @@ def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=False,
Node/edge features are not preserved. The original IDs of
the sampled edges are stored as the `dgl.EID` feature in the returned graph.
GPU sampling is supported for this function. Refer to :ref:`guide-minibatch-gpu-sampling`
for more details.
Parameters
----------
g : DGLGraph
......@@ -193,8 +194,6 @@ def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=False,
The features must be non-negative floats, and the sum of the features of
inbound/outbound edges for every node must be positive (though they don't have
to sum up to one). Otherwise, the result will be undefined.
If :attr:`prob` is not None, GPU sampling is not supported.
exclude_edges: tensor or dict
Edge IDs to exclude during sampling neighbors for the seed nodes.
......
......@@ -549,11 +549,12 @@ COOMatrix CSRRowWiseSampling(
CSRMatrix mat, IdArray rows, int64_t num_samples, FloatArray prob, bool replace) {
COOMatrix ret;
if (IsNullArray(prob)) {
ATEN_CSR_SWITCH_CUDA_UVA(mat, rows, XPU, IdType, "CSRRowWiseSampling", {
ATEN_CSR_SWITCH_CUDA_UVA(mat, rows, XPU, IdType, "CSRRowWiseSamplingUniform", {
ret = impl::CSRRowWiseSamplingUniform<XPU, IdType>(mat, rows, num_samples, replace);
});
} else {
ATEN_CSR_SWITCH(mat, XPU, IdType, "CSRRowWiseSampling", {
CHECK_SAME_CONTEXT(rows, prob);
ATEN_CSR_SWITCH_CUDA_UVA(mat, rows, XPU, IdType, "CSRRowWiseSampling", {
ATEN_FLOAT_TYPE_SWITCH(prob->dtype, FloatType, "probability", {
ret = impl::CSRRowWiseSampling<XPU, IdType, FloatType>(
mat, rows, num_samples, prob, replace);
......
......@@ -7,13 +7,11 @@
#ifndef DGL_ARRAY_CUDA_DGL_CUB_CUH_
#define DGL_ARRAY_CUDA_DGL_CUB_CUH_
// include cub in a safe manner
#define CUB_NS_PREFIX namespace dgl {
#define CUB_NS_POSTFIX }
#define CUB_NS_QUALIFIER ::dgl::cub
// This should be defined in CMakeLists.txt
#ifndef THRUST_CUB_WRAPPED_NAMESPACE
static_assert(false, "THRUST_CUB_WRAPPED_NAMESPACE must be defined for DGL.");
#endif
#include "cub/cub.cuh"
#undef CUB_NS_QUALIFIER
#undef CUB_NS_POSTFIX
#undef CUB_NS_PREFIX
#endif
/*!
* Copyright (c) 2021 by Contributors
* \file array/cuda/rowwise_sampling.cu
* \brief rowwise sampling
* \brief uniform rowwise sampling
*/
#include <dgl/random.h>
......@@ -13,6 +13,7 @@
#include "../../array/cuda/atomic.cuh"
#include "../../runtime/cuda/cuda_common.h"
using namespace dgl::aten::cuda;
namespace dgl {
......@@ -21,7 +22,7 @@ namespace impl {
namespace {
constexpr int CTA_SIZE = 128;
constexpr int BLOCK_SIZE = 128;
/**
* @brief Compute the size of each row in the sampled CSR, without replacement.
......@@ -41,14 +42,14 @@ __global__ void _CSRRowWiseSampleDegreeKernel(
const IdType * const in_rows,
const IdType * const in_ptr,
IdType * const out_deg) {
const int tIdx = threadIdx.x + blockIdx.x*blockDim.x;
const int tIdx = threadIdx.x + blockIdx.x * blockDim.x;
if (tIdx < num_rows) {
const int in_row = in_rows[tIdx];
const int out_row = tIdx;
out_deg[out_row] = min(static_cast<IdType>(num_picks), in_ptr[in_row+1]-in_ptr[in_row]);
out_deg[out_row] = min(static_cast<IdType>(num_picks), in_ptr[in_row + 1] - in_ptr[in_row]);
if (out_row == num_rows-1) {
if (out_row == num_rows - 1) {
// make the prefixsum work
out_deg[num_rows] = 0;
}
......@@ -73,19 +74,19 @@ __global__ void _CSRRowWiseSampleDegreeReplaceKernel(
const IdType * const in_rows,
const IdType * const in_ptr,
IdType * const out_deg) {
const int tIdx = threadIdx.x + blockIdx.x*blockDim.x;
const int tIdx = threadIdx.x + blockIdx.x * blockDim.x;
if (tIdx < num_rows) {
const int64_t in_row = in_rows[tIdx];
const int64_t out_row = tIdx;
if (in_ptr[in_row+1]-in_ptr[in_row] == 0) {
if (in_ptr[in_row + 1] - in_ptr[in_row] == 0) {
out_deg[out_row] = 0;
} else {
out_deg[out_row] = static_cast<IdType>(num_picks);
}
if (out_row == num_rows-1) {
if (out_row == num_rows - 1) {
// make the prefixsum work
out_deg[num_rows] = 0;
}
......@@ -93,11 +94,10 @@ __global__ void _CSRRowWiseSampleDegreeReplaceKernel(
}
/**
* @brief Perform row-wise sampling on a CSR matrix, and generate a COO matrix,
* without replacement.
* @brief Perform row-wise uniform sampling on a CSR matrix,
* and generate a COO matrix, without replacement.
*
* @tparam IdType The ID type used for matrices.
* @tparam BLOCK_CTAS The number of rows each thread block runs in parallel.
* @tparam TILE_SIZE The number of rows covered by each threadblock.
* @param rand_seed The random seed to use.
* @param num_picks The number of non-zeros to pick per row.
......@@ -111,8 +111,8 @@ __global__ void _CSRRowWiseSampleDegreeReplaceKernel(
* @param out_cols The columns of the output COO (output).
* @param out_idxs The data array of the output COO (output).
*/
template<typename IdType, int BLOCK_CTAS, int TILE_SIZE>
__global__ void _CSRRowWiseSampleKernel(
template<typename IdType, int TILE_SIZE>
__global__ void _CSRRowWiseSampleUniformKernel(
const uint64_t rand_seed,
const int64_t num_picks,
const int64_t num_rows,
......@@ -125,68 +125,62 @@ __global__ void _CSRRowWiseSampleKernel(
IdType * const out_cols,
IdType * const out_idxs) {
// we assign one warp per row
assert(blockDim.x == CTA_SIZE);
assert(blockDim.x == BLOCK_SIZE);
int64_t out_row = blockIdx.x*TILE_SIZE+threadIdx.y;
const int64_t last_row = min(static_cast<int64_t>(blockIdx.x+1)*TILE_SIZE, num_rows);
int64_t out_row = blockIdx.x * TILE_SIZE;
const int64_t last_row = min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
curandStatePhilox4_32_10_t rng;
curand_init((rand_seed*gridDim.x+blockIdx.x)*blockDim.y+threadIdx.y, threadIdx.x, 0, &rng);
curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (out_row < last_row) {
const int64_t row = in_rows[out_row];
const int64_t in_row_start = in_ptr[row];
const int64_t deg = in_ptr[row+1] - in_row_start;
const int64_t deg = in_ptr[row + 1] - in_row_start;
const int64_t out_row_start = out_ptr[out_row];
if (deg <= num_picks) {
// just copy row
for (int idx = threadIdx.x; idx < deg; idx += CTA_SIZE) {
const IdType in_idx = in_row_start+idx;
out_rows[out_row_start+idx] = row;
out_cols[out_row_start+idx] = in_index[in_idx];
out_idxs[out_row_start+idx] = data ? data[in_idx] : in_idx;
// just copy row when there is not enough nodes to sample.
for (int idx = threadIdx.x; idx < deg; idx += BLOCK_SIZE) {
const IdType in_idx = in_row_start + idx;
out_rows[out_row_start + idx] = row;
out_cols[out_row_start + idx] = in_index[in_idx];
out_idxs[out_row_start + idx] = data ? data[in_idx] : in_idx;
}
} else {
// generate permutation list via reservoir algorithm
for (int idx = threadIdx.x; idx < num_picks; idx+=CTA_SIZE) {
out_idxs[out_row_start+idx] = idx;
for (int idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
out_idxs[out_row_start + idx] = idx;
}
__syncthreads();
for (int idx = num_picks+threadIdx.x; idx < deg; idx+=CTA_SIZE) {
const int num = curand(&rng)%(idx+1);
for (int idx = num_picks + threadIdx.x; idx < deg; idx += BLOCK_SIZE) {
const int num = curand(&rng) % (idx + 1);
if (num < num_picks) {
// use max so as to achieve the replacement order the serial
// algorithm would have
AtomicMax(out_idxs+out_row_start+num, idx);
AtomicMax(out_idxs + out_row_start + num, idx);
}
}
__syncthreads();
// copy permutation over
for (int idx = threadIdx.x; idx < num_picks; idx += CTA_SIZE) {
const IdType perm_idx = out_idxs[out_row_start+idx]+in_row_start;
out_rows[out_row_start+idx] = row;
out_cols[out_row_start+idx] = in_index[perm_idx];
if (data) {
out_idxs[out_row_start+idx] = data[perm_idx];
}
for (int idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
const IdType perm_idx = out_idxs[out_row_start + idx] + in_row_start;
out_rows[out_row_start + idx] = row;
out_cols[out_row_start + idx] = in_index[perm_idx];
out_idxs[out_row_start + idx] = data ? data[perm_idx] : perm_idx;
}
}
out_row += BLOCK_CTAS;
out_row += 1;
}
}
/**
* @brief Perform row-wise sampling on a CSR matrix, and generate a COO matrix,
* with replacement.
* @brief Perform row-wise uniform sampling on a CSR matrix,
* and generate a COO matrix, with replacement.
*
* @tparam IdType The ID type used for matrices.
* @tparam BLOCK_CTAS The number of rows each thread block runs in parallel.
* @tparam TILE_SIZE The number of rows covered by each threadblock.
* @param rand_seed The random seed to use.
* @param num_picks The number of non-zeros to pick per row.
......@@ -200,8 +194,8 @@ __global__ void _CSRRowWiseSampleKernel(
* @param out_cols The columns of the output COO (output).
* @param out_idxs The data array of the output COO (output).
*/
template<typename IdType, int BLOCK_CTAS, int TILE_SIZE>
__global__ void _CSRRowWiseSampleReplaceKernel(
template<typename IdType, int TILE_SIZE>
__global__ void _CSRRowWiseSampleUniformReplaceKernel(
const uint64_t rand_seed,
const int64_t num_picks,
const int64_t num_rows,
......@@ -214,39 +208,37 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
IdType * const out_cols,
IdType * const out_idxs) {
// we assign one warp per row
assert(blockDim.x == CTA_SIZE);
assert(blockDim.x == BLOCK_SIZE);
int64_t out_row = blockIdx.x*TILE_SIZE+threadIdx.y;
const int64_t last_row = min(static_cast<int64_t>(blockIdx.x+1)*TILE_SIZE, num_rows);
int64_t out_row = blockIdx.x * TILE_SIZE;
const int64_t last_row = min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
curandStatePhilox4_32_10_t rng;
curand_init((rand_seed*gridDim.x+blockIdx.x)*blockDim.y+threadIdx.y, threadIdx.x, 0, &rng);
curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (out_row < last_row) {
const int64_t row = in_rows[out_row];
const int64_t in_row_start = in_ptr[row];
const int64_t out_row_start = out_ptr[out_row];
const int64_t deg = in_ptr[row+1] - in_row_start;
const int64_t deg = in_ptr[row + 1] - in_row_start;
if (deg > 0) {
// each thread then blindly copies in rows only if deg > 0.
for (int idx = threadIdx.x; idx < num_picks; idx += CTA_SIZE) {
for (int idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
const int64_t edge = curand(&rng) % deg;
const int64_t out_idx = out_row_start+idx;
const int64_t out_idx = out_row_start + idx;
out_rows[out_idx] = row;
out_cols[out_idx] = in_index[in_row_start+edge];
out_idxs[out_idx] = data ? data[in_row_start+edge] : in_row_start+edge;
out_cols[out_idx] = in_index[in_row_start + edge];
out_idxs[out_idx] = data ? data[in_row_start + edge] : in_row_start + edge;
}
}
out_row += BLOCK_CTAS;
out_row += 1;
}
}
} // namespace
/////////////////////////////// CSR ///////////////////////////////
///////////////////////////// CSR sampling //////////////////////////
template <DLDeviceType XPU, typename IdType>
COOMatrix CSRRowWiseSamplingUniform(CSRMatrix mat,
......@@ -277,22 +269,26 @@ COOMatrix CSRRowWiseSamplingUniform(CSRMatrix mat,
// compute degree
IdType * out_deg = static_cast<IdType*>(
device->AllocWorkspace(ctx, (num_rows+1)*sizeof(IdType)));
device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
if (replace) {
const dim3 block(512);
const dim3 grid((num_rows+block.x-1)/block.x);
_CSRRowWiseSampleDegreeReplaceKernel<<<grid, block, 0, stream>>>(
const dim3 grid((num_rows + block.x - 1) / block.x);
CUDA_KERNEL_CALL(
_CSRRowWiseSampleDegreeReplaceKernel,
grid, block, 0, stream,
num_picks, num_rows, slice_rows, in_ptr, out_deg);
} else {
const dim3 block(512);
const dim3 grid((num_rows+block.x-1)/block.x);
_CSRRowWiseSampleDegreeKernel<<<grid, block, 0, stream>>>(
const dim3 grid((num_rows + block.x - 1) / block.x);
CUDA_KERNEL_CALL(
_CSRRowWiseSampleDegreeKernel,
grid, block, 0, stream,
num_picks, num_rows, slice_rows, in_ptr, out_deg);
}
// fill out_ptr
IdType * out_ptr = static_cast<IdType*>(
device->AllocWorkspace(ctx, (num_rows+1)*sizeof(IdType)));
device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
size_t prefix_temp_size = 0;
CUDA_CALL(cub::DeviceScan::ExclusiveSum(nullptr, prefix_temp_size,
out_deg,
......@@ -314,24 +310,25 @@ COOMatrix CSRRowWiseSamplingUniform(CSRMatrix mat,
// TODO(dlasalle): use pinned memory to overlap with the actual sampling, and wait on
// a cudaevent
IdType new_len;
device->CopyDataFromTo(out_ptr, num_rows*sizeof(new_len), &new_len, 0,
sizeof(new_len),
ctx,
DGLContext{kDLCPU, 0},
mat.indptr->dtype,
stream);
device->CopyDataFromTo(out_ptr, num_rows * sizeof(new_len), &new_len, 0,
sizeof(new_len),
ctx,
DGLContext{kDLCPU, 0},
mat.indptr->dtype,
stream);
CUDA_CALL(cudaEventRecord(copyEvent, stream));
const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
// select edges
if (replace) {
constexpr int BLOCK_CTAS = 128/CTA_SIZE;
// the number of rows each thread block will cover
constexpr int TILE_SIZE = BLOCK_CTAS;
const dim3 block(CTA_SIZE, BLOCK_CTAS);
const dim3 grid((num_rows+TILE_SIZE-1)/TILE_SIZE);
_CSRRowWiseSampleReplaceKernel<IdType, BLOCK_CTAS, TILE_SIZE><<<grid, block, 0, stream>>>(
// the number of rows each thread block will cover
constexpr int TILE_SIZE = 128 / BLOCK_SIZE;
if (replace) { // with replacement
const dim3 block(BLOCK_SIZE);
const dim3 grid((num_rows + TILE_SIZE - 1) / TILE_SIZE);
CUDA_KERNEL_CALL(
(_CSRRowWiseSampleUniformReplaceKernel<IdType, TILE_SIZE>),
grid, block, 0, stream,
random_seed,
num_picks,
num_rows,
......@@ -343,13 +340,12 @@ COOMatrix CSRRowWiseSamplingUniform(CSRMatrix mat,
out_rows,
out_cols,
out_idxs);
} else {
constexpr int BLOCK_CTAS = 128/CTA_SIZE;
// the number of rows each thread block will cover
constexpr int TILE_SIZE = BLOCK_CTAS;
const dim3 block(CTA_SIZE, BLOCK_CTAS);
const dim3 grid((num_rows+TILE_SIZE-1)/TILE_SIZE);
_CSRRowWiseSampleKernel<IdType, BLOCK_CTAS, TILE_SIZE><<<grid, block, 0, stream>>>(
} else { // without replacement
const dim3 block(BLOCK_SIZE);
const dim3 grid((num_rows + TILE_SIZE - 1) / TILE_SIZE);
CUDA_KERNEL_CALL(
(_CSRRowWiseSampleUniformKernel<IdType, TILE_SIZE>),
grid, block, 0, stream,
random_seed,
num_picks,
num_rows,
......
This diff is collapsed.
......@@ -625,12 +625,10 @@ def test_sample_neighbors_noprob():
_test_sample_neighbors(False, None)
#_test_sample_neighbors(True)
@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU sample neighbors with probability is not implemented")
def test_sample_neighbors_prob():
_test_sample_neighbors(False, 'prob')
#_test_sample_neighbors(True)
@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU sample neighbors not implemented")
def test_sample_neighbors_outedge():
_test_sample_neighbors_outedge(False)
#_test_sample_neighbors_outedge(True)
......@@ -645,9 +643,8 @@ def test_sample_neighbors_topk_outedge():
_test_sample_neighbors_topk_outedge(False)
#_test_sample_neighbors_topk_outedge(True)
@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU sample neighbors not implemented")
def test_sample_neighbors_with_0deg():
g = dgl.graph(([], []), num_nodes=5)
g = dgl.graph(([], []), num_nodes=5).to(F.ctx())
sg = dgl.sampling.sample_neighbors(g, F.tensor([1, 2], dtype=F.int64), 2, edge_dir='in', replace=False)
assert sg.number_of_edges() == 0
sg = dgl.sampling.sample_neighbors(g, F.tensor([1, 2], dtype=F.int64), 2, edge_dir='in', replace=True)
......@@ -884,7 +881,6 @@ def test_sample_neighbors_etype_sorted_homogeneous(format_, direction):
assert fail
@pytest.mark.parametrize('dtype', ['int32', 'int64'])
@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU sample neighbors not implemented")
def test_sample_neighbors_exclude_edges_heteroG(dtype):
d_i_d_u_nodes = F.zerocopy_from_numpy(np.unique(np.random.randint(300, size=100, dtype=dtype)))
d_i_d_v_nodes = F.zerocopy_from_numpy(np.random.randint(25, size=d_i_d_u_nodes.shape, dtype=dtype))
......@@ -897,7 +893,7 @@ def test_sample_neighbors_exclude_edges_heteroG(dtype):
('drug', 'interacts', 'drug'): (d_i_d_u_nodes, d_i_d_v_nodes),
('drug', 'interacts', 'gene'): (d_i_g_u_nodes, d_i_g_v_nodes),
('drug', 'treats', 'disease'): (d_t_d_u_nodes, d_t_d_v_nodes)
})
}).to(F.ctx())
(U, V, EID) = (0, 1, 2)
......@@ -950,11 +946,10 @@ def test_sample_neighbors_exclude_edges_heteroG(dtype):
etype=('drug','treats','disease'))))
@pytest.mark.parametrize('dtype', ['int32', 'int64'])
@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU sample neighbors not implemented")
def test_sample_neighbors_exclude_edges_homoG(dtype):
u_nodes = F.zerocopy_from_numpy(np.unique(np.random.randint(300,size=100, dtype=dtype)))
v_nodes = F.zerocopy_from_numpy(np.random.randint(25, size=u_nodes.shape, dtype=dtype))
g = dgl.graph((u_nodes, v_nodes))
g = dgl.graph((u_nodes, v_nodes)).to(F.ctx())
(U, V, EID) = (0, 1, 2)
......
Subproject commit a3ee304a1f8e22f278df10600df2e4b333012592
Subproject commit cdaa9558a85e45d849016e5fe7b6e4ee79113f95
Subproject commit 0ef5c509856e12cc408f0f00ed586b4c5b1a155c
Subproject commit 6a3078c64cab0e2f276340fa5dcafa0d758ed890
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment