Unverified Commit 46af76c3 authored by nv-dlasalle's avatar nv-dlasalle Committed by GitHub
Browse files

[Performance Improvement] Make GPU sampling and to_block use pinned memory to...

[Performance Improvement] Make GPU sampling and to_block use pinned memory to decrease required synchronization (#5685)
parent 585ce94b
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include <curand_kernel.h> #include <curand_kernel.h>
#include <dgl/random.h> #include <dgl/random.h>
#include <dgl/runtime/device_api.h> #include <dgl/runtime/device_api.h>
#include <dgl/runtime/tensordispatch.h>
#include <numeric> #include <numeric>
...@@ -15,9 +16,11 @@ ...@@ -15,9 +16,11 @@
#include "./dgl_cub.cuh" #include "./dgl_cub.cuh"
#include "./utils.h" #include "./utils.h"
using namespace dgl::cuda;
using namespace dgl::aten::cuda;
using TensorDispatcher = dgl::runtime::TensorDispatcher;
namespace dgl { namespace dgl {
using namespace cuda;
using namespace aten::cuda;
namespace aten { namespace aten {
namespace impl { namespace impl {
...@@ -287,13 +290,20 @@ COOMatrix _CSRRowWiseSamplingUniform( ...@@ -287,13 +290,20 @@ COOMatrix _CSRRowWiseSamplingUniform(
cudaEvent_t copyEvent; cudaEvent_t copyEvent;
CUDA_CALL(cudaEventCreate(&copyEvent)); CUDA_CALL(cudaEventCreate(&copyEvent));
// TODO(dlasalle): use pinned memory to overlap with the actual sampling, and NDArray new_len_tensor;
// wait on a cudaevent if (TensorDispatcher::Global()->IsAvailable()) {
IdType new_len; new_len_tensor = NDArray::PinnedEmpty(
{1}, DGLDataTypeTraits<IdType>::dtype, DGLContext{kDGLCPU, 0});
} else {
// use pageable memory, it will unecessarily block but be functional
new_len_tensor = NDArray::Empty(
{1}, DGLDataTypeTraits<IdType>::dtype, DGLContext{kDGLCPU, 0});
}
// copy using the internal current stream // copy using the internal current stream
device->CopyDataFromTo( CUDA_CALL(cudaMemcpyAsync(
out_ptr, num_rows * sizeof(new_len), &new_len, 0, sizeof(new_len), ctx, new_len_tensor->data, out_ptr + num_rows, sizeof(IdType),
DGLContext{kDGLCPU, 0}, mat.indptr->dtype); cudaMemcpyDeviceToHost, stream));
CUDA_CALL(cudaEventRecord(copyEvent, stream)); CUDA_CALL(cudaEventRecord(copyEvent, stream));
const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000); const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
...@@ -322,6 +332,7 @@ COOMatrix _CSRRowWiseSamplingUniform( ...@@ -322,6 +332,7 @@ COOMatrix _CSRRowWiseSamplingUniform(
CUDA_CALL(cudaEventSynchronize(copyEvent)); CUDA_CALL(cudaEventSynchronize(copyEvent));
CUDA_CALL(cudaEventDestroy(copyEvent)); CUDA_CALL(cudaEventDestroy(copyEvent));
const IdType new_len = static_cast<const IdType*>(new_len_tensor->data)[0];
picked_row = picked_row.CreateView({new_len}, picked_row->dtype); picked_row = picked_row.CreateView({new_len}, picked_row->dtype);
picked_col = picked_col.CreateView({new_len}, picked_col->dtype); picked_col = picked_col.CreateView({new_len}, picked_col->dtype);
picked_idx = picked_idx.CreateView({new_len}, picked_idx->dtype); picked_idx = picked_idx.CreateView({new_len}, picked_idx->dtype);
......
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <dgl/immutable_graph.h> #include <dgl/immutable_graph.h>
#include <dgl/runtime/device_api.h> #include <dgl/runtime/device_api.h>
#include <dgl/runtime/tensordispatch.h>
#include <algorithm> #include <algorithm>
#include <memory> #include <memory>
...@@ -36,6 +37,7 @@ ...@@ -36,6 +37,7 @@
using namespace dgl::aten; using namespace dgl::aten;
using namespace dgl::runtime::cuda; using namespace dgl::runtime::cuda;
using namespace dgl::transform::cuda; using namespace dgl::transform::cuda;
using TensorDispatcher = dgl::runtime::TensorDispatcher;
namespace dgl { namespace dgl {
namespace transform { namespace transform {
...@@ -165,6 +167,9 @@ struct CUDAIdsMapper { ...@@ -165,6 +167,9 @@ struct CUDAIdsMapper {
NewIdArray(maxNodesPerType[ntype], ctx, sizeof(IdType) * 8)); NewIdArray(maxNodesPerType[ntype], ctx, sizeof(IdType) * 8));
} }
} }
cudaEvent_t copyEvent;
NDArray new_len_tensor;
// Populate the mappings. // Populate the mappings.
if (generate_lhs_nodes) { if (generate_lhs_nodes) {
int64_t* count_lhs_device = static_cast<int64_t*>( int64_t* count_lhs_device = static_cast<int64_t*>(
...@@ -174,13 +179,23 @@ struct CUDAIdsMapper { ...@@ -174,13 +179,23 @@ struct CUDAIdsMapper {
src_nodes, rhs_nodes, &node_maps, count_lhs_device, &lhs_nodes, src_nodes, rhs_nodes, &node_maps, count_lhs_device, &lhs_nodes,
stream); stream);
device->CopyDataFromTo( CUDA_CALL(cudaEventCreate(&copyEvent));
count_lhs_device, 0, num_nodes_per_type.data(), 0, if (TensorDispatcher::Global()->IsAvailable()) {
sizeof(*num_nodes_per_type.data()) * num_ntypes, ctx, new_len_tensor = NDArray::PinnedEmpty(
DGLContext{kDGLCPU, 0}, DGLDataType{kDGLInt, 64, 1}); {num_ntypes}, DGLDataTypeTraits<int64_t>::dtype,
device->StreamSync(ctx, stream); DGLContext{kDGLCPU, 0});
} else {
// use pageable memory, it will unecessarily block but be functional
new_len_tensor = NDArray::Empty(
{num_ntypes}, DGLDataTypeTraits<int64_t>::dtype,
DGLContext{kDGLCPU, 0});
}
CUDA_CALL(cudaMemcpyAsync(
new_len_tensor->data, count_lhs_device,
sizeof(*num_nodes_per_type.data()) * num_ntypes,
cudaMemcpyDeviceToHost, stream));
CUDA_CALL(cudaEventRecord(copyEvent, stream));
// Wait for the node counts to finish transferring.
device->FreeWorkspace(ctx, count_lhs_device); device->FreeWorkspace(ctx, count_lhs_device);
} else { } else {
maker.Make(lhs_nodes, rhs_nodes, &node_maps, stream); maker.Make(lhs_nodes, rhs_nodes, &node_maps, stream);
...@@ -189,14 +204,23 @@ struct CUDAIdsMapper { ...@@ -189,14 +204,23 @@ struct CUDAIdsMapper {
num_nodes_per_type[ntype] = lhs_nodes[ntype]->shape[0]; num_nodes_per_type[ntype] = lhs_nodes[ntype]->shape[0];
} }
} }
// Resize lhs nodes. // Map node numberings from global to local, and build pointer for CSR.
auto ret = MapEdges(graph, edge_arrays, node_maps, stream);
if (generate_lhs_nodes) { if (generate_lhs_nodes) {
// wait for the previous copy
CUDA_CALL(cudaEventSynchronize(copyEvent));
CUDA_CALL(cudaEventDestroy(copyEvent));
// Resize lhs nodes.
for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) { for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
num_nodes_per_type[ntype] =
static_cast<int64_t*>(new_len_tensor->data)[ntype];
lhs_nodes[ntype]->shape[0] = num_nodes_per_type[ntype]; lhs_nodes[ntype]->shape[0] = num_nodes_per_type[ntype];
} }
} }
// Map node numberings from global to local, and build pointer for CSR.
return MapEdges(graph, edge_arrays, node_maps, stream); return ret;
} }
}; };
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment