Unverified Commit 2cf4bd0a authored by Minjie Wang's avatar Minjie Wang Committed by GitHub
Browse files

Merge branch 'master' into dist_part

parents 2e8ae9f9 d077d371
"""DGL distributed module contains classes and functions to support
distributed graph neural network training and inference in a cluster of
machines.
This includes a few submodules:
* distributed data structures including distributed graph, distributed tensor
and distributed embeddings.
* distributed sampling.
* distributed workload split at runtime.
* graph partition.
"""
import os
import sys
"""DGL distributed module"""
from .dist_graph import DistGraphServer, DistGraph, node_split, edge_split
from .dist_tensor import DistTensor
from .partition import partition_graph, load_partition, load_partition_feats, load_partition_book
......@@ -28,4 +13,4 @@ from .dist_context import initialize, exit_client
from .kvstore import KVServer, KVClient
from .server_state import ServerState
from .dist_dataloader import DistDataLoader
from .graph_services import sample_neighbors, sample_etype_neighbors, in_subgraph
from .graph_services import *
......@@ -5,3 +5,6 @@ MAX_QUEUE_SIZE = 20*1024*1024*1024
SERVER_EXIT = "server_exit"
SERVER_KEEP_ALIVE = "server_keep_alive"
DEFAULT_NTYPE = '_N'
DEFAULT_ETYPE = (DEFAULT_NTYPE, '_E', DEFAULT_NTYPE)
......@@ -12,7 +12,10 @@ from ..base import NID, EID
from ..utils import toindex
from .. import backend as F
__all__ = ['sample_neighbors', 'in_subgraph', 'find_edges']
__all__ = [
'sample_neighbors', 'sample_etype_neighbors',
'in_subgraph', 'find_edges'
]
SAMPLING_SERVICE_ID = 6657
INSUBGRAPH_SERVICE_ID = 6658
......
......@@ -173,7 +173,7 @@ def load_partition_feats(part_config, part_id):
return node_feats, edge_feats
def load_partition_book(part_config, part_id, graph=None):
''' Load a graph partition book from the partition config file.
'''Load a graph partition book from the partition config file.
Parameters
----------
......
......@@ -4,6 +4,8 @@ from __future__ import absolute_import, division
from collections.abc import Mapping, Iterable, Sequence
from collections import defaultdict
from functools import wraps
import glob
import os
import numpy as np
from ..base import DGLError, dgl_warning, NID, EID
......@@ -914,6 +916,46 @@ def set_num_threads(num_threads):
"""
_CAPI_DGLSetOMPThreads(num_threads)
def get_num_threads():
"""Get the number of OMP threads in the process"""
return _CAPI_DGLGetOMPThreads()
def get_numa_nodes_cores():
""" Returns numa nodes info, format:
{<node_id>: [(<core_id>, [<sibling_thread_id_0>, <sibling_thread_id_1>, ...]), ...], ...}
E.g.: {0: [(0, [0, 4]), (1, [1, 5])], 1: [(2, [2, 6]), (3, [3, 7])]}
If not available, returns {}
"""
numa_node_paths = glob.glob('/sys/devices/system/node/node[0-9]*')
if not numa_node_paths:
return {}
nodes = {}
try:
for node_path in numa_node_paths:
numa_node_id = int(os.path.basename(node_path)[4:])
thread_siblings = {}
for cpu_dir in glob.glob(os.path.join(node_path, 'cpu[0-9]*')):
cpu_id = int(os.path.basename(cpu_dir)[3:])
with open(os.path.join(cpu_dir, 'topology', 'core_id')) as core_id_file:
core_id = int(core_id_file.read().strip())
if core_id in thread_siblings:
thread_siblings[core_id].append(cpu_id)
else:
thread_siblings[core_id] = [cpu_id]
nodes[numa_node_id] = sorted([(k, sorted(v)) for k, v in thread_siblings.items()])
except (OSError, ValueError, IndexError, IOError):
dgl_warning('Failed to read NUMA info')
return {}
return nodes
def alias_func(func):
"""Return an alias function with proper docstring."""
@wraps(func)
......
......@@ -548,11 +548,11 @@ __global__ void SpMMCsrKernel(
const int64_t* __restrict__ ebcast_off,
int64_t ufeat_len, int64_t efeat_len, int64_t out_len) {
// SPMM with CSR.
int ty = blockIdx.y * blockDim.y + threadIdx.y;
const Idx stride_y = blockDim.y * gridDim.y;
const int stride_x = blockDim.x * gridDim.x;
int ty = blockIdx.x * blockDim.y + threadIdx.y;
const Idx stride_y = blockDim.y * gridDim.x;
const int stride_x = blockDim.x * gridDim.y;
while (ty < num_rows) {
int tx = blockIdx.x * blockDim.x + threadIdx.x;
int tx = blockIdx.y * blockDim.x + threadIdx.x;
while (tx < out_len) {
DType local_accum = ReduceOp::zero();
Idx local_argu = 0, local_arge = 0;
......@@ -759,8 +759,8 @@ void SpMMCsr(
rhs_len = bcast.rhs_len;
const int ntx = FindNumThreads(len);
const int nty = CUDA_MAX_NUM_THREADS / ntx;
const int nbx = (len + ntx - 1) / ntx;
const int nby = FindNumBlocks<'y'>((csr.num_rows + nty - 1) / nty);
const int nby= (len + ntx - 1) / ntx;
const int nbx = FindNumBlocks<'x'>((csr.num_rows + nty - 1) / nty);
//LOG(INFO) << "nblks=(" << nbx << ", " << nby << ") nthrs=(" << ntx << ", " << nty << ")";
const dim3 nblks(nbx, nby);
const dim3 nthrs(ntx, nty);
......
......@@ -18,7 +18,8 @@ namespace cuda {
#define CUDA_MAX_NUM_BLOCKS_X 0x7FFFFFFF
#define CUDA_MAX_NUM_BLOCKS_Y 0xFFFF
#define CUDA_MAX_NUM_BLOCKS_Z 0xFFFF
#define CUDA_MAX_NUM_THREADS 1024
// The max number of threads per block
#define CUDA_MAX_NUM_THREADS 256
#ifdef USE_FP16
#define SWITCH_BITS(bits, DType, ...) \
......
......@@ -197,10 +197,15 @@ class CUDADeviceAPI final : public DeviceAPI {
* not just the one that performed the allocation
*/
void PinData(void* ptr, size_t nbytes) {
// prevent users from pinning empty tensors or graphs
if (ptr == nullptr || nbytes == 0)
return;
CUDA_CALL(cudaHostRegister(ptr, nbytes, cudaHostRegisterDefault));
}
void UnpinData(void* ptr) {
if (ptr == nullptr)
return;
CUDA_CALL(cudaHostUnregister(ptr));
}
......
......@@ -26,6 +26,10 @@ DGL_REGISTER_GLOBAL("utils.internal._CAPI_DGLSetOMPThreads")
omp_set_num_threads(num_threads);
});
DGL_REGISTER_GLOBAL("utils.internal._CAPI_DGLGetOMPThreads")
.set_body([] (DGLArgs args, DGLRetValue* rv) {
*rv = omp_get_max_threads();
});
DGL_REGISTER_GLOBAL("utils.checks._CAPI_DGLCOOIsSorted")
.set_body([] (DGLArgs args, DGLRetValue* rv) {
......
......@@ -1008,52 +1008,70 @@ def test_pin_memory_(idtype):
g = g.to(F.cpu())
assert not g.is_pinned()
if F.is_cuda_available():
# unpin an unpinned CPU graph, directly return
g.unpin_memory_()
assert not g.is_pinned()
assert g.device == F.cpu()
# unpin an unpinned CPU graph, directly return
g.unpin_memory_()
assert not g.is_pinned()
assert g.device == F.cpu()
# pin a CPU graph
g.pin_memory_()
assert g.is_pinned()
assert g.device == F.cpu()
assert F.context(g.nodes['user'].data['h']) == F.cpu()
assert F.context(g.nodes['game'].data['i']) == F.cpu()
assert F.context(g.edges['plays'].data['e']) == F.cpu()
for ntype in g.ntypes:
assert F.context(g.batch_num_nodes(ntype)) == F.cpu()
for etype in g.canonical_etypes:
assert F.context(g.batch_num_edges(etype)) == F.cpu()
# pin a CPU graph
g.pin_memory_()
assert g.is_pinned()
assert g.device == F.cpu()
assert g.nodes['user'].data['h'].is_pinned()
assert g.nodes['game'].data['i'].is_pinned()
assert g.edges['plays'].data['e'].is_pinned()
assert F.context(g.nodes['user'].data['h']) == F.cpu()
assert F.context(g.nodes['game'].data['i']) == F.cpu()
assert F.context(g.edges['plays'].data['e']) == F.cpu()
for ntype in g.ntypes:
assert F.context(g.batch_num_nodes(ntype)) == F.cpu()
for etype in g.canonical_etypes:
assert F.context(g.batch_num_edges(etype)) == F.cpu()
# it's fine to clone with new formats, but new graphs are not pinned
# >>> g.formats()
# {'created': ['coo'], 'not created': ['csr', 'csc']}
assert not g.formats('csc').is_pinned()
assert not g.formats('csr').is_pinned()
# 'coo' formats is already created and thus not cloned
assert g.formats('coo').is_pinned()
# pin a pinned graph, directly return
g.pin_memory_()
assert g.is_pinned()
assert g.device == F.cpu()
# it's fine to clone with new formats, but new graphs are not pinned
# >>> g.formats()
# {'created': ['coo'], 'not created': ['csr', 'csc']}
assert not g.formats('csc').is_pinned()
assert not g.formats('csr').is_pinned()
# 'coo' formats is already created and thus not cloned
assert g.formats('coo').is_pinned()
# pin a pinned graph, directly return
g.pin_memory_()
assert g.is_pinned()
assert g.device == F.cpu()
# unpin a pinned graph
g.unpin_memory_()
assert not g.is_pinned()
assert g.device == F.cpu()
# unpin a pinned graph
g.unpin_memory_()
assert not g.is_pinned()
assert g.device == F.cpu()
g1 = g.to(F.cuda())
g1 = g.to(F.cuda())
# unpin an unpinned GPU graph, directly return
g1.unpin_memory_()
assert not g1.is_pinned()
assert g1.device == F.cuda()
# unpin an unpinned GPU graph, directly return
g1.unpin_memory_()
assert not g1.is_pinned()
assert g1.device == F.cuda()
# error pinning a GPU graph
with pytest.raises(DGLError):
g1.pin_memory_()
# error pinning a GPU graph
with pytest.raises(DGLError):
g1.pin_memory_()
# test pin empty homograph
g2 = dgl.graph(([], []))
g2.pin_memory_()
assert g2.is_pinned()
g2.unpin_memory_()
assert not g2.is_pinned()
# test pin heterograph with 0 edge of one relation type
g3 = dgl.heterograph({
('a','b','c'): ([0, 1], [1, 2]),
('c','d','c'): ([], [])}).astype(idtype)
g3.pin_memory_()
assert g3.is_pinned()
g3.unpin_memory_()
assert not g3.is_pinned()
@parametrize_idtype
def test_convert_bound(idtype):
......
......@@ -25,7 +25,7 @@ def test_pin_unpin():
F.to_dgl_nd(t_pin).unpin_memory_()
else:
with pytest.raises(dgl.DGLError):
# tensorflow and mxnet should throw an erro
# tensorflow and mxnet should throw an error
dgl.utils.pin_memory_inplace(t)
if __name__ == "__main__":
......
......@@ -3,14 +3,12 @@ import sys
import os
import numpy as np
from scipy import sparse as spsp
from numpy.testing import assert_array_equal
from dgl.heterograph_index import create_unitgraph_from_coo
from dgl.distributed import partition_graph, load_partition, load_partition_feats
from dgl.distributed.graph_partition_book import BasicPartitionBook, RangePartitionBook, \
NodePartitionPolicy, EdgePartitionPolicy, HeteroDataName
from dgl import function as fn
import backend as F
import unittest
import pickle
import random
import tempfile
def _get_inner_node_mask(graph, ntype_id):
......@@ -426,8 +424,100 @@ def test_hetero_partition():
check_hetero_partition(hg, 'random')
check_hetero_partition(hg, 'metis', 4, 8, load_feats=False)
@unittest.skipIf(os.name == 'nt', reason='Do not support windows yet')
def test_BasicPartitionBook():
part_id = 0
num_parts = 2
node_map = np.random.choice(num_parts, 1000)
edge_map = np.random.choice(num_parts, 5000)
graph = dgl.rand_graph(1000, 5000)
graph = dgl.node_subgraph(graph, F.arange(0, graph.num_nodes()))
gpb = BasicPartitionBook(part_id, num_parts, node_map, edge_map, graph)
c_etype = ('_N', '_E', '_N')
assert gpb.etypes == ['_E']
assert gpb.canonical_etypes == [c_etype]
node_policy = NodePartitionPolicy(gpb, '_N')
assert node_policy.type_name == '_N'
edge_policy = EdgePartitionPolicy(gpb, '_E')
assert edge_policy.type_name == '_E'
@unittest.skipIf(os.name == 'nt', reason='Do not support windows yet')
def test_RangePartitionBook():
part_id = 0
num_parts = 2
# homogeneous
node_map = {'_N': F.tensor([[0, 1000], [1000, 2000]])}
edge_map = {'_E': F.tensor([[0, 5000], [5000, 10000]])}
ntypes = {'_N': 0}
etypes = {'_E': 0}
gpb = RangePartitionBook(
part_id, num_parts, node_map, edge_map, ntypes, etypes)
assert gpb.etypes == ['_E']
assert gpb.canonical_etypes == [None]
assert gpb._to_canonical_etype('_E') == '_E'
node_policy = NodePartitionPolicy(gpb, '_N')
assert node_policy.type_name == '_N'
edge_policy = EdgePartitionPolicy(gpb, '_E')
assert edge_policy.type_name == '_E'
# heterogeneous, init via etype
node_map = {'node1': F.tensor([[0, 1000], [1000, 2000]]), 'node2': F.tensor([
[0, 1000], [1000, 2000]])}
edge_map = {'edge1': F.tensor([[0, 5000], [5000, 10000]])}
ntypes = {'node1': 0, 'node2': 1}
etypes = {'edge1': 0}
gpb = RangePartitionBook(
part_id, num_parts, node_map, edge_map, ntypes, etypes)
assert gpb.etypes == ['edge1']
assert gpb.canonical_etypes == [None]
assert gpb._to_canonical_etype('edge1') == 'edge1'
node_policy = NodePartitionPolicy(gpb, 'node1')
assert node_policy.type_name == 'node1'
edge_policy = EdgePartitionPolicy(gpb, 'edge1')
assert edge_policy.type_name == 'edge1'
# heterogeneous, init via canonical etype
node_map = {'node1': F.tensor([[0, 1000], [1000, 2000]]), 'node2': F.tensor([
[0, 1000], [1000, 2000]])}
edge_map = {('node1', 'edge1', 'node2'): F.tensor([[0, 5000], [5000, 10000]])}
ntypes = {'node1': 0, 'node2': 1}
etypes = {('node1', 'edge1', 'node2'): 0}
c_etype = list(etypes.keys())[0]
gpb = RangePartitionBook(
part_id, num_parts, node_map, edge_map, ntypes, etypes)
assert gpb.etypes == ['edge1']
assert gpb.canonical_etypes == [c_etype]
assert gpb._to_canonical_etype('edge1') == c_etype
assert gpb._to_canonical_etype(c_etype) == c_etype
expect_except = False
try:
gpb._to_canonical_etype(('node1', 'edge2', 'node2'))
except:
expect_except = True
assert expect_except
expect_except = False
try:
gpb._to_canonical_etype('edge2')
except:
expect_except = True
assert expect_except
node_policy = NodePartitionPolicy(gpb, 'node1')
assert node_policy.type_name == 'node1'
edge_policy = EdgePartitionPolicy(gpb, c_etype)
assert edge_policy.type_name == c_etype
data_name = HeteroDataName(False, 'edge1', 'edge1')
assert data_name.get_type() == 'edge1'
data_name = HeteroDataName(False, c_etype, 'edge1')
assert data_name.get_type() == c_etype
if __name__ == '__main__':
os.makedirs('/tmp/partition', exist_ok=True)
test_partition()
test_hetero_partition()
test_BasicPartitionBook()
test_RangePartitionBook()
......@@ -57,6 +57,17 @@ def test_pin_unpin_column():
assert col._data_nd is None
assert not g.ndata['x'].is_pinned()
@pytest.mark.skipif(F._default_context_str == 'cpu', reason='Need gpu for this test.')
def test_pin_empty():
t = torch.tensor([])
assert not t.is_pinned()
# Empty tensors will not be pinned or unpinned. It's a no-op.
# This is also the default behavior in PyTorch.
# We just check that it won't raise an error.
nd = dgl.utils.pin_memory_inplace(t)
assert not t.is_pinned()
if __name__ == "__main__":
test_pin_noncontiguous()
test_pin_view()
......
......@@ -22,25 +22,39 @@ OpenMP settings
During training on CPU, the training and dataloading part need to be maintained simultaneously.
Best performance of parallelization in OpenMP
can be achieved by setting up the optimal number of working threads and dataloading workers.
Nodes with high number of CPU cores may benefit from higher number of dataloading workers.
A good starting point could be setting num_threads=4 in Dataloader constructor for nodes with 32 cores or more.
If number of cores is rather small, the best performance might be achieved with just one
dataloader worker or even with dataloader num_threads=0 for dataloading and trainig performed
in the same process
**GNU OpenMP**
Default BKM for setting the number of OMP threads with Pytorch backend:
**Dataloader CPU affinity**
``OMP_NUM_THREADS`` = number of physical cores – ``num_workers``
If number of dataloader workers is more than 0, please consider using **use_cpu_affinity()** method
of DGL Dataloader class, it will generally result in significant performance improvement for training.
Number of physical cores can be checked by using ``lscpu`` ("Core(s) per socket")
or ``nproc`` command in Linux command line.
Below simple bash script example for setting the OMP threads and ``pytorch`` backend dataloader workers:
*use_cpu_affinity* will set the proper OpenMP thread count (equal to the number of CPU cores allocated for main process),
affinitize dataloader workers for separate CPU cores and restrict the main process to remaining cores
.. code:: bash
In multiple NUMA nodes setups *use_cpu_affinity* will only use cores of NUMA node 0 by default
with an assumption, that the workload is scaling poorly across multiple NUMA nodes. If you believe
your workload will have better performance utilizing more than one NUMA node, you can pass
the list of cores to use for dataloading (loader_cores) and for compute (compute_cores).
cores=`nproc`
num_workers=4
export OMP_NUM_THREADS=$(($cores-$num_workers))
python script.py --gpu -1 --num_workers=$num_workers
loader_cores and compute_cores arguments (list of CPU cores) can be passed to *enable_cpu_affinity* for more
control over which cores should be used, e.g. in case a workload scales well across multiple NUMA nodes.
Depending on the dataset, model and CPU optimal number of dataloader workers and OpemMP threads may vary
but close to the general default advise presented above [#f4]_ .
Usage:
.. code:: python
dataloader = dgl.dataloading.DataLoader(...)
...
with dataloader.enable_cpu_affinity():
<training loop or inferencing>
**Manual control**
For advanced and more fine-grained control over OpenMP settings please refer to Maximize Performance of Intel® Optimization for PyTorch* on CPU [#f4]_ article
.. rubric:: Footnotes
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment