Unverified Commit 3e2f94ed authored by Da Zheng's avatar Da Zheng Committed by GitHub
Browse files

[Distributed] Refactor distributed training to new DGLGraph. (#1874)

* fix tests in partition.

* fix DistGraph.

* fix without shared memory.

* fix sampling.

* enable distributed test.

* fix tests.

* fix a bug in shared-mem heterograph.

* print better error messages.

* fix.

* don't specify formats.

* fix.

* fix

* small fix.
parent 05a43379
...@@ -4,12 +4,12 @@ from collections.abc import MutableMapping ...@@ -4,12 +4,12 @@ from collections.abc import MutableMapping
import os import os
import numpy as np import numpy as np
from ..graph import DGLGraph from ..heterograph import DGLHeteroGraph
from .. import heterograph_index
from .. import backend as F from .. import backend as F
from ..base import NID, EID from ..base import NID, EID
from .kvstore import KVServer, KVClient from .kvstore import KVServer, KVClient
from .standalone_kvstore import KVClient as SA_KVClient from .standalone_kvstore import KVClient as SA_KVClient
from ..graph_index import from_shared_mem_graph_index
from .._ffi.ndarray import empty_shared_mem from .._ffi.ndarray import empty_shared_mem
from ..frame import infer_scheme from ..frame import infer_scheme
from .partition import load_partition from .partition import load_partition
...@@ -21,14 +21,9 @@ from .rpc_client import connect_to_server ...@@ -21,14 +21,9 @@ from .rpc_client import connect_to_server
from .server_state import ServerState from .server_state import ServerState
from .rpc_server import start_server from .rpc_server import start_server
from .dist_tensor import DistTensor, _get_data_name from .dist_tensor import DistTensor, _get_data_name
from ..transform import as_heterograph
def _get_graph_path(graph_name):
return "/" + graph_name
def _copy_graph_to_shared_mem(g, graph_name): def _copy_graph_to_shared_mem(g, graph_name):
gidx = g._graph.copyto_shared_mem(_get_graph_path(graph_name)) new_g = g.shared_memory(graph_name, formats='csc')
new_g = DGLGraph(gidx)
# We should share the node/edge data to the client explicitly instead of putting them # We should share the node/edge data to the client explicitly instead of putting them
# in the KVStore because some of the node/edge data may be duplicated. # in the KVStore because some of the node/edge data may be duplicated.
local_node_path = _get_ndata_path(graph_name, 'inner_node') local_node_path = _get_ndata_path(graph_name, 'inner_node')
...@@ -85,11 +80,10 @@ def _get_graph_from_shared_mem(graph_name): ...@@ -85,11 +80,10 @@ def _get_graph_from_shared_mem(graph_name):
The client can access the graph structure and some metadata on nodes and edges directly The client can access the graph structure and some metadata on nodes and edges directly
through shared memory to reduce the overhead of data access. through shared memory to reduce the overhead of data access.
''' '''
gidx = from_shared_mem_graph_index(_get_graph_path(graph_name)) g, ntypes, etypes = heterograph_index.create_heterograph_from_shared_memory(graph_name)
if gidx is None: if g is None:
return gidx return None
g = DGLHeteroGraph(g, ntypes, etypes)
g = DGLGraph(gidx)
g.ndata['inner_node'] = _get_shared_mem_ndata(g, graph_name, 'inner_node') g.ndata['inner_node'] = _get_shared_mem_ndata(g, graph_name, 'inner_node')
g.edata['inner_edge'] = _get_shared_mem_edata(g, graph_name, 'inner_edge') g.edata['inner_edge'] = _get_shared_mem_edata(g, graph_name, 'inner_edge')
g.ndata[NID] = _get_shared_mem_ndata(g, graph_name, NID) g.ndata[NID] = _get_shared_mem_ndata(g, graph_name, NID)
...@@ -306,7 +300,7 @@ class DistGraph: ...@@ -306,7 +300,7 @@ class DistGraph:
'The standalone mode can only work with the graph data with one partition' 'The standalone mode can only work with the graph data with one partition'
if self._gpb is None: if self._gpb is None:
self._gpb = gpb self._gpb = gpb
self._g = as_heterograph(g) self._g = g
for name in node_feats: for name in node_feats:
self._client.add_data(_get_data_name(name, NODE_PART_POLICY), node_feats[name]) self._client.add_data(_get_data_name(name, NODE_PART_POLICY), node_feats[name])
for name in edge_feats: for name in edge_feats:
...@@ -315,11 +309,7 @@ class DistGraph: ...@@ -315,11 +309,7 @@ class DistGraph:
else: else:
connect_to_server(ip_config=ip_config) connect_to_server(ip_config=ip_config)
self._client = KVClient(ip_config) self._client = KVClient(ip_config)
g = _get_graph_from_shared_mem(graph_name) self._g = _get_graph_from_shared_mem(graph_name)
if g is not None:
self._g = as_heterograph(g)
else:
self._g = None
self._gpb = get_shared_mem_partition_book(graph_name, self._g) self._gpb = get_shared_mem_partition_book(graph_name, self._g)
if self._gpb is None: if self._gpb is None:
self._gpb = gpb self._gpb = gpb
......
...@@ -132,8 +132,7 @@ def merge_graphs(res_list, num_nodes): ...@@ -132,8 +132,7 @@ def merge_graphs(res_list, num_nodes):
src_tensor = res_list[0].global_src src_tensor = res_list[0].global_src
dst_tensor = res_list[0].global_dst dst_tensor = res_list[0].global_dst
eid_tensor = res_list[0].global_eids eid_tensor = res_list[0].global_eids
g = graph((src_tensor, dst_tensor), g = graph((src_tensor, dst_tensor), num_nodes=num_nodes)
restrict_format='coo', num_nodes=num_nodes)
g.edata[EID] = eid_tensor g.edata[EID] = eid_tensor
return g return g
......
"""Server data""" """Server data"""
from .._ffi.function import _init_api from .._ffi.function import _init_api
from ..graph import DGLGraph
from ..transform import as_heterograph
# Remove C++ bindings for now, since not used # Remove C++ bindings for now, since not used
...@@ -63,9 +61,6 @@ class ServerState: ...@@ -63,9 +61,6 @@ class ServerState:
@graph.setter @graph.setter
def graph(self, graph): def graph(self, graph):
if isinstance(graph, DGLGraph):
self._graph = as_heterograph(graph)
else:
self._graph = graph self._graph = graph
......
...@@ -4839,7 +4839,7 @@ class DGLHeteroGraph(object): ...@@ -4839,7 +4839,7 @@ class DGLHeteroGraph(object):
---------- ----------
name : str name : str
The name of the shared memory. The name of the shared memory.
formats : list of str (optional) formats : str or a list of str (optional)
Desired formats to be materialized. Desired formats to be materialized.
Returns Returns
...@@ -4849,8 +4849,10 @@ class DGLHeteroGraph(object): ...@@ -4849,8 +4849,10 @@ class DGLHeteroGraph(object):
""" """
assert len(name) > 0, "The name of shared memory cannot be empty" assert len(name) > 0, "The name of shared memory cannot be empty"
assert len(formats) > 0 assert len(formats) > 0
if isinstance(formats, str):
formats = [formats]
for fmt in formats: for fmt in formats:
assert fmt in ("coo", "csr", "csc") assert fmt in ("coo", "csr", "csc"), '{} is not coo, csr or csc'.format(fmt)
gidx = self._graph.shared_memory(name, self.ntypes, self.etypes, formats) gidx = self._graph.shared_memory(name, self.ntypes, self.etypes, formats)
return DGLHeteroGraph(gidx, self.ntypes, self.etypes) return DGLHeteroGraph(gidx, self.ntypes, self.etypes)
......
...@@ -322,6 +322,10 @@ HeteroGraphPtr HeteroGraph::CopyToSharedMem( ...@@ -322,6 +322,10 @@ HeteroGraphPtr HeteroGraph::CopyToSharedMem(
std::tuple<HeteroGraphPtr, std::vector<std::string>, std::vector<std::string>> std::tuple<HeteroGraphPtr, std::vector<std::string>, std::vector<std::string>>
HeteroGraph::CreateFromSharedMem(const std::string &name) { HeteroGraph::CreateFromSharedMem(const std::string &name) {
bool exist = SharedMemory::Exist(name);
if (!exist) {
return std::make_tuple(nullptr, std::vector<std::string>(), std::vector<std::string>());
}
auto mem = std::make_shared<SharedMemory>(name); auto mem = std::make_shared<SharedMemory>(name);
auto mem_buf = mem->Open(SHARED_MEM_METAINFO_SIZE_MAX); auto mem_buf = mem->Open(SHARED_MEM_METAINFO_SIZE_MAX);
dmlc::MemoryFixedSizeStream strm(mem_buf, SHARED_MEM_METAINFO_SIZE_MAX); dmlc::MemoryFixedSizeStream strm(mem_buf, SHARED_MEM_METAINFO_SIZE_MAX);
......
...@@ -120,8 +120,8 @@ bool SharedMemManager::CreateFromSharedMem<COOMatrix>(COOMatrix *coo, ...@@ -120,8 +120,8 @@ bool SharedMemManager::CreateFromSharedMem<COOMatrix>(COOMatrix *coo,
template <> template <>
bool SharedMemManager::CreateFromSharedMem<CSRMatrix>(CSRMatrix *csr, bool SharedMemManager::CreateFromSharedMem<CSRMatrix>(CSRMatrix *csr,
std::string name) { std::string name) {
CreateFromSharedMem(&csr->indices, name + "_indices");
CreateFromSharedMem(&csr->indptr, name + "_indptr"); CreateFromSharedMem(&csr->indptr, name + "_indptr");
CreateFromSharedMem(&csr->indices, name + "_indices");
CreateFromSharedMem(&csr->data, name + "_data"); CreateFromSharedMem(&csr->data, name + "_data");
strm_->Read(&csr->num_rows); strm_->Read(&csr->num_rows);
strm_->Read(&csr->num_cols); strm_->Read(&csr->num_cols);
......
...@@ -9,7 +9,7 @@ from scipy import sparse as spsp ...@@ -9,7 +9,7 @@ from scipy import sparse as spsp
from numpy.testing import assert_array_equal from numpy.testing import assert_array_equal
from multiprocessing import Process, Manager, Condition, Value from multiprocessing import Process, Manager, Condition, Value
import multiprocessing as mp import multiprocessing as mp
from dgl.graph_index import create_graph_index from dgl.heterograph_index import create_unitgraph_from_coo
from dgl.data.utils import load_graphs, save_graphs from dgl.data.utils import load_graphs, save_graphs
from dgl.distributed import DistGraphServer, DistGraph from dgl.distributed import DistGraphServer, DistGraph
from dgl.distributed import partition_graph, load_partition, load_partition_book, node_split, edge_split from dgl.distributed import partition_graph, load_partition, load_partition_book, node_split, edge_split
...@@ -50,9 +50,8 @@ def get_local_usable_addr(): ...@@ -50,9 +50,8 @@ def get_local_usable_addr():
return ip_addr + ' ' + str(port) return ip_addr + ' ' + str(port)
def create_random_graph(n): def create_random_graph(n):
arr = (spsp.random(n, n, density=0.001, format='coo') != 0).astype(np.int64) arr = (spsp.random(n, n, density=0.001, format='coo', random_state=100) != 0).astype(np.int64)
ig = create_graph_index(arr, readonly=True) return dgl.graph(arr)
return dgl.DGLGraph(ig)
def run_server(graph_name, server_id, num_clients, shared_mem): def run_server(graph_name, server_id, num_clients, shared_mem):
g = DistGraphServer(server_id, "kv_ip_config.txt", num_clients, g = DistGraphServer(server_id, "kv_ip_config.txt", num_clients,
...@@ -65,7 +64,7 @@ def emb_init(shape, dtype): ...@@ -65,7 +64,7 @@ def emb_init(shape, dtype):
return F.zeros(shape, dtype, F.cpu()) return F.zeros(shape, dtype, F.cpu())
def rand_init(shape, dtype): def rand_init(shape, dtype):
return F.tensor(np.random.normal(size=shape)) return F.tensor(np.random.normal(size=shape), F.float32)
def run_client(graph_name, part_id, num_nodes, num_edges): def run_client(graph_name, part_id, num_nodes, num_edges):
time.sleep(5) time.sleep(5)
......
...@@ -131,7 +131,6 @@ def test_rpc_sampling_shuffle(): ...@@ -131,7 +131,6 @@ def test_rpc_sampling_shuffle():
def check_standalone_sampling(tmpdir): def check_standalone_sampling(tmpdir):
g = CitationGraphDataset("cora")[0] g = CitationGraphDataset("cora")[0]
g.readonly()
num_parts = 1 num_parts = 1
num_hops = 1 num_hops = 1
partition_graph(g, 'test_sampling', num_parts, tmpdir, partition_graph(g, 'test_sampling', num_parts, tmpdir,
...@@ -193,7 +192,6 @@ def check_rpc_in_subgraph(tmpdir, num_server): ...@@ -193,7 +192,6 @@ def check_rpc_in_subgraph(tmpdir, num_server):
p.join() p.join()
src, dst = sampled_graph.edges() src, dst = sampled_graph.edges()
g = dgl.as_heterograph(g)
assert sampled_graph.number_of_nodes() == g.number_of_nodes() assert sampled_graph.number_of_nodes() == g.number_of_nodes()
subg1 = dgl.in_subgraph(g, nodes) subg1 = dgl.in_subgraph(g, nodes)
src1, dst1 = subg1.edges() src1, dst1 = subg1.edges()
......
...@@ -39,11 +39,6 @@ def get_local_usable_addr(): ...@@ -39,11 +39,6 @@ def get_local_usable_addr():
return ip_addr + ' ' + str(port) return ip_addr + ' ' + str(port)
def create_random_graph(n):
arr = (spsp.random(n, n, density=0.001, format='coo') != 0).astype(np.int64)
ig = create_graph_index(arr, readonly=True)
return dgl.DGLGraph(ig)
# Create an one-part Graph # Create an one-part Graph
node_map = F.tensor([0,0,0,0,0,0], F.int64) node_map = F.tensor([0,0,0,0,0,0], F.int64)
edge_map = F.tensor([0,0,0,0,0,0,0], F.int64) edge_map = F.tensor([0,0,0,0,0,0,0], F.int64)
......
...@@ -4,7 +4,7 @@ import os ...@@ -4,7 +4,7 @@ import os
import numpy as np import numpy as np
from scipy import sparse as spsp from scipy import sparse as spsp
from numpy.testing import assert_array_equal from numpy.testing import assert_array_equal
from dgl.graph_index import create_graph_index from dgl.heterograph_index import create_unitgraph_from_coo
from dgl.distributed import partition_graph, load_partition from dgl.distributed import partition_graph, load_partition
from dgl import function as fn from dgl import function as fn
import backend as F import backend as F
...@@ -14,13 +14,12 @@ import random ...@@ -14,13 +14,12 @@ import random
def create_random_graph(n): def create_random_graph(n):
arr = (spsp.random(n, n, density=0.001, format='coo', random_state=100) != 0).astype(np.int64) arr = (spsp.random(n, n, density=0.001, format='coo', random_state=100) != 0).astype(np.int64)
ig = create_graph_index(arr, readonly=True) return dgl.graph(arr)
return dgl.DGLGraph(ig)
def check_partition(g, part_method, reshuffle): def check_partition(g, part_method, reshuffle):
g.ndata['labels'] = F.arange(0, g.number_of_nodes()) g.ndata['labels'] = F.arange(0, g.number_of_nodes())
g.ndata['feats'] = F.tensor(np.random.randn(g.number_of_nodes(), 10)) g.ndata['feats'] = F.tensor(np.random.randn(g.number_of_nodes(), 10), F.float32)
g.edata['feats'] = F.tensor(np.random.randn(g.number_of_edges(), 10)) g.edata['feats'] = F.tensor(np.random.randn(g.number_of_edges(), 10), F.float32)
g.update_all(fn.copy_src('feats', 'msg'), fn.sum('msg', 'h')) g.update_all(fn.copy_src('feats', 'msg'), fn.sum('msg', 'h'))
g.update_all(fn.copy_edge('feats', 'msg'), fn.sum('msg', 'eh')) g.update_all(fn.copy_edge('feats', 'msg'), fn.sum('msg', 'eh'))
num_parts = 4 num_parts = 4
...@@ -112,7 +111,6 @@ def test_partition(): ...@@ -112,7 +111,6 @@ def test_partition():
def test_hetero_partition(): def test_hetero_partition():
g = create_random_graph(10000) g = create_random_graph(10000)
g = dgl.as_heterograph(g)
check_partition(g, 'metis', True) check_partition(g, 'metis', True)
check_partition(g, 'metis', False) check_partition(g, 'metis', False)
check_partition(g, 'random', True) check_partition(g, 'random', True)
......
...@@ -37,6 +37,6 @@ python3 -m pytest -v --junitxml=pytest_gindex.xml tests/graph_index || fail "gra ...@@ -37,6 +37,6 @@ python3 -m pytest -v --junitxml=pytest_gindex.xml tests/graph_index || fail "gra
python3 -m pytest -v --junitxml=pytest_backend.xml tests/$DGLBACKEND || fail "backend-specific" python3 -m pytest -v --junitxml=pytest_backend.xml tests/$DGLBACKEND || fail "backend-specific"
export OMP_NUM_THREADS=1 export OMP_NUM_THREADS=1
#if [ $2 != "gpu" ]; then if [ $2 != "gpu" ]; then
# python3 -m pytest -v --junitxml=pytest_distributed.xml tests/distributed || fail "distributed" python3 -m pytest -v --junitxml=pytest_distributed.xml tests/distributed || fail "distributed"
#fi fi
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment