Unverified Commit 25ac3344 authored by Da Zheng's avatar Da Zheng Committed by GitHub
Browse files

[Distributed] Heterogeneous graph support (#2457)



* Distributed heterograph (#3)

* heterogeneous graph partition.

* fix graph partition book for heterograph.

* load heterograph partitions.

* update DistGraphServer to support heterograph.

* make DistGraph runnable for heterograph.

* partition a graph and store parts with homogeneous graph structure.

* update DistGraph server&client to use homogeneous graph.

* shuffle node Ids based on node types.

* load mag in heterograph.

* fix per-node-type mapping.

* balance node types.

* fix for homogeneous graph

* store etype for now.

* fix data name.

* fix a bug in example.

* add profiler in rgcn.

* heterogeneous RGCN.

* map homogeneous node ids to hetero node ids.

* fix graph partition book.

* fix DistGraph.

* shuffle eids.

* verify eids and their mappings when loading a partition.

* Id map from homogneous Ids to per-type Ids.

* verify partitioned results.

* add test for distributed sampler.

* add mapping from per-type Ids to homogeneous Ids.

* update example.

* fix DistGraph.

* Revert "add profiler in rgcn."

This reverts commit 36daaed8b660933dac8f61a39faec3da2467d676.

* add tests for homogeneous graphs.

* fix a bug.

* fix test.

* fix for one partition.

* fix for standalone training and evaluation.

* small fix.

* fix two bugs.

* initialize projection matrix.

* small fix on RGCN.

* Fix rgcn performance (#17)
Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-62-171.ec2.internal>

* fix lint.

* fix lint.

* fix lint.

* fix lint.

* fix lint.

* fix lint.

* fix.

* fix test.

* fix lint.

* test partitions.

* remove redundant test for partitioning.

* remove commented code.

* fix partition.

* fix tests.

* fix RGCN.

* fix test.

* fix test.

* fix test.

* fix.

* fix a bug.

* update dmlc-core.

* fix.

* fix rgcn.

* update readme.

* add comments.
Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-2-202.us-west-1.compute.internal>
Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-9-132.us-west-1.compute.internal>
Co-authored-by: default avatarxiang song(charlie.song) <classicxsong@gmail.com>
Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-62-171.ec2.internal>

* fix.

* fix.

* add div_int.

* fix.

* fix.

* fix lint.

* fix.

* fix.

* fix.

* adjust.

* move code.

* handle heterograph.

* return pytorch tensor in GPB.

* remove some tests in example.

* add to_block for distributed training.

* use distributed to_block.

* remove unnecessary function in DistGraph.

* remove distributed to_block.

* use pytorch tensor.

* fix a bug in ntypes and etypes.

* enable norm.

* make the data loader compatible with the old format.

* fix.

* add comments.

* fix a bug.

* add test for heterograph.

* support partition without reshuffle.

* add test.

* support partition without reshuffle.

* fix.

* add test.

* fix bugs.

* fix lint.

* fix dataset.

* fix for mxnet.

* update docstring.

* rename to floor_div

* avoid exposing NodePartitionPolicy and EdgePartitionPolicy.

* fix docstring.

* fix error.

* fixes.

* fix comments.

* rename.

* rename.

* explain IdMap.

* fix docstring.

* fix docstring.

* update docstring.

* remove the code of returning heterograph.

* remove argument.

* fix example.

* make GraphPartitionBook an abstract class.

* fix.

* fix.

* fix a bug.

* fix a bug in example

* fix a bug

* reverse heterograph sampling.

* temp fix.

* fix lint.

* Revert "temp fix."

This reverts commit c450717b9f578b8c48769c675f2a19d6c1e64381.

* compute norm.

* Revert "reverse heterograph sampling."

This reverts commit bd6deb7f52998de76508f800441ff518e2fadcb9.

* fix.

* move id_map.py

* remove check

* add more comments.

* update docstring.
Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-2-202.us-west-1.compute.internal>
Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-9-132.us-west-1.compute.internal>
Co-authored-by: default avatarxiang song(charlie.song) <classicxsong@gmail.com>
Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-62-171.ec2.internal>
parent aa884d43
......@@ -236,7 +236,15 @@ DGL_REGISTER_GLOBAL("partition._CAPI_DGLPartitionWithHalo_Hetero")
*rv = ret_list;
});
// TODO(JJ): What's this?
template<class IdType>
struct EdgeProperty {
IdType eid;
int64_t idx;
int part_id;
};
// Reassign edge IDs so that all edges in a partition have contiguous edge IDs.
// The original edge IDs are returned.
DGL_REGISTER_GLOBAL("partition._CAPI_DGLReassignEdges_Hetero")
.set_body([](DGLArgs args, DGLRetValue *rv) {
HeteroGraphRef g = args[0];
......@@ -245,26 +253,54 @@ DGL_REGISTER_GLOBAL("partition._CAPI_DGLReassignEdges_Hetero")
CHECK_EQ(hgptr->relation_graphs().size(), 1)
<< "Reorder only supports HomoGraph";
auto ugptr = hgptr->relation_graphs()[0];
bool is_incsr = args[1];
IdArray etype = args[1];
IdArray part_id = args[2];
bool is_incsr = args[3];
auto csrmat = is_incsr ? ugptr->GetCSCMatrix(0) : ugptr->GetCSRMatrix(0);
int64_t num_edges = csrmat.data->shape[0];
int64_t num_rows = csrmat.indptr->shape[0] - 1;
IdArray new_data =
IdArray::Empty({num_edges}, csrmat.data->dtype, csrmat.data->ctx);
// Return the original edge Ids.
*rv = new_data;
// TODO(zhengda) I need to invalidate out-CSR and COO.
// Generate new edge Ids.
// TODO(zhengda) after assignment, we actually don't need to store them
// physically.
ATEN_ID_TYPE_SWITCH(new_data->dtype, IdType, {
IdType *typed_new_data = static_cast<IdType *>(new_data->data);
CHECK(etype->dtype.bits == sizeof(IdType) * 8);
CHECK(part_id->dtype.bits == sizeof(IdType) * 8);
const IdType *part_id_data = static_cast<IdType *>(part_id->data);
const IdType *etype_data = static_cast<IdType *>(etype->data);
const IdType *indptr_data = static_cast<IdType *>(csrmat.indptr->data);
IdType *typed_data = static_cast<IdType *>(csrmat.data->data);
for (int64_t i = 0; i < num_edges; i++) {
typed_new_data[i] = typed_data[i];
typed_data[i] = i;
IdType *typed_new_data = static_cast<IdType *>(new_data->data);
std::vector<EdgeProperty<IdType>> indexed_eids(num_edges);
for (int64_t i = 0; i < num_rows; i++) {
for (int64_t j = indptr_data[i]; j < indptr_data[i + 1]; j++) {
indexed_eids[j].eid = typed_data[j];
indexed_eids[j].idx = j;
indexed_eids[j].part_id = part_id_data[i];
}
}
auto comp = [etype_data](const EdgeProperty<IdType> &a, const EdgeProperty<IdType> &b) {
if (a.part_id == b.part_id) {
return etype_data[a.eid] < etype_data[b.eid];
} else {
return a.part_id < b.part_id;
}
};
// We only need to sort the edges if the input graph has multiple relations.
// If it's a homogeneous grap, we'll just assign edge Ids based on its previous order.
if (etype->shape[0] > 0) {
std::sort(indexed_eids.begin(), indexed_eids.end(), comp);
}
for (int64_t new_eid = 0; new_eid < num_edges; new_eid++) {
int64_t orig_idx = indexed_eids[new_eid].idx;
typed_new_data[new_eid] = typed_data[orig_idx];
typed_data[orig_idx] = new_eid;
}
});
ugptr->InvalidateCSR();
ugptr->InvalidateCOO();
});
DGL_REGISTER_GLOBAL("partition._CAPI_GetHaloSubgraphInnerNodes_Hetero")
......
......@@ -1238,6 +1238,18 @@ HeteroGraphPtr UnitGraph::CopyTo(HeteroGraphPtr g, const DLContext& ctx) {
}
}
void UnitGraph::InvalidateCSR() {
this->out_csr_ = CSRPtr(new CSR());
}
void UnitGraph::InvalidateCSC() {
this->in_csr_ = CSRPtr(new CSR());
}
void UnitGraph::InvalidateCOO() {
this->coo_ = COOPtr(new COO());
}
UnitGraph::UnitGraph(GraphPtr metagraph, CSRPtr in_csr, CSRPtr out_csr, COOPtr coo,
dgl_format_code_t formats)
: BaseHeteroGraph(metagraph), in_csr_(in_csr), out_csr_(out_csr), coo_(coo) {
......
......@@ -276,6 +276,12 @@ class UnitGraph : public BaseHeteroGraph {
*/
std::tuple<UnitGraphPtr, IdArray, IdArray>ToSimple() const;
void InvalidateCSR();
void InvalidateCSC();
void InvalidateCOO();
private:
friend class Serializer;
friend class HeteroGraph;
......
......@@ -56,7 +56,7 @@ SharedMemory::~SharedMemory() {
close(fd_);
if (own_) {
LOG(INFO) << "remove " << name << " for shared memory";
CHECK(shm_unlink(name.c_str()) != -1) << strerror(errno);
shm_unlink(name.c_str());
// The resource has been deleted. We don't need to keep track of it any more.
DeleteResource(name);
}
......
......@@ -69,7 +69,7 @@ def rand_init(shape, dtype):
def run_client(graph_name, part_id, server_count, num_clients, num_nodes, num_edges):
time.sleep(5)
dgl.distributed.initialize("kv_ip_config.txt", server_count)
gpb, graph_name = load_partition_book('/tmp/dist_graph/{}.json'.format(graph_name),
gpb, graph_name, _, _ = load_partition_book('/tmp/dist_graph/{}.json'.format(graph_name),
part_id, None)
g = DistGraph(graph_name, gpb=gpb)
check_dist_graph(g, num_clients, num_nodes, num_edges)
......@@ -237,10 +237,149 @@ def check_server_client(shared_mem, num_servers, num_clients):
print('clients have terminated')
def run_client_hetero(graph_name, part_id, server_count, num_clients, num_nodes, num_edges):
time.sleep(5)
dgl.distributed.initialize("kv_ip_config.txt", server_count)
gpb, graph_name, _, _ = load_partition_book('/tmp/dist_graph/{}.json'.format(graph_name),
part_id, None)
g = DistGraph(graph_name, gpb=gpb)
check_dist_graph_hetero(g, num_clients, num_nodes, num_edges)
def create_random_hetero():
num_nodes = {'n1': 10000, 'n2': 10010, 'n3': 10020}
etypes = [('n1', 'r1', 'n2'),
('n1', 'r2', 'n3'),
('n2', 'r3', 'n3')]
edges = {}
for etype in etypes:
src_ntype, _, dst_ntype = etype
arr = spsp.random(num_nodes[src_ntype], num_nodes[dst_ntype], density=0.001, format='coo',
random_state=100)
edges[etype] = (arr.row, arr.col)
g = dgl.heterograph(edges, num_nodes)
g.nodes['n1'].data['feat'] = F.unsqueeze(F.arange(0, g.number_of_nodes('n1')), 1)
g.edges['r1'].data['feat'] = F.unsqueeze(F.arange(0, g.number_of_edges('r1')), 1)
return g
def check_dist_graph_hetero(g, num_clients, num_nodes, num_edges):
# Test API
for ntype in num_nodes:
assert ntype in g.ntypes
assert num_nodes[ntype] == g.number_of_nodes(ntype)
for etype in num_edges:
assert etype in g.etypes
assert num_edges[etype] == g.number_of_edges(etype)
assert g.number_of_nodes() == sum([num_nodes[ntype] for ntype in num_nodes])
assert g.number_of_edges() == sum([num_edges[etype] for etype in num_edges])
# Test reading node data
nids = F.arange(0, int(g.number_of_nodes('n1') / 2))
feats1 = g.nodes['n1'].data['feat'][nids]
feats = F.squeeze(feats1, 1)
assert np.all(F.asnumpy(feats == nids))
# Test reading edge data
eids = F.arange(0, int(g.number_of_edges('r1') / 2))
feats1 = g.edges['r1'].data['feat'][eids]
feats = F.squeeze(feats1, 1)
assert np.all(F.asnumpy(feats == eids))
# Test init node data
new_shape = (g.number_of_nodes('n1'), 2)
g.nodes['n1'].data['test1'] = dgl.distributed.DistTensor(new_shape, F.int32)
feats = g.nodes['n1'].data['test1'][nids]
assert np.all(F.asnumpy(feats) == 0)
# create a tensor and destroy a tensor and create it again.
test3 = dgl.distributed.DistTensor(new_shape, F.float32, 'test3', init_func=rand_init)
del test3
test3 = dgl.distributed.DistTensor((g.number_of_nodes('n1'), 3), F.float32, 'test3')
del test3
# add tests for anonymous distributed tensor.
test3 = dgl.distributed.DistTensor(new_shape, F.float32, init_func=rand_init)
data = test3[0:10]
test4 = dgl.distributed.DistTensor(new_shape, F.float32, init_func=rand_init)
del test3
test5 = dgl.distributed.DistTensor(new_shape, F.float32, init_func=rand_init)
assert np.sum(F.asnumpy(test5[0:10] != data)) > 0
# test a persistent tesnor
test4 = dgl.distributed.DistTensor(new_shape, F.float32, 'test4', init_func=rand_init,
persistent=True)
del test4
try:
test4 = dgl.distributed.DistTensor((g.number_of_nodes('n1'), 3), F.float32, 'test4')
raise Exception('')
except:
pass
# Test write data
new_feats = F.ones((len(nids), 2), F.int32, F.cpu())
g.nodes['n1'].data['test1'][nids] = new_feats
feats = g.nodes['n1'].data['test1'][nids]
assert np.all(F.asnumpy(feats) == 1)
# Test metadata operations.
assert len(g.nodes['n1'].data['feat']) == g.number_of_nodes('n1')
assert g.nodes['n1'].data['feat'].shape == (g.number_of_nodes('n1'), 1)
assert g.nodes['n1'].data['feat'].dtype == F.int64
selected_nodes = np.random.randint(0, 100, size=g.number_of_nodes('n1')) > 30
# Test node split
nodes = node_split(selected_nodes, g.get_partition_book(), ntype='n1')
nodes = F.asnumpy(nodes)
# We only have one partition, so the local nodes are basically all nodes in the graph.
local_nids = np.arange(g.number_of_nodes('n1'))
for n in nodes:
assert n in local_nids
print('end')
def check_server_client_hetero(shared_mem, num_servers, num_clients):
prepare_dist()
g = create_random_hetero()
# Partition the graph
num_parts = 1
graph_name = 'dist_graph_test_3'
partition_graph(g, graph_name, num_parts, '/tmp/dist_graph')
# let's just test on one partition for now.
# We cannot run multiple servers and clients on the same machine.
serv_ps = []
ctx = mp.get_context('spawn')
for serv_id in range(num_servers):
p = ctx.Process(target=run_server, args=(graph_name, serv_id, num_servers,
num_clients, shared_mem))
serv_ps.append(p)
p.start()
cli_ps = []
num_nodes = {ntype: g.number_of_nodes(ntype) for ntype in g.ntypes}
num_edges = {etype: g.number_of_edges(etype) for etype in g.etypes}
for cli_id in range(num_clients):
print('start client', cli_id)
p = ctx.Process(target=run_client_hetero, args=(graph_name, 0, num_servers, num_clients, num_nodes,
num_edges))
p.start()
cli_ps.append(p)
for p in cli_ps:
p.join()
for p in serv_ps:
p.join()
print('clients have terminated')
@unittest.skipIf(os.name == 'nt', reason='Do not support windows yet')
@unittest.skipIf(dgl.backend.backend_name == "tensorflow", reason="TF doesn't support some of operations in DistGraph")
def test_server_client():
os.environ['DGL_DIST_MODE'] = 'distributed'
check_server_client_hetero(True, 1, 1)
check_server_client_hetero(False, 1, 1)
check_server_client(True, 1, 1)
check_server_client(False, 1, 1)
check_server_client(True, 2, 2)
......@@ -289,19 +428,19 @@ def test_split():
for i in range(num_parts):
set_roles(num_parts)
part_g, node_feats, edge_feats, gpb, _ = load_partition('/tmp/dist_graph/dist_graph_test.json', i)
part_g, node_feats, edge_feats, gpb, _, _, _ = load_partition('/tmp/dist_graph/dist_graph_test.json', i)
local_nids = F.nonzero_1d(part_g.ndata['inner_node'])
local_nids = F.gather_row(part_g.ndata[dgl.NID], local_nids)
nodes1 = np.intersect1d(selected_nodes, F.asnumpy(local_nids))
nodes2 = node_split(node_mask, gpb, i, force_even=False)
nodes2 = node_split(node_mask, gpb, rank=i, force_even=False)
assert np.all(np.sort(nodes1) == np.sort(F.asnumpy(nodes2)))
local_nids = F.asnumpy(local_nids)
for n in nodes1:
assert n in local_nids
set_roles(num_parts * 2)
nodes3 = node_split(node_mask, gpb, i * 2, force_even=False)
nodes4 = node_split(node_mask, gpb, i * 2 + 1, force_even=False)
nodes3 = node_split(node_mask, gpb, rank=i * 2, force_even=False)
nodes4 = node_split(node_mask, gpb, rank=i * 2 + 1, force_even=False)
nodes5 = F.cat([nodes3, nodes4], 0)
assert np.all(np.sort(nodes1) == np.sort(F.asnumpy(nodes5)))
......@@ -309,15 +448,15 @@ def test_split():
local_eids = F.nonzero_1d(part_g.edata['inner_edge'])
local_eids = F.gather_row(part_g.edata[dgl.EID], local_eids)
edges1 = np.intersect1d(selected_edges, F.asnumpy(local_eids))
edges2 = edge_split(edge_mask, gpb, i, force_even=False)
edges2 = edge_split(edge_mask, gpb, rank=i, force_even=False)
assert np.all(np.sort(edges1) == np.sort(F.asnumpy(edges2)))
local_eids = F.asnumpy(local_eids)
for e in edges1:
assert e in local_eids
set_roles(num_parts * 2)
edges3 = edge_split(edge_mask, gpb, i * 2, force_even=False)
edges4 = edge_split(edge_mask, gpb, i * 2 + 1, force_even=False)
edges3 = edge_split(edge_mask, gpb, rank=i * 2, force_even=False)
edges4 = edge_split(edge_mask, gpb, rank=i * 2 + 1, force_even=False)
edges5 = F.cat([edges3, edges4], 0)
assert np.all(np.sort(edges1) == np.sort(F.asnumpy(edges5)))
......@@ -348,18 +487,18 @@ def test_split_even():
for i in range(num_parts):
set_roles(num_parts)
part_g, node_feats, edge_feats, gpb, _ = load_partition('/tmp/dist_graph/dist_graph_test.json', i)
part_g, node_feats, edge_feats, gpb, _, _, _ = load_partition('/tmp/dist_graph/dist_graph_test.json', i)
local_nids = F.nonzero_1d(part_g.ndata['inner_node'])
local_nids = F.gather_row(part_g.ndata[dgl.NID], local_nids)
nodes = node_split(node_mask, gpb, i, force_even=True)
nodes = node_split(node_mask, gpb, rank=i, force_even=True)
all_nodes1.append(nodes)
subset = np.intersect1d(F.asnumpy(nodes), F.asnumpy(local_nids))
print('part {} get {} nodes and {} are in the partition'.format(i, len(nodes), len(subset)))
set_roles(num_parts * 2)
nodes1 = node_split(node_mask, gpb, i * 2, force_even=True)
nodes2 = node_split(node_mask, gpb, i * 2 + 1, force_even=True)
nodes3 = F.cat([nodes1, nodes2], 0)
nodes1 = node_split(node_mask, gpb, rank=i * 2, force_even=True)
nodes2 = node_split(node_mask, gpb, rank=i * 2 + 1, force_even=True)
nodes3, _ = F.sort_1d(F.cat([nodes1, nodes2], 0))
all_nodes2.append(nodes3)
subset = np.intersect1d(F.asnumpy(nodes), F.asnumpy(nodes3))
print('intersection has', len(subset))
......@@ -367,15 +506,15 @@ def test_split_even():
set_roles(num_parts)
local_eids = F.nonzero_1d(part_g.edata['inner_edge'])
local_eids = F.gather_row(part_g.edata[dgl.EID], local_eids)
edges = edge_split(edge_mask, gpb, i, force_even=True)
edges = edge_split(edge_mask, gpb, rank=i, force_even=True)
all_edges1.append(edges)
subset = np.intersect1d(F.asnumpy(edges), F.asnumpy(local_eids))
print('part {} get {} edges and {} are in the partition'.format(i, len(edges), len(subset)))
set_roles(num_parts * 2)
edges1 = edge_split(edge_mask, gpb, i * 2, force_even=True)
edges2 = edge_split(edge_mask, gpb, i * 2 + 1, force_even=True)
edges3 = F.cat([edges1, edges2], 0)
edges1 = edge_split(edge_mask, gpb, rank=i * 2, force_even=True)
edges2 = edge_split(edge_mask, gpb, rank=i * 2 + 1, force_even=True)
edges3, _ = F.sort_1d(F.cat([edges1, edges2], 0))
all_edges2.append(edges3)
subset = np.intersect1d(F.asnumpy(edges), F.asnumpy(edges3))
print('intersection has', len(subset))
......
......@@ -12,6 +12,7 @@ import time
from utils import get_local_usable_addr
from pathlib import Path
import pytest
from scipy import sparse as spsp
from dgl.distributed import DistGraphServer, DistGraph
......@@ -24,7 +25,7 @@ def start_server(rank, tmpdir, disable_shared_mem, graph_name):
def start_sample_client(rank, tmpdir, disable_shared_mem):
gpb = None
if disable_shared_mem:
_, _, _, gpb, _ = load_partition(tmpdir / 'test_sampling.json', rank)
_, _, _, gpb, _, _, _ = load_partition(tmpdir / 'test_sampling.json', rank)
dgl.distributed.initialize("rpc_ip_config.txt", 1)
dist_graph = DistGraph("test_sampling", gpb=gpb)
try:
......@@ -38,7 +39,7 @@ def start_sample_client(rank, tmpdir, disable_shared_mem):
def start_find_edges_client(rank, tmpdir, disable_shared_mem, eids):
gpb = None
if disable_shared_mem:
_, _, _, gpb, _ = load_partition(tmpdir / 'test_find_edges.json', rank)
_, _, _, gpb, _, _, _ = load_partition(tmpdir / 'test_find_edges.json', rank)
dgl.distributed.initialize("rpc_ip_config.txt", 1)
dist_graph = DistGraph("test_find_edges", gpb=gpb)
try:
......@@ -85,7 +86,7 @@ def check_rpc_sampling(tmpdir, num_server):
assert np.array_equal(
F.asnumpy(sampled_graph.edata[dgl.EID]), F.asnumpy(eids))
def check_rpc_find_edges(tmpdir, num_server):
def check_rpc_find_edges_shuffle(tmpdir, num_server):
ip_config = open("rpc_ip_config.txt", "w")
for _ in range(num_server):
ip_config.write('{}\n'.format(get_local_usable_addr()))
......@@ -96,7 +97,7 @@ def check_rpc_find_edges(tmpdir, num_server):
num_parts = num_server
partition_graph(g, 'test_find_edges', num_parts, tmpdir,
num_hops=1, part_method='metis', reshuffle=False)
num_hops=1, part_method='metis', reshuffle=True)
pserver_list = []
ctx = mp.get_context('spawn')
......@@ -106,15 +107,25 @@ def check_rpc_find_edges(tmpdir, num_server):
time.sleep(1)
pserver_list.append(p)
orig_nid = F.zeros((g.number_of_nodes(),), dtype=F.int64)
orig_eid = F.zeros((g.number_of_edges(),), dtype=F.int64)
for i in range(num_server):
part, _, _, _, _, _, _ = load_partition(tmpdir / 'test_find_edges.json', i)
orig_nid[part.ndata[dgl.NID]] = part.ndata['orig_id']
orig_eid[part.edata[dgl.EID]] = part.edata['orig_id']
time.sleep(3)
eids = F.tensor(np.random.randint(g.number_of_edges(), size=100))
u, v = g.find_edges(eids)
u, v = g.find_edges(orig_eid[eids])
du, dv = start_find_edges_client(0, tmpdir, num_server > 1, eids)
du = orig_nid[du]
dv = orig_nid[dv]
assert F.array_equal(u, du)
assert F.array_equal(v, dv)
@unittest.skipIf(os.name == 'nt', reason='Do not support windows yet')
@unittest.skipIf(dgl.backend.backend_name == 'tensorflow', reason='Not support tensorflow for now')
#@unittest.skipIf(os.name == 'nt', reason='Do not support windows yet')
#@unittest.skipIf(dgl.backend.backend_name == 'tensorflow', reason='Not support tensorflow for now')
@unittest.skip('Only support partition with shuffle')
def test_rpc_sampling():
import tempfile
os.environ['DGL_DIST_MODE'] = 'distributed'
......@@ -152,7 +163,7 @@ def check_rpc_sampling_shuffle(tmpdir, num_server):
orig_nid = F.zeros((g.number_of_nodes(),), dtype=F.int64)
orig_eid = F.zeros((g.number_of_edges(),), dtype=F.int64)
for i in range(num_server):
part, _, _, _, _ = load_partition(tmpdir / 'test_sampling.json', i)
part, _, _, _, _, _, _ = load_partition(tmpdir / 'test_sampling.json', i)
orig_nid[part.ndata[dgl.NID]] = part.ndata['orig_id']
orig_eid[part.edata[dgl.EID]] = part.edata['orig_id']
......@@ -165,6 +176,113 @@ def check_rpc_sampling_shuffle(tmpdir, num_server):
eids1 = orig_eid[sampled_graph.edata[dgl.EID]]
assert np.array_equal(F.asnumpy(eids1), F.asnumpy(eids))
def create_random_hetero():
num_nodes = {'n1': 1010, 'n2': 1000, 'n3': 1020}
etypes = [('n1', 'r1', 'n2'),
('n1', 'r2', 'n3'),
('n2', 'r3', 'n3')]
edges = {}
for etype in etypes:
src_ntype, _, dst_ntype = etype
arr = spsp.random(num_nodes[src_ntype], num_nodes[dst_ntype], density=0.001, format='coo',
random_state=100)
edges[etype] = (arr.row, arr.col)
g = dgl.heterograph(edges, num_nodes)
g.nodes['n1'].data['feat'] = F.ones((g.number_of_nodes('n1'), 10), F.float32, F.cpu())
return g
def start_hetero_sample_client(rank, tmpdir, disable_shared_mem):
gpb = None
if disable_shared_mem:
_, _, _, gpb, _, _, _ = load_partition(tmpdir / 'test_sampling.json', rank)
dgl.distributed.initialize("rpc_ip_config.txt", 1)
dist_graph = DistGraph("test_sampling", gpb=gpb)
assert 'feat' in dist_graph.nodes['n1'].data
assert 'feat' not in dist_graph.nodes['n2'].data
assert 'feat' not in dist_graph.nodes['n3'].data
if gpb is None:
gpb = dist_graph.get_partition_book()
try:
nodes = {'n3': [0, 10, 99, 66, 124, 208]}
sampled_graph = sample_neighbors(dist_graph, nodes, 3)
nodes = gpb.map_to_homo_nid(nodes['n3'], 'n3')
block = dgl.to_block(sampled_graph, nodes)
block.edata[dgl.EID] = sampled_graph.edata[dgl.EID]
except Exception as e:
print(e)
block = None
dgl.distributed.exit_client()
return block, gpb
def check_rpc_hetero_sampling_shuffle(tmpdir, num_server):
ip_config = open("rpc_ip_config.txt", "w")
for _ in range(num_server):
ip_config.write('{}\n'.format(get_local_usable_addr()))
ip_config.close()
g = create_random_hetero()
num_parts = num_server
num_hops = 1
partition_graph(g, 'test_sampling', num_parts, tmpdir,
num_hops=num_hops, part_method='metis', reshuffle=True)
pserver_list = []
ctx = mp.get_context('spawn')
for i in range(num_server):
p = ctx.Process(target=start_server, args=(i, tmpdir, num_server > 1, 'test_sampling'))
p.start()
time.sleep(1)
pserver_list.append(p)
time.sleep(3)
block, gpb = start_hetero_sample_client(0, tmpdir, num_server > 1)
print("Done sampling")
for p in pserver_list:
p.join()
orig_nid_map = F.zeros((g.number_of_nodes(),), dtype=F.int64)
orig_eid_map = F.zeros((g.number_of_edges(),), dtype=F.int64)
for i in range(num_server):
part, _, _, _, _, _, _ = load_partition(tmpdir / 'test_sampling.json', i)
F.scatter_row_inplace(orig_nid_map, part.ndata[dgl.NID], part.ndata['orig_id'])
F.scatter_row_inplace(orig_eid_map, part.edata[dgl.EID], part.edata['orig_id'])
src, dst = block.edges()
# These are global Ids after shuffling.
shuffled_src = F.gather_row(block.srcdata[dgl.NID], src)
shuffled_dst = F.gather_row(block.dstdata[dgl.NID], dst)
shuffled_eid = block.edata[dgl.EID]
# Get node/edge types.
etype, _ = gpb.map_to_per_etype(shuffled_eid)
src_type, _ = gpb.map_to_per_ntype(shuffled_src)
dst_type, _ = gpb.map_to_per_ntype(shuffled_dst)
etype = F.asnumpy(etype)
src_type = F.asnumpy(src_type)
dst_type = F.asnumpy(dst_type)
# These are global Ids in the original graph.
orig_src = F.asnumpy(F.gather_row(orig_nid_map, shuffled_src))
orig_dst = F.asnumpy(F.gather_row(orig_nid_map, shuffled_dst))
orig_eid = F.asnumpy(F.gather_row(orig_eid_map, shuffled_eid))
etype_map = {g.get_etype_id(etype):etype for etype in g.etypes}
etype_to_eptype = {g.get_etype_id(etype):(src_ntype, dst_ntype) for src_ntype, etype, dst_ntype in g.canonical_etypes}
for e in np.unique(etype):
src_t = src_type[etype == e]
dst_t = dst_type[etype == e]
assert np.all(src_t == src_t[0])
assert np.all(dst_t == dst_t[0])
# Check the node Ids and edge Ids.
orig_src1, orig_dst1 = g.find_edges(orig_eid[etype == e], etype=etype_map[e])
assert np.all(F.asnumpy(orig_src1) == orig_src[etype == e])
assert np.all(F.asnumpy(orig_dst1) == orig_dst[etype == e])
# Check the node types.
src_ntype, dst_ntype = etype_to_eptype[e]
assert np.all(src_t == g.get_ntype_id(src_ntype))
assert np.all(dst_t == g.get_ntype_id(dst_ntype))
# Wait non shared memory graph store
@unittest.skipIf(os.name == 'nt', reason='Do not support windows yet')
@unittest.skipIf(dgl.backend.backend_name == 'tensorflow', reason='Not support tensorflow for now')
......@@ -174,13 +292,14 @@ def test_rpc_sampling_shuffle(num_server):
os.environ['DGL_DIST_MODE'] = 'distributed'
with tempfile.TemporaryDirectory() as tmpdirname:
check_rpc_sampling_shuffle(Path(tmpdirname), num_server)
check_rpc_hetero_sampling_shuffle(Path(tmpdirname), num_server)
def check_standalone_sampling(tmpdir):
def check_standalone_sampling(tmpdir, reshuffle):
g = CitationGraphDataset("cora")[0]
num_parts = 1
num_hops = 1
partition_graph(g, 'test_sampling', num_parts, tmpdir,
num_hops=num_hops, part_method='metis', reshuffle=False)
num_hops=num_hops, part_method='metis', reshuffle=reshuffle)
os.environ['DGL_DIST_MODE'] = 'standalone'
dgl.distributed.initialize("rpc_ip_config.txt", 1)
......@@ -201,13 +320,14 @@ def test_standalone_sampling():
import tempfile
os.environ['DGL_DIST_MODE'] = 'standalone'
with tempfile.TemporaryDirectory() as tmpdirname:
check_standalone_sampling(Path(tmpdirname))
check_standalone_sampling(Path(tmpdirname), False)
check_standalone_sampling(Path(tmpdirname), True)
def start_in_subgraph_client(rank, tmpdir, disable_shared_mem, nodes):
gpb = None
dgl.distributed.initialize("rpc_ip_config.txt", 1)
if disable_shared_mem:
_, _, _, gpb, _ = load_partition(tmpdir / 'test_in_subgraph.json', rank)
_, _, _, gpb, _, _, _ = load_partition(tmpdir / 'test_in_subgraph.json', rank)
dist_graph = DistGraph("test_in_subgraph", gpb=gpb)
try:
sampled_graph = dgl.distributed.in_subgraph(dist_graph, nodes)
......@@ -218,7 +338,7 @@ def start_in_subgraph_client(rank, tmpdir, disable_shared_mem, nodes):
return sampled_graph
def check_rpc_in_subgraph(tmpdir, num_server):
def check_rpc_in_subgraph_shuffle(tmpdir, num_server):
ip_config = open("rpc_ip_config.txt", "w")
for _ in range(num_server):
ip_config.write('{}\n'.format(get_local_usable_addr()))
......@@ -229,7 +349,7 @@ def check_rpc_in_subgraph(tmpdir, num_server):
num_parts = num_server
partition_graph(g, 'test_in_subgraph', num_parts, tmpdir,
num_hops=1, part_method='metis', reshuffle=False)
num_hops=1, part_method='metis', reshuffle=True)
pserver_list = []
ctx = mp.get_context('spawn')
......@@ -245,15 +365,27 @@ def check_rpc_in_subgraph(tmpdir, num_server):
for p in pserver_list:
p.join()
orig_nid = F.zeros((g.number_of_nodes(),), dtype=F.int64)
orig_eid = F.zeros((g.number_of_edges(),), dtype=F.int64)
for i in range(num_server):
part, _, _, _, _, _, _ = load_partition(tmpdir / 'test_in_subgraph.json', i)
orig_nid[part.ndata[dgl.NID]] = part.ndata['orig_id']
orig_eid[part.edata[dgl.EID]] = part.edata['orig_id']
src, dst = sampled_graph.edges()
src = orig_nid[src]
dst = orig_nid[dst]
assert sampled_graph.number_of_nodes() == g.number_of_nodes()
subg1 = dgl.in_subgraph(g, nodes)
assert np.all(F.asnumpy(g.has_edges_between(src, dst)))
subg1 = dgl.in_subgraph(g, orig_nid[nodes])
src1, dst1 = subg1.edges()
assert np.all(np.sort(F.asnumpy(src)) == np.sort(F.asnumpy(src1)))
assert np.all(np.sort(F.asnumpy(dst)) == np.sort(F.asnumpy(dst1)))
eids = g.edge_ids(src, dst)
assert np.array_equal(
F.asnumpy(sampled_graph.edata[dgl.EID]), F.asnumpy(eids))
eids1 = orig_eid[sampled_graph.edata[dgl.EID]]
assert np.array_equal(F.asnumpy(eids1), F.asnumpy(eids))
@unittest.skipIf(os.name == 'nt', reason='Do not support windows yet')
@unittest.skipIf(dgl.backend.backend_name == 'tensorflow', reason='Not support tensorflow for now')
......@@ -261,18 +393,21 @@ def test_rpc_in_subgraph():
import tempfile
os.environ['DGL_DIST_MODE'] = 'distributed'
with tempfile.TemporaryDirectory() as tmpdirname:
check_rpc_in_subgraph(Path(tmpdirname), 2)
check_rpc_in_subgraph_shuffle(Path(tmpdirname), 2)
if __name__ == "__main__":
import tempfile
with tempfile.TemporaryDirectory() as tmpdirname:
os.environ['DGL_DIST_MODE'] = 'standalone'
check_standalone_sampling(Path(tmpdirname))
check_standalone_sampling(Path(tmpdirname), True)
check_standalone_sampling(Path(tmpdirname), False)
os.environ['DGL_DIST_MODE'] = 'distributed'
check_rpc_in_subgraph(Path(tmpdirname), 2)
check_rpc_sampling_shuffle(Path(tmpdirname), 1)
check_rpc_sampling_shuffle(Path(tmpdirname), 2)
check_rpc_sampling(Path(tmpdirname), 2)
check_rpc_sampling(Path(tmpdirname), 1)
check_rpc_find_edges(Path(tmpdirname), 2)
check_rpc_find_edges(Path(tmpdirname), 1)
check_rpc_find_edges_shuffle(Path(tmpdirname), 2)
check_rpc_find_edges_shuffle(Path(tmpdirname), 1)
check_rpc_in_subgraph_shuffle(Path(tmpdirname), 2)
check_rpc_sampling_shuffle(Path(tmpdirname), 1)
check_rpc_sampling_shuffle(Path(tmpdirname), 2)
check_rpc_hetero_sampling_shuffle(Path(tmpdirname), 1)
check_rpc_hetero_sampling_shuffle(Path(tmpdirname), 2)
......@@ -45,18 +45,28 @@ def start_server(rank, tmpdir, disable_shared_mem, num_clients):
g.start()
def start_dist_dataloader(rank, tmpdir, disable_shared_mem, num_workers, drop_last):
def start_dist_dataloader(rank, tmpdir, num_server, num_workers, drop_last):
import dgl
import torch as th
dgl.distributed.initialize("mp_ip_config.txt", 1, num_workers=num_workers)
gpb = None
disable_shared_mem = num_server > 0
if disable_shared_mem:
_, _, _, gpb, _ = load_partition(tmpdir / 'test_sampling.json', rank)
_, _, _, gpb, _, _, _ = load_partition(tmpdir / 'test_sampling.json', rank)
num_nodes_to_sample = 202
batch_size = 32
train_nid = th.arange(num_nodes_to_sample)
dist_graph = DistGraph("test_mp", gpb=gpb, part_config=tmpdir / 'test_sampling.json')
orig_nid = F.arange(0, dist_graph.number_of_nodes())
orig_eid = F.arange(0, dist_graph.number_of_edges())
for i in range(num_server):
part, _, _, _, _, _, _ = load_partition(tmpdir / 'test_sampling.json', i)
if 'orig_id' in part.ndata:
orig_nid[part.ndata[dgl.NID]] = part.ndata['orig_id']
if 'orig_id' in part.edata:
orig_eid[part.edata[dgl.EID]] = part.edata['orig_id']
# Create sampler
sampler = NeighborSampler(dist_graph, [5, 10],
dgl.distributed.sample_neighbors)
......@@ -80,9 +90,12 @@ def start_dist_dataloader(rank, tmpdir, disable_shared_mem, num_workers, drop_la
o_src, o_dst = block.edges()
src_nodes_id = block.srcdata[dgl.NID][o_src]
dst_nodes_id = block.dstdata[dgl.NID][o_dst]
max_nid.append(np.max(F.asnumpy(dst_nodes_id)))
src_nodes_id = orig_nid[src_nodes_id]
dst_nodes_id = orig_nid[dst_nodes_id]
has_edges = groundtruth_g.has_edges_between(src_nodes_id, dst_nodes_id)
assert np.all(F.asnumpy(has_edges))
max_nid.append(np.max(F.asnumpy(dst_nodes_id)))
# assert np.all(np.unique(np.sort(F.asnumpy(dst_nodes_id))) == np.arange(idx, batch_size))
if drop_last:
assert np.max(max_nid) == num_nodes_to_sample - 1 - num_nodes_to_sample % batch_size
......@@ -105,11 +118,11 @@ def test_standalone(tmpdir):
num_hops = 1
partition_graph(g, 'test_sampling', num_parts, tmpdir,
num_hops=num_hops, part_method='metis', reshuffle=False)
num_hops=num_hops, part_method='metis', reshuffle=True)
os.environ['DGL_DIST_MODE'] = 'standalone'
try:
start_dist_dataloader(0, tmpdir, False, 2, True)
start_dist_dataloader(0, tmpdir, 1, 2, True)
except Exception as e:
print(e)
dgl.distributed.exit_client() # this is needed since there's two test here in one process
......@@ -120,7 +133,8 @@ def test_standalone(tmpdir):
@pytest.mark.parametrize("num_server", [3])
@pytest.mark.parametrize("num_workers", [0, 4])
@pytest.mark.parametrize("drop_last", [True, False])
def test_dist_dataloader(tmpdir, num_server, num_workers, drop_last):
@pytest.mark.parametrize("reshuffle", [True, False])
def test_dist_dataloader(tmpdir, num_server, num_workers, drop_last, reshuffle):
ip_config = open("mp_ip_config.txt", "w")
for _ in range(num_server):
ip_config.write('{}\n'.format(get_local_usable_addr()))
......@@ -132,7 +146,7 @@ def test_dist_dataloader(tmpdir, num_server, num_workers, drop_last):
num_hops = 1
partition_graph(g, 'test_sampling', num_parts, tmpdir,
num_hops=num_hops, part_method='metis', reshuffle=False)
num_hops=num_hops, part_method='metis', reshuffle=reshuffle)
pserver_list = []
ctx = mp.get_context('spawn')
......@@ -146,7 +160,7 @@ def test_dist_dataloader(tmpdir, num_server, num_workers, drop_last):
time.sleep(3)
os.environ['DGL_DIST_MODE'] = 'distributed'
ptrainer = ctx.Process(target=start_dist_dataloader, args=(
0, tmpdir, num_server > 1, num_workers, drop_last))
0, tmpdir, num_server, num_workers, drop_last))
ptrainer.start()
time.sleep(1)
......@@ -154,18 +168,26 @@ def test_dist_dataloader(tmpdir, num_server, num_workers, drop_last):
p.join()
ptrainer.join()
def start_node_dataloader(rank, tmpdir, disable_shared_mem, num_workers):
def start_node_dataloader(rank, tmpdir, num_server, num_workers):
import dgl
import torch as th
dgl.distributed.initialize("mp_ip_config.txt", 1, num_workers=num_workers)
gpb = None
disable_shared_mem = num_server > 1
if disable_shared_mem:
_, _, _, gpb, _ = load_partition(tmpdir / 'test_sampling.json', rank)
_, _, _, gpb, _, _, _ = load_partition(tmpdir / 'test_sampling.json', rank)
num_nodes_to_sample = 202
batch_size = 32
train_nid = th.arange(num_nodes_to_sample)
dist_graph = DistGraph("test_mp", gpb=gpb, part_config=tmpdir / 'test_sampling.json')
orig_nid = F.zeros((dist_graph.number_of_nodes(),), dtype=F.int64)
orig_eid = F.zeros((dist_graph.number_of_edges(),), dtype=F.int64)
for i in range(num_server):
part, _, _, _, _, _, _ = load_partition(tmpdir / 'test_sampling.json', i)
orig_nid[part.ndata[dgl.NID]] = part.ndata['orig_id']
orig_eid[part.edata[dgl.EID]] = part.edata['orig_id']
# Create sampler
sampler = dgl.dataloading.MultiLayerNeighborSampler([5, 10])
......@@ -190,6 +212,8 @@ def start_node_dataloader(rank, tmpdir, disable_shared_mem, num_workers):
o_src, o_dst = block.edges()
src_nodes_id = block.srcdata[dgl.NID][o_src]
dst_nodes_id = block.dstdata[dgl.NID][o_dst]
src_nodes_id = orig_nid[src_nodes_id]
dst_nodes_id = orig_nid[dst_nodes_id]
has_edges = groundtruth_g.has_edges_between(src_nodes_id, dst_nodes_id)
assert np.all(F.asnumpy(has_edges))
max_nid.append(np.max(F.asnumpy(dst_nodes_id)))
......@@ -215,7 +239,7 @@ def test_dataloader(tmpdir, num_server, num_workers, dataloader_type):
num_hops = 1
partition_graph(g, 'test_sampling', num_parts, tmpdir,
num_hops=num_hops, part_method='metis', reshuffle=False)
num_hops=num_hops, part_method='metis', reshuffle=True)
pserver_list = []
ctx = mp.get_context('spawn')
......@@ -231,7 +255,7 @@ def test_dataloader(tmpdir, num_server, num_workers, dataloader_type):
ptrainer_list = []
if dataloader_type == 'node':
p = ctx.Process(target=start_node_dataloader, args=(
0, tmpdir, num_server > 1, num_workers))
0, tmpdir, num_server, num_workers))
p.start()
time.sleep(1)
ptrainer_list.append(p)
......@@ -243,7 +267,9 @@ def test_dataloader(tmpdir, num_server, num_workers, dataloader_type):
if __name__ == "__main__":
import tempfile
with tempfile.TemporaryDirectory() as tmpdirname:
test_dataloader(Path(tmpdirname), 3, 4, 'node')
test_standalone(Path(tmpdirname))
test_dist_dataloader(Path(tmpdirname), 3, 0, True)
test_dist_dataloader(Path(tmpdirname), 3, 4, True)
test_dist_dataloader(Path(tmpdirname), 3, 0, True, True)
test_dist_dataloader(Path(tmpdirname), 3, 4, True, True)
test_dist_dataloader(Path(tmpdirname), 3, 0, True, False)
test_dist_dataloader(Path(tmpdirname), 3, 4, True, False)
test_dataloader(Path(tmpdirname), 3, 4, 'node')
......@@ -64,10 +64,10 @@ gpb = dgl.distributed.graph_partition_book.BasicPartitionBook(part_id=0,
edge_map=edge_map,
part_graph=g)
node_policy = dgl.distributed.PartitionPolicy(policy_str='node',
node_policy = dgl.distributed.PartitionPolicy(policy_str='node:_N',
partition_book=gpb)
edge_policy = dgl.distributed.PartitionPolicy(policy_str='edge',
edge_policy = dgl.distributed.PartitionPolicy(policy_str='edge:_E',
partition_book=gpb)
data_0 = F.tensor([[1.,1.],[1.,1.],[1.,1.],[1.,1.],[1.,1.],[1.,1.]], F.float32)
......@@ -88,8 +88,6 @@ def add_push(target, name, id_tensor, data_tensor):
@unittest.skipIf(os.name == 'nt' or os.getenv('DGLBACKEND') == 'tensorflow', reason='Do not support windows and TF yet')
def test_partition_policy():
assert node_policy.policy_str == 'node'
assert edge_policy.policy_str == 'edge'
assert node_policy.part_id == 0
assert edge_policy.part_id == 0
local_nid = node_policy.to_local(F.tensor([0,1,2,3,4,5]))
......@@ -114,15 +112,15 @@ def start_server(server_id, num_clients, num_servers):
kvserver.add_part_policy(node_policy)
kvserver.add_part_policy(edge_policy)
if kvserver.is_backup_server():
kvserver.init_data('data_0', 'node')
kvserver.init_data('data_0_1', 'node')
kvserver.init_data('data_0_2', 'node')
kvserver.init_data('data_0_3', 'node')
kvserver.init_data('data_0', 'node:_N')
kvserver.init_data('data_0_1', 'node:_N')
kvserver.init_data('data_0_2', 'node:_N')
kvserver.init_data('data_0_3', 'node:_N')
else:
kvserver.init_data('data_0', 'node', data_0)
kvserver.init_data('data_0_1', 'node', data_0_1)
kvserver.init_data('data_0_2', 'node', data_0_2)
kvserver.init_data('data_0_3', 'node', data_0_3)
kvserver.init_data('data_0', 'node:_N', data_0)
kvserver.init_data('data_0_1', 'node:_N', data_0_1)
kvserver.init_data('data_0_2', 'node:_N', data_0_2)
kvserver.init_data('data_0_3', 'node:_N', data_0_3)
# start server
server_state = dgl.distributed.ServerState(kv_store=kvserver, local_g=None, partition_book=None)
dgl.distributed.start_server(server_id=server_id,
......@@ -139,9 +137,9 @@ def start_server_mul_role(server_id, num_clients, num_servers):
num_clients=num_clients)
kvserver.add_part_policy(node_policy)
if kvserver.is_backup_server():
kvserver.init_data('data_0', 'node')
kvserver.init_data('data_0', 'node:_N')
else:
kvserver.init_data('data_0', 'node', data_0)
kvserver.init_data('data_0', 'node:_N', data_0)
# start server
server_state = dgl.distributed.ServerState(kv_store=kvserver, local_g=None, partition_book=None)
dgl.distributed.start_server(server_id=server_id,
......@@ -183,37 +181,37 @@ def start_client(num_clients, num_servers):
dtype, shape, policy = meta
assert dtype == F.dtype(data_0)
assert shape == F.shape(data_0)
assert policy.policy_str == 'node'
assert policy.policy_str == 'node:_N'
meta = kvclient.get_data_meta('data_0_1')
dtype, shape, policy = meta
assert dtype == F.dtype(data_0_1)
assert shape == F.shape(data_0_1)
assert policy.policy_str == 'node'
assert policy.policy_str == 'node:_N'
meta = kvclient.get_data_meta('data_0_2')
dtype, shape, policy = meta
assert dtype == F.dtype(data_0_2)
assert shape == F.shape(data_0_2)
assert policy.policy_str == 'node'
assert policy.policy_str == 'node:_N'
meta = kvclient.get_data_meta('data_0_3')
dtype, shape, policy = meta
assert dtype == F.dtype(data_0_3)
assert shape == F.shape(data_0_3)
assert policy.policy_str == 'node'
assert policy.policy_str == 'node:_N'
meta = kvclient.get_data_meta('data_1')
dtype, shape, policy = meta
assert dtype == F.dtype(data_1)
assert shape == F.shape(data_1)
assert policy.policy_str == 'edge'
assert policy.policy_str == 'edge:_E'
meta = kvclient.get_data_meta('data_2')
dtype, shape, policy = meta
assert dtype == F.dtype(data_2)
assert shape == F.shape(data_2)
assert policy.policy_str == 'node'
assert policy.policy_str == 'node:_N'
# Test push and pull
id_tensor = F.tensor([0,2,4], F.int64)
......
......@@ -12,10 +12,144 @@ import unittest
import pickle
import random
def _get_inner_node_mask(graph, ntype_id):
if dgl.NTYPE in graph.ndata:
dtype = F.dtype(graph.ndata['inner_node'])
return graph.ndata['inner_node'] * F.astype(graph.ndata[dgl.NTYPE] == ntype_id, dtype) == 1
else:
return graph.ndata['inner_node'] == 1
def _get_inner_edge_mask(graph, etype_id):
if dgl.ETYPE in graph.edata:
dtype = F.dtype(graph.edata['inner_edge'])
return graph.edata['inner_edge'] * F.astype(graph.edata[dgl.ETYPE] == etype_id, dtype) == 1
else:
return graph.edata['inner_edge'] == 1
def _get_part_ranges(id_ranges):
if isinstance(id_ranges, dict):
return {key:np.concatenate([np.array(l) for l in id_ranges[key]]).reshape(-1, 2) \
for key in id_ranges}
else:
return np.concatenate([np.array(l) for l in id_range[key]]).reshape(-1, 2)
def create_random_graph(n):
arr = (spsp.random(n, n, density=0.001, format='coo', random_state=100) != 0).astype(np.int64)
return dgl.from_scipy(arr)
def create_random_hetero():
num_nodes = {'n1': 10000, 'n2': 10010, 'n3': 10020}
etypes = [('n1', 'r1', 'n2'),
('n1', 'r2', 'n3'),
('n2', 'r3', 'n3')]
edges = {}
for etype in etypes:
src_ntype, _, dst_ntype = etype
arr = spsp.random(num_nodes[src_ntype], num_nodes[dst_ntype], density=0.001, format='coo',
random_state=100)
edges[etype] = (arr.row, arr.col)
return dgl.heterograph(edges, num_nodes)
def verify_hetero_graph(g, parts):
num_nodes = {ntype:0 for ntype in g.ntypes}
num_edges = {etype:0 for etype in g.etypes}
for part in parts:
assert len(g.ntypes) == len(F.unique(part.ndata[dgl.NTYPE]))
assert len(g.etypes) == len(F.unique(part.edata[dgl.ETYPE]))
for ntype in g.ntypes:
ntype_id = g.get_ntype_id(ntype)
inner_node_mask = _get_inner_node_mask(part, ntype_id)
num_inner_nodes = F.sum(F.astype(inner_node_mask, F.int64), 0)
num_nodes[ntype] += num_inner_nodes
for etype in g.etypes:
etype_id = g.get_etype_id(etype)
inner_edge_mask = _get_inner_edge_mask(part, etype_id)
num_inner_edges = F.sum(F.astype(inner_edge_mask, F.int64), 0)
num_edges[etype] += num_inner_edges
# Verify the number of nodes are correct.
for ntype in g.ntypes:
print('node {}: {}, {}'.format(ntype, g.number_of_nodes(ntype), num_nodes[ntype]))
assert g.number_of_nodes(ntype) == num_nodes[ntype]
# Verify the number of edges are correct.
for etype in g.etypes:
print('edge {}: {}, {}'.format(etype, g.number_of_edges(etype), num_edges[etype]))
assert g.number_of_edges(etype) == num_edges[etype]
nids = {ntype:[] for ntype in g.ntypes}
eids = {etype:[] for etype in g.etypes}
for part in parts:
src, dst, eid = part.edges(form='all')
orig_src = F.gather_row(part.ndata['orig_id'], src)
orig_dst = F.gather_row(part.ndata['orig_id'], dst)
orig_eid = F.gather_row(part.edata['orig_id'], eid)
etype_arr = F.gather_row(part.edata[dgl.ETYPE], eid)
eid_type = F.gather_row(part.edata[dgl.EID], eid)
for etype in g.etypes:
etype_id = g.get_etype_id(etype)
src1 = F.boolean_mask(orig_src, etype_arr == etype_id)
dst1 = F.boolean_mask(orig_dst, etype_arr == etype_id)
eid1 = F.boolean_mask(orig_eid, etype_arr == etype_id)
exist = g.has_edges_between(src1, dst1, etype=etype)
assert np.all(F.asnumpy(exist))
eid2 = g.edge_ids(src1, dst1, etype=etype)
assert np.all(F.asnumpy(eid1 == eid2))
eids[etype].append(F.boolean_mask(eid_type, etype_arr == etype_id))
# Make sure edge Ids fall into a range.
inner_edge_mask = _get_inner_edge_mask(part, etype_id)
inner_eids = np.sort(F.asnumpy(F.boolean_mask(part.edata[dgl.EID], inner_edge_mask)))
assert np.all(inner_eids == np.arange(inner_eids[0], inner_eids[-1] + 1))
for ntype in g.ntypes:
ntype_id = g.get_ntype_id(ntype)
# Make sure inner nodes have Ids fall into a range.
inner_node_mask = _get_inner_node_mask(part, ntype_id)
inner_nids = F.boolean_mask(part.ndata[dgl.NID], inner_node_mask)
assert np.all(F.asnumpy(inner_nids == F.arange(F.as_scalar(inner_nids[0]),
F.as_scalar(inner_nids[-1]) + 1)))
nids[ntype].append(inner_nids)
for ntype in nids:
nids_type = F.cat(nids[ntype], 0)
uniq_ids = F.unique(nids_type)
# We should get all nodes.
assert len(uniq_ids) == g.number_of_nodes(ntype)
for etype in eids:
eids_type = F.cat(eids[etype], 0)
uniq_ids = F.unique(eids_type)
assert len(uniq_ids) == g.number_of_edges(etype)
# TODO(zhengda) this doesn't check 'part_id'
def verify_graph_feats(g, part, node_feats):
for ntype in g.ntypes:
ntype_id = g.get_ntype_id(ntype)
for name in g.nodes[ntype].data:
if name in [dgl.NID, 'inner_node']:
continue
inner_node_mask = _get_inner_node_mask(part, ntype_id)
inner_nids = F.boolean_mask(part.ndata[dgl.NID],inner_node_mask)
min_nids = F.min(inner_nids, 0)
orig_id = F.boolean_mask(part.ndata['orig_id'], inner_node_mask)
true_feats = F.gather_row(g.nodes[ntype].data[name], orig_id)
ndata = F.gather_row(node_feats[ntype + '/' + name], inner_nids - min_nids)
assert np.all(F.asnumpy(ndata == true_feats))
def check_hetero_partition(hg, part_method):
hg.nodes['n1'].data['labels'] = F.arange(0, hg.number_of_nodes('n1'))
hg.nodes['n1'].data['feats'] = F.tensor(np.random.randn(hg.number_of_nodes('n1'), 10), F.float32)
hg.edges['r1'].data['feats'] = F.tensor(np.random.randn(hg.number_of_edges('r1'), 10), F.float32)
num_parts = 4
num_hops = 1
partition_graph(hg, 'test', num_parts, '/tmp/partition', num_hops=num_hops,
part_method=part_method, reshuffle=True)
parts = []
for i in range(num_parts):
part_g, node_feats, edge_feats, gpb, _, ntypes, etypes = load_partition('/tmp/partition/test.json', i)
parts.append(part_g)
verify_graph_feats(hg, part_g, node_feats)
verify_hetero_graph(hg, parts)
def check_partition(g, part_method, reshuffle):
g.ndata['labels'] = F.arange(0, g.number_of_nodes())
g.ndata['feats'] = F.tensor(np.random.randn(g.number_of_nodes(), 10), F.float32)
......@@ -29,7 +163,7 @@ def check_partition(g, part_method, reshuffle):
part_method=part_method, reshuffle=reshuffle)
part_sizes = []
for i in range(num_parts):
part_g, node_feats, edge_feats, gpb, _ = load_partition('/tmp/partition/test.json', i)
part_g, node_feats, edge_feats, gpb, _, ntypes, etypes = load_partition('/tmp/partition/test.json', i)
# Check the metadata
assert gpb._num_nodes() == g.number_of_nodes()
......@@ -79,13 +213,13 @@ def check_partition(g, part_method, reshuffle):
F.gather_row(part_g.ndata['eh'], llocal_nodes))
for name in ['labels', 'feats']:
assert name in node_feats
assert node_feats[name].shape[0] == len(local_nodes)
assert np.all(F.asnumpy(g.ndata[name])[F.asnumpy(local_nodes)] == F.asnumpy(node_feats[name]))
assert '_N/' + name in node_feats
assert node_feats['_N/' + name].shape[0] == len(local_nodes)
assert np.all(F.asnumpy(g.ndata[name])[F.asnumpy(local_nodes)] == F.asnumpy(node_feats['_N/' + name]))
for name in ['feats']:
assert name in edge_feats
assert edge_feats[name].shape[0] == len(local_edges)
assert np.all(F.asnumpy(g.edata[name])[F.asnumpy(local_edges)] == F.asnumpy(edge_feats[name]))
assert '_E/' + name in edge_feats
assert edge_feats['_E/' + name].shape[0] == len(local_edges)
assert np.all(F.asnumpy(g.edata[name])[F.asnumpy(local_edges)] == F.asnumpy(edge_feats['_E/' + name]))
if reshuffle:
node_map = []
......@@ -105,18 +239,16 @@ def check_partition(g, part_method, reshuffle):
@unittest.skipIf(os.name == 'nt', reason='Do not support windows yet')
def test_partition():
g = create_random_graph(10000)
check_partition(g, 'metis', True)
check_partition(g, 'metis', False)
check_partition(g, 'random', True)
check_partition(g, 'metis', True)
check_partition(g, 'random', False)
check_partition(g, 'random', True)
@unittest.skipIf(os.name == 'nt', reason='Do not support windows yet')
def test_hetero_partition():
g = create_random_graph(10000)
check_partition(g, 'metis', True)
check_partition(g, 'metis', False)
check_partition(g, 'random', True)
check_partition(g, 'random', False)
hg = create_random_hetero()
check_hetero_partition(hg, 'metis')
check_hetero_partition(hg, 'random')
if __name__ == '__main__':
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment