"git@developer.sourcefind.cn:OpenDAS/dgl.git" did not exist on "be53add4d11011b4316e9a0fbe3aa42a5ce33119"
Unverified Commit b377e1b9 authored by Rhett Ying's avatar Rhett Ying Committed by GitHub
Browse files

[Dist][Examples] refactor dist graphsage examples (#4269)

* [Dist][Examples] refactor dist graphsage examples

* refine train_dist.py

* update train_dist_unsupervised.py

* fix debug info

* update train_dist_transductive

* update unsupervised_transductive

* remove distgnn

* fix join() in standalone mode

* change batch_labels to long() for ogbn-papers100M

* free unnecessary mem

* lint

* fix lint

* refine

* fix lint

* fix incorrect args

* refine
parent ff090f69
...@@ -137,7 +137,6 @@ python3 ~/workspace/dgl/tools/launch.py \ ...@@ -137,7 +137,6 @@ python3 ~/workspace/dgl/tools/launch.py \
--num_servers 1 \ --num_servers 1 \
--part_config data/ogb-product.json \ --part_config data/ogb-product.json \
--ip_config ip_config.txt \ --ip_config ip_config.txt \
--graph_format csc,coo \
"python3 train_dist_unsupervised.py --graph_name ogb-product --ip_config ip_config.txt --num_epochs 3 --batch_size 1000" "python3 train_dist_unsupervised.py --graph_name ogb-product --ip_config ip_config.txt --num_epochs 3 --batch_size 1000"
``` ```
...@@ -158,24 +157,22 @@ To run supervised with transductive setting (nodes are initialized with node emb ...@@ -158,24 +157,22 @@ To run supervised with transductive setting (nodes are initialized with node emb
```bash ```bash
python3 ~/workspace/dgl/tools/launch.py --workspace ~/workspace/dgl/examples/pytorch/graphsage/dist/ \ python3 ~/workspace/dgl/tools/launch.py --workspace ~/workspace/dgl/examples/pytorch/graphsage/dist/ \
--num_trainers 4 \ --num_trainers 4 \
--num_samplers 4 \
--num_servers 1 \ --num_servers 1 \
--num_samplers 0 \ --num_samplers 0 \
--part_config data/ogb-product.json \ --part_config data/ogb-product.json \
--ip_config ip_config.txt \ --ip_config ip_config.txt \
"python3 train_dist_transductive.py --graph_name ogb-product --ip_config ip_config.txt --batch_size 1000 --num_gpu 4 --eval_every 5" "python3 train_dist_transductive.py --graph_name ogb-product --ip_config ip_config.txt --batch_size 1000 --num_gpus 4 --eval_every 5"
``` ```
To run supervised with transductive setting using dgl distributed DistEmbedding To run supervised with transductive setting using dgl distributed DistEmbedding
```bash ```bash
python3 ~/workspace/dgl/tools/launch.py --workspace ~/workspace/dgl/examples/pytorch/graphsage/dist/ \ python3 ~/workspace/dgl/tools/launch.py --workspace ~/workspace/dgl/examples/pytorch/graphsage/dist/ \
--num_trainers 4 \ --num_trainers 4 \
--num_samplers 4 \
--num_servers 1 \ --num_servers 1 \
--num_samplers 0 \ --num_samplers 0 \
--part_config data/ogb-product.json \ --part_config data/ogb-product.json \
--ip_config ip_config.txt \ --ip_config ip_config.txt \
"python3 train_dist_transductive.py --graph_name ogb-product --ip_config ip_config.txt --batch_size 1000 --num_gpu 4 --eval_every 5 --dgl_sparse" "python3 train_dist_transductive.py --graph_name ogb-product --ip_config ip_config.txt --batch_size 1000 --num_gpus 4 --eval_every 5 --dgl_sparse"
``` ```
To run unsupervised with transductive setting (nodes are initialized with node embedding) To run unsupervised with transductive setting (nodes are initialized with node embedding)
...@@ -186,7 +183,6 @@ python3 ~/workspace/dgl/tools/launch.py --workspace ~/workspace/dgl/examples/pyt ...@@ -186,7 +183,6 @@ python3 ~/workspace/dgl/tools/launch.py --workspace ~/workspace/dgl/examples/pyt
--num_servers 1 \ --num_servers 1 \
--part_config data/ogb-product.json \ --part_config data/ogb-product.json \
--ip_config ip_config.txt \ --ip_config ip_config.txt \
--graph_format csc,coo \
"python3 train_dist_unsupervised_transductive.py --graph_name ogb-product --ip_config ip_config.txt --num_epochs 3 --batch_size 1000 --num_gpus 4" "python3 train_dist_unsupervised_transductive.py --graph_name ogb-product --ip_config ip_config.txt --num_epochs 3 --batch_size 1000 --num_gpus 4"
``` ```
...@@ -198,7 +194,6 @@ python3 ~/workspace/dgl/tools/launch.py --workspace ~/workspace/dgl/examples/pyt ...@@ -198,7 +194,6 @@ python3 ~/workspace/dgl/tools/launch.py --workspace ~/workspace/dgl/examples/pyt
--num_servers 1 \ --num_servers 1 \
--part_config data/ogb-product.json \ --part_config data/ogb-product.json \
--ip_config ip_config.txt \ --ip_config ip_config.txt \
--graph_format csc,coo \
"python3 train_dist_unsupervised_transductive.py --graph_name ogb-product --ip_config ip_config.txt --num_epochs 3 --batch_size 1000 --num_gpus 4 --dgl_sparse" "python3 train_dist_unsupervised_transductive.py --graph_name ogb-product --ip_config ip_config.txt --num_epochs 3 --batch_size 1000 --num_gpus 4 --dgl_sparse"
``` ```
......
import os
os.environ["DGLBACKEND"] = "pytorch"
import argparse import argparse
import math
import socket import socket
import time import time
from functools import wraps from contextlib import contextmanager
from multiprocessing import Process
import numpy as np import numpy as np
import torch as th import torch as th
import torch.multiprocessing as mp
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
import torch.optim as optim import torch.optim as optim
import tqdm import tqdm
from torch.utils.data import DataLoader
import dgl import dgl
import dgl.function as fn
import dgl.nn.pytorch as dglnn import dgl.nn.pytorch as dglnn
from dgl import DGLGraph
from dgl.data import load_data, register_data_args
from dgl.data.utils import load_graphs
from dgl.distributed import DistDataLoader
def load_subtensor(g, seeds, input_nodes, device, load_feat=True): def load_subtensor(g, seeds, input_nodes, device, load_feat=True):
""" """
...@@ -37,40 +23,6 @@ def load_subtensor(g, seeds, input_nodes, device, load_feat=True): ...@@ -37,40 +23,6 @@ def load_subtensor(g, seeds, input_nodes, device, load_feat=True):
return batch_inputs, batch_labels return batch_inputs, batch_labels
class NeighborSampler(object):
def __init__(self, g, fanouts, sample_neighbors, device, load_feat=True):
self.g = g
self.fanouts = fanouts
self.sample_neighbors = sample_neighbors
self.device = device
self.load_feat = load_feat
def sample_blocks(self, seeds):
seeds = th.LongTensor(np.asarray(seeds))
blocks = []
for fanout in self.fanouts:
# For each seed node, sample ``fanout`` neighbors.
frontier = self.sample_neighbors(
self.g, seeds, fanout, replace=True
)
# Then we compact the frontier into a bipartite graph for message passing.
block = dgl.to_block(frontier, seeds)
# Obtain the seed nodes for next layer.
seeds = block.srcdata[dgl.NID]
blocks.insert(0, block)
input_nodes = blocks[0].srcdata[dgl.NID]
seeds = blocks[-1].dstdata[dgl.NID]
batch_inputs, batch_labels = load_subtensor(
self.g, seeds, input_nodes, "cpu", self.load_feat
)
if self.load_feat:
blocks[0].srcdata["features"] = batch_inputs
blocks[-1].dstdata["labels"] = batch_labels
return blocks
class DistSAGE(nn.Module): class DistSAGE(nn.Module):
def __init__( def __init__(
self, in_feats, n_hidden, n_classes, n_layers, activation, dropout self, in_feats, n_hidden, n_classes, n_layers, activation, dropout
...@@ -89,72 +41,68 @@ class DistSAGE(nn.Module): ...@@ -89,72 +41,68 @@ class DistSAGE(nn.Module):
def forward(self, blocks, x): def forward(self, blocks, x):
h = x h = x
for l, (layer, block) in enumerate(zip(self.layers, blocks)): for i, (layer, block) in enumerate(zip(self.layers, blocks)):
h = layer(block, h) h = layer(block, h)
if l != len(self.layers) - 1: if i != len(self.layers) - 1:
h = self.activation(h) h = self.activation(h)
h = self.dropout(h) h = self.dropout(h)
return h return h
def inference(self, g, x, batch_size, device): def inference(self, g, x, batch_size, device):
""" """
Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling). Inference with the GraphSAGE model on full neighbors (i.e. without
neighbor sampling).
g : the entire graph. g : the entire graph.
x : the input of entire node set. x : the input of entire node set.
The inference code is written in a fashion that it could handle any number of nodes and Distributed layer-wise inference.
layers.
""" """
# During inference with sampling, multi-layer blocks are very inefficient because # During inference with sampling, multi-layer blocks are very
# lots of computations in the first few layers are repeated. # inefficient because lots of computations in the first few layers
# Therefore, we compute the representation of all nodes layer by layer. The nodes # are repeated. Therefore, we compute the representation of all nodes
# on each layer are of course splitted in batches. # layer by layer. The nodes on each layer are of course splitted in
# batches.
# TODO: can we standardize this? # TODO: can we standardize this?
nodes = dgl.distributed.node_split( nodes = dgl.distributed.node_split(
np.arange(g.number_of_nodes()), np.arange(g.num_nodes()),
g.get_partition_book(), g.get_partition_book(),
force_even=True, force_even=True,
) )
y = dgl.distributed.DistTensor( y = dgl.distributed.DistTensor(
(g.number_of_nodes(), self.n_hidden), (g.num_nodes(), self.n_hidden),
th.float32, th.float32,
"h", "h",
persistent=True, persistent=True,
) )
for l, layer in enumerate(self.layers): for i, layer in enumerate(self.layers):
if l == len(self.layers) - 1: if i == len(self.layers) - 1:
y = dgl.distributed.DistTensor( y = dgl.distributed.DistTensor(
(g.number_of_nodes(), self.n_classes), (g.num_nodes(), self.n_classes),
th.float32, th.float32,
"h_last", "h_last",
persistent=True, persistent=True,
) )
sampler = NeighborSampler(
g, [-1], dgl.distributed.sample_neighbors, device
)
print( print(
"|V|={}, eval batch size: {}".format( f"|V|={g.num_nodes()}, eval batch size: {batch_size}"
g.number_of_nodes(), batch_size
)
) )
# Create PyTorch DataLoader for constructing blocks
dataloader = DistDataLoader( sampler = dgl.dataloading.NeighborSampler([-1])
dataset=nodes, dataloader = dgl.dataloading.DistNodeDataLoader(
g,
nodes,
sampler,
batch_size=batch_size, batch_size=batch_size,
collate_fn=sampler.sample_blocks,
shuffle=False, shuffle=False,
drop_last=False, drop_last=False,
) )
for blocks in tqdm.tqdm(dataloader): for input_nodes, output_nodes, blocks in tqdm.tqdm(dataloader):
block = blocks[0].to(device) block = blocks[0].to(device)
input_nodes = block.srcdata[dgl.NID]
output_nodes = block.dstdata[dgl.NID]
h = x[input_nodes].to(device) h = x[input_nodes].to(device)
h_dst = h[: block.number_of_dst_nodes()] h_dst = h[: block.number_of_dst_nodes()]
h = layer(block, (h, h_dst)) h = layer(block, (h, h_dst))
if l != len(self.layers) - 1: if i != len(self.layers) - 1:
h = self.activation(h) h = self.activation(h)
h = self.dropout(h) h = self.dropout(h)
...@@ -164,6 +112,11 @@ class DistSAGE(nn.Module): ...@@ -164,6 +112,11 @@ class DistSAGE(nn.Module):
g.barrier() g.barrier()
return y return y
@contextmanager
def join(self):
"""dummy join for standalone"""
yield
def compute_acc(pred, labels): def compute_acc(pred, labels):
""" """
...@@ -196,23 +149,18 @@ def run(args, device, data): ...@@ -196,23 +149,18 @@ def run(args, device, data):
# Unpack data # Unpack data
train_nid, val_nid, test_nid, in_feats, n_classes, g = data train_nid, val_nid, test_nid, in_feats, n_classes, g = data
shuffle = True shuffle = True
# Create sampler # prefetch_node_feats/prefetch_labels are not supported for DistGraph yet.
sampler = NeighborSampler( sampler = dgl.dataloading.NeighborSampler(
g, [int(fanout) for fanout in args.fan_out.split(",")]
[int(fanout) for fanout in args.fan_out.split(",")],
dgl.distributed.sample_neighbors,
device,
) )
dataloader = dgl.dataloading.DistNodeDataLoader(
# Create DataLoader for constructing blocks g,
dataloader = DistDataLoader( train_nid,
dataset=train_nid.numpy(), sampler,
batch_size=args.batch_size, batch_size=args.batch_size,
collate_fn=sampler.sample_blocks,
shuffle=shuffle, shuffle=shuffle,
drop_last=False, drop_last=False,
) )
# Define model and optimizer # Define model and optimizer
model = DistSAGE( model = DistSAGE(
in_feats, in_feats,
...@@ -247,28 +195,27 @@ def run(args, device, data): ...@@ -247,28 +195,27 @@ def run(args, device, data):
num_seeds = 0 num_seeds = 0
num_inputs = 0 num_inputs = 0
start = time.time() start = time.time()
# Loop over the dataloader to sample the computation dependency graph as a list of # Loop over the dataloader to sample the computation dependency graph
# blocks. # as a list of blocks.
step_time = [] step_time = []
with model.join(): with model.join():
for step, blocks in enumerate(dataloader): for step, (input_nodes, seeds, blocks) in enumerate(dataloader):
tic_step = time.time() tic_step = time.time()
sample_time += tic_step - start sample_time += tic_step - start
# fetch features/labels
# The nodes for input lies at the LHS side of the first block. batch_inputs, batch_labels = load_subtensor(
# The nodes for output lies at the RHS side of the last block. g, seeds, input_nodes, "cpu"
batch_inputs = blocks[0].srcdata["features"] )
batch_labels = blocks[-1].dstdata["labels"]
batch_labels = batch_labels.long() batch_labels = batch_labels.long()
num_seeds += len(blocks[-1].dstdata[dgl.NID]) num_seeds += len(blocks[-1].dstdata[dgl.NID])
num_inputs += len(blocks[0].srcdata[dgl.NID]) num_inputs += len(blocks[0].srcdata[dgl.NID])
# move to target device
blocks = [block.to(device) for block in blocks] blocks = [block.to(device) for block in blocks]
batch_inputs = batch_inputs.to(device)
batch_labels = batch_labels.to(device) batch_labels = batch_labels.to(device)
# Compute loss and prediction # Compute loss and prediction
start = time.time() start = time.time()
# print(g.rank(), blocks[0].device, model.module.layers[0].fc_neigh.weight.device, dev_id)
batch_pred = model(blocks, batch_inputs) batch_pred = model(blocks, batch_inputs)
loss = loss_fcn(batch_pred, batch_labels) loss = loss_fcn(batch_pred, batch_labels)
forward_end = time.time() forward_end = time.time()
...@@ -292,7 +239,9 @@ def run(args, device, data): ...@@ -292,7 +239,9 @@ def run(args, device, data):
else 0 else 0
) )
print( print(
"Part {} | Epoch {:05d} | Step {:05d} | Loss {:.4f} | Train Acc {:.4f} | Speed (samples/sec) {:.4f} | GPU {:.1f} MB | time {:.3f} s".format( "Part {} | Epoch {:05d} | Step {:05d} | Loss {:.4f} | "
"Train Acc {:.4f} | Speed (samples/sec) {:.4f} | GPU "
"{:.1f} MB | time {:.3f} s".format(
g.rank(), g.rank(),
epoch, epoch,
step, step,
...@@ -300,14 +249,16 @@ def run(args, device, data): ...@@ -300,14 +249,16 @@ def run(args, device, data):
acc.item(), acc.item(),
np.mean(iter_tput[3:]), np.mean(iter_tput[3:]),
gpu_mem_alloc, gpu_mem_alloc,
np.sum(step_time[-args.log_every :]), np.sum(step_time[-args.log_every:]),
) )
) )
start = time.time() start = time.time()
toc = time.time() toc = time.time()
print( print(
"Part {}, Epoch Time(s): {:.4f}, sample+data_copy: {:.4f}, forward: {:.4f}, backward: {:.4f}, update: {:.4f}, #seeds: {}, #inputs: {}".format( "Part {}, Epoch Time(s): {:.4f}, sample+data_copy: {:.4f}, "
"forward: {:.4f}, backward: {:.4f}, update: {:.4f}, #seeds: {}, "
"#inputs: {}".format(
g.rank(), g.rank(),
toc - tic, toc - tic,
sample_time, sample_time,
...@@ -323,7 +274,7 @@ def run(args, device, data): ...@@ -323,7 +274,7 @@ def run(args, device, data):
if epoch % args.eval_every == 0 and epoch != 0: if epoch % args.eval_every == 0 and epoch != 0:
start = time.time() start = time.time()
val_acc, test_acc = evaluate( val_acc, test_acc = evaluate(
model.module, model if args.standalone else model.module,
g, g,
g.ndata["features"], g.ndata["features"],
g.ndata["labels"], g.ndata["labels"],
...@@ -333,7 +284,8 @@ def run(args, device, data): ...@@ -333,7 +284,8 @@ def run(args, device, data):
device, device,
) )
print( print(
"Part {}, Val Acc {:.4f}, Test Acc {:.4f}, time: {:.4f}".format( "Part {}, Val Acc {:.4f}, Test Acc {:.4f}, time: {:.4f}".format
(
g.rank(), val_acc, test_acc, time.time() - start g.rank(), val_acc, test_acc, time.time() - start
) )
) )
...@@ -346,7 +298,10 @@ def main(args): ...@@ -346,7 +298,10 @@ def main(args):
print(socket.gethostname(), "Initializing DGL process group") print(socket.gethostname(), "Initializing DGL process group")
th.distributed.init_process_group(backend=args.backend) th.distributed.init_process_group(backend=args.backend)
print(socket.gethostname(), "Initializing DistGraph") print(socket.gethostname(), "Initializing DistGraph")
g = dgl.distributed.DistGraph(args.graph_name, part_config=args.part_config) g = dgl.distributed.DistGraph(
args.graph_name,
part_config=args.part_config
)
print(socket.gethostname(), "rank:", g.rank()) print(socket.gethostname(), "rank:", g.rank())
pb = g.get_partition_book() pb = g.get_partition_book()
...@@ -381,7 +336,8 @@ def main(args): ...@@ -381,7 +336,8 @@ def main(args):
) )
local_nid = pb.partid2nids(pb.partid).detach().numpy() local_nid = pb.partid2nids(pb.partid).detach().numpy()
print( print(
"part {}, train: {} (local: {}), val: {} (local: {}), test: {} (local: {})".format( "part {}, train: {} (local: {}), val: {} (local: {}), test: {} "
"(local: {})".format(
g.rank(), g.rank(),
len(train_nid), len(train_nid),
len(np.intersect1d(train_nid.numpy(), local_nid)), len(np.intersect1d(train_nid.numpy(), local_nid)),
...@@ -398,8 +354,8 @@ def main(args): ...@@ -398,8 +354,8 @@ def main(args):
dev_id = g.rank() % args.num_gpus dev_id = g.rank() % args.num_gpus
device = th.device("cuda:" + str(dev_id)) device = th.device("cuda:" + str(dev_id))
n_classes = args.n_classes n_classes = args.n_classes
if n_classes == -1: if n_classes == 0:
labels = g.ndata["labels"][np.arange(g.number_of_nodes())] labels = g.ndata["labels"][np.arange(g.num_nodes())]
n_classes = len(th.unique(labels[th.logical_not(th.isnan(labels))])) n_classes = len(th.unique(labels[th.logical_not(th.isnan(labels))]))
del labels del labels
print("#labels:", n_classes) print("#labels:", n_classes)
...@@ -413,7 +369,6 @@ def main(args): ...@@ -413,7 +369,6 @@ def main(args):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="GCN") parser = argparse.ArgumentParser(description="GCN")
register_data_args(parser)
parser.add_argument("--graph_name", type=str, help="graph name") parser.add_argument("--graph_name", type=str, help="graph name")
parser.add_argument("--id", type=int, help="the partition id") parser.add_argument("--id", type=int, help="the partition id")
parser.add_argument( parser.add_argument(
...@@ -422,14 +377,8 @@ if __name__ == "__main__": ...@@ -422,14 +377,8 @@ if __name__ == "__main__":
parser.add_argument( parser.add_argument(
"--part_config", type=str, help="The path to the partition config file" "--part_config", type=str, help="The path to the partition config file"
) )
parser.add_argument("--num_clients", type=int, help="The number of clients")
parser.add_argument( parser.add_argument(
"--n_classes", "--n_classes", type=int, default=0, help="the number of classes"
type=int,
default=-1,
help="The number of classes. If not specified, this"
" value will be calculated via scaning all the labels"
" in the dataset which probably causes memory burst.",
) )
parser.add_argument( parser.add_argument(
"--backend", "--backend",
...@@ -463,7 +412,8 @@ if __name__ == "__main__": ...@@ -463,7 +412,8 @@ if __name__ == "__main__":
"--pad-data", "--pad-data",
default=False, default=False,
action="store_true", action="store_true",
help="Pad train nid to the same length across machine, to ensure num of batches to be the same.", help="Pad train nid to the same length across machine, to ensure num "
"of batches to be the same.",
) )
parser.add_argument( parser.add_argument(
"--net_type", "--net_type",
......
import os
os.environ["DGLBACKEND"] = "pytorch"
import argparse import argparse
import math
import time import time
from functools import wraps
from multiprocessing import Process
import numpy as np import numpy as np
import torch as th import torch as th
import torch.multiprocessing as mp
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
import torch.optim as optim import torch.optim as optim
import tqdm
from torch.utils.data import DataLoader
from train_dist import DistSAGE, NeighborSampler, compute_acc
import dgl import dgl
import dgl.function as fn from dgl.distributed import DistEmbedding
import dgl.nn.pytorch as dglnn from train_dist import DistSAGE, compute_acc
from dgl import DGLGraph
from dgl.data import load_data, register_data_args
from dgl.data.utils import load_graphs
from dgl.distributed import DistDataLoader, DistEmbedding
class TransDistSAGE(DistSAGE):
def __init__(
self, in_feats, n_hidden, n_classes, n_layers, activation, dropout
):
super(TransDistSAGE, self).__init__(
in_feats, n_hidden, n_classes, n_layers, activation, dropout
)
def inference(self, standalone, g, x, batch_size, device):
"""
Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling).
g : the entire graph.
x : the input of entire node set.
The inference code is written in a fashion that it could handle any number of nodes and
layers.
"""
# During inference with sampling, multi-layer blocks are very inefficient because
# lots of computations in the first few layers are repeated.
# Therefore, we compute the representation of all nodes layer by layer. The nodes
# on each layer are of course splitted in batches.
# TODO: can we standardize this?
nodes = dgl.distributed.node_split(
np.arange(g.number_of_nodes()),
g.get_partition_book(),
force_even=True,
)
y = dgl.distributed.DistTensor(
(g.number_of_nodes(), self.n_hidden),
th.float32,
"h",
persistent=True,
)
for l, layer in enumerate(self.layers):
if l == len(self.layers) - 1:
y = dgl.distributed.DistTensor(
(g.number_of_nodes(), self.n_classes),
th.float32,
"h_last",
persistent=True,
)
sampler = NeighborSampler(
g,
[-1],
dgl.distributed.sample_neighbors,
device,
load_feat=False,
)
print(
"|V|={}, eval batch size: {}".format(
g.number_of_nodes(), batch_size
)
)
# Create PyTorch DataLoader for constructing blocks
dataloader = DistDataLoader(
dataset=nodes,
batch_size=batch_size,
collate_fn=sampler.sample_blocks,
shuffle=False,
drop_last=False,
)
for blocks in tqdm.tqdm(dataloader):
block = blocks[0].to(device)
input_nodes = block.srcdata[dgl.NID]
output_nodes = block.dstdata[dgl.NID]
h = x[input_nodes].to(device)
h_dst = h[: block.number_of_dst_nodes()]
h = layer(block, (h, h_dst))
if l != len(self.layers) - 1:
h = self.activation(h)
h = self.dropout(h)
y[output_nodes] = h.cpu()
x = y
g.barrier()
return y
def initializer(shape, dtype): def initializer(shape, dtype):
...@@ -114,7 +18,9 @@ def initializer(shape, dtype): ...@@ -114,7 +18,9 @@ def initializer(shape, dtype):
class DistEmb(nn.Module): class DistEmb(nn.Module):
def __init__(self, num_nodes, emb_size, dgl_sparse_emb=False, dev_id="cpu"): def __init__(
self, num_nodes, emb_size, dgl_sparse_emb=False, dev_id="cpu"
):
super().__init__() super().__init__()
self.dev_id = dev_id self.dev_id = dev_id
self.emb_size = emb_size self.emb_size = emb_size
...@@ -138,11 +44,11 @@ class DistEmb(nn.Module): ...@@ -138,11 +44,11 @@ class DistEmb(nn.Module):
def load_embs(standalone, emb_layer, g): def load_embs(standalone, emb_layer, g):
nodes = dgl.distributed.node_split( nodes = dgl.distributed.node_split(
np.arange(g.number_of_nodes()), g.get_partition_book(), force_even=True np.arange(g.num_nodes()), g.get_partition_book(), force_even=True
) )
x = dgl.distributed.DistTensor( x = dgl.distributed.DistTensor(
( (
g.number_of_nodes(), g.num_nodes(),
emb_layer.module.emb_size emb_layer.module.emb_size
if isinstance(emb_layer, th.nn.parallel.DistributedDataParallel) if isinstance(emb_layer, th.nn.parallel.DistributedDataParallel)
else emb_layer.emb_size, else emb_layer.emb_size,
...@@ -154,7 +60,7 @@ def load_embs(standalone, emb_layer, g): ...@@ -154,7 +60,7 @@ def load_embs(standalone, emb_layer, g):
num_nodes = nodes.shape[0] num_nodes = nodes.shape[0]
for i in range((num_nodes + 1023) // 1024): for i in range((num_nodes + 1023) // 1024):
idx = nodes[ idx = nodes[
i * 1024 : (i + 1) * 1024 i * 1024: (i + 1) * 1024
if (i + 1) * 1024 < num_nodes if (i + 1) * 1024 < num_nodes
else num_nodes else num_nodes
] ]
...@@ -187,11 +93,13 @@ def evaluate( ...@@ -187,11 +93,13 @@ def evaluate(
batch_size : Number of nodes to compute at the same time. batch_size : Number of nodes to compute at the same time.
device : The GPU device to evaluate on. device : The GPU device to evaluate on.
""" """
if not standalone:
model = model.module
model.eval() model.eval()
emb_layer.eval() emb_layer.eval()
with th.no_grad(): with th.no_grad():
inputs = load_embs(standalone, emb_layer, g) inputs = load_embs(standalone, emb_layer, g)
pred = model.inference(standalone, g, inputs, batch_size, device) pred = model.inference(g, inputs, batch_size, device)
model.train() model.train()
emb_layer.train() emb_layer.train()
return compute_acc(pred[val_nid], labels[val_nid]), compute_acc( return compute_acc(pred[val_nid], labels[val_nid]), compute_acc(
...@@ -202,24 +110,17 @@ def evaluate( ...@@ -202,24 +110,17 @@ def evaluate(
def run(args, device, data): def run(args, device, data):
# Unpack data # Unpack data
train_nid, val_nid, test_nid, n_classes, g = data train_nid, val_nid, test_nid, n_classes, g = data
# Create sampler sampler = dgl.dataloading.NeighborSampler(
sampler = NeighborSampler( [int(fanout) for fanout in args.fan_out.split(",")]
g,
[int(fanout) for fanout in args.fan_out.split(",")],
dgl.distributed.sample_neighbors,
device,
load_feat=False,
) )
dataloader = dgl.dataloading.DistNodeDataLoader(
# Create DataLoader for constructing blocks g,
dataloader = DistDataLoader( train_nid,
dataset=train_nid.numpy(), sampler,
batch_size=args.batch_size, batch_size=args.batch_size,
collate_fn=sampler.sample_blocks,
shuffle=True, shuffle=True,
drop_last=False, drop_last=False,
) )
# Define model and optimizer # Define model and optimizer
emb_layer = DistEmb( emb_layer = DistEmb(
g.num_nodes(), g.num_nodes(),
...@@ -227,7 +128,7 @@ def run(args, device, data): ...@@ -227,7 +128,7 @@ def run(args, device, data):
dgl_sparse_emb=args.dgl_sparse, dgl_sparse_emb=args.dgl_sparse,
dev_id=device, dev_id=device,
) )
model = TransDistSAGE( model = DistSAGE(
args.num_hidden, args.num_hidden,
args.num_hidden, args.num_hidden,
n_classes, n_classes,
...@@ -263,9 +164,10 @@ def run(args, device, data): ...@@ -263,9 +164,10 @@ def run(args, device, data):
emb_optimizer = th.optim.SparseAdam( emb_optimizer = th.optim.SparseAdam(
list(emb_layer.module.sparse_emb.parameters()), lr=args.sparse_lr list(emb_layer.module.sparse_emb.parameters()), lr=args.sparse_lr
) )
print("optimize Pytorch sparse embedding:", emb_layer.module.sparse_emb) print(
"optimize Pytorch sparse embedding:",
train_size = th.sum(g.ndata["train_mask"][0 : g.number_of_nodes()]) emb_layer.module.sparse_emb
)
# Training loop # Training loop
iter_tput = [] iter_tput = []
...@@ -280,67 +182,65 @@ def run(args, device, data): ...@@ -280,67 +182,65 @@ def run(args, device, data):
num_seeds = 0 num_seeds = 0
num_inputs = 0 num_inputs = 0
start = time.time() start = time.time()
# Loop over the dataloader to sample the computation dependency graph as a list of with model.join():
# blocks. # Loop over the dataloader to sample the computation dependency
step_time = [] # graph as a list of blocks.
for step, blocks in enumerate(dataloader): step_time = []
tic_step = time.time() for step, (input_nodes, seeds, blocks) in enumerate(dataloader):
sample_time += tic_step - start tic_step = time.time()
sample_time += tic_step - start
# The nodes for input lies at the LHS side of the first block. num_seeds += len(blocks[-1].dstdata[dgl.NID])
# The nodes for output lies at the RHS side of the last block. num_inputs += len(blocks[0].srcdata[dgl.NID])
batch_inputs = blocks[0].srcdata[dgl.NID] blocks = [block.to(device) for block in blocks]
batch_labels = blocks[-1].dstdata["labels"] batch_labels = g.ndata["labels"][seeds].long().to(device)
batch_labels = batch_labels.long() # Compute loss and prediction
start = time.time()
num_seeds += len(blocks[-1].dstdata[dgl.NID]) batch_inputs = emb_layer(input_nodes)
num_inputs += len(blocks[0].srcdata[dgl.NID]) batch_pred = model(blocks, batch_inputs)
blocks = [block.to(device) for block in blocks] loss = loss_fcn(batch_pred, batch_labels)
batch_labels = batch_labels.to(device) forward_end = time.time()
# Compute loss and prediction emb_optimizer.zero_grad()
start = time.time() optimizer.zero_grad()
batch_inputs = emb_layer(batch_inputs) loss.backward()
batch_pred = model(blocks, batch_inputs) compute_end = time.time()
loss = loss_fcn(batch_pred, batch_labels) forward_time += forward_end - start
forward_end = time.time() backward_time += compute_end - forward_end
emb_optimizer.zero_grad()
optimizer.zero_grad() emb_optimizer.step()
loss.backward() optimizer.step()
compute_end = time.time() update_time += time.time() - compute_end
forward_time += forward_end - start
backward_time += compute_end - forward_end step_t = time.time() - tic_step
step_time.append(step_t)
emb_optimizer.step() iter_tput.append(len(blocks[-1].dstdata[dgl.NID]) / step_t)
optimizer.step() if step % args.log_every == 0:
update_time += time.time() - compute_end acc = compute_acc(batch_pred, batch_labels)
gpu_mem_alloc = (
step_t = time.time() - tic_step th.cuda.max_memory_allocated() / 1000000
step_time.append(step_t) if th.cuda.is_available()
iter_tput.append(len(blocks[-1].dstdata[dgl.NID]) / step_t) else 0
if step % args.log_every == 0:
acc = compute_acc(batch_pred, batch_labels)
gpu_mem_alloc = (
th.cuda.max_memory_allocated() / 1000000
if th.cuda.is_available()
else 0
)
print(
"Part {} | Epoch {:05d} | Step {:05d} | Loss {:.4f} | Train Acc {:.4f} | Speed (samples/sec) {:.4f} | GPU {:.1f} MB | time {:.3f} s".format(
g.rank(),
epoch,
step,
loss.item(),
acc.item(),
np.mean(iter_tput[3:]),
gpu_mem_alloc,
np.sum(step_time[-args.log_every :]),
) )
) print(
start = time.time() "Part {} | Epoch {:05d} | Step {:05d} | Loss {:.4f} | "
"Train Acc {:.4f} | Speed (samples/sec) {:.4f} | GPU "
"{:.1f} MB | time {:.3f} s".format(
g.rank(),
epoch,
step,
loss.item(),
acc.item(),
np.mean(iter_tput[3:]),
gpu_mem_alloc,
np.sum(step_time[-args.log_every:]),
)
)
start = time.time()
toc = time.time() toc = time.time()
print( print(
"Part {}, Epoch Time(s): {:.4f}, sample+data_copy: {:.4f}, forward: {:.4f}, backward: {:.4f}, update: {:.4f}, #seeds: {}, #inputs: {}".format( "Part {}, Epoch Time(s): {:.4f}, sample+data_copy: {:.4f}, forward"
": {:.4f}, backward: {:.4f}, update: {:.4f}, #seeds: {}, #inputs"
": {}".format(
g.rank(), g.rank(),
toc - tic, toc - tic,
sample_time, sample_time,
...@@ -357,7 +257,7 @@ def run(args, device, data): ...@@ -357,7 +257,7 @@ def run(args, device, data):
start = time.time() start = time.time()
val_acc, test_acc = evaluate( val_acc, test_acc = evaluate(
args.standalone, args.standalone,
model.module, model,
emb_layer, emb_layer,
g, g,
g.ndata["labels"], g.ndata["labels"],
...@@ -367,7 +267,8 @@ def run(args, device, data): ...@@ -367,7 +267,8 @@ def run(args, device, data):
device, device,
) )
print( print(
"Part {}, Val Acc {:.4f}, Test Acc {:.4f}, time: {:.4f}".format( "Part {}, Val Acc {:.4f}, Test Acc {:.4f}, time: {:.4f}".format
(
g.rank(), val_acc, test_acc, time.time() - start g.rank(), val_acc, test_acc, time.time() - start
) )
) )
...@@ -377,7 +278,10 @@ def main(args): ...@@ -377,7 +278,10 @@ def main(args):
dgl.distributed.initialize(args.ip_config) dgl.distributed.initialize(args.ip_config)
if not args.standalone: if not args.standalone:
th.distributed.init_process_group(backend="gloo") th.distributed.init_process_group(backend="gloo")
g = dgl.distributed.DistGraph(args.graph_name, part_config=args.part_config) g = dgl.distributed.DistGraph(
args.graph_name,
part_config=args.part_config
)
print("rank:", g.rank()) print("rank:", g.rank())
pb = g.get_partition_book() pb = g.get_partition_book()
...@@ -392,7 +296,8 @@ def main(args): ...@@ -392,7 +296,8 @@ def main(args):
) )
local_nid = pb.partid2nids(pb.partid).detach().numpy() local_nid = pb.partid2nids(pb.partid).detach().numpy()
print( print(
"part {}, train: {} (local: {}), val: {} (local: {}), test: {} (local: {})".format( "part {}, train: {} (local: {}), val: {} (local: {}), test: {} "
"(local: {})".format(
g.rank(), g.rank(),
len(train_nid), len(train_nid),
len(np.intersect1d(train_nid.numpy(), local_nid)), len(np.intersect1d(train_nid.numpy(), local_nid)),
...@@ -405,8 +310,9 @@ def main(args): ...@@ -405,8 +310,9 @@ def main(args):
if args.num_gpus == -1: if args.num_gpus == -1:
device = th.device("cpu") device = th.device("cpu")
else: else:
device = th.device("cuda:" + str(args.local_rank)) dev_id = g.rank() % args.num_gpus
labels = g.ndata["labels"][np.arange(g.number_of_nodes())] device = th.device("cuda:" + str(dev_id))
labels = g.ndata["labels"][np.arange(g.num_nodes())]
n_classes = len(th.unique(labels[th.logical_not(th.isnan(labels))])) n_classes = len(th.unique(labels[th.logical_not(th.isnan(labels))]))
print("#labels:", n_classes) print("#labels:", n_classes)
...@@ -418,7 +324,6 @@ def main(args): ...@@ -418,7 +324,6 @@ def main(args):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="GCN") parser = argparse.ArgumentParser(description="GCN")
register_data_args(parser)
parser.add_argument("--graph_name", type=str, help="graph name") parser.add_argument("--graph_name", type=str, help="graph name")
parser.add_argument("--id", type=int, help="the partition id") parser.add_argument("--id", type=int, help="the partition id")
parser.add_argument( parser.add_argument(
...@@ -427,7 +332,6 @@ if __name__ == "__main__": ...@@ -427,7 +332,6 @@ if __name__ == "__main__":
parser.add_argument( parser.add_argument(
"--part_config", type=str, help="The path to the partition config file" "--part_config", type=str, help="The path to the partition config file"
) )
parser.add_argument("--num_clients", type=int, help="The number of clients")
parser.add_argument("--n_classes", type=int, help="the number of classes") parser.add_argument("--n_classes", type=int, help="the number of classes")
parser.add_argument( parser.add_argument(
"--num_gpus", "--num_gpus",
......
import os
os.environ["DGLBACKEND"] = "pytorch"
import argparse import argparse
import math
import time import time
from functools import wraps
from multiprocessing import Process
import numpy as np import numpy as np
import sklearn.linear_model as lm
import sklearn.metrics as skm
import torch as th import torch as th
import torch.multiprocessing as mp
import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
import torch.optim as optim import torch.optim as optim
import tqdm
from train_dist_transductive import DistEmb, load_embs
from train_dist_unsupervised import (
SAGE,
CrossEntropyLoss,
NeighborSampler,
PosNeighborSampler,
compute_acc,
)
import dgl import dgl
import dgl.function as fn from train_dist_transductive import DistEmb, load_embs
import dgl.nn.pytorch as dglnn from train_dist_unsupervised import CrossEntropyLoss, DistSAGE, compute_acc
from dgl import DGLGraph
from dgl.data import load_data, register_data_args
from dgl.data.utils import load_graphs
from dgl.distributed import DistDataLoader
def generate_emb(standalone, model, emb_layer, g, batch_size, device): def generate_emb(standalone, model, emb_layer, g, batch_size, device):
...@@ -43,6 +19,8 @@ def generate_emb(standalone, model, emb_layer, g, batch_size, device): ...@@ -43,6 +19,8 @@ def generate_emb(standalone, model, emb_layer, g, batch_size, device):
batch_size : Number of nodes to compute at the same time. batch_size : Number of nodes to compute at the same time.
device : The GPU device to evaluate on. device : The GPU device to evaluate on.
""" """
if not standalone:
model = model.module
model.eval() model.eval()
emb_layer.eval() emb_layer.eval()
with th.no_grad(): with th.no_grad():
...@@ -64,24 +42,24 @@ def run(args, device, data): ...@@ -64,24 +42,24 @@ def run(args, device, data):
labels, labels,
) = data ) = data
# Create sampler # Create sampler
sampler = NeighborSampler( neg_sampler = dgl.dataloading.negative_sampler.Uniform(args.num_negs)
g, sampler = dgl.dataloading.NeighborSampler(
[int(fanout) for fanout in args.fan_out.split(",")], [int(fanout) for fanout in args.fan_out.split(",")]
train_nids,
dgl.distributed.sample_neighbors,
args.num_negs,
args.remove_edge,
) )
# Create dataloader
# Create PyTorch DataLoader for constructing blocks exclude = "reverse_id" if args.remove_edge else None
dataloader = dgl.distributed.DistDataLoader( reverse_eids = th.arange(g.num_edges()) if args.remove_edge else None
dataset=train_eids.numpy(), dataloader = dgl.dataloading.DistEdgeDataLoader(
g,
train_eids,
sampler,
negative_sampler=neg_sampler,
exclude=exclude,
reverse_eids=reverse_eids,
batch_size=args.batch_size, batch_size=args.batch_size,
collate_fn=sampler.sample_blocks,
shuffle=True, shuffle=True,
drop_last=False, drop_last=False,
) )
# Define model and optimizer # Define model and optimizer
emb_layer = DistEmb( emb_layer = DistEmb(
g.num_nodes(), g.num_nodes(),
...@@ -89,7 +67,7 @@ def run(args, device, data): ...@@ -89,7 +67,7 @@ def run(args, device, data):
dgl_sparse_emb=args.dgl_sparse, dgl_sparse_emb=args.dgl_sparse,
dev_id=device, dev_id=device,
) )
model = SAGE( model = DistSAGE(
args.num_hidden, args.num_hidden,
args.num_hidden, args.num_hidden,
args.num_hidden, args.num_hidden,
...@@ -126,21 +104,16 @@ def run(args, device, data): ...@@ -126,21 +104,16 @@ def run(args, device, data):
emb_optimizer = th.optim.SparseAdam( emb_optimizer = th.optim.SparseAdam(
list(emb_layer.module.sparse_emb.parameters()), lr=args.sparse_lr list(emb_layer.module.sparse_emb.parameters()), lr=args.sparse_lr
) )
print("optimize Pytorch sparse embedding:", emb_layer.module.sparse_emb) print(
"optimize Pytorch sparse embedding:", emb_layer.module.sparse_emb
)
# Training loop # Training loop
epoch = 0 epoch = 0
for epoch in range(args.num_epochs): for epoch in range(args.num_epochs):
sample_time = 0
copy_time = 0
forward_time = 0
backward_time = 0
update_time = 0
num_seeds = 0 num_seeds = 0
num_inputs = 0 num_inputs = 0
step_time = [] step_time = []
iter_t = []
sample_t = [] sample_t = []
feat_copy_t = [] feat_copy_t = []
forward_t = [] forward_t = []
...@@ -149,69 +122,71 @@ def run(args, device, data): ...@@ -149,69 +122,71 @@ def run(args, device, data):
iter_tput = [] iter_tput = []
start = time.time() start = time.time()
# Loop over the dataloader to sample the computation dependency graph as a list of with model.join():
# blocks. # Loop over the dataloader to sample the computation dependency
for step, (pos_graph, neg_graph, blocks) in enumerate(dataloader): # graph as a list of blocks.
tic_step = time.time() for step, (input_nodes, pos_graph, neg_graph, blocks) in enumerate(
sample_t.append(tic_step - start) dataloader
):
tic_step = time.time()
sample_t.append(tic_step - start)
pos_graph = pos_graph.to(device) copy_t = time.time()
neg_graph = neg_graph.to(device) pos_graph = pos_graph.to(device)
blocks = [block.to(device) for block in blocks] neg_graph = neg_graph.to(device)
# The nodes for input lies at the LHS side of the first block. blocks = [block.to(device) for block in blocks]
# The nodes for output lies at the RHS side of the last block. feat_copy_t.append(copy_t - tic_step)
copy_time = time.time()
# Load the input features as well as output labels # Compute loss and prediction
batch_inputs = blocks[0].srcdata[dgl.NID] batch_inputs = emb_layer(input_nodes)
copy_time = time.time() batch_pred = model(blocks, batch_inputs)
feat_copy_t.append(copy_time - tic_step) loss = loss_fcn(batch_pred, pos_graph, neg_graph)
forward_end = time.time()
emb_optimizer.zero_grad()
optimizer.zero_grad()
loss.backward()
compute_end = time.time()
forward_t.append(forward_end - copy_time)
backward_t.append(compute_end - forward_end)
# Compute loss and prediction # Aggregate gradients in multiple nodes.
batch_inputs = emb_layer(batch_inputs) emb_optimizer.step()
batch_pred = model(blocks, batch_inputs) optimizer.step()
loss = loss_fcn(batch_pred, pos_graph, neg_graph) update_t.append(time.time() - compute_end)
forward_end = time.time()
emb_optimizer.zero_grad()
optimizer.zero_grad()
loss.backward()
compute_end = time.time()
forward_t.append(forward_end - copy_time)
backward_t.append(compute_end - forward_end)
# Aggregate gradients in multiple nodes. pos_edges = pos_graph.num_edges()
emb_optimizer.step()
optimizer.step()
update_t.append(time.time() - compute_end)
pos_edges = pos_graph.number_of_edges() step_t = time.time() - start
neg_edges = neg_graph.number_of_edges() step_time.append(step_t)
iter_tput.append(pos_edges / step_t)
step_t = time.time() - start num_seeds += pos_edges
step_time.append(step_t) if step % args.log_every == 0:
iter_tput.append(pos_edges / step_t) print(
num_seeds += pos_edges "[{}] Epoch {:05d} | Step {:05d} | Loss {:.4f} | Speed "
if step % args.log_every == 0: "(samples/sec) {:.4f} | time {:.3f}s | sample {:.3f} | "
print( "copy {:.3f} | forward {:.3f} | backward {:.3f} | "
"[{}] Epoch {:05d} | Step {:05d} | Loss {:.4f} | Speed (samples/sec) {:.4f} | time {:.3f} s" "update {:.3f}".format(
"| sample {:.3f} | copy {:.3f} | forward {:.3f} | backward {:.3f} | update {:.3f}".format( g.rank(),
g.rank(), epoch,
epoch, step,
step, loss.item(),
loss.item(), np.mean(iter_tput[3:]),
np.mean(iter_tput[3:]), np.sum(step_time[-args.log_every:]),
np.sum(step_time[-args.log_every :]), np.sum(sample_t[-args.log_every:]),
np.sum(sample_t[-args.log_every :]), np.sum(feat_copy_t[-args.log_every:]),
np.sum(feat_copy_t[-args.log_every :]), np.sum(forward_t[-args.log_every:]),
np.sum(forward_t[-args.log_every :]), np.sum(backward_t[-args.log_every:]),
np.sum(backward_t[-args.log_every :]), np.sum(update_t[-args.log_every:]),
np.sum(update_t[-args.log_every :]), )
) )
)
start = time.time() start = time.time()
print( print(
"[{}]Epoch Time(s): {:.4f}, sample: {:.4f}, data copy: {:.4f}, forward: {:.4f}, backward: {:.4f}, update: {:.4f}, #seeds: {}, #inputs: {}".format( "[{}]Epoch Time(s): {:.4f}, sample: {:.4f}, data copy: {:.4f}, "
"forward: {:.4f}, backward: {:.4f}, update: {:.4f}, #seeds: {}, "
"#inputs: {}".format(
g.rank(), g.rank(),
np.sum(step_time), np.sum(step_time),
np.sum(sample_t), np.sum(sample_t),
...@@ -226,14 +201,9 @@ def run(args, device, data): ...@@ -226,14 +201,9 @@ def run(args, device, data):
epoch += 1 epoch += 1
# evaluate the embedding using LogisticRegression # evaluate the embedding using LogisticRegression
if args.standalone: pred = generate_emb(
pred = generate_emb( args.standalone, model, emb_layer, g, args.batch_size_eval, device
True, model, emb_layer, g, args.batch_size_eval, device )
)
else:
pred = generate_emb(
False, model.module, emb_layer, g, args.batch_size_eval, device
)
if g.rank() == 0: if g.rank() == 0:
eval_acc, test_acc = compute_acc( eval_acc, test_acc = compute_acc(
pred, labels, global_train_nid, global_valid_nid, global_test_nid pred, labels, global_train_nid, global_valid_nid, global_test_nid
...@@ -251,7 +221,6 @@ def run(args, device, data): ...@@ -251,7 +221,6 @@ def run(args, device, data):
if g.rank() == 0: if g.rank() == 0:
th.save(pred, "emb.pt") th.save(pred, "emb.pt")
else: else:
feat = g.ndata["features"]
th.save(pred, "emb.pt") th.save(pred, "emb.pt")
...@@ -259,32 +228,35 @@ def main(args): ...@@ -259,32 +228,35 @@ def main(args):
dgl.distributed.initialize(args.ip_config) dgl.distributed.initialize(args.ip_config)
if not args.standalone: if not args.standalone:
th.distributed.init_process_group(backend="gloo") th.distributed.init_process_group(backend="gloo")
g = dgl.distributed.DistGraph(args.graph_name, part_config=args.part_config) g = dgl.distributed.DistGraph(
args.graph_name, part_config=args.part_config
)
print("rank:", g.rank()) print("rank:", g.rank())
print("number of edges", g.number_of_edges()) print("number of edges", g.num_edges())
train_eids = dgl.distributed.edge_split( train_eids = dgl.distributed.edge_split(
th.ones((g.number_of_edges(),), dtype=th.bool), th.ones((g.num_edges(),), dtype=th.bool),
g.get_partition_book(), g.get_partition_book(),
force_even=True, force_even=True,
) )
train_nids = dgl.distributed.node_split( train_nids = dgl.distributed.node_split(
th.ones((g.number_of_nodes(),), dtype=th.bool), g.get_partition_book() th.ones((g.num_nodes(),), dtype=th.bool), g.get_partition_book()
) )
global_train_nid = th.LongTensor( global_train_nid = th.LongTensor(
np.nonzero(g.ndata["train_mask"][np.arange(g.number_of_nodes())]) np.nonzero(g.ndata["train_mask"][np.arange(g.num_nodes())])
) )
global_valid_nid = th.LongTensor( global_valid_nid = th.LongTensor(
np.nonzero(g.ndata["val_mask"][np.arange(g.number_of_nodes())]) np.nonzero(g.ndata["val_mask"][np.arange(g.num_nodes())])
) )
global_test_nid = th.LongTensor( global_test_nid = th.LongTensor(
np.nonzero(g.ndata["test_mask"][np.arange(g.number_of_nodes())]) np.nonzero(g.ndata["test_mask"][np.arange(g.num_nodes())])
) )
labels = g.ndata["labels"][np.arange(g.number_of_nodes())] labels = g.ndata["labels"][np.arange(g.num_nodes())]
if args.num_gpus == -1: if args.num_gpus == -1:
device = th.device("cpu") device = th.device("cpu")
else: else:
device = th.device("cuda:" + str(args.local_rank)) dev_id = g.rank() % args.num_gpus
device = th.device("cuda:" + str(dev_id))
# Pack data # Pack data
global_train_nid = global_train_nid.squeeze() global_train_nid = global_train_nid.squeeze()
...@@ -308,7 +280,6 @@ def main(args): ...@@ -308,7 +280,6 @@ def main(args):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="GCN") parser = argparse.ArgumentParser(description="GCN")
register_data_args(parser)
parser.add_argument("--graph_name", type=str, help="graph name") parser.add_argument("--graph_name", type=str, help="graph name")
parser.add_argument("--id", type=int, help="the partition id") parser.add_argument("--id", type=int, help="the partition id")
parser.add_argument( parser.add_argument(
......
## DistGNN vertex-cut based graph partitioning (using Libra)
### How to run graph partitioning
```python partition_graph.py --dataset <dataset> --num-parts <num_parts> --out-dir <output_location>```
Example: The following command-line creates 4 partitions of pubmed graph
``` python partition_graph.py --dataset pubmed --num-parts 4 --out-dir ./```
The ouptut partitions are created in the current directory in Libra_result_\<dataset\>/ folder.
The *upcoming DistGNN* application can directly use these partitions for distributed training.
### How Libra partitioning works
Libra is a vertex-cut based graph partitioning method. It applies greedy heuristics to uniquely distribute the input graph edges among the partitions. It generates the partitions as a list of edges. Script ```libra_partition.py``` after generates the Libra partitions and converts the Libra output to DGL/DistGNN input format.
Note: Current Libra implementation is sequential. Extra overhead is paid due to the additional work of format conversion of the partitioned graph.
### Expected partitioning timinigs
Cora, Pubmed, Citeseer: < 10 sec (<10GB)
Reddit: ~150 sec (~ 25GB)
OGBN-Products: ~200 sec (~30GB)
Proteins: 1800 sec (Format conversion from public data takes time) (~100GB)
OGBN-Paper100M: 2500 sec (~200GB)
### Settings
Tested with:
Cent OS 7.6
gcc v8.3.0
PyTorch 1.7.1
Python 3.7.10
r"""
Copyright (c) 2021 Intel Corporation
\file Graph partitioning
\brief Calls Libra - Vertex-cut based graph partitioner for distirbuted training
\author Vasimuddin Md <vasimuddin.md@intel.com>,
Guixiang Ma <guixiang.ma@intel.com>
Sanchit Misra <sanchit.misra@intel.com>,
Ramanarayan Mohanty <ramanarayan.mohanty@intel.com>,
Sasikanth Avancha <sasikanth.avancha@intel.com>
Nesreen K. Ahmed <nesreen.k.ahmed@intel.com>
"""
import argparse
import csv
import os
import random
import sys
import time
from statistics import mean
import numpy as np
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from load_graph import load_ogb
import dgl
from dgl.base import DGLError
from dgl.data import load_data
from dgl.distgnn.partition import partition_graph
from dgl.distgnn.tools import load_proteins
if __name__ == "__main__":
argparser = argparse.ArgumentParser()
argparser.add_argument("--dataset", type=str, default="cora")
argparser.add_argument("--num-parts", type=int, default=2)
argparser.add_argument("--out-dir", type=str, default="./")
args = argparser.parse_args()
dataset = args.dataset
num_community = args.num_parts
out_dir = "Libra_result_" + dataset ## "Libra_result_" prefix is mandatory
resultdir = os.path.join(args.out_dir, out_dir)
print("Input dataset for partitioning: ", dataset)
if args.dataset == "ogbn-products":
print("Loading ogbn-products")
G, _ = load_ogb("ogbn-products")
elif args.dataset == "ogbn-papers100M":
print("Loading ogbn-papers100M")
G, _ = load_ogb("ogbn-papers100M")
elif args.dataset == "proteins":
G = load_proteins("proteins")
elif args.dataset == "ogbn-arxiv":
print("Loading ogbn-arxiv")
G, _ = load_ogb("ogbn-arxiv")
else:
try:
G = load_data(args)[0]
except:
raise DGLError("Error: Dataset {} not found !!!".format(dataset))
print("Done loading the graph.", flush=True)
partition_graph(num_community, G, resultdir)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment