Unverified Commit 927d2b31 authored by Mufei Li's avatar Mufei Li Committed by GitHub
Browse files

[Example] Move Data to GPU before Minibatch Training (#2453)

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update
parent 72ef642f
...@@ -4,19 +4,12 @@ import torch as th ...@@ -4,19 +4,12 @@ import torch as th
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
import torch.optim as optim import torch.optim as optim
import torch.multiprocessing as mp
from torch.utils.data import DataLoader
import dgl.function as fn
import dgl.nn.pytorch as dglnn import dgl.nn.pytorch as dglnn
import time import time
import argparse import argparse
from _thread import start_new_thread
from functools import wraps
from dgl.data import RedditDataset
import tqdm import tqdm
import traceback
from load_graph import load_reddit, load_ogb, inductive_split from load_graph import load_reddit, inductive_split
class SAGE(nn.Module): class SAGE(nn.Module):
def __init__(self, def __init__(self,
...@@ -47,7 +40,7 @@ class SAGE(nn.Module): ...@@ -47,7 +40,7 @@ class SAGE(nn.Module):
h = self.dropout(h) h = self.dropout(h)
return h return h
def inference(self, g, x, batch_size, device): def inference(self, g, x, device):
""" """
Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling). Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling).
g : the entire graph. g : the entire graph.
...@@ -62,12 +55,12 @@ class SAGE(nn.Module): ...@@ -62,12 +55,12 @@ class SAGE(nn.Module):
# on each layer are of course splitted in batches. # on each layer are of course splitted in batches.
# TODO: can we standardize this? # TODO: can we standardize this?
for l, layer in enumerate(self.layers): for l, layer in enumerate(self.layers):
y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes) y = th.zeros(g.num_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes)
sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1) sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
dataloader = dgl.dataloading.NodeDataLoader( dataloader = dgl.dataloading.NodeDataLoader(
g, g,
th.arange(g.number_of_nodes()), th.arange(g.num_nodes()),
sampler, sampler,
batch_size=args.batch_size, batch_size=args.batch_size,
shuffle=True, shuffle=True,
...@@ -96,34 +89,35 @@ def compute_acc(pred, labels): ...@@ -96,34 +89,35 @@ def compute_acc(pred, labels):
labels = labels.long() labels = labels.long()
return (th.argmax(pred, dim=1) == labels).float().sum() / len(pred) return (th.argmax(pred, dim=1) == labels).float().sum() / len(pred)
def evaluate(model, g, inputs, labels, val_nid, batch_size, device): def evaluate(model, g, nfeat, labels, val_nid, device):
""" """
Evaluate the model on the validation set specified by ``val_nid``. Evaluate the model on the validation set specified by ``val_nid``.
g : The entire graph. g : The entire graph.
inputs : The features of all the nodes. inputs : The features of all the nodes.
labels : The labels of all the nodes. labels : The labels of all the nodes.
val_nid : the node Ids for validation. val_nid : the node Ids for validation.
batch_size : Number of nodes to compute at the same time.
device : The GPU device to evaluate on. device : The GPU device to evaluate on.
""" """
model.eval() model.eval()
with th.no_grad(): with th.no_grad():
pred = model.inference(g, inputs, batch_size, device) pred = model.inference(g, nfeat, device)
model.train() model.train()
return compute_acc(pred[val_nid], labels[val_nid]) return compute_acc(pred[val_nid], labels[val_nid].to(pred.device))
def load_subtensor(g, seeds, input_nodes, device): def load_subtensor(nfeat, labels, seeds, input_nodes, device):
""" """
Copys features and labels of a set of nodes onto GPU. Extracts features and labels for a subset of nodes
""" """
batch_inputs = g.ndata['features'][input_nodes].to(device) batch_inputs = nfeat[input_nodes].to(device)
batch_labels = g.ndata['labels'][seeds].to(device) batch_labels = labels[seeds].to(device)
return batch_inputs, batch_labels return batch_inputs, batch_labels
#### Entry point #### Entry point
def run(args, device, data): def run(args, device, data):
# Unpack data # Unpack data
in_feats, n_classes, train_g, val_g, test_g = data n_classes, train_g, val_g, test_g, train_nfeat, train_labels, \
val_nfeat, val_labels, test_nfeat, test_labels = data
in_feats = train_nfeat.shape[1]
train_nid = th.nonzero(train_g.ndata['train_mask'], as_tuple=True)[0] train_nid = th.nonzero(train_g.ndata['train_mask'], as_tuple=True)[0]
val_nid = th.nonzero(val_g.ndata['val_mask'], as_tuple=True)[0] val_nid = th.nonzero(val_g.ndata['val_mask'], as_tuple=True)[0]
test_nid = th.nonzero(~(test_g.ndata['train_mask'] | test_g.ndata['val_mask']), as_tuple=True)[0] test_nid = th.nonzero(~(test_g.ndata['train_mask'] | test_g.ndata['val_mask']), as_tuple=True)[0]
...@@ -144,7 +138,6 @@ def run(args, device, data): ...@@ -144,7 +138,6 @@ def run(args, device, data):
model = SAGE(in_feats, args.num_hidden, n_classes, args.num_layers, F.relu, args.dropout) model = SAGE(in_feats, args.num_hidden, n_classes, args.num_layers, F.relu, args.dropout)
model = model.to(device) model = model.to(device)
loss_fcn = nn.CrossEntropyLoss() loss_fcn = nn.CrossEntropyLoss()
loss_fcn = loss_fcn.to(device)
optimizer = optim.Adam(model.parameters(), lr=args.lr) optimizer = optim.Adam(model.parameters(), lr=args.lr)
# Training loop # Training loop
...@@ -158,10 +151,9 @@ def run(args, device, data): ...@@ -158,10 +151,9 @@ def run(args, device, data):
tic_step = time.time() tic_step = time.time()
for step, (input_nodes, seeds, blocks) in enumerate(dataloader): for step, (input_nodes, seeds, blocks) in enumerate(dataloader):
# Load the input features as well as output labels # Load the input features as well as output labels
#batch_inputs, batch_labels = load_subtensor(train_g, seeds, input_nodes, device) batch_inputs, batch_labels = load_subtensor(train_nfeat, train_labels,
seeds, input_nodes, device)
blocks = [block.int().to(device) for block in blocks] blocks = [block.int().to(device) for block in blocks]
batch_inputs = blocks[0].srcdata['features']
batch_labels = blocks[-1].dstdata['labels']
# Compute loss and prediction # Compute loss and prediction
batch_pred = model(blocks, batch_inputs) batch_pred = model(blocks, batch_inputs)
...@@ -183,9 +175,9 @@ def run(args, device, data): ...@@ -183,9 +175,9 @@ def run(args, device, data):
if epoch >= 5: if epoch >= 5:
avg += toc - tic avg += toc - tic
if epoch % args.eval_every == 0 and epoch != 0: if epoch % args.eval_every == 0 and epoch != 0:
eval_acc = evaluate(model, val_g, val_g.ndata['features'], val_g.ndata['labels'], val_nid, args.batch_size, device) eval_acc = evaluate(model, val_g, val_nfeat, val_labels, val_nid, device)
print('Eval Acc {:.4f}'.format(eval_acc)) print('Eval Acc {:.4f}'.format(eval_acc))
test_acc = evaluate(model, test_g, test_g.ndata['features'], test_g.ndata['labels'], test_nid, args.batch_size, device) test_acc = evaluate(model, test_g, test_nfeat, test_labels, test_nid, device)
print('Test Acc: {:.4f}'.format(test_acc)) print('Test Acc: {:.4f}'.format(test_acc))
print('Avg epoch time: {}'.format(avg / (epoch - 4))) print('Avg epoch time: {}'.format(avg / (epoch - 4)))
...@@ -208,6 +200,11 @@ if __name__ == '__main__': ...@@ -208,6 +200,11 @@ if __name__ == '__main__':
help="Number of sampling processes. Use 0 for no extra process.") help="Number of sampling processes. Use 0 for no extra process.")
argparser.add_argument('--inductive', action='store_true', argparser.add_argument('--inductive', action='store_true',
help="Inductive learning setting") help="Inductive learning setting")
argparser.add_argument('--data-cpu', action='store_true',
help="By default the script puts all node features and labels "
"on GPU when using it to save time for data copy. This may "
"be undesired if they cannot fit in GPU memory at once. "
"This flag disables that.")
args = argparser.parse_args() args = argparser.parse_args()
if args.gpu >= 0: if args.gpu >= 0:
...@@ -217,17 +214,25 @@ if __name__ == '__main__': ...@@ -217,17 +214,25 @@ if __name__ == '__main__':
if args.dataset == 'reddit': if args.dataset == 'reddit':
g, n_classes = load_reddit() g, n_classes = load_reddit()
elif args.dataset == 'ogb-product':
g, n_classes = load_ogb('ogbn-products')
else: else:
raise Exception('unknown dataset') raise Exception('unknown dataset')
in_feats = g.ndata['features'].shape[1]
if args.inductive: if args.inductive:
train_g, val_g, test_g = inductive_split(g) train_g, val_g, test_g = inductive_split(g)
train_nfeat = train_g.ndata.pop('features')
val_nfeat = val_g.ndata.pop('features')
test_nfeat = test_g.ndata.pop('features')
train_labels = train_g.ndata.pop('labels')
val_labels = val_g.ndata.pop('labels')
test_labels = test_g.ndata.pop('labels')
else: else:
train_g = val_g = test_g = g train_g = val_g = test_g = g
train_nfeat = val_nfeat = test_nfeat = g.ndata.pop('features')
train_labels = val_labels = test_labels = g.ndata.pop('labels')
if not args.data_cpu:
train_nfeat = train_nfeat.to(device)
train_labels = train_labels.to(device)
# Create csr/coo/csc formats before launching training processes with multi-gpu. # Create csr/coo/csc formats before launching training processes with multi-gpu.
# This avoids creating certain formats in each sub-process, which saves momory and CPU. # This avoids creating certain formats in each sub-process, which saves momory and CPU.
...@@ -235,6 +240,7 @@ if __name__ == '__main__': ...@@ -235,6 +240,7 @@ if __name__ == '__main__':
val_g.create_formats_() val_g.create_formats_()
test_g.create_formats_() test_g.create_formats_()
# Pack data # Pack data
data = in_feats, n_classes, train_g, val_g, test_g data = n_classes, train_g, val_g, test_g, train_nfeat, train_labels, \
val_nfeat, val_labels, test_nfeat, test_labels
run(args, device, data) run(args, device, data)
...@@ -5,16 +5,12 @@ import torch.nn as nn ...@@ -5,16 +5,12 @@ import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
import torch.optim as optim import torch.optim as optim
import torch.multiprocessing as mp import torch.multiprocessing as mp
from torch.utils.data import DataLoader
import dgl.function as fn
import dgl.nn.pytorch as dglnn import dgl.nn.pytorch as dglnn
import time import time
import math import math
import argparse import argparse
from dgl.data import RedditDataset
from torch.nn.parallel import DistributedDataParallel from torch.nn.parallel import DistributedDataParallel
import tqdm import tqdm
import traceback
from utils import thread_wrapped_func from utils import thread_wrapped_func
from load_graph import load_reddit, inductive_split from load_graph import load_reddit, inductive_split
...@@ -48,7 +44,7 @@ class SAGE(nn.Module): ...@@ -48,7 +44,7 @@ class SAGE(nn.Module):
h = self.dropout(h) h = self.dropout(h)
return h return h
def inference(self, g, x, batch_size, device): def inference(self, g, x, device):
""" """
Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling). Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling).
g : the entire graph. g : the entire graph.
...@@ -62,14 +58,13 @@ class SAGE(nn.Module): ...@@ -62,14 +58,13 @@ class SAGE(nn.Module):
# Therefore, we compute the representation of all nodes layer by layer. The nodes # Therefore, we compute the representation of all nodes layer by layer. The nodes
# on each layer are of course splitted in batches. # on each layer are of course splitted in batches.
# TODO: can we standardize this? # TODO: can we standardize this?
nodes = th.arange(g.number_of_nodes())
for l, layer in enumerate(self.layers): for l, layer in enumerate(self.layers):
y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes) y = th.zeros(g.num_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes)
sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1) sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
dataloader = dgl.dataloading.NodeDataLoader( dataloader = dgl.dataloading.NodeDataLoader(
g, g,
th.arange(g.number_of_nodes()), th.arange(g.num_nodes()),
sampler, sampler,
batch_size=args.batch_size, batch_size=args.batch_size,
shuffle=True, shuffle=True,
...@@ -97,27 +92,26 @@ def compute_acc(pred, labels): ...@@ -97,27 +92,26 @@ def compute_acc(pred, labels):
""" """
return (th.argmax(pred, dim=1) == labels).float().sum() / len(pred) return (th.argmax(pred, dim=1) == labels).float().sum() / len(pred)
def evaluate(model, g, inputs, labels, val_nid, batch_size, device): def evaluate(model, g, nfeat, labels, val_nid, device):
""" """
Evaluate the model on the validation set specified by ``val_nid``. Evaluate the model on the validation set specified by ``val_nid``.
g : The entire graph. g : The entire graph.
inputs : The features of all the nodes. inputs : The features of all the nodes.
labels : The labels of all the nodes. labels : The labels of all the nodes.
val_nid : A node ID tensor indicating which nodes do we actually compute the accuracy for. val_nid : A node ID tensor indicating which nodes do we actually compute the accuracy for.
batch_size : Number of nodes to compute at the same time.
device : The GPU device to evaluate on. device : The GPU device to evaluate on.
""" """
model.eval() model.eval()
with th.no_grad(): with th.no_grad():
pred = model.inference(g, inputs, batch_size, device) pred = model.inference(g, nfeat, device)
model.train() model.train()
return compute_acc(pred[val_nid], labels[val_nid]) return compute_acc(pred[val_nid], labels[val_nid])
def load_subtensor(g, labels, seeds, input_nodes, dev_id): def load_subtensor(nfeat, labels, seeds, input_nodes, dev_id):
""" """
Copys features and labels of a set of nodes onto GPU. Extracts features and labels for a subset of nodes.
""" """
batch_inputs = g.ndata['features'][input_nodes].to(dev_id) batch_inputs = nfeat[input_nodes].to(dev_id)
batch_labels = labels[seeds].to(dev_id) batch_labels = labels[seeds].to(dev_id)
return batch_inputs, batch_labels return batch_inputs, batch_labels
...@@ -137,7 +131,25 @@ def run(proc_id, n_gpus, args, devices, data): ...@@ -137,7 +131,25 @@ def run(proc_id, n_gpus, args, devices, data):
th.cuda.set_device(dev_id) th.cuda.set_device(dev_id)
# Unpack data # Unpack data
in_feats, n_classes, train_g, val_g, test_g = data n_classes, train_g, val_g, test_g = data
if args.inductive:
train_nfeat = train_g.ndata.pop('features')
val_nfeat = val_g.ndata.pop('features')
test_nfeat = test_g.ndata.pop('features')
train_labels = train_g.ndata.pop('labels')
val_labels = val_g.ndata.pop('labels')
test_labels = test_g.ndata.pop('labels')
else:
train_nfeat = val_nfeat = test_nfeat = g.ndata.pop('features')
train_labels = val_labels = test_labels = g.ndata.pop('labels')
if not args.data_cpu:
train_nfeat = train_nfeat.to(dev_id)
train_labels = train_labels.to(dev_id)
in_feats = train_nfeat.shape[1]
train_mask = train_g.ndata['train_mask'] train_mask = train_g.ndata['train_mask']
val_mask = val_g.ndata['val_mask'] val_mask = val_g.ndata['val_mask']
test_mask = ~(test_g.ndata['train_mask'] | test_g.ndata['val_mask']) test_mask = ~(test_g.ndata['train_mask'] | test_g.ndata['val_mask'])
...@@ -166,7 +178,6 @@ def run(proc_id, n_gpus, args, devices, data): ...@@ -166,7 +178,6 @@ def run(proc_id, n_gpus, args, devices, data):
if n_gpus > 1: if n_gpus > 1:
model = DistributedDataParallel(model, device_ids=[dev_id], output_device=dev_id) model = DistributedDataParallel(model, device_ids=[dev_id], output_device=dev_id)
loss_fcn = nn.CrossEntropyLoss() loss_fcn = nn.CrossEntropyLoss()
loss_fcn = loss_fcn.to(dev_id)
optimizer = optim.Adam(model.parameters(), lr=args.lr) optimizer = optim.Adam(model.parameters(), lr=args.lr)
# Training loop # Training loop
...@@ -182,7 +193,8 @@ def run(proc_id, n_gpus, args, devices, data): ...@@ -182,7 +193,8 @@ def run(proc_id, n_gpus, args, devices, data):
tic_step = time.time() tic_step = time.time()
# Load the input features as well as output labels # Load the input features as well as output labels
batch_inputs, batch_labels = load_subtensor(train_g, train_g.ndata['labels'], seeds, input_nodes, dev_id) batch_inputs, batch_labels = load_subtensor(train_nfeat, train_labels,
seeds, input_nodes, dev_id)
blocks = [block.int().to(dev_id) for block in blocks] blocks = [block.int().to(dev_id) for block in blocks]
# Compute loss and prediction # Compute loss and prediction
batch_pred = model(blocks, batch_inputs) batch_pred = model(blocks, batch_inputs)
...@@ -209,14 +221,14 @@ def run(proc_id, n_gpus, args, devices, data): ...@@ -209,14 +221,14 @@ def run(proc_id, n_gpus, args, devices, data):
if epoch % args.eval_every == 0 and epoch != 0: if epoch % args.eval_every == 0 and epoch != 0:
if n_gpus == 1: if n_gpus == 1:
eval_acc = evaluate( eval_acc = evaluate(
model, val_g, val_g.ndata['features'], val_g.ndata['labels'], val_nid, args.batch_size, devices[0]) model, val_g, val_nfeat, val_labels, val_nid, devices[0])
test_acc = evaluate( test_acc = evaluate(
model, test_g, test_g.ndata['features'], test_g.ndata['labels'], test_nid, args.batch_size, devices[0]) model, test_g, test_nfeat, test_labels, test_nid, devices[0])
else: else:
eval_acc = evaluate( eval_acc = evaluate(
model.module, val_g, val_g.ndata['features'], val_g.ndata['labels'], val_nid, args.batch_size, devices[0]) model.module, val_g, val_nfeat, val_labels, val_nid, devices[0])
test_acc = evaluate( test_acc = evaluate(
model.module, test_g, test_g.ndata['features'], test_g.ndata['labels'], test_nid, args.batch_size, devices[0]) model.module, test_g, test_nfeat, test_labels, test_nid, devices[0])
print('Eval Acc {:.4f}'.format(eval_acc)) print('Eval Acc {:.4f}'.format(eval_acc))
print('Test Acc: {:.4f}'.format(test_acc)) print('Test Acc: {:.4f}'.format(test_acc))
...@@ -243,6 +255,11 @@ if __name__ == '__main__': ...@@ -243,6 +255,11 @@ if __name__ == '__main__':
help="Number of sampling processes. Use 0 for no extra process.") help="Number of sampling processes. Use 0 for no extra process.")
argparser.add_argument('--inductive', action='store_true', argparser.add_argument('--inductive', action='store_true',
help="Inductive learning setting") help="Inductive learning setting")
argparser.add_argument('--data-cpu', action='store_true',
help="By default the script puts all node features and labels "
"on GPU when using it to save time for data copy. This may "
"be undesired if they cannot fit in GPU memory at once. "
"This flag disables that.")
args = argparser.parse_args() args = argparser.parse_args()
devices = list(map(int, args.gpu.split(','))) devices = list(map(int, args.gpu.split(',')))
...@@ -251,7 +268,6 @@ if __name__ == '__main__': ...@@ -251,7 +268,6 @@ if __name__ == '__main__':
g, n_classes = load_reddit() g, n_classes = load_reddit()
# Construct graph # Construct graph
g = dgl.as_heterograph(g) g = dgl.as_heterograph(g)
in_feats = g.ndata['features'].shape[1]
if args.inductive: if args.inductive:
train_g, val_g, test_g = inductive_split(g) train_g, val_g, test_g = inductive_split(g)
...@@ -264,7 +280,7 @@ if __name__ == '__main__': ...@@ -264,7 +280,7 @@ if __name__ == '__main__':
val_g.create_formats_() val_g.create_formats_()
test_g.create_formats_() test_g.create_formats_()
# Pack data # Pack data
data = in_feats, n_classes, train_g, val_g, test_g data = n_classes, train_g, val_g, test_g
if n_gpus == 1: if n_gpus == 1:
run(0, n_gpus, args, devices, data) run(0, n_gpus, args, devices, data)
......
...@@ -5,17 +5,13 @@ import torch.nn as nn ...@@ -5,17 +5,13 @@ import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
import torch.optim as optim import torch.optim as optim
import torch.multiprocessing as mp import torch.multiprocessing as mp
from torch.utils.data import DataLoader
import dgl.function as fn import dgl.function as fn
import dgl.nn.pytorch as dglnn import dgl.nn.pytorch as dglnn
import time import time
import argparse import argparse
from _thread import start_new_thread
from functools import wraps
from dgl.data import RedditDataset from dgl.data import RedditDataset
from torch.nn.parallel import DistributedDataParallel from torch.nn.parallel import DistributedDataParallel
import tqdm import tqdm
import traceback
import sklearn.linear_model as lm import sklearn.linear_model as lm
import sklearn.metrics as skm import sklearn.metrics as skm
...@@ -38,13 +34,6 @@ class NegativeSampler(object): ...@@ -38,13 +34,6 @@ class NegativeSampler(object):
src = src.repeat_interleave(self.k) src = src.repeat_interleave(self.k)
return src, dst return src, dst
def load_subtensor(g, input_nodes, device):
"""
Copys features and labels of a set of nodes onto GPU.
"""
batch_inputs = g.ndata['features'][input_nodes].to(device)
return batch_inputs
class SAGE(nn.Module): class SAGE(nn.Module):
def __init__(self, def __init__(self,
in_feats, in_feats,
...@@ -74,7 +63,7 @@ class SAGE(nn.Module): ...@@ -74,7 +63,7 @@ class SAGE(nn.Module):
h = self.dropout(h) h = self.dropout(h)
return h return h
def inference(self, g, x, batch_size, device): def inference(self, g, x, device):
""" """
Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling). Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling).
g : the entire graph. g : the entire graph.
...@@ -88,14 +77,13 @@ class SAGE(nn.Module): ...@@ -88,14 +77,13 @@ class SAGE(nn.Module):
# Therefore, we compute the representation of all nodes layer by layer. The nodes # Therefore, we compute the representation of all nodes layer by layer. The nodes
# on each layer are of course splitted in batches. # on each layer are of course splitted in batches.
# TODO: can we standardize this? # TODO: can we standardize this?
nodes = th.arange(g.number_of_nodes())
for l, layer in enumerate(self.layers): for l, layer in enumerate(self.layers):
y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes) y = th.zeros(g.num_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes)
sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1) sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
dataloader = dgl.dataloading.NodeDataLoader( dataloader = dgl.dataloading.NodeDataLoader(
g, g,
th.arange(g.number_of_nodes()), th.arange(g.num_nodes()),
sampler, sampler,
batch_size=args.batch_size, batch_size=args.batch_size,
shuffle=True, shuffle=True,
...@@ -155,24 +143,23 @@ def compute_acc(emb, labels, train_nids, val_nids, test_nids): ...@@ -155,24 +143,23 @@ def compute_acc(emb, labels, train_nids, val_nids, test_nids):
f1_micro_test = skm.f1_score(test_labels, pred[test_nids], average='micro') f1_micro_test = skm.f1_score(test_labels, pred[test_nids], average='micro')
return f1_micro_eval, f1_micro_test return f1_micro_eval, f1_micro_test
def evaluate(model, g, inputs, labels, train_nids, val_nids, test_nids, batch_size, device): def evaluate(model, g, nfeat, labels, train_nids, val_nids, test_nids, device):
""" """
Evaluate the model on the validation set specified by ``val_mask``. Evaluate the model on the validation set specified by ``val_mask``.
g : The entire graph. g : The entire graph.
inputs : The features of all the nodes. inputs : The features of all the nodes.
labels : The labels of all the nodes. labels : The labels of all the nodes.
val_mask : A 0-1 mask indicating which nodes do we actually compute the accuracy for. val_mask : A 0-1 mask indicating which nodes do we actually compute the accuracy for.
batch_size : Number of nodes to compute at the same time.
device : The GPU device to evaluate on. device : The GPU device to evaluate on.
""" """
model.eval() model.eval()
with th.no_grad(): with th.no_grad():
# single gpu # single gpu
if isinstance(model, SAGE): if isinstance(model, SAGE):
pred = model.inference(g, inputs, batch_size, device) pred = model.inference(g, nfeat, device)
# multi gpu # multi gpu
else: else:
pred = model.module.inference(g, inputs, batch_size, device) pred = model.module.inference(g, nfeat, device)
model.train() model.train()
return compute_acc(pred, labels, train_nids, val_nids, test_nids) return compute_acc(pred, labels, train_nids, val_nids, test_nids)
...@@ -188,18 +175,17 @@ def run(proc_id, n_gpus, args, devices, data): ...@@ -188,18 +175,17 @@ def run(proc_id, n_gpus, args, devices, data):
init_method=dist_init_method, init_method=dist_init_method,
world_size=world_size, world_size=world_size,
rank=proc_id) rank=proc_id)
train_mask, val_mask, test_mask, in_feats, labels, n_classes, g = data train_mask, val_mask, test_mask, n_classes, g = data
nfeat = g.ndata.pop('feat')
labels = g.ndata.pop('label')
in_feats = nfeat.shape[1]
train_nid = th.LongTensor(np.nonzero(train_mask)).squeeze() train_nid = th.LongTensor(np.nonzero(train_mask)).squeeze()
val_nid = th.LongTensor(np.nonzero(val_mask)).squeeze() val_nid = th.LongTensor(np.nonzero(val_mask)).squeeze()
test_nid = th.LongTensor(np.nonzero(test_mask)).squeeze() test_nid = th.LongTensor(np.nonzero(test_mask)).squeeze()
#train_nid = th.LongTensor(np.nonzero(train_mask)[0])
#val_nid = th.LongTensor(np.nonzero(val_mask)[0])
#test_nid = th.LongTensor(np.nonzero(test_mask)[0])
# Create PyTorch DataLoader for constructing blocks # Create PyTorch DataLoader for constructing blocks
n_edges = g.number_of_edges() n_edges = g.num_edges()
train_seeds = np.arange(n_edges) train_seeds = np.arange(n_edges)
if n_gpus > 0: if n_gpus > 0:
num_per_gpu = (train_seeds.shape[0] + n_gpus -1) // n_gpus num_per_gpu = (train_seeds.shape[0] + n_gpus -1) // n_gpus
...@@ -230,7 +216,6 @@ def run(proc_id, n_gpus, args, devices, data): ...@@ -230,7 +216,6 @@ def run(proc_id, n_gpus, args, devices, data):
if n_gpus > 1: if n_gpus > 1:
model = DistributedDataParallel(model, device_ids=[device], output_device=device) model = DistributedDataParallel(model, device_ids=[device], output_device=device)
loss_fcn = CrossEntropyLoss() loss_fcn = CrossEntropyLoss()
loss_fcn = loss_fcn.to(device)
optimizer = optim.Adam(model.parameters(), lr=args.lr) optimizer = optim.Adam(model.parameters(), lr=args.lr)
# Training loop # Training loop
...@@ -249,7 +234,7 @@ def run(proc_id, n_gpus, args, devices, data): ...@@ -249,7 +234,7 @@ def run(proc_id, n_gpus, args, devices, data):
tic_step = time.time() tic_step = time.time()
for step, (input_nodes, pos_graph, neg_graph, blocks) in enumerate(dataloader): for step, (input_nodes, pos_graph, neg_graph, blocks) in enumerate(dataloader):
batch_inputs = load_subtensor(g, input_nodes, device) batch_inputs = nfeat[input_nodes].to(device)
d_step = time.time() d_step = time.time()
pos_graph = pos_graph.to(device) pos_graph = pos_graph.to(device)
...@@ -263,8 +248,8 @@ def run(proc_id, n_gpus, args, devices, data): ...@@ -263,8 +248,8 @@ def run(proc_id, n_gpus, args, devices, data):
optimizer.step() optimizer.step()
t = time.time() t = time.time()
pos_edges = pos_graph.number_of_edges() pos_edges = pos_graph.num_edges()
neg_edges = neg_graph.number_of_edges() neg_edges = neg_graph.num_edges()
iter_pos.append(pos_edges / (t - tic_step)) iter_pos.append(pos_edges / (t - tic_step))
iter_neg.append(neg_edges / (t - tic_step)) iter_neg.append(neg_edges / (t - tic_step))
iter_d.append(d_step - tic_step) iter_d.append(d_step - tic_step)
...@@ -276,14 +261,21 @@ def run(proc_id, n_gpus, args, devices, data): ...@@ -276,14 +261,21 @@ def run(proc_id, n_gpus, args, devices, data):
tic_step = time.time() tic_step = time.time()
if step % args.eval_every == 0 and proc_id == 0: if step % args.eval_every == 0 and proc_id == 0:
eval_acc, test_acc = evaluate(model, g, g.ndata['features'], labels, train_nid, val_nid, test_nid, args.batch_size, device) eval_acc, test_acc = evaluate(model, g, nfeat, labels, train_nid, val_nid, test_nid, device)
print('Eval Acc {:.4f} Test Acc {:.4f}'.format(eval_acc, test_acc)) print('Eval Acc {:.4f} Test Acc {:.4f}'.format(eval_acc, test_acc))
if eval_acc > best_eval_acc: if eval_acc > best_eval_acc:
best_eval_acc = eval_acc best_eval_acc = eval_acc
best_test_acc = test_acc best_test_acc = test_acc
print('Best Eval Acc {:.4f} Test Acc {:.4f}'.format(best_eval_acc, best_test_acc)) print('Best Eval Acc {:.4f} Test Acc {:.4f}'.format(best_eval_acc, best_test_acc))
toc = time.time()
if proc_id == 0:
print('Epoch Time(s): {:.4f}'.format(toc - tic))
if epoch >= 5:
avg += toc - tic
if n_gpus > 1: if n_gpus > 1:
th.distributed.barrier() th.distributed.barrier()
if proc_id == 0:
print('Avg epoch time: {}'.format(avg / (epoch - 4))) print('Avg epoch time: {}'.format(avg / (epoch - 4)))
def main(args, devices): def main(args, devices):
...@@ -291,19 +283,15 @@ def main(args, devices): ...@@ -291,19 +283,15 @@ def main(args, devices):
data = RedditDataset(self_loop=False) data = RedditDataset(self_loop=False)
n_classes = data.num_classes n_classes = data.num_classes
g = data[0] g = data[0]
features = g.ndata['feat']
in_feats = features.shape[1]
labels = g.ndata['label']
train_mask = g.ndata['train_mask'] train_mask = g.ndata['train_mask']
val_mask = g.ndata['val_mask'] val_mask = g.ndata['val_mask']
test_mask = g.ndata['test_mask'] test_mask = g.ndata['test_mask']
g.ndata['features'] = features
# Create csr/coo/csc formats before launching training processes with multi-gpu. # Create csr/coo/csc formats before launching training processes with multi-gpu.
# This avoids creating certain formats in each sub-process, which saves momory and CPU. # This avoids creating certain formats in each sub-process, which saves momory and CPU.
g.create_formats_() g.create_formats_()
# Pack data # Pack data
data = train_mask, val_mask, test_mask, in_feats, labels, n_classes, g data = train_mask, val_mask, test_mask, n_classes, g
n_gpus = len(devices) n_gpus = len(devices)
if devices[0] == -1: if devices[0] == -1:
...@@ -324,7 +312,8 @@ def main(args, devices): ...@@ -324,7 +312,8 @@ def main(args, devices):
if __name__ == '__main__': if __name__ == '__main__':
argparser = argparse.ArgumentParser("multi-gpu training") argparser = argparse.ArgumentParser("multi-gpu training")
argparser.add_argument("--gpu", type=str, default='0', argparser.add_argument("--gpu", type=str, default='0',
help="GPU, can be a list of gpus for multi-gpu trianing, e.g., 0,1,2,3; -1 for CPU") help="GPU, can be a list of gpus for multi-gpu trianing,"
" e.g., 0,1,2,3; -1 for CPU")
argparser.add_argument('--num-epochs', type=int, default=20) argparser.add_argument('--num-epochs', type=int, default=20)
argparser.add_argument('--num-hidden', type=int, default=16) argparser.add_argument('--num-hidden', type=int, default=16)
argparser.add_argument('--num-layers', type=int, default=2) argparser.add_argument('--num-layers', type=int, default=2)
......
...@@ -5,24 +5,15 @@ import torch as th ...@@ -5,24 +5,15 @@ import torch as th
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
import torch.optim as optim import torch.optim as optim
import torch.multiprocessing as mp
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
import dgl.function as fn
import dgl.nn.pytorch as dglnn import dgl.nn.pytorch as dglnn
import time import time
import argparse import argparse
from _thread import start_new_thread
from functools import wraps
from dgl.data import RedditDataset
import tqdm import tqdm
import traceback
from ogb.nodeproppred import DglNodePropPredDataset from ogb.nodeproppred import DglNodePropPredDataset
from sampler import ClusterIter, subgraph_collate_fn from sampler import ClusterIter, subgraph_collate_fn
#### Neighbor sampler
class GAT(nn.Module): class GAT(nn.Module):
def __init__(self, def __init__(self,
in_feats, in_feats,
...@@ -79,16 +70,15 @@ class GAT(nn.Module): ...@@ -79,16 +70,15 @@ class GAT(nn.Module):
layers. layers.
""" """
num_heads = self.num_heads num_heads = self.num_heads
nodes = th.arange(g.number_of_nodes())
for l, layer in enumerate(self.layers): for l, layer in enumerate(self.layers):
if l < self.n_layers - 1: if l < self.n_layers - 1:
y = th.zeros(g.number_of_nodes(), self.n_hidden * num_heads if l != len(self.layers) - 1 else self.n_classes) y = th.zeros(g.num_nodes(), self.n_hidden * num_heads if l != len(self.layers) - 1 else self.n_classes)
else: else:
y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes) y = th.zeros(g.num_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes)
sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1) sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
dataloader = dgl.dataloading.NodeDataLoader( dataloader = dgl.dataloading.NodeDataLoader(
g, g,
th.arange(g.number_of_nodes()), th.arange(g.num_nodes()),
sampler, sampler,
batch_size=batch_size, batch_size=batch_size,
shuffle=False, shuffle=False,
...@@ -98,7 +88,6 @@ class GAT(nn.Module): ...@@ -98,7 +88,6 @@ class GAT(nn.Module):
for input_nodes, output_nodes, blocks in tqdm.tqdm(dataloader): for input_nodes, output_nodes, blocks in tqdm.tqdm(dataloader):
block = blocks[0].int().to(device) block = blocks[0].int().to(device)
h = x[input_nodes].to(device) h = x[input_nodes].to(device)
h_dst = h[:block.number_of_dst_nodes()].to(device)
if l < self.n_layers - 1: if l < self.n_layers - 1:
h = layer(block, h).flatten(1) h = layer(block, h).flatten(1)
else: else:
...@@ -116,7 +105,7 @@ def compute_acc(pred, labels): ...@@ -116,7 +105,7 @@ def compute_acc(pred, labels):
""" """
return (th.argmax(pred, dim=1) == labels).float().sum() / len(pred) return (th.argmax(pred, dim=1) == labels).float().sum() / len(pred)
def evaluate(model, g, labels, val_nid, test_nid, batch_size, device): def evaluate(model, g, nfeat, labels, val_nid, test_nid, batch_size, device):
""" """
Evaluate the model on the validation set specified by ``val_mask``. Evaluate the model on the validation set specified by ``val_mask``.
g : The entire graph. g : The entire graph.
...@@ -128,8 +117,7 @@ def evaluate(model, g, labels, val_nid, test_nid, batch_size, device): ...@@ -128,8 +117,7 @@ def evaluate(model, g, labels, val_nid, test_nid, batch_size, device):
""" """
model.eval() model.eval()
with th.no_grad(): with th.no_grad():
inputs = g.ndata['feat'] pred = model.inference(g, nfeat, batch_size, device)
pred = model.inference(g, inputs, batch_size, device)
model.train() model.train()
return compute_acc(pred[val_nid], labels[val_nid]), compute_acc(pred[test_nid], labels[test_nid]), pred return compute_acc(pred[val_nid], labels[val_nid]), compute_acc(pred[test_nid], labels[test_nid]), pred
...@@ -142,7 +130,8 @@ def model_param_summary(model): ...@@ -142,7 +130,8 @@ def model_param_summary(model):
def run(args, device, data): def run(args, device, data):
# Unpack data # Unpack data
train_nid, val_nid, test_nid, in_feats, labels, n_classes, g, cluster_iterator = data train_nid, val_nid, test_nid, in_feats, labels, n_classes, g, cluster_iterator = data
labels = labels.to(device)
nfeat = g.ndata.pop('feat').to(device)
# Define model and optimizer # Define model and optimizer
model = GAT(in_feats, args.num_heads, args.num_hidden, n_classes, args.num_layers, F.relu, args.dropout) model = GAT(in_feats, args.num_heads, args.num_hidden, n_classes, args.num_layers, F.relu, args.dropout)
...@@ -164,16 +153,18 @@ def run(args, device, data): ...@@ -164,16 +153,18 @@ def run(args, device, data):
# blocks. # blocks.
tic_start = time.time() tic_start = time.time()
for step, cluster in enumerate(cluster_iterator): for step, cluster in enumerate(cluster_iterator):
cluster = cluster.int().to(device) mask = cluster.ndata.pop('train_mask')
mask = cluster.ndata['train_mask']
if mask.sum() == 0: if mask.sum() == 0:
continue continue
feat = cluster.ndata['feat'] cluster.edata.pop(dgl.EID)
batch_labels = cluster.ndata['labels'] cluster = cluster.int().to(device)
input_nodes = cluster.ndata[dgl.NID]
batch_inputs = nfeat[input_nodes]
batch_labels = labels[input_nodes]
tic_step = time.time() tic_step = time.time()
# Compute loss and prediction # Compute loss and prediction
batch_pred = model(cluster, feat) batch_pred = model(cluster, batch_inputs)
batch_pred = batch_pred[mask] batch_pred = batch_pred[mask]
batch_labels = batch_labels[mask] batch_labels = batch_labels[mask]
loss = nn.functional.nll_loss(batch_pred, batch_labels) loss = nn.functional.nll_loss(batch_pred, batch_labels)
...@@ -199,7 +190,7 @@ def run(args, device, data): ...@@ -199,7 +190,7 @@ def run(args, device, data):
avg += toc - tic avg += toc - tic
if epoch % args.eval_every == 0 and epoch != 0: if epoch % args.eval_every == 0 and epoch != 0:
eval_acc, test_acc, pred = evaluate(model, g, labels, val_nid, test_nid, args.val_batch_size, device) eval_acc, test_acc, pred = evaluate(model, g, nfeat, labels, val_nid, test_nid, args.val_batch_size, device)
model = model.to(device) model = model.to(device)
if args.save_pred: if args.save_pred:
np.savetxt(args.save_pred + '%02d' % epoch, pred.argmax(1).cpu().numpy(), '%d') np.savetxt(args.save_pred + '%02d' % epoch, pred.argmax(1).cpu().numpy(), '%d')
...@@ -229,6 +220,11 @@ if __name__ == '__main__': ...@@ -229,6 +220,11 @@ if __name__ == '__main__':
argparser.add_argument('--wd', type=float, default=0) argparser.add_argument('--wd', type=float, default=0)
argparser.add_argument('--num_partitions', type=int, default=15000) argparser.add_argument('--num_partitions', type=int, default=15000)
argparser.add_argument('--num-workers', type=int, default=0) argparser.add_argument('--num-workers', type=int, default=0)
argparser.add_argument('--data-cpu', action='store_true',
help="By default the script puts all node features and labels "
"on GPU when using it to save time for data copy. This may "
"be undesired if they cannot fit in GPU memory at once. "
"This flag disables that.")
args = argparser.parse_args() args = argparser.parse_args()
if args.gpu >= 0: if args.gpu >= 0:
...@@ -242,22 +238,15 @@ if __name__ == '__main__': ...@@ -242,22 +238,15 @@ if __name__ == '__main__':
train_idx, val_idx, test_idx = splitted_idx['train'], splitted_idx['valid'], splitted_idx['test'] train_idx, val_idx, test_idx = splitted_idx['train'], splitted_idx['valid'], splitted_idx['test']
graph, labels = data[0] graph, labels = data[0]
labels = labels[:, 0] labels = labels[:, 0]
print('Total edges before adding self-loop {}'.format(graph.number_of_edges())) print('Total edges before adding self-loop {}'.format(graph.num_edges()))
graph = dgl.remove_self_loop(graph) graph = dgl.remove_self_loop(graph)
graph = dgl.add_self_loop(graph) graph = dgl.add_self_loop(graph)
print('Total edges after adding self-loop {}'.format(graph.number_of_edges())) print('Total edges after adding self-loop {}'.format(graph.num_edges()))
num_nodes = train_idx.shape[0] + val_idx.shape[0] + test_idx.shape[0] num_nodes = train_idx.shape[0] + val_idx.shape[0] + test_idx.shape[0]
assert num_nodes == graph.number_of_nodes() assert num_nodes == graph.num_nodes()
graph.ndata['labels'] = labels
mask = th.zeros(num_nodes, dtype=th.bool) mask = th.zeros(num_nodes, dtype=th.bool)
mask[train_idx] = True mask[train_idx] = True
graph.ndata['train_mask'] = mask graph.ndata['train_mask'] = mask
mask = th.zeros(num_nodes, dtype=th.bool)
mask[val_idx] = True
graph.ndata['valid_mask'] = mask
mask = th.zeros(num_nodes, dtype=th.bool)
mask[test_idx] = True
graph.ndata['test_mask'] = mask
graph.in_degrees(0) graph.in_degrees(0)
graph.out_degrees(0) graph.out_degrees(0)
...@@ -265,7 +254,9 @@ if __name__ == '__main__': ...@@ -265,7 +254,9 @@ if __name__ == '__main__':
cluster_iter_data = ClusterIter( cluster_iter_data = ClusterIter(
'ogbn-products', graph, args.num_partitions, args.batch_size) 'ogbn-products', graph, args.num_partitions, args.batch_size)
cluster_iterator = DataLoader(cluster_iter_data, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=4, collate_fn=partial(subgraph_collate_fn, graph)) cluster_iterator = DataLoader(cluster_iter_data, batch_size=args.batch_size, shuffle=True,
pin_memory=True, num_workers=4,
collate_fn=partial(subgraph_collate_fn, graph))
in_feats = graph.ndata['feat'].shape[1] in_feats = graph.ndata['feat'].shape[1]
n_classes = (labels.max() + 1).item() n_classes = (labels.max() + 1).item()
......
import os import os
import random
import dgl.function as fn
import torch import torch
import time
from partition_utils import * from partition_utils import *
......
...@@ -4,17 +4,10 @@ import torch as th ...@@ -4,17 +4,10 @@ import torch as th
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
import torch.optim as optim import torch.optim as optim
import torch.multiprocessing as mp
from torch.utils.data import DataLoader
import dgl.function as fn
import dgl.nn.pytorch as dglnn import dgl.nn.pytorch as dglnn
import time import time
import argparse import argparse
from _thread import start_new_thread
from functools import wraps
from dgl.data import RedditDataset
import tqdm import tqdm
import traceback
from ogb.nodeproppred import DglNodePropPredDataset from ogb.nodeproppred import DglNodePropPredDataset
...@@ -25,17 +18,18 @@ class GAT(nn.Module): ...@@ -25,17 +18,18 @@ class GAT(nn.Module):
n_classes, n_classes,
n_layers, n_layers,
num_heads, num_heads,
activation, activation):
dropout):
super().__init__() super().__init__()
self.n_layers = n_layers self.n_layers = n_layers
self.n_hidden = n_hidden self.n_hidden = n_hidden
self.n_classes = n_classes self.n_classes = n_classes
self.layers = nn.ModuleList() self.layers = nn.ModuleList()
self.layers.append(dglnn.GATConv((in_feats, in_feats), n_hidden, num_heads=num_heads, feat_drop=0., attn_drop=0., activation=activation, negative_slope=0.2)) self.layers.append(dglnn.GATConv((in_feats, in_feats), n_hidden, num_heads=num_heads, activation=activation))
for i in range(1, n_layers - 1): for i in range(1, n_layers - 1):
self.layers.append(dglnn.GATConv((n_hidden * num_heads, n_hidden * num_heads), n_hidden, num_heads=num_heads, feat_drop=0., attn_drop=0., activation=activation, negative_slope=0.2)) self.layers.append(dglnn.GATConv((n_hidden * num_heads, n_hidden * num_heads), n_hidden,
self.layers.append(dglnn.GATConv((n_hidden * num_heads, n_hidden * num_heads), n_classes, num_heads=num_heads, feat_drop=0., attn_drop=0., activation=None, negative_slope=0.2)) num_heads=num_heads, activation=activation))
self.layers.append(dglnn.GATConv((n_hidden * num_heads, n_hidden * num_heads), n_classes,
num_heads=num_heads, activation=None))
def forward(self, blocks, x): def forward(self, blocks, x):
h = x h = x
...@@ -44,7 +38,7 @@ class GAT(nn.Module): ...@@ -44,7 +38,7 @@ class GAT(nn.Module):
# appropriate nodes on the LHS. # appropriate nodes on the LHS.
# Note that the shape of h is (num_nodes_LHS, D) and the shape of h_dst # Note that the shape of h is (num_nodes_LHS, D) and the shape of h_dst
# would be (num_nodes_RHS, D) # would be (num_nodes_RHS, D)
h_dst = h[:block.number_of_dst_nodes()] h_dst = h[:block.num_dst_nodes()]
# Then we compute the updated representation on the RHS. # Then we compute the updated representation on the RHS.
# The shape of h now becomes (num_nodes_RHS, D) # The shape of h now becomes (num_nodes_RHS, D)
if l < self.n_layers - 1: if l < self.n_layers - 1:
...@@ -54,7 +48,7 @@ class GAT(nn.Module): ...@@ -54,7 +48,7 @@ class GAT(nn.Module):
h = h.mean(1) h = h.mean(1)
return h.log_softmax(dim=-1) return h.log_softmax(dim=-1)
def inference(self, g, x, batch_size, num_heads, device): def inference(self, g, x, num_heads, device):
""" """
Inference with the GAT model on full neighbors (i.e. without neighbor sampling). Inference with the GAT model on full neighbors (i.e. without neighbor sampling).
g : the entire graph. g : the entire graph.
...@@ -67,17 +61,16 @@ class GAT(nn.Module): ...@@ -67,17 +61,16 @@ class GAT(nn.Module):
# Therefore, we compute the representation of all nodes layer by layer. The nodes # Therefore, we compute the representation of all nodes layer by layer. The nodes
# on each layer are of course splitted in batches. # on each layer are of course splitted in batches.
# TODO: can we standardize this? # TODO: can we standardize this?
nodes = th.arange(g.number_of_nodes())
for l, layer in enumerate(self.layers): for l, layer in enumerate(self.layers):
if l < self.n_layers - 1: if l < self.n_layers - 1:
y = th.zeros(g.number_of_nodes(), self.n_hidden * num_heads if l != len(self.layers) - 1 else self.n_classes) y = th.zeros(g.num_nodes(), self.n_hidden * num_heads if l != len(self.layers) - 1 else self.n_classes)
else: else:
y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes) y = th.zeros(g.num_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes)
sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1) sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
dataloader = dgl.dataloading.NodeDataLoader( dataloader = dgl.dataloading.NodeDataLoader(
g, g,
th.arange(g.number_of_nodes()), th.arange(g.num_nodes()),
sampler, sampler,
batch_size=args.batch_size, batch_size=args.batch_size,
shuffle=True, shuffle=True,
...@@ -88,7 +81,7 @@ class GAT(nn.Module): ...@@ -88,7 +81,7 @@ class GAT(nn.Module):
block = blocks[0].int().to(device) block = blocks[0].int().to(device)
h = x[input_nodes].to(device) h = x[input_nodes].to(device)
h_dst = h[:block.number_of_dst_nodes()] h_dst = h[:block.num_dst_nodes()]
if l < self.n_layers - 1: if l < self.n_layers - 1:
h = layer(block, (h, h_dst)).flatten(1) h = layer(block, (h, h_dst)).flatten(1)
else: else:
...@@ -99,7 +92,7 @@ class GAT(nn.Module): ...@@ -99,7 +92,7 @@ class GAT(nn.Module):
y[output_nodes] = h.cpu() y[output_nodes] = h.cpu()
x = y x = y
return y return y.to(device)
def compute_acc(pred, labels): def compute_acc(pred, labels):
""" """
...@@ -107,7 +100,7 @@ def compute_acc(pred, labels): ...@@ -107,7 +100,7 @@ def compute_acc(pred, labels):
""" """
return (th.argmax(pred, dim=1) == labels).float().sum() / len(pred) return (th.argmax(pred, dim=1) == labels).float().sum() / len(pred)
def evaluate(model, g, labels, val_nid, test_nid, batch_size, num_heads, device): def evaluate(model, g, nfeat, labels, val_nid, test_nid, num_heads, device):
""" """
Evaluate the model on the validation set specified by ``val_mask``. Evaluate the model on the validation set specified by ``val_mask``.
g : The entire graph. g : The entire graph.
...@@ -119,23 +112,22 @@ def evaluate(model, g, labels, val_nid, test_nid, batch_size, num_heads, device) ...@@ -119,23 +112,22 @@ def evaluate(model, g, labels, val_nid, test_nid, batch_size, num_heads, device)
""" """
model.eval() model.eval()
with th.no_grad(): with th.no_grad():
inputs = g.ndata['feat'] pred = model.inference(g, nfeat, num_heads, device)
pred = model.inference(g, inputs, batch_size, num_heads, device)
model.train() model.train()
return compute_acc(pred[val_nid], labels[val_nid]), compute_acc(pred[test_nid], labels[test_nid]), pred return compute_acc(pred[val_nid], labels[val_nid]), compute_acc(pred[test_nid], labels[test_nid]), pred
def load_subtensor(g, labels, seeds, input_nodes, device): def load_subtensor(nfeat, labels, seeds, input_nodes):
""" """
Copys features and labels of a set of nodes onto GPU. Extracts features and labels for a set of nodes.
""" """
batch_inputs = g.ndata['feat'][input_nodes].to(device) batch_inputs = nfeat[input_nodes]
batch_labels = labels[seeds].to(device) batch_labels = labels[seeds]
return batch_inputs, batch_labels return batch_inputs, batch_labels
#### Entry point #### Entry point
def run(args, device, data): def run(args, device, data):
# Unpack data # Unpack data
train_nid, val_nid, test_nid, in_feats, labels, n_classes, g, num_heads = data train_nid, val_nid, test_nid, in_feats, labels, n_classes, nfeat, g, num_heads = data
# Create PyTorch DataLoader for constructing blocks # Create PyTorch DataLoader for constructing blocks
sampler = dgl.dataloading.MultiLayerNeighborSampler( sampler = dgl.dataloading.MultiLayerNeighborSampler(
...@@ -150,7 +142,7 @@ def run(args, device, data): ...@@ -150,7 +142,7 @@ def run(args, device, data):
num_workers=args.num_workers) num_workers=args.num_workers)
# Define model and optimizer # Define model and optimizer
model = GAT(in_feats, args.num_hidden, n_classes, args.num_layers, num_heads, F.relu, args.dropout) model = GAT(in_feats, args.num_hidden, n_classes, args.num_layers, num_heads, F.relu)
model = model.to(device) model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd) optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd)
...@@ -171,7 +163,7 @@ def run(args, device, data): ...@@ -171,7 +163,7 @@ def run(args, device, data):
blocks = [blk.to(device) for blk in blocks] blocks = [blk.to(device) for blk in blocks]
# Load the input features as well as output labels # Load the input features as well as output labels
batch_inputs, batch_labels = load_subtensor(g, labels, seeds, input_nodes, device) batch_inputs, batch_labels = load_subtensor(nfeat, labels, seeds, input_nodes)
# Compute loss and prediction # Compute loss and prediction
batch_pred = model(blocks, batch_inputs) batch_pred = model(blocks, batch_inputs)
...@@ -192,7 +184,7 @@ def run(args, device, data): ...@@ -192,7 +184,7 @@ def run(args, device, data):
if epoch >= 5: if epoch >= 5:
avg += toc - tic avg += toc - tic
if epoch % args.eval_every == 0 and epoch != 0: if epoch % args.eval_every == 0 and epoch != 0:
eval_acc, test_acc, pred = evaluate(model, g, labels, val_nid, test_nid, args.val_batch_size, num_heads, device) eval_acc, test_acc, pred = evaluate(model, g, nfeat, labels, val_nid, test_nid, num_heads, device)
if args.save_pred: if args.save_pred:
np.savetxt(args.save_pred + '%02d' % epoch, pred.argmax(1).cpu().numpy(), '%d') np.savetxt(args.save_pred + '%02d' % epoch, pred.argmax(1).cpu().numpy(), '%d')
print('Eval Acc {:.4f}'.format(eval_acc)) print('Eval Acc {:.4f}'.format(eval_acc))
...@@ -217,7 +209,6 @@ if __name__ == '__main__': ...@@ -217,7 +209,6 @@ if __name__ == '__main__':
argparser.add_argument('--log-every', type=int, default=20) argparser.add_argument('--log-every', type=int, default=20)
argparser.add_argument('--eval-every', type=int, default=1) argparser.add_argument('--eval-every', type=int, default=1)
argparser.add_argument('--lr', type=float, default=0.001) argparser.add_argument('--lr', type=float, default=0.001)
argparser.add_argument('--dropout', type=float, default=0.5)
argparser.add_argument('--num-workers', type=int, default=8, argparser.add_argument('--num-workers', type=int, default=8,
help="Number of sampling processes. Use 0 for no extra process.") help="Number of sampling processes. Use 0 for no extra process.")
argparser.add_argument('--save-pred', type=str, default='') argparser.add_argument('--save-pred', type=str, default='')
...@@ -235,20 +226,21 @@ if __name__ == '__main__': ...@@ -235,20 +226,21 @@ if __name__ == '__main__':
splitted_idx = data.get_idx_split() splitted_idx = data.get_idx_split()
train_idx, val_idx, test_idx = splitted_idx['train'], splitted_idx['valid'], splitted_idx['test'] train_idx, val_idx, test_idx = splitted_idx['train'], splitted_idx['valid'], splitted_idx['test']
graph, labels = data[0] graph, labels = data[0]
labels = labels[:, 0] nfeat = graph.ndata.pop('feat').to(device)
labels = labels[:, 0].to(device)
print('Total edges before adding self-loop {}'.format(graph.number_of_edges())) print('Total edges before adding self-loop {}'.format(graph.num_edges()))
graph = graph.remove_self_loop().add_self_loop() graph = graph.remove_self_loop().add_self_loop()
print('Total edges after adding self-loop {}'.format(graph.number_of_edges())) print('Total edges after adding self-loop {}'.format(graph.num_edges()))
in_feats = graph.ndata['feat'].shape[1] in_feats = nfeat.shape[1]
n_classes = (labels.max() + 1).item() n_classes = (labels.max() + 1).item()
# Create csr/coo/csc formats before launching sampling processes # Create csr/coo/csc formats before launching sampling processes
# This avoids creating certain formats in each data loader process, which saves momory and CPU. # This avoids creating certain formats in each data loader process, which saves momory and CPU.
graph.create_formats_() graph.create_formats_()
# Pack data # Pack data
data = train_idx, val_idx, test_idx, in_feats, labels, n_classes, graph, args.head data = train_idx, val_idx, test_idx, in_feats, labels, n_classes, nfeat, graph, args.head
# Run 10 times # Run 10 times
test_accs = [] test_accs = []
......
...@@ -4,17 +4,10 @@ import torch as th ...@@ -4,17 +4,10 @@ import torch as th
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
import torch.optim as optim import torch.optim as optim
import torch.multiprocessing as mp
from torch.utils.data import DataLoader
import dgl.function as fn
import dgl.nn.pytorch as dglnn import dgl.nn.pytorch as dglnn
import time import time
import argparse import argparse
from _thread import start_new_thread
from functools import wraps
from dgl.data import RedditDataset
import tqdm import tqdm
import traceback
from ogb.nodeproppred import DglNodePropPredDataset from ogb.nodeproppred import DglNodePropPredDataset
class SAGE(nn.Module): class SAGE(nn.Module):
...@@ -44,7 +37,7 @@ class SAGE(nn.Module): ...@@ -44,7 +37,7 @@ class SAGE(nn.Module):
# appropriate nodes on the LHS. # appropriate nodes on the LHS.
# Note that the shape of h is (num_nodes_LHS, D) and the shape of h_dst # Note that the shape of h is (num_nodes_LHS, D) and the shape of h_dst
# would be (num_nodes_RHS, D) # would be (num_nodes_RHS, D)
h_dst = h[:block.number_of_dst_nodes()] h_dst = h[:block.num_dst_nodes()]
# Then we compute the updated representation on the RHS. # Then we compute the updated representation on the RHS.
# The shape of h now becomes (num_nodes_RHS, D) # The shape of h now becomes (num_nodes_RHS, D)
h = layer(block, (h, h_dst)) h = layer(block, (h, h_dst))
...@@ -53,7 +46,7 @@ class SAGE(nn.Module): ...@@ -53,7 +46,7 @@ class SAGE(nn.Module):
h = self.dropout(h) h = self.dropout(h)
return h return h
def inference(self, g, x, batch_size, device): def inference(self, g, x, device):
""" """
Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling). Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling).
g : the entire graph. g : the entire graph.
...@@ -66,14 +59,13 @@ class SAGE(nn.Module): ...@@ -66,14 +59,13 @@ class SAGE(nn.Module):
# Therefore, we compute the representation of all nodes layer by layer. The nodes # Therefore, we compute the representation of all nodes layer by layer. The nodes
# on each layer are of course splitted in batches. # on each layer are of course splitted in batches.
# TODO: can we standardize this? # TODO: can we standardize this?
nodes = th.arange(g.number_of_nodes())
for l, layer in enumerate(self.layers): for l, layer in enumerate(self.layers):
y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes) y = th.zeros(g.num_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes).to(device)
sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1) sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
dataloader = dgl.dataloading.NodeDataLoader( dataloader = dgl.dataloading.NodeDataLoader(
g, g,
th.arange(g.number_of_nodes()), th.arange(g.num_nodes()),
sampler, sampler,
batch_size=args.batch_size, batch_size=args.batch_size,
shuffle=True, shuffle=True,
...@@ -83,14 +75,14 @@ class SAGE(nn.Module): ...@@ -83,14 +75,14 @@ class SAGE(nn.Module):
for input_nodes, output_nodes, blocks in tqdm.tqdm(dataloader): for input_nodes, output_nodes, blocks in tqdm.tqdm(dataloader):
block = blocks[0].int().to(device) block = blocks[0].int().to(device)
h = x[input_nodes].to(device) h = x[input_nodes]
h_dst = h[:block.number_of_dst_nodes()] h_dst = h[:block.num_dst_nodes()]
h = layer(block, (h, h_dst)) h = layer(block, (h, h_dst))
if l != len(self.layers) - 1: if l != len(self.layers) - 1:
h = self.activation(h) h = self.activation(h)
h = self.dropout(h) h = self.dropout(h)
y[output_nodes] = h.cpu() y[output_nodes] = h
x = y x = y
return y return y
...@@ -101,35 +93,33 @@ def compute_acc(pred, labels): ...@@ -101,35 +93,33 @@ def compute_acc(pred, labels):
""" """
return (th.argmax(pred, dim=1) == labels).float().sum() / len(pred) return (th.argmax(pred, dim=1) == labels).float().sum() / len(pred)
def evaluate(model, g, labels, val_nid, test_nid, batch_size, device): def evaluate(model, g, nfeat, labels, val_nid, test_nid, device):
""" """
Evaluate the model on the validation set specified by ``val_mask``. Evaluate the model on the validation set specified by ``val_mask``.
g : The entire graph. g : The entire graph.
inputs : The features of all the nodes. inputs : The features of all the nodes.
labels : The labels of all the nodes. labels : The labels of all the nodes.
val_mask : A 0-1 mask indicating which nodes do we actually compute the accuracy for. val_mask : A 0-1 mask indicating which nodes do we actually compute the accuracy for.
batch_size : Number of nodes to compute at the same time.
device : The GPU device to evaluate on. device : The GPU device to evaluate on.
""" """
model.eval() model.eval()
with th.no_grad(): with th.no_grad():
inputs = g.ndata['feat'] pred = model.inference(g, nfeat, device)
pred = model.inference(g, inputs, batch_size, device)
model.train() model.train()
return compute_acc(pred[val_nid], labels[val_nid]), compute_acc(pred[test_nid], labels[test_nid]), pred return compute_acc(pred[val_nid], labels[val_nid]), compute_acc(pred[test_nid], labels[test_nid]), pred
def load_subtensor(g, labels, seeds, input_nodes, device): def load_subtensor(nfeat, labels, seeds, input_nodes):
""" """
Copys features and labels of a set of nodes onto GPU. Extracts features and labels for a set of nodes.
""" """
batch_inputs = g.ndata['feat'][input_nodes].to(device) batch_inputs = nfeat[input_nodes]
batch_labels = labels[seeds].to(device) batch_labels = labels[seeds]
return batch_inputs, batch_labels return batch_inputs, batch_labels
#### Entry point #### Entry point
def run(args, device, data): def run(args, device, data):
# Unpack data # Unpack data
train_nid, val_nid, test_nid, in_feats, labels, n_classes, g = data train_nid, val_nid, test_nid, in_feats, labels, n_classes, nfeat, g = data
# Create PyTorch DataLoader for constructing blocks # Create PyTorch DataLoader for constructing blocks
sampler = dgl.dataloading.MultiLayerNeighborSampler( sampler = dgl.dataloading.MultiLayerNeighborSampler(
...@@ -147,7 +137,6 @@ def run(args, device, data): ...@@ -147,7 +137,6 @@ def run(args, device, data):
model = SAGE(in_feats, args.num_hidden, n_classes, args.num_layers, F.relu, args.dropout) model = SAGE(in_feats, args.num_hidden, n_classes, args.num_layers, F.relu, args.dropout)
model = model.to(device) model = model.to(device)
loss_fcn = nn.CrossEntropyLoss() loss_fcn = nn.CrossEntropyLoss()
loss_fcn = loss_fcn.to(device)
optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd) optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd)
# Training loop # Training loop
...@@ -167,7 +156,7 @@ def run(args, device, data): ...@@ -167,7 +156,7 @@ def run(args, device, data):
blocks = [blk.int().to(device) for blk in blocks] blocks = [blk.int().to(device) for blk in blocks]
# Load the input features as well as output labels # Load the input features as well as output labels
batch_inputs, batch_labels = load_subtensor(g, labels, seeds, input_nodes, device) batch_inputs, batch_labels = load_subtensor(nfeat, labels, seeds, input_nodes)
# Compute loss and prediction # Compute loss and prediction
batch_pred = model(blocks, batch_inputs) batch_pred = model(blocks, batch_inputs)
...@@ -188,7 +177,7 @@ def run(args, device, data): ...@@ -188,7 +177,7 @@ def run(args, device, data):
if epoch >= 5: if epoch >= 5:
avg += toc - tic avg += toc - tic
if epoch % args.eval_every == 0 and epoch != 0: if epoch % args.eval_every == 0 and epoch != 0:
eval_acc, test_acc, pred = evaluate(model, g, labels, val_nid, test_nid, args.val_batch_size, device) eval_acc, test_acc, pred = evaluate(model, g, nfeat, labels, val_nid, test_nid, device)
if args.save_pred: if args.save_pred:
np.savetxt(args.save_pred + '%02d' % epoch, pred.argmax(1).cpu().numpy(), '%d') np.savetxt(args.save_pred + '%02d' % epoch, pred.argmax(1).cpu().numpy(), '%d')
print('Eval Acc {:.4f}'.format(eval_acc)) print('Eval Acc {:.4f}'.format(eval_acc))
...@@ -230,15 +219,16 @@ if __name__ == '__main__': ...@@ -230,15 +219,16 @@ if __name__ == '__main__':
splitted_idx = data.get_idx_split() splitted_idx = data.get_idx_split()
train_idx, val_idx, test_idx = splitted_idx['train'], splitted_idx['valid'], splitted_idx['test'] train_idx, val_idx, test_idx = splitted_idx['train'], splitted_idx['valid'], splitted_idx['test']
graph, labels = data[0] graph, labels = data[0]
labels = labels[:, 0] nfeat = graph.ndata.pop('feat').to(device)
labels = labels[:, 0].to(device)
in_feats = graph.ndata['feat'].shape[1] in_feats = nfeat.shape[1]
n_classes = (labels.max() + 1).item() n_classes = (labels.max() + 1).item()
# Create csr/coo/csc formats before launching sampling processes # Create csr/coo/csc formats before launching sampling processes
# This avoids creating certain formats in each data loader process, which saves momory and CPU. # This avoids creating certain formats in each data loader process, which saves momory and CPU.
graph.create_formats_() graph.create_formats_()
# Pack data # Pack data
data = train_idx, val_idx, test_idx, in_feats, labels, n_classes, graph data = train_idx, val_idx, test_idx, in_feats, labels, n_classes, nfeat, graph
# Run 10 times # Run 10 times
test_accs = [] test_accs = []
......
...@@ -7,10 +7,7 @@ import itertools ...@@ -7,10 +7,7 @@ import itertools
import numpy as np import numpy as np
import time import time
import torch as th import torch as th
import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from torch.utils.data import DataLoader
from functools import partial
import dgl import dgl
from dgl.data.rdf import AIFBDataset, MUTAGDataset, BGSDataset, AMDataset from dgl.data.rdf import AIFBDataset, MUTAGDataset, BGSDataset, AMDataset
...@@ -32,7 +29,7 @@ def evaluate(model, loader, node_embed, labels, category, device): ...@@ -32,7 +29,7 @@ def evaluate(model, loader, node_embed, labels, category, device):
blocks = [blk.to(device) for blk in blocks] blocks = [blk.to(device) for blk in blocks]
seeds = seeds[category] seeds = seeds[category]
emb = extract_embed(node_embed, input_nodes) emb = extract_embed(node_embed, input_nodes)
emb = {k : e.to(device) for k, e in emb.items()} emb = {k: e.to(device) for k, e in emb.items()}
lbl = labels[seeds].to(device) lbl = labels[seeds].to(device)
logits = model(emb, blocks)[category] logits = model(emb, blocks)[category]
loss = F.cross_entropy(logits, lbl) loss = F.cross_entropy(logits, lbl)
...@@ -43,6 +40,13 @@ def evaluate(model, loader, node_embed, labels, category, device): ...@@ -43,6 +40,13 @@ def evaluate(model, loader, node_embed, labels, category, device):
return total_loss / count, total_acc / count return total_loss / count, total_acc / count
def main(args): def main(args):
# check cuda
device = 'cpu'
use_cuda = args.gpu >= 0 and th.cuda.is_available()
if use_cuda:
th.cuda.set_device(args.gpu)
device = 'cuda:%d' % args.gpu
# load graph data # load graph data
if args.dataset == 'aifb': if args.dataset == 'aifb':
dataset = AIFBDataset() dataset = AIFBDataset()
...@@ -71,19 +75,13 @@ def main(args): ...@@ -71,19 +75,13 @@ def main(args):
else: else:
val_idx = train_idx val_idx = train_idx
# check cuda
device = 'cpu'
use_cuda = args.gpu >= 0 and th.cuda.is_available()
if use_cuda:
th.cuda.set_device(args.gpu)
device = 'cuda:%d' % args.gpu
train_label = labels[train_idx]
val_label = labels[val_idx]
test_label = labels[test_idx]
# create embeddings # create embeddings
embed_layer = RelGraphEmbed(g, args.n_hidden) embed_layer = RelGraphEmbed(g, args.n_hidden)
if not args.data_cpu:
labels = labels.to(device)
embed_layer = embed_layer.to(device)
node_embed = embed_layer() node_embed = embed_layer()
# create model # create model
model = EntityClassify(g, model = EntityClassify(g,
...@@ -187,6 +185,11 @@ if __name__ == '__main__': ...@@ -187,6 +185,11 @@ if __name__ == '__main__':
help="Mini-batch size. If -1, use full graph training.") help="Mini-batch size. If -1, use full graph training.")
parser.add_argument("--fanout", type=int, default=4, parser.add_argument("--fanout", type=int, default=4,
help="Fan-out of neighbor sampling.") help="Fan-out of neighbor sampling.")
parser.add_argument('--data-cpu', action='store_true',
help="By default the script puts all node features and labels "
"on GPU when using it to save time for data copy. This may "
"be undesired if they cannot fit in GPU memory at once. "
"This flag disables that.")
fp = parser.add_mutually_exclusive_group(required=False) fp = parser.add_mutually_exclusive_group(required=False)
fp.add_argument('--validation', dest='validation', action='store_true') fp.add_argument('--validation', dest='validation', action='store_true')
fp.add_argument('--testing', dest='validation', action='store_false') fp.add_argument('--testing', dest='validation', action='store_false')
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment