"...text-generation-inference.git" did not exist on "7de8a377b067af5d9133874b88f5b0a37452a5eb"
Unverified Commit 927d2b31 authored by Mufei Li's avatar Mufei Li Committed by GitHub
Browse files

[Example] Move Data to GPU before Minibatch Training (#2453)

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update
parent 72ef642f
......@@ -4,19 +4,12 @@ import torch as th
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.multiprocessing as mp
from torch.utils.data import DataLoader
import dgl.function as fn
import dgl.nn.pytorch as dglnn
import time
import argparse
from _thread import start_new_thread
from functools import wraps
from dgl.data import RedditDataset
import tqdm
import traceback
from load_graph import load_reddit, load_ogb, inductive_split
from load_graph import load_reddit, inductive_split
class SAGE(nn.Module):
def __init__(self,
......@@ -47,7 +40,7 @@ class SAGE(nn.Module):
h = self.dropout(h)
return h
def inference(self, g, x, batch_size, device):
def inference(self, g, x, device):
"""
Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling).
g : the entire graph.
......@@ -62,12 +55,12 @@ class SAGE(nn.Module):
# on each layer are of course splitted in batches.
# TODO: can we standardize this?
for l, layer in enumerate(self.layers):
y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes)
y = th.zeros(g.num_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes)
sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
dataloader = dgl.dataloading.NodeDataLoader(
g,
th.arange(g.number_of_nodes()),
th.arange(g.num_nodes()),
sampler,
batch_size=args.batch_size,
shuffle=True,
......@@ -96,34 +89,35 @@ def compute_acc(pred, labels):
labels = labels.long()
return (th.argmax(pred, dim=1) == labels).float().sum() / len(pred)
def evaluate(model, g, inputs, labels, val_nid, batch_size, device):
def evaluate(model, g, nfeat, labels, val_nid, device):
"""
Evaluate the model on the validation set specified by ``val_nid``.
g : The entire graph.
inputs : The features of all the nodes.
labels : The labels of all the nodes.
val_nid : the node Ids for validation.
batch_size : Number of nodes to compute at the same time.
device : The GPU device to evaluate on.
"""
model.eval()
with th.no_grad():
pred = model.inference(g, inputs, batch_size, device)
pred = model.inference(g, nfeat, device)
model.train()
return compute_acc(pred[val_nid], labels[val_nid])
return compute_acc(pred[val_nid], labels[val_nid].to(pred.device))
def load_subtensor(g, seeds, input_nodes, device):
def load_subtensor(nfeat, labels, seeds, input_nodes, device):
"""
Copys features and labels of a set of nodes onto GPU.
Extracts features and labels for a subset of nodes
"""
batch_inputs = g.ndata['features'][input_nodes].to(device)
batch_labels = g.ndata['labels'][seeds].to(device)
batch_inputs = nfeat[input_nodes].to(device)
batch_labels = labels[seeds].to(device)
return batch_inputs, batch_labels
#### Entry point
def run(args, device, data):
# Unpack data
in_feats, n_classes, train_g, val_g, test_g = data
n_classes, train_g, val_g, test_g, train_nfeat, train_labels, \
val_nfeat, val_labels, test_nfeat, test_labels = data
in_feats = train_nfeat.shape[1]
train_nid = th.nonzero(train_g.ndata['train_mask'], as_tuple=True)[0]
val_nid = th.nonzero(val_g.ndata['val_mask'], as_tuple=True)[0]
test_nid = th.nonzero(~(test_g.ndata['train_mask'] | test_g.ndata['val_mask']), as_tuple=True)[0]
......@@ -144,7 +138,6 @@ def run(args, device, data):
model = SAGE(in_feats, args.num_hidden, n_classes, args.num_layers, F.relu, args.dropout)
model = model.to(device)
loss_fcn = nn.CrossEntropyLoss()
loss_fcn = loss_fcn.to(device)
optimizer = optim.Adam(model.parameters(), lr=args.lr)
# Training loop
......@@ -158,10 +151,9 @@ def run(args, device, data):
tic_step = time.time()
for step, (input_nodes, seeds, blocks) in enumerate(dataloader):
# Load the input features as well as output labels
#batch_inputs, batch_labels = load_subtensor(train_g, seeds, input_nodes, device)
batch_inputs, batch_labels = load_subtensor(train_nfeat, train_labels,
seeds, input_nodes, device)
blocks = [block.int().to(device) for block in blocks]
batch_inputs = blocks[0].srcdata['features']
batch_labels = blocks[-1].dstdata['labels']
# Compute loss and prediction
batch_pred = model(blocks, batch_inputs)
......@@ -183,9 +175,9 @@ def run(args, device, data):
if epoch >= 5:
avg += toc - tic
if epoch % args.eval_every == 0 and epoch != 0:
eval_acc = evaluate(model, val_g, val_g.ndata['features'], val_g.ndata['labels'], val_nid, args.batch_size, device)
eval_acc = evaluate(model, val_g, val_nfeat, val_labels, val_nid, device)
print('Eval Acc {:.4f}'.format(eval_acc))
test_acc = evaluate(model, test_g, test_g.ndata['features'], test_g.ndata['labels'], test_nid, args.batch_size, device)
test_acc = evaluate(model, test_g, test_nfeat, test_labels, test_nid, device)
print('Test Acc: {:.4f}'.format(test_acc))
print('Avg epoch time: {}'.format(avg / (epoch - 4)))
......@@ -193,7 +185,7 @@ def run(args, device, data):
if __name__ == '__main__':
argparser = argparse.ArgumentParser("multi-gpu training")
argparser.add_argument('--gpu', type=int, default=0,
help="GPU device ID. Use -1 for CPU training")
help="GPU device ID. Use -1 for CPU training")
argparser.add_argument('--dataset', type=str, default='reddit')
argparser.add_argument('--num-epochs', type=int, default=20)
argparser.add_argument('--num-hidden', type=int, default=16)
......@@ -205,9 +197,14 @@ if __name__ == '__main__':
argparser.add_argument('--lr', type=float, default=0.003)
argparser.add_argument('--dropout', type=float, default=0.5)
argparser.add_argument('--num-workers', type=int, default=4,
help="Number of sampling processes. Use 0 for no extra process.")
help="Number of sampling processes. Use 0 for no extra process.")
argparser.add_argument('--inductive', action='store_true',
help="Inductive learning setting")
help="Inductive learning setting")
argparser.add_argument('--data-cpu', action='store_true',
help="By default the script puts all node features and labels "
"on GPU when using it to save time for data copy. This may "
"be undesired if they cannot fit in GPU memory at once. "
"This flag disables that.")
args = argparser.parse_args()
if args.gpu >= 0:
......@@ -217,17 +214,25 @@ if __name__ == '__main__':
if args.dataset == 'reddit':
g, n_classes = load_reddit()
elif args.dataset == 'ogb-product':
g, n_classes = load_ogb('ogbn-products')
else:
raise Exception('unknown dataset')
in_feats = g.ndata['features'].shape[1]
if args.inductive:
train_g, val_g, test_g = inductive_split(g)
train_nfeat = train_g.ndata.pop('features')
val_nfeat = val_g.ndata.pop('features')
test_nfeat = test_g.ndata.pop('features')
train_labels = train_g.ndata.pop('labels')
val_labels = val_g.ndata.pop('labels')
test_labels = test_g.ndata.pop('labels')
else:
train_g = val_g = test_g = g
train_nfeat = val_nfeat = test_nfeat = g.ndata.pop('features')
train_labels = val_labels = test_labels = g.ndata.pop('labels')
if not args.data_cpu:
train_nfeat = train_nfeat.to(device)
train_labels = train_labels.to(device)
# Create csr/coo/csc formats before launching training processes with multi-gpu.
# This avoids creating certain formats in each sub-process, which saves momory and CPU.
......@@ -235,6 +240,7 @@ if __name__ == '__main__':
val_g.create_formats_()
test_g.create_formats_()
# Pack data
data = in_feats, n_classes, train_g, val_g, test_g
data = n_classes, train_g, val_g, test_g, train_nfeat, train_labels, \
val_nfeat, val_labels, test_nfeat, test_labels
run(args, device, data)
......@@ -5,16 +5,12 @@ import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.multiprocessing as mp
from torch.utils.data import DataLoader
import dgl.function as fn
import dgl.nn.pytorch as dglnn
import time
import math
import argparse
from dgl.data import RedditDataset
from torch.nn.parallel import DistributedDataParallel
import tqdm
import traceback
from utils import thread_wrapped_func
from load_graph import load_reddit, inductive_split
......@@ -48,7 +44,7 @@ class SAGE(nn.Module):
h = self.dropout(h)
return h
def inference(self, g, x, batch_size, device):
def inference(self, g, x, device):
"""
Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling).
g : the entire graph.
......@@ -62,14 +58,13 @@ class SAGE(nn.Module):
# Therefore, we compute the representation of all nodes layer by layer. The nodes
# on each layer are of course splitted in batches.
# TODO: can we standardize this?
nodes = th.arange(g.number_of_nodes())
for l, layer in enumerate(self.layers):
y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes)
y = th.zeros(g.num_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes)
sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
dataloader = dgl.dataloading.NodeDataLoader(
g,
th.arange(g.number_of_nodes()),
th.arange(g.num_nodes()),
sampler,
batch_size=args.batch_size,
shuffle=True,
......@@ -97,27 +92,26 @@ def compute_acc(pred, labels):
"""
return (th.argmax(pred, dim=1) == labels).float().sum() / len(pred)
def evaluate(model, g, inputs, labels, val_nid, batch_size, device):
def evaluate(model, g, nfeat, labels, val_nid, device):
"""
Evaluate the model on the validation set specified by ``val_nid``.
g : The entire graph.
inputs : The features of all the nodes.
labels : The labels of all the nodes.
val_nid : A node ID tensor indicating which nodes do we actually compute the accuracy for.
batch_size : Number of nodes to compute at the same time.
device : The GPU device to evaluate on.
"""
model.eval()
with th.no_grad():
pred = model.inference(g, inputs, batch_size, device)
pred = model.inference(g, nfeat, device)
model.train()
return compute_acc(pred[val_nid], labels[val_nid])
def load_subtensor(g, labels, seeds, input_nodes, dev_id):
def load_subtensor(nfeat, labels, seeds, input_nodes, dev_id):
"""
Copys features and labels of a set of nodes onto GPU.
Extracts features and labels for a subset of nodes.
"""
batch_inputs = g.ndata['features'][input_nodes].to(dev_id)
batch_inputs = nfeat[input_nodes].to(dev_id)
batch_labels = labels[seeds].to(dev_id)
return batch_inputs, batch_labels
......@@ -137,7 +131,25 @@ def run(proc_id, n_gpus, args, devices, data):
th.cuda.set_device(dev_id)
# Unpack data
in_feats, n_classes, train_g, val_g, test_g = data
n_classes, train_g, val_g, test_g = data
if args.inductive:
train_nfeat = train_g.ndata.pop('features')
val_nfeat = val_g.ndata.pop('features')
test_nfeat = test_g.ndata.pop('features')
train_labels = train_g.ndata.pop('labels')
val_labels = val_g.ndata.pop('labels')
test_labels = test_g.ndata.pop('labels')
else:
train_nfeat = val_nfeat = test_nfeat = g.ndata.pop('features')
train_labels = val_labels = test_labels = g.ndata.pop('labels')
if not args.data_cpu:
train_nfeat = train_nfeat.to(dev_id)
train_labels = train_labels.to(dev_id)
in_feats = train_nfeat.shape[1]
train_mask = train_g.ndata['train_mask']
val_mask = val_g.ndata['val_mask']
test_mask = ~(test_g.ndata['train_mask'] | test_g.ndata['val_mask'])
......@@ -166,7 +178,6 @@ def run(proc_id, n_gpus, args, devices, data):
if n_gpus > 1:
model = DistributedDataParallel(model, device_ids=[dev_id], output_device=dev_id)
loss_fcn = nn.CrossEntropyLoss()
loss_fcn = loss_fcn.to(dev_id)
optimizer = optim.Adam(model.parameters(), lr=args.lr)
# Training loop
......@@ -182,7 +193,8 @@ def run(proc_id, n_gpus, args, devices, data):
tic_step = time.time()
# Load the input features as well as output labels
batch_inputs, batch_labels = load_subtensor(train_g, train_g.ndata['labels'], seeds, input_nodes, dev_id)
batch_inputs, batch_labels = load_subtensor(train_nfeat, train_labels,
seeds, input_nodes, dev_id)
blocks = [block.int().to(dev_id) for block in blocks]
# Compute loss and prediction
batch_pred = model(blocks, batch_inputs)
......@@ -209,14 +221,14 @@ def run(proc_id, n_gpus, args, devices, data):
if epoch % args.eval_every == 0 and epoch != 0:
if n_gpus == 1:
eval_acc = evaluate(
model, val_g, val_g.ndata['features'], val_g.ndata['labels'], val_nid, args.batch_size, devices[0])
model, val_g, val_nfeat, val_labels, val_nid, devices[0])
test_acc = evaluate(
model, test_g, test_g.ndata['features'], test_g.ndata['labels'], test_nid, args.batch_size, devices[0])
model, test_g, test_nfeat, test_labels, test_nid, devices[0])
else:
eval_acc = evaluate(
model.module, val_g, val_g.ndata['features'], val_g.ndata['labels'], val_nid, args.batch_size, devices[0])
model.module, val_g, val_nfeat, val_labels, val_nid, devices[0])
test_acc = evaluate(
model.module, test_g, test_g.ndata['features'], test_g.ndata['labels'], test_nid, args.batch_size, devices[0])
model.module, test_g, test_nfeat, test_labels, test_nid, devices[0])
print('Eval Acc {:.4f}'.format(eval_acc))
print('Test Acc: {:.4f}'.format(test_acc))
......@@ -229,7 +241,7 @@ def run(proc_id, n_gpus, args, devices, data):
if __name__ == '__main__':
argparser = argparse.ArgumentParser("multi-gpu training")
argparser.add_argument('--gpu', type=str, default='0',
help="Comma separated list of GPU device IDs.")
help="Comma separated list of GPU device IDs.")
argparser.add_argument('--num-epochs', type=int, default=20)
argparser.add_argument('--num-hidden', type=int, default=16)
argparser.add_argument('--num-layers', type=int, default=2)
......@@ -240,9 +252,14 @@ if __name__ == '__main__':
argparser.add_argument('--lr', type=float, default=0.003)
argparser.add_argument('--dropout', type=float, default=0.5)
argparser.add_argument('--num-workers', type=int, default=0,
help="Number of sampling processes. Use 0 for no extra process.")
help="Number of sampling processes. Use 0 for no extra process.")
argparser.add_argument('--inductive', action='store_true',
help="Inductive learning setting")
help="Inductive learning setting")
argparser.add_argument('--data-cpu', action='store_true',
help="By default the script puts all node features and labels "
"on GPU when using it to save time for data copy. This may "
"be undesired if they cannot fit in GPU memory at once. "
"This flag disables that.")
args = argparser.parse_args()
devices = list(map(int, args.gpu.split(',')))
......@@ -251,7 +268,6 @@ if __name__ == '__main__':
g, n_classes = load_reddit()
# Construct graph
g = dgl.as_heterograph(g)
in_feats = g.ndata['features'].shape[1]
if args.inductive:
train_g, val_g, test_g = inductive_split(g)
......@@ -264,7 +280,7 @@ if __name__ == '__main__':
val_g.create_formats_()
test_g.create_formats_()
# Pack data
data = in_feats, n_classes, train_g, val_g, test_g
data = n_classes, train_g, val_g, test_g
if n_gpus == 1:
run(0, n_gpus, args, devices, data)
......
......@@ -5,17 +5,13 @@ import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.multiprocessing as mp
from torch.utils.data import DataLoader
import dgl.function as fn
import dgl.nn.pytorch as dglnn
import time
import argparse
from _thread import start_new_thread
from functools import wraps
from dgl.data import RedditDataset
from torch.nn.parallel import DistributedDataParallel
import tqdm
import traceback
import sklearn.linear_model as lm
import sklearn.metrics as skm
......@@ -38,13 +34,6 @@ class NegativeSampler(object):
src = src.repeat_interleave(self.k)
return src, dst
def load_subtensor(g, input_nodes, device):
"""
Copys features and labels of a set of nodes onto GPU.
"""
batch_inputs = g.ndata['features'][input_nodes].to(device)
return batch_inputs
class SAGE(nn.Module):
def __init__(self,
in_feats,
......@@ -74,7 +63,7 @@ class SAGE(nn.Module):
h = self.dropout(h)
return h
def inference(self, g, x, batch_size, device):
def inference(self, g, x, device):
"""
Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling).
g : the entire graph.
......@@ -88,14 +77,13 @@ class SAGE(nn.Module):
# Therefore, we compute the representation of all nodes layer by layer. The nodes
# on each layer are of course splitted in batches.
# TODO: can we standardize this?
nodes = th.arange(g.number_of_nodes())
for l, layer in enumerate(self.layers):
y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes)
y = th.zeros(g.num_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes)
sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
dataloader = dgl.dataloading.NodeDataLoader(
g,
th.arange(g.number_of_nodes()),
th.arange(g.num_nodes()),
sampler,
batch_size=args.batch_size,
shuffle=True,
......@@ -155,24 +143,23 @@ def compute_acc(emb, labels, train_nids, val_nids, test_nids):
f1_micro_test = skm.f1_score(test_labels, pred[test_nids], average='micro')
return f1_micro_eval, f1_micro_test
def evaluate(model, g, inputs, labels, train_nids, val_nids, test_nids, batch_size, device):
def evaluate(model, g, nfeat, labels, train_nids, val_nids, test_nids, device):
"""
Evaluate the model on the validation set specified by ``val_mask``.
g : The entire graph.
inputs : The features of all the nodes.
labels : The labels of all the nodes.
val_mask : A 0-1 mask indicating which nodes do we actually compute the accuracy for.
batch_size : Number of nodes to compute at the same time.
device : The GPU device to evaluate on.
"""
model.eval()
with th.no_grad():
# single gpu
if isinstance(model, SAGE):
pred = model.inference(g, inputs, batch_size, device)
pred = model.inference(g, nfeat, device)
# multi gpu
else:
pred = model.module.inference(g, inputs, batch_size, device)
pred = model.module.inference(g, nfeat, device)
model.train()
return compute_acc(pred, labels, train_nids, val_nids, test_nids)
......@@ -188,18 +175,17 @@ def run(proc_id, n_gpus, args, devices, data):
init_method=dist_init_method,
world_size=world_size,
rank=proc_id)
train_mask, val_mask, test_mask, in_feats, labels, n_classes, g = data
train_mask, val_mask, test_mask, n_classes, g = data
nfeat = g.ndata.pop('feat')
labels = g.ndata.pop('label')
in_feats = nfeat.shape[1]
train_nid = th.LongTensor(np.nonzero(train_mask)).squeeze()
val_nid = th.LongTensor(np.nonzero(val_mask)).squeeze()
test_nid = th.LongTensor(np.nonzero(test_mask)).squeeze()
#train_nid = th.LongTensor(np.nonzero(train_mask)[0])
#val_nid = th.LongTensor(np.nonzero(val_mask)[0])
#test_nid = th.LongTensor(np.nonzero(test_mask)[0])
# Create PyTorch DataLoader for constructing blocks
n_edges = g.number_of_edges()
n_edges = g.num_edges()
train_seeds = np.arange(n_edges)
if n_gpus > 0:
num_per_gpu = (train_seeds.shape[0] + n_gpus -1) // n_gpus
......@@ -230,7 +216,6 @@ def run(proc_id, n_gpus, args, devices, data):
if n_gpus > 1:
model = DistributedDataParallel(model, device_ids=[device], output_device=device)
loss_fcn = CrossEntropyLoss()
loss_fcn = loss_fcn.to(device)
optimizer = optim.Adam(model.parameters(), lr=args.lr)
# Training loop
......@@ -249,7 +234,7 @@ def run(proc_id, n_gpus, args, devices, data):
tic_step = time.time()
for step, (input_nodes, pos_graph, neg_graph, blocks) in enumerate(dataloader):
batch_inputs = load_subtensor(g, input_nodes, device)
batch_inputs = nfeat[input_nodes].to(device)
d_step = time.time()
pos_graph = pos_graph.to(device)
......@@ -263,8 +248,8 @@ def run(proc_id, n_gpus, args, devices, data):
optimizer.step()
t = time.time()
pos_edges = pos_graph.number_of_edges()
neg_edges = neg_graph.number_of_edges()
pos_edges = pos_graph.num_edges()
neg_edges = neg_graph.num_edges()
iter_pos.append(pos_edges / (t - tic_step))
iter_neg.append(neg_edges / (t - tic_step))
iter_d.append(d_step - tic_step)
......@@ -276,34 +261,37 @@ def run(proc_id, n_gpus, args, devices, data):
tic_step = time.time()
if step % args.eval_every == 0 and proc_id == 0:
eval_acc, test_acc = evaluate(model, g, g.ndata['features'], labels, train_nid, val_nid, test_nid, args.batch_size, device)
eval_acc, test_acc = evaluate(model, g, nfeat, labels, train_nid, val_nid, test_nid, device)
print('Eval Acc {:.4f} Test Acc {:.4f}'.format(eval_acc, test_acc))
if eval_acc > best_eval_acc:
best_eval_acc = eval_acc
best_test_acc = test_acc
print('Best Eval Acc {:.4f} Test Acc {:.4f}'.format(best_eval_acc, best_test_acc))
toc = time.time()
if proc_id == 0:
print('Epoch Time(s): {:.4f}'.format(toc - tic))
if epoch >= 5:
avg += toc - tic
if n_gpus > 1:
th.distributed.barrier()
print('Avg epoch time: {}'.format(avg / (epoch - 4)))
if proc_id == 0:
print('Avg epoch time: {}'.format(avg / (epoch - 4)))
def main(args, devices):
# load reddit data
data = RedditDataset(self_loop=False)
n_classes = data.num_classes
g = data[0]
features = g.ndata['feat']
in_feats = features.shape[1]
labels = g.ndata['label']
train_mask = g.ndata['train_mask']
val_mask = g.ndata['val_mask']
test_mask = g.ndata['test_mask']
g.ndata['features'] = features
# Create csr/coo/csc formats before launching training processes with multi-gpu.
# This avoids creating certain formats in each sub-process, which saves momory and CPU.
g.create_formats_()
# Pack data
data = train_mask, val_mask, test_mask, in_feats, labels, n_classes, g
data = train_mask, val_mask, test_mask, n_classes, g
n_gpus = len(devices)
if devices[0] == -1:
......@@ -324,13 +312,14 @@ def main(args, devices):
if __name__ == '__main__':
argparser = argparse.ArgumentParser("multi-gpu training")
argparser.add_argument("--gpu", type=str, default='0',
help="GPU, can be a list of gpus for multi-gpu trianing, e.g., 0,1,2,3; -1 for CPU")
help="GPU, can be a list of gpus for multi-gpu trianing,"
" e.g., 0,1,2,3; -1 for CPU")
argparser.add_argument('--num-epochs', type=int, default=20)
argparser.add_argument('--num-hidden', type=int, default=16)
argparser.add_argument('--num-layers', type=int, default=2)
argparser.add_argument('--num-negs', type=int, default=1)
argparser.add_argument('--neg-share', default=False, action='store_true',
help="sharing neg nodes for positive nodes")
help="sharing neg nodes for positive nodes")
argparser.add_argument('--fan-out', type=str, default='10,25')
argparser.add_argument('--batch-size', type=int, default=10000)
argparser.add_argument('--log-every', type=int, default=20)
......@@ -338,7 +327,7 @@ if __name__ == '__main__':
argparser.add_argument('--lr', type=float, default=0.003)
argparser.add_argument('--dropout', type=float, default=0.5)
argparser.add_argument('--num-workers', type=int, default=0,
help="Number of sampling processes. Use 0 for no extra process.")
help="Number of sampling processes. Use 0 for no extra process.")
args = argparser.parse_args()
devices = list(map(int, args.gpu.split(',')))
......
......@@ -5,24 +5,15 @@ import torch as th
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.multiprocessing as mp
from torch.utils.data import DataLoader
import dgl.function as fn
import dgl.nn.pytorch as dglnn
import time
import argparse
from _thread import start_new_thread
from functools import wraps
from dgl.data import RedditDataset
import tqdm
import traceback
from ogb.nodeproppred import DglNodePropPredDataset
from sampler import ClusterIter, subgraph_collate_fn
#### Neighbor sampler
class GAT(nn.Module):
def __init__(self,
in_feats,
......@@ -79,16 +70,15 @@ class GAT(nn.Module):
layers.
"""
num_heads = self.num_heads
nodes = th.arange(g.number_of_nodes())
for l, layer in enumerate(self.layers):
if l < self.n_layers - 1:
y = th.zeros(g.number_of_nodes(), self.n_hidden * num_heads if l != len(self.layers) - 1 else self.n_classes)
y = th.zeros(g.num_nodes(), self.n_hidden * num_heads if l != len(self.layers) - 1 else self.n_classes)
else:
y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes)
y = th.zeros(g.num_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes)
sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
dataloader = dgl.dataloading.NodeDataLoader(
g,
th.arange(g.number_of_nodes()),
th.arange(g.num_nodes()),
sampler,
batch_size=batch_size,
shuffle=False,
......@@ -98,7 +88,6 @@ class GAT(nn.Module):
for input_nodes, output_nodes, blocks in tqdm.tqdm(dataloader):
block = blocks[0].int().to(device)
h = x[input_nodes].to(device)
h_dst = h[:block.number_of_dst_nodes()].to(device)
if l < self.n_layers - 1:
h = layer(block, h).flatten(1)
else:
......@@ -116,7 +105,7 @@ def compute_acc(pred, labels):
"""
return (th.argmax(pred, dim=1) == labels).float().sum() / len(pred)
def evaluate(model, g, labels, val_nid, test_nid, batch_size, device):
def evaluate(model, g, nfeat, labels, val_nid, test_nid, batch_size, device):
"""
Evaluate the model on the validation set specified by ``val_mask``.
g : The entire graph.
......@@ -128,8 +117,7 @@ def evaluate(model, g, labels, val_nid, test_nid, batch_size, device):
"""
model.eval()
with th.no_grad():
inputs = g.ndata['feat']
pred = model.inference(g, inputs, batch_size, device)
pred = model.inference(g, nfeat, batch_size, device)
model.train()
return compute_acc(pred[val_nid], labels[val_nid]), compute_acc(pred[test_nid], labels[test_nid]), pred
......@@ -142,7 +130,8 @@ def model_param_summary(model):
def run(args, device, data):
# Unpack data
train_nid, val_nid, test_nid, in_feats, labels, n_classes, g, cluster_iterator = data
labels = labels.to(device)
nfeat = g.ndata.pop('feat').to(device)
# Define model and optimizer
model = GAT(in_feats, args.num_heads, args.num_hidden, n_classes, args.num_layers, F.relu, args.dropout)
......@@ -164,16 +153,18 @@ def run(args, device, data):
# blocks.
tic_start = time.time()
for step, cluster in enumerate(cluster_iterator):
cluster = cluster.int().to(device)
mask = cluster.ndata['train_mask']
mask = cluster.ndata.pop('train_mask')
if mask.sum() == 0:
continue
feat = cluster.ndata['feat']
batch_labels = cluster.ndata['labels']
cluster.edata.pop(dgl.EID)
cluster = cluster.int().to(device)
input_nodes = cluster.ndata[dgl.NID]
batch_inputs = nfeat[input_nodes]
batch_labels = labels[input_nodes]
tic_step = time.time()
# Compute loss and prediction
batch_pred = model(cluster, feat)
batch_pred = model(cluster, batch_inputs)
batch_pred = batch_pred[mask]
batch_labels = batch_labels[mask]
loss = nn.functional.nll_loss(batch_pred, batch_labels)
......@@ -199,7 +190,7 @@ def run(args, device, data):
avg += toc - tic
if epoch % args.eval_every == 0 and epoch != 0:
eval_acc, test_acc, pred = evaluate(model, g, labels, val_nid, test_nid, args.val_batch_size, device)
eval_acc, test_acc, pred = evaluate(model, g, nfeat, labels, val_nid, test_nid, args.val_batch_size, device)
model = model.to(device)
if args.save_pred:
np.savetxt(args.save_pred + '%02d' % epoch, pred.argmax(1).cpu().numpy(), '%d')
......@@ -229,6 +220,11 @@ if __name__ == '__main__':
argparser.add_argument('--wd', type=float, default=0)
argparser.add_argument('--num_partitions', type=int, default=15000)
argparser.add_argument('--num-workers', type=int, default=0)
argparser.add_argument('--data-cpu', action='store_true',
help="By default the script puts all node features and labels "
"on GPU when using it to save time for data copy. This may "
"be undesired if they cannot fit in GPU memory at once. "
"This flag disables that.")
args = argparser.parse_args()
if args.gpu >= 0:
......@@ -242,22 +238,15 @@ if __name__ == '__main__':
train_idx, val_idx, test_idx = splitted_idx['train'], splitted_idx['valid'], splitted_idx['test']
graph, labels = data[0]
labels = labels[:, 0]
print('Total edges before adding self-loop {}'.format(graph.number_of_edges()))
print('Total edges before adding self-loop {}'.format(graph.num_edges()))
graph = dgl.remove_self_loop(graph)
graph = dgl.add_self_loop(graph)
print('Total edges after adding self-loop {}'.format(graph.number_of_edges()))
print('Total edges after adding self-loop {}'.format(graph.num_edges()))
num_nodes = train_idx.shape[0] + val_idx.shape[0] + test_idx.shape[0]
assert num_nodes == graph.number_of_nodes()
graph.ndata['labels'] = labels
assert num_nodes == graph.num_nodes()
mask = th.zeros(num_nodes, dtype=th.bool)
mask[train_idx] = True
graph.ndata['train_mask'] = mask
mask = th.zeros(num_nodes, dtype=th.bool)
mask[val_idx] = True
graph.ndata['valid_mask'] = mask
mask = th.zeros(num_nodes, dtype=th.bool)
mask[test_idx] = True
graph.ndata['test_mask'] = mask
graph.in_degrees(0)
graph.out_degrees(0)
......@@ -265,7 +254,9 @@ if __name__ == '__main__':
cluster_iter_data = ClusterIter(
'ogbn-products', graph, args.num_partitions, args.batch_size)
cluster_iterator = DataLoader(cluster_iter_data, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=4, collate_fn=partial(subgraph_collate_fn, graph))
cluster_iterator = DataLoader(cluster_iter_data, batch_size=args.batch_size, shuffle=True,
pin_memory=True, num_workers=4,
collate_fn=partial(subgraph_collate_fn, graph))
in_feats = graph.ndata['feat'].shape[1]
n_classes = (labels.max() + 1).item()
......
import os
import random
import dgl.function as fn
import torch
import time
from partition_utils import *
......
......@@ -4,17 +4,10 @@ import torch as th
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.multiprocessing as mp
from torch.utils.data import DataLoader
import dgl.function as fn
import dgl.nn.pytorch as dglnn
import time
import argparse
from _thread import start_new_thread
from functools import wraps
from dgl.data import RedditDataset
import tqdm
import traceback
from ogb.nodeproppred import DglNodePropPredDataset
......@@ -25,17 +18,18 @@ class GAT(nn.Module):
n_classes,
n_layers,
num_heads,
activation,
dropout):
activation):
super().__init__()
self.n_layers = n_layers
self.n_hidden = n_hidden
self.n_classes = n_classes
self.layers = nn.ModuleList()
self.layers.append(dglnn.GATConv((in_feats, in_feats), n_hidden, num_heads=num_heads, feat_drop=0., attn_drop=0., activation=activation, negative_slope=0.2))
self.layers.append(dglnn.GATConv((in_feats, in_feats), n_hidden, num_heads=num_heads, activation=activation))
for i in range(1, n_layers - 1):
self.layers.append(dglnn.GATConv((n_hidden * num_heads, n_hidden * num_heads), n_hidden, num_heads=num_heads, feat_drop=0., attn_drop=0., activation=activation, negative_slope=0.2))
self.layers.append(dglnn.GATConv((n_hidden * num_heads, n_hidden * num_heads), n_classes, num_heads=num_heads, feat_drop=0., attn_drop=0., activation=None, negative_slope=0.2))
self.layers.append(dglnn.GATConv((n_hidden * num_heads, n_hidden * num_heads), n_hidden,
num_heads=num_heads, activation=activation))
self.layers.append(dglnn.GATConv((n_hidden * num_heads, n_hidden * num_heads), n_classes,
num_heads=num_heads, activation=None))
def forward(self, blocks, x):
h = x
......@@ -44,7 +38,7 @@ class GAT(nn.Module):
# appropriate nodes on the LHS.
# Note that the shape of h is (num_nodes_LHS, D) and the shape of h_dst
# would be (num_nodes_RHS, D)
h_dst = h[:block.number_of_dst_nodes()]
h_dst = h[:block.num_dst_nodes()]
# Then we compute the updated representation on the RHS.
# The shape of h now becomes (num_nodes_RHS, D)
if l < self.n_layers - 1:
......@@ -54,7 +48,7 @@ class GAT(nn.Module):
h = h.mean(1)
return h.log_softmax(dim=-1)
def inference(self, g, x, batch_size, num_heads, device):
def inference(self, g, x, num_heads, device):
"""
Inference with the GAT model on full neighbors (i.e. without neighbor sampling).
g : the entire graph.
......@@ -67,17 +61,16 @@ class GAT(nn.Module):
# Therefore, we compute the representation of all nodes layer by layer. The nodes
# on each layer are of course splitted in batches.
# TODO: can we standardize this?
nodes = th.arange(g.number_of_nodes())
for l, layer in enumerate(self.layers):
if l < self.n_layers - 1:
y = th.zeros(g.number_of_nodes(), self.n_hidden * num_heads if l != len(self.layers) - 1 else self.n_classes)
y = th.zeros(g.num_nodes(), self.n_hidden * num_heads if l != len(self.layers) - 1 else self.n_classes)
else:
y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes)
y = th.zeros(g.num_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes)
sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
dataloader = dgl.dataloading.NodeDataLoader(
g,
th.arange(g.number_of_nodes()),
th.arange(g.num_nodes()),
sampler,
batch_size=args.batch_size,
shuffle=True,
......@@ -88,7 +81,7 @@ class GAT(nn.Module):
block = blocks[0].int().to(device)
h = x[input_nodes].to(device)
h_dst = h[:block.number_of_dst_nodes()]
h_dst = h[:block.num_dst_nodes()]
if l < self.n_layers - 1:
h = layer(block, (h, h_dst)).flatten(1)
else:
......@@ -99,7 +92,7 @@ class GAT(nn.Module):
y[output_nodes] = h.cpu()
x = y
return y
return y.to(device)
def compute_acc(pred, labels):
"""
......@@ -107,7 +100,7 @@ def compute_acc(pred, labels):
"""
return (th.argmax(pred, dim=1) == labels).float().sum() / len(pred)
def evaluate(model, g, labels, val_nid, test_nid, batch_size, num_heads, device):
def evaluate(model, g, nfeat, labels, val_nid, test_nid, num_heads, device):
"""
Evaluate the model on the validation set specified by ``val_mask``.
g : The entire graph.
......@@ -119,23 +112,22 @@ def evaluate(model, g, labels, val_nid, test_nid, batch_size, num_heads, device)
"""
model.eval()
with th.no_grad():
inputs = g.ndata['feat']
pred = model.inference(g, inputs, batch_size, num_heads, device)
pred = model.inference(g, nfeat, num_heads, device)
model.train()
return compute_acc(pred[val_nid], labels[val_nid]), compute_acc(pred[test_nid], labels[test_nid]), pred
def load_subtensor(g, labels, seeds, input_nodes, device):
def load_subtensor(nfeat, labels, seeds, input_nodes):
"""
Copys features and labels of a set of nodes onto GPU.
Extracts features and labels for a set of nodes.
"""
batch_inputs = g.ndata['feat'][input_nodes].to(device)
batch_labels = labels[seeds].to(device)
batch_inputs = nfeat[input_nodes]
batch_labels = labels[seeds]
return batch_inputs, batch_labels
#### Entry point
def run(args, device, data):
# Unpack data
train_nid, val_nid, test_nid, in_feats, labels, n_classes, g, num_heads = data
train_nid, val_nid, test_nid, in_feats, labels, n_classes, nfeat, g, num_heads = data
# Create PyTorch DataLoader for constructing blocks
sampler = dgl.dataloading.MultiLayerNeighborSampler(
......@@ -150,7 +142,7 @@ def run(args, device, data):
num_workers=args.num_workers)
# Define model and optimizer
model = GAT(in_feats, args.num_hidden, n_classes, args.num_layers, num_heads, F.relu, args.dropout)
model = GAT(in_feats, args.num_hidden, n_classes, args.num_layers, num_heads, F.relu)
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd)
......@@ -171,7 +163,7 @@ def run(args, device, data):
blocks = [blk.to(device) for blk in blocks]
# Load the input features as well as output labels
batch_inputs, batch_labels = load_subtensor(g, labels, seeds, input_nodes, device)
batch_inputs, batch_labels = load_subtensor(nfeat, labels, seeds, input_nodes)
# Compute loss and prediction
batch_pred = model(blocks, batch_inputs)
......@@ -192,7 +184,7 @@ def run(args, device, data):
if epoch >= 5:
avg += toc - tic
if epoch % args.eval_every == 0 and epoch != 0:
eval_acc, test_acc, pred = evaluate(model, g, labels, val_nid, test_nid, args.val_batch_size, num_heads, device)
eval_acc, test_acc, pred = evaluate(model, g, nfeat, labels, val_nid, test_nid, num_heads, device)
if args.save_pred:
np.savetxt(args.save_pred + '%02d' % epoch, pred.argmax(1).cpu().numpy(), '%d')
print('Eval Acc {:.4f}'.format(eval_acc))
......@@ -217,7 +209,6 @@ if __name__ == '__main__':
argparser.add_argument('--log-every', type=int, default=20)
argparser.add_argument('--eval-every', type=int, default=1)
argparser.add_argument('--lr', type=float, default=0.001)
argparser.add_argument('--dropout', type=float, default=0.5)
argparser.add_argument('--num-workers', type=int, default=8,
help="Number of sampling processes. Use 0 for no extra process.")
argparser.add_argument('--save-pred', type=str, default='')
......@@ -235,20 +226,21 @@ if __name__ == '__main__':
splitted_idx = data.get_idx_split()
train_idx, val_idx, test_idx = splitted_idx['train'], splitted_idx['valid'], splitted_idx['test']
graph, labels = data[0]
labels = labels[:, 0]
nfeat = graph.ndata.pop('feat').to(device)
labels = labels[:, 0].to(device)
print('Total edges before adding self-loop {}'.format(graph.number_of_edges()))
print('Total edges before adding self-loop {}'.format(graph.num_edges()))
graph = graph.remove_self_loop().add_self_loop()
print('Total edges after adding self-loop {}'.format(graph.number_of_edges()))
print('Total edges after adding self-loop {}'.format(graph.num_edges()))
in_feats = graph.ndata['feat'].shape[1]
in_feats = nfeat.shape[1]
n_classes = (labels.max() + 1).item()
# Create csr/coo/csc formats before launching sampling processes
# This avoids creating certain formats in each data loader process, which saves momory and CPU.
graph.create_formats_()
# Pack data
data = train_idx, val_idx, test_idx, in_feats, labels, n_classes, graph, args.head
data = train_idx, val_idx, test_idx, in_feats, labels, n_classes, nfeat, graph, args.head
# Run 10 times
test_accs = []
......
......@@ -4,17 +4,10 @@ import torch as th
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.multiprocessing as mp
from torch.utils.data import DataLoader
import dgl.function as fn
import dgl.nn.pytorch as dglnn
import time
import argparse
from _thread import start_new_thread
from functools import wraps
from dgl.data import RedditDataset
import tqdm
import traceback
from ogb.nodeproppred import DglNodePropPredDataset
class SAGE(nn.Module):
......@@ -44,7 +37,7 @@ class SAGE(nn.Module):
# appropriate nodes on the LHS.
# Note that the shape of h is (num_nodes_LHS, D) and the shape of h_dst
# would be (num_nodes_RHS, D)
h_dst = h[:block.number_of_dst_nodes()]
h_dst = h[:block.num_dst_nodes()]
# Then we compute the updated representation on the RHS.
# The shape of h now becomes (num_nodes_RHS, D)
h = layer(block, (h, h_dst))
......@@ -53,7 +46,7 @@ class SAGE(nn.Module):
h = self.dropout(h)
return h
def inference(self, g, x, batch_size, device):
def inference(self, g, x, device):
"""
Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling).
g : the entire graph.
......@@ -66,14 +59,13 @@ class SAGE(nn.Module):
# Therefore, we compute the representation of all nodes layer by layer. The nodes
# on each layer are of course splitted in batches.
# TODO: can we standardize this?
nodes = th.arange(g.number_of_nodes())
for l, layer in enumerate(self.layers):
y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes)
y = th.zeros(g.num_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes).to(device)
sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
dataloader = dgl.dataloading.NodeDataLoader(
g,
th.arange(g.number_of_nodes()),
th.arange(g.num_nodes()),
sampler,
batch_size=args.batch_size,
shuffle=True,
......@@ -83,14 +75,14 @@ class SAGE(nn.Module):
for input_nodes, output_nodes, blocks in tqdm.tqdm(dataloader):
block = blocks[0].int().to(device)
h = x[input_nodes].to(device)
h_dst = h[:block.number_of_dst_nodes()]
h = x[input_nodes]
h_dst = h[:block.num_dst_nodes()]
h = layer(block, (h, h_dst))
if l != len(self.layers) - 1:
h = self.activation(h)
h = self.dropout(h)
y[output_nodes] = h.cpu()
y[output_nodes] = h
x = y
return y
......@@ -101,35 +93,33 @@ def compute_acc(pred, labels):
"""
return (th.argmax(pred, dim=1) == labels).float().sum() / len(pred)
def evaluate(model, g, labels, val_nid, test_nid, batch_size, device):
def evaluate(model, g, nfeat, labels, val_nid, test_nid, device):
"""
Evaluate the model on the validation set specified by ``val_mask``.
g : The entire graph.
inputs : The features of all the nodes.
labels : The labels of all the nodes.
val_mask : A 0-1 mask indicating which nodes do we actually compute the accuracy for.
batch_size : Number of nodes to compute at the same time.
device : The GPU device to evaluate on.
"""
model.eval()
with th.no_grad():
inputs = g.ndata['feat']
pred = model.inference(g, inputs, batch_size, device)
pred = model.inference(g, nfeat, device)
model.train()
return compute_acc(pred[val_nid], labels[val_nid]), compute_acc(pred[test_nid], labels[test_nid]), pred
def load_subtensor(g, labels, seeds, input_nodes, device):
def load_subtensor(nfeat, labels, seeds, input_nodes):
"""
Copys features and labels of a set of nodes onto GPU.
Extracts features and labels for a set of nodes.
"""
batch_inputs = g.ndata['feat'][input_nodes].to(device)
batch_labels = labels[seeds].to(device)
batch_inputs = nfeat[input_nodes]
batch_labels = labels[seeds]
return batch_inputs, batch_labels
#### Entry point
def run(args, device, data):
# Unpack data
train_nid, val_nid, test_nid, in_feats, labels, n_classes, g = data
train_nid, val_nid, test_nid, in_feats, labels, n_classes, nfeat, g = data
# Create PyTorch DataLoader for constructing blocks
sampler = dgl.dataloading.MultiLayerNeighborSampler(
......@@ -147,7 +137,6 @@ def run(args, device, data):
model = SAGE(in_feats, args.num_hidden, n_classes, args.num_layers, F.relu, args.dropout)
model = model.to(device)
loss_fcn = nn.CrossEntropyLoss()
loss_fcn = loss_fcn.to(device)
optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd)
# Training loop
......@@ -167,7 +156,7 @@ def run(args, device, data):
blocks = [blk.int().to(device) for blk in blocks]
# Load the input features as well as output labels
batch_inputs, batch_labels = load_subtensor(g, labels, seeds, input_nodes, device)
batch_inputs, batch_labels = load_subtensor(nfeat, labels, seeds, input_nodes)
# Compute loss and prediction
batch_pred = model(blocks, batch_inputs)
......@@ -188,7 +177,7 @@ def run(args, device, data):
if epoch >= 5:
avg += toc - tic
if epoch % args.eval_every == 0 and epoch != 0:
eval_acc, test_acc, pred = evaluate(model, g, labels, val_nid, test_nid, args.val_batch_size, device)
eval_acc, test_acc, pred = evaluate(model, g, nfeat, labels, val_nid, test_nid, device)
if args.save_pred:
np.savetxt(args.save_pred + '%02d' % epoch, pred.argmax(1).cpu().numpy(), '%d')
print('Eval Acc {:.4f}'.format(eval_acc))
......@@ -230,15 +219,16 @@ if __name__ == '__main__':
splitted_idx = data.get_idx_split()
train_idx, val_idx, test_idx = splitted_idx['train'], splitted_idx['valid'], splitted_idx['test']
graph, labels = data[0]
labels = labels[:, 0]
nfeat = graph.ndata.pop('feat').to(device)
labels = labels[:, 0].to(device)
in_feats = graph.ndata['feat'].shape[1]
in_feats = nfeat.shape[1]
n_classes = (labels.max() + 1).item()
# Create csr/coo/csc formats before launching sampling processes
# This avoids creating certain formats in each data loader process, which saves momory and CPU.
graph.create_formats_()
# Pack data
data = train_idx, val_idx, test_idx, in_feats, labels, n_classes, graph
data = train_idx, val_idx, test_idx, in_feats, labels, n_classes, nfeat, graph
# Run 10 times
test_accs = []
......
......@@ -7,10 +7,7 @@ import itertools
import numpy as np
import time
import torch as th
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from functools import partial
import dgl
from dgl.data.rdf import AIFBDataset, MUTAGDataset, BGSDataset, AMDataset
......@@ -32,7 +29,7 @@ def evaluate(model, loader, node_embed, labels, category, device):
blocks = [blk.to(device) for blk in blocks]
seeds = seeds[category]
emb = extract_embed(node_embed, input_nodes)
emb = {k : e.to(device) for k, e in emb.items()}
emb = {k: e.to(device) for k, e in emb.items()}
lbl = labels[seeds].to(device)
logits = model(emb, blocks)[category]
loss = F.cross_entropy(logits, lbl)
......@@ -43,6 +40,13 @@ def evaluate(model, loader, node_embed, labels, category, device):
return total_loss / count, total_acc / count
def main(args):
# check cuda
device = 'cpu'
use_cuda = args.gpu >= 0 and th.cuda.is_available()
if use_cuda:
th.cuda.set_device(args.gpu)
device = 'cuda:%d' % args.gpu
# load graph data
if args.dataset == 'aifb':
dataset = AIFBDataset()
......@@ -71,19 +75,13 @@ def main(args):
else:
val_idx = train_idx
# check cuda
device = 'cpu'
use_cuda = args.gpu >= 0 and th.cuda.is_available()
if use_cuda:
th.cuda.set_device(args.gpu)
device = 'cuda:%d' % args.gpu
train_label = labels[train_idx]
val_label = labels[val_idx]
test_label = labels[test_idx]
# create embeddings
embed_layer = RelGraphEmbed(g, args.n_hidden)
if not args.data_cpu:
labels = labels.to(device)
embed_layer = embed_layer.to(device)
node_embed = embed_layer()
# create model
model = EntityClassify(g,
......@@ -187,6 +185,11 @@ if __name__ == '__main__':
help="Mini-batch size. If -1, use full graph training.")
parser.add_argument("--fanout", type=int, default=4,
help="Fan-out of neighbor sampling.")
parser.add_argument('--data-cpu', action='store_true',
help="By default the script puts all node features and labels "
"on GPU when using it to save time for data copy. This may "
"be undesired if they cannot fit in GPU memory at once. "
"This flag disables that.")
fp = parser.add_mutually_exclusive_group(required=False)
fp.add_argument('--validation', dest='validation', action='store_true')
fp.add_argument('--testing', dest='validation', action='store_false')
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment