Unverified Commit 4ddd477f authored by Quan (Andy) Gan's avatar Quan (Andy) Gan Committed by GitHub
Browse files

[Model] GraphSAGE inductive example (#1741)

* graphsage inductive example

* fix
parent 168a88e5
......@@ -33,7 +33,9 @@ python3 train_full.py --dataset cora --gpu 0 # full graph
Train w/ mini-batch sampling (on the Reddit dataset)
```bash
python3 train_sampling.py --num-epochs 30 # neighbor sampling
python3 train_sampling.py --num-epochs 30 --inductive # inductive learning with neighbor sampling
python3 train_sampling_multi_gpu.py --num-epochs 30 # neighbor sampling with multi GPU
python3 train_sampling_multi_gpu.py --num-epochs 30 --inductive # inductive learning with neighbor sampling, multi GPU
python3 train_cv.py --num-epochs 30 # control variate sampling
python3 train_cv_multi_gpu.py --num-epochs 30 # control variate sampling with multi GPU
```
......@@ -44,6 +46,7 @@ Accuracy:
|:---------------------:|:--------:|
| Full Graph | 0.9504 |
| Neighbor Sampling | 0.9495 |
| N.S. (Inductive) | 0.9460 |
| Control Variate | 0.9490 |
### Unsupervised training
......
......@@ -15,9 +15,9 @@ def load_reddit():
g = data.graph
g.ndata['features'] = features
g.ndata['labels'] = labels
g.ndata['train_mask'] = th.LongTensor(data.train_mask)
g.ndata['val_mask'] = th.LongTensor(data.val_mask)
g.ndata['test_mask'] = th.LongTensor(data.test_mask)
g.ndata['train_mask'] = th.BoolTensor(data.train_mask)
g.ndata['val_mask'] = th.BoolTensor(data.val_mask)
g.ndata['test_mask'] = th.BoolTensor(data.test_mask)
return g, data.num_labels
def load_ogb(name):
......@@ -35,13 +35,21 @@ def load_ogb(name):
# Find the node IDs in the training, validation, and test set.
train_nid, val_nid, test_nid = splitted_idx['train'], splitted_idx['valid'], splitted_idx['test']
train_mask = th.zeros((graph.number_of_nodes(),), dtype=th.int64)
train_mask[train_nid] = 1
val_mask = th.zeros((graph.number_of_nodes(),), dtype=th.int64)
val_mask[val_nid] = 1
test_mask = th.zeros((graph.number_of_nodes(),), dtype=th.int64)
test_mask[test_nid] = 1
train_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool)
train_mask[train_nid] = True
val_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool)
val_mask[val_nid] = True
test_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool)
test_mask[test_nid] = True
graph.ndata['train_mask'] = train_mask
graph.ndata['val_mask'] = val_mask
graph.ndata['test_mask'] = test_mask
return graph, len(th.unique(graph.ndata['labels']))
def inductive_split(g):
"""Split the graph into training graph, validation graph, and test graph by training
and validation masks. Suitable for inductive models."""
train_g = g.subgraph(g.ndata['train_mask'])
val_g = g.subgraph(g.ndata['train_mask'] | g.ndata['val_mask'])
test_g = g
return train_g, val_g, test_g
......@@ -16,7 +16,7 @@ from dgl.data import RedditDataset
import tqdm
import traceback
from load_graph import load_reddit, load_ogb
from load_graph import load_reddit, load_ogb, inductive_split
class SAGE(nn.Module):
def __init__(self,
......@@ -142,15 +142,16 @@ def load_subtensor(g, seeds, input_nodes, device):
#### Entry point
def run(args, device, data):
# Unpack data
train_mask, val_mask, in_feats, n_classes, g = data
train_nid = th.nonzero(train_mask, as_tuple=True)[0]
val_nid = th.nonzero(val_mask, as_tuple=True)[0]
in_feats, n_classes, train_g, val_g, test_g = data
train_nid = th.nonzero(train_g.ndata['train_mask'], as_tuple=True)[0]
val_nid = th.nonzero(val_g.ndata['val_mask'], as_tuple=True)[0]
test_nid = th.nonzero(~(test_g.ndata['train_mask'] | test_g.ndata['val_mask']), as_tuple=True)[0]
# Create PyTorch DataLoader for constructing blocks
sampler = dgl.sampling.MultiLayerNeighborSampler(
[int(fanout) for fanout in args.fan_out.split(',')])
dataloader = dgl.sampling.NodeDataLoader(
g,
train_g,
train_nid,
sampler,
batch_size=args.batch_size,
......@@ -177,7 +178,7 @@ def run(args, device, data):
tic_step = time.time()
# Load the input features as well as output labels
batch_inputs, batch_labels = load_subtensor(g, seeds, input_nodes, device)
batch_inputs, batch_labels = load_subtensor(train_g, seeds, input_nodes, device)
# Compute loss and prediction
batch_pred = model(blocks, batch_inputs)
......@@ -198,8 +199,10 @@ def run(args, device, data):
if epoch >= 5:
avg += toc - tic
if epoch % args.eval_every == 0 and epoch != 0:
eval_acc = evaluate(model, g, g.ndata['features'], g.ndata['labels'], val_nid, args.batch_size, device)
eval_acc = evaluate(model, val_g, val_g.ndata['features'], val_g.ndata['labels'], val_nid, args.batch_size, device)
print('Eval Acc {:.4f}'.format(eval_acc))
test_acc = evaluate(model, test_g, test_g.ndata['features'], test_g.ndata['labels'], test_nid, args.batch_size, device)
print('Test Acc: {:.4f}'.format(test_acc))
print('Avg epoch time: {}'.format(avg / (epoch - 4)))
......@@ -219,6 +222,8 @@ if __name__ == '__main__':
argparser.add_argument('--dropout', type=float, default=0.5)
argparser.add_argument('--num-workers', type=int, default=0,
help="Number of sampling processes. Use 0 for no extra process.")
argparser.add_argument('--inductive', action='store_true',
help="Inductive learning setting")
args = argparser.parse_args()
if args.gpu >= 0:
......@@ -232,12 +237,20 @@ if __name__ == '__main__':
g, n_classes = load_ogb('ogbn-products')
else:
raise Exception('unknown dataset')
g = dgl.as_heterograph(g)
in_feats = g.ndata['features'].shape[1]
train_mask = g.ndata['train_mask']
val_mask = g.ndata['val_mask']
prepare_mp(g)
g = dgl.as_heterograph(g)
if args.inductive:
train_g, val_g, test_g = inductive_split(g)
else:
train_g = val_g = test_g = g
prepare_mp(train_g)
prepare_mp(val_g)
prepare_mp(test_g)
# Pack data
data = train_mask, val_mask, in_feats, n_classes, g
data = in_feats, n_classes, train_g, val_g, test_g
run(args, device, data)
......@@ -16,6 +16,7 @@ import tqdm
import traceback
from utils import thread_wrapped_func
from load_graph import load_reddit, inductive_split
class SAGE(nn.Module):
def __init__(self,
......@@ -114,13 +115,13 @@ def compute_acc(pred, labels):
"""
return (th.argmax(pred, dim=1) == labels).float().sum() / len(pred)
def evaluate(model, g, inputs, labels, val_mask, batch_size, device):
def evaluate(model, g, inputs, labels, val_nid, batch_size, device):
"""
Evaluate the model on the validation set specified by ``val_mask``.
Evaluate the model on the validation set specified by ``val_nid``.
g : The entire graph.
inputs : The features of all the nodes.
labels : The labels of all the nodes.
val_mask : A 0-1 mask indicating which nodes do we actually compute the accuracy for.
val_nid : A node ID tensor indicating which nodes do we actually compute the accuracy for.
batch_size : Number of nodes to compute at the same time.
device : The GPU device to evaluate on.
"""
......@@ -128,7 +129,7 @@ def evaluate(model, g, inputs, labels, val_mask, batch_size, device):
with th.no_grad():
pred = model.inference(g, inputs, batch_size, device)
model.train()
return compute_acc(pred[val_mask], labels[val_mask])
return compute_acc(pred[val_nid], labels[val_nid])
def load_subtensor(g, labels, seeds, input_nodes, dev_id):
"""
......@@ -154,11 +155,13 @@ def run(proc_id, n_gpus, args, devices, data):
th.cuda.set_device(dev_id)
# Unpack data
train_mask, val_mask, in_feats, labels, n_classes, g = data
train_nid = th.LongTensor(np.nonzero(train_mask)[0])
val_nid = th.LongTensor(np.nonzero(val_mask)[0])
train_mask = th.BoolTensor(train_mask)
val_mask = th.BoolTensor(val_mask)
in_feats, n_classes, train_g, val_g, test_g = data
train_mask = train_g.ndata['train_mask']
val_mask = val_g.ndata['val_mask']
test_mask = ~(train_g.ndata['train_mask'] | val_g.ndata['val_mask'])
train_nid = train_mask.nonzero()[:, 0]
val_nid = val_mask.nonzero()[:, 0]
test_nid = test_mask.nonzero()[:, 0]
# Split train_nid
train_nid = th.split(train_nid, len(train_nid) // n_gpus)[proc_id]
......@@ -167,7 +170,7 @@ def run(proc_id, n_gpus, args, devices, data):
sampler = dgl.sampling.MultiLayerNeighborSampler(
[int(fanout) for fanout in args.fan_out.split(',')])
dataloader = dgl.sampling.NodeDataLoader(
g,
train_g,
train_nid,
sampler,
batch_size=args.batch_size,
......@@ -197,7 +200,7 @@ def run(proc_id, n_gpus, args, devices, data):
tic_step = time.time()
# Load the input features as well as output labels
batch_inputs, batch_labels = load_subtensor(g, labels, seeds, input_nodes, dev_id)
batch_inputs, batch_labels = load_subtensor(train_g, train_g.ndata['labels'], seeds, input_nodes, dev_id)
# Compute loss and prediction
batch_pred = model(blocks, batch_inputs)
......@@ -230,10 +233,17 @@ def run(proc_id, n_gpus, args, devices, data):
avg += toc - tic
if epoch % args.eval_every == 0 and epoch != 0:
if n_gpus == 1:
eval_acc = evaluate(model, g, g.ndata['features'], labels, val_mask, args.batch_size, devices[0])
eval_acc = evaluate(
model, val_g, val_g.ndata['features'], val_g.ndata['labels'], val_nid, args.batch_size, devices[0])
test_acc = evaluate(
model, test_g, test_g.ndata['features'], test_g.ndata['labels'], test_nid, args.batch_size, devices[0])
else:
eval_acc = evaluate(model.module, g, g.ndata['features'], labels, val_mask, args.batch_size, devices[0])
eval_acc = evaluate(
model.module, val_g, val_g.ndata['features'], val_g.ndata['labels'], val_nid, args.batch_size, devices[0])
test_acc = evaluate(
model.module, test_g, test_g.ndata['features'], test_g.ndata['labels'], test_nid, args.batch_size, devices[0])
print('Eval Acc {:.4f}'.format(eval_acc))
print('Test Acc: {:.4f}'.format(test_acc))
if n_gpus > 1:
......@@ -256,25 +266,28 @@ if __name__ == '__main__':
argparser.add_argument('--dropout', type=float, default=0.5)
argparser.add_argument('--num-workers', type=int, default=0,
help="Number of sampling processes. Use 0 for no extra process.")
argparser.add_argument('--inductive', action='store_true',
help="Inductive learning setting")
args = argparser.parse_args()
devices = list(map(int, args.gpu.split(',')))
n_gpus = len(devices)
# load reddit data
data = RedditDataset(self_loop=True)
train_mask = data.train_mask
val_mask = data.val_mask
features = th.Tensor(data.features)
in_feats = features.shape[1]
labels = th.LongTensor(data.labels)
n_classes = data.num_labels
g, n_classes = load_reddit()
# Construct graph
g = dgl.graph(data.graph.all_edges())
g.ndata['features'] = features
prepare_mp(g)
g = dgl.as_heterograph(g)
in_feats = g.ndata['features'].shape[1]
if args.inductive:
train_g, val_g, test_g = inductive_split(g)
else:
train_g = val_g = test_g = g
prepare_mp(train_g)
prepare_mp(val_g)
prepare_mp(test_g)
# Pack data
data = train_mask, val_mask, in_feats, labels, n_classes, g
data = in_feats, n_classes, train_g, val_g, test_g
if n_gpus == 1:
run(0, n_gpus, args, devices, data)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment