Unverified Commit d41d07d0 authored by Quan (Andy) Gan's avatar Quan (Andy) Gan Committed by GitHub
Browse files

[Doc and bugfix] Add docs and user guide and update tutorial for sampling pipeline (#3774)



* huuuuge update

* remove

* lint

* lint

* fix

* what happened to nccl

* update multi-gpu unsupervised graphsage example

* replace most of the dgl.mp.process with torch.mp.spawn

* update if condition for use_uva case

* update user guide

* address comments

* incorporating suggestions from @jermainewang

* oops

* fix tutorial to pass CI

* oops

* fix again
Co-authored-by: default avatarXin Yao <xiny@nvidia.com>
parent 3bd5a9b6
from time import time
import numpy as np
from utils import arg_list
from dgl.transforms import metis_partition
from dgl import backend as F
import dgl
def get_partition_list(g, psize):
p_gs = metis_partition(g, psize)
graphs = []
for k, val in p_gs.items():
nids = val.ndata[dgl.NID]
nids = F.asnumpy(nids)
graphs.append(nids)
return graphs
def get_subgraph(g, par_arr, i, psize, batch_size):
par_batch_ind_arr = [par_arr[s] for s in range(
i * batch_size, (i + 1) * batch_size) if s < psize]
g1 = g.subgraph(np.concatenate(
par_batch_ind_arr).reshape(-1).astype(np.int64))
return g1
#!/bin/bash
python cluster_gcn.py --gpu 0 --dataset ppi --lr 1e-2 --weight-decay 0.0 --psize 50 --batch-size 1 --n-epochs 300 \
--n-hidden 2048 --n-layers 3 --log-every 100 --use-pp --self-loop \
--note self-loop-ppi-non-sym-ly3-pp-cluster-2-2-wd-0 --dropout 0.2 --use-val --normalize
#!/bin/bash
python cluster_gcn.py --gpu 0 --dataset reddit-self-loop --lr 1e-2 --weight-decay 0.0 --psize 1500 --batch-size 20 \
--n-epochs 30 --n-hidden 128 --n-layers 1 --log-every 100 --use-pp --self-loop \
--note self-loop-reddit-non-sym-ly3-pp-cluster-2-2-wd-5e-4 --dropout 0.2 --use-val --normalize
import os
import random
import dgl.function as fn
import torch
from partition_utils import *
class ClusterIter(object):
'''The partition sampler given a DGLGraph and partition number.
The metis is used as the graph partition backend.
'''
def __init__(self, dn, g, psize, batch_size, seed_nid, use_pp=True):
"""Initialize the sampler.
Paramters
---------
dn : str
The dataset name.
g : DGLGraph
The full graph of dataset
psize: int
The partition number
batch_size: int
The number of partitions in one batch
seed_nid: np.ndarray
The training nodes ids, used to extract the training graph
use_pp: bool
Whether to use precompute of AX
"""
self.use_pp = use_pp
self.g = g.subgraph(seed_nid)
# precalc the aggregated features from training graph only
if use_pp:
self.precalc(self.g)
print('precalculating')
self.psize = psize
self.batch_size = batch_size
# cache the partitions of known datasets&partition number
if dn:
fn = os.path.join('./datasets/', dn + '_{}.npy'.format(psize))
if os.path.exists(fn):
self.par_li = np.load(fn, allow_pickle=True)
else:
os.makedirs('./datasets/', exist_ok=True)
self.par_li = get_partition_list(self.g, psize)
np.save(fn, self.par_li)
else:
self.par_li = get_partition_list(self.g, psize)
self.max = int((psize) // batch_size)
random.shuffle(self.par_li)
self.get_fn = get_subgraph
def precalc(self, g):
norm = self.get_norm(g)
g.ndata['norm'] = norm
features = g.ndata['feat']
print("features shape, ", features.shape)
with torch.no_grad():
g.update_all(fn.copy_src(src='feat', out='m'),
fn.sum(msg='m', out='feat'),
None)
pre_feats = g.ndata['feat'] * norm
# use graphsage embedding aggregation style
g.ndata['feat'] = torch.cat([features, pre_feats], dim=1)
# use one side normalization
def get_norm(self, g):
norm = 1. / g.in_degrees().float().unsqueeze(1)
norm[torch.isinf(norm)] = 0
norm = norm.to(self.g.ndata['feat'].device)
return norm
def __len__(self):
return self.max
def __iter__(self):
self.n = 0
return self
def __next__(self):
if self.n < self.max:
result = self.get_fn(self.g, self.par_li, self.n,
self.psize, self.batch_size)
self.n += 1
return result
else:
random.shuffle(self.par_li)
raise StopIteration
import os
from functools import namedtuple
import dgl
import numpy as np
import torch
from dgl.data import PPIDataset
from dgl.data import load_data as _load_data
from sklearn.metrics import f1_score
class Logger(object):
'''A custom logger to log stdout to a logging file.'''
def __init__(self, path):
"""Initialize the logger.
Paramters
---------
path : str
The file path to be stored in.
"""
self.path = path
def write(self, s):
with open(self.path, 'a') as f:
f.write(str(s))
print(s)
return
def arg_list(labels):
hist, indexes, inverse, counts = np.unique(
labels, return_index=True, return_counts=True, return_inverse=True)
li = []
for h in hist:
li.append(np.argwhere(inverse == h))
return li
def save_log_dir(args):
log_dir = './log/{}/{}'.format(args.dataset, args.note)
os.makedirs(log_dir, exist_ok=True)
return log_dir
def calc_f1(y_true, y_pred, multitask):
if multitask:
y_pred[y_pred > 0] = 1
y_pred[y_pred <= 0] = 0
else:
y_pred = np.argmax(y_pred, axis=1)
return f1_score(y_true, y_pred, average="micro"), \
f1_score(y_true, y_pred, average="macro")
def evaluate(model, g, labels, mask, multitask=False):
model.eval()
with torch.no_grad():
logits = model(g)
logits = logits[mask]
labels = labels[mask]
f1_mic, f1_mac = calc_f1(labels.cpu().numpy(),
logits.cpu().numpy(), multitask)
return f1_mic, f1_mac
def load_data(args):
'''Wraps the dgl's load_data utility to handle ppi special case'''
DataType = namedtuple('Dataset', ['num_classes', 'g'])
if args.dataset != 'ppi':
dataset = _load_data(args)
data = DataType(g=dataset[0], num_classes=dataset.num_classes)
return data
train_dataset = PPIDataset('train')
train_graph = dgl.batch([train_dataset[i] for i in range(len(train_dataset))], edge_attrs=None, node_attrs=None)
val_dataset = PPIDataset('valid')
val_graph = dgl.batch([val_dataset[i] for i in range(len(val_dataset))], edge_attrs=None, node_attrs=None)
test_dataset = PPIDataset('test')
test_graph = dgl.batch([test_dataset[i] for i in range(len(test_dataset))], edge_attrs=None, node_attrs=None)
G = dgl.batch(
[train_graph, val_graph, test_graph], edge_attrs=None, node_attrs=None)
train_nodes_num = train_graph.number_of_nodes()
test_nodes_num = test_graph.number_of_nodes()
val_nodes_num = val_graph.number_of_nodes()
nodes_num = G.number_of_nodes()
assert(nodes_num == (train_nodes_num + test_nodes_num + val_nodes_num))
# construct mask
mask = np.zeros((nodes_num,), dtype=bool)
train_mask = mask.copy()
train_mask[:train_nodes_num] = True
val_mask = mask.copy()
val_mask[train_nodes_num:-test_nodes_num] = True
test_mask = mask.copy()
test_mask[-test_nodes_num:] = True
G.ndata['train_mask'] = torch.tensor(train_mask, dtype=torch.bool)
G.ndata['val_mask'] = torch.tensor(val_mask, dtype=torch.bool)
G.ndata['test_mask'] = torch.tensor(test_mask, dtype=torch.bool)
data = DataType(g=G, num_classes=train_dataset.num_labels)
return data
......@@ -20,8 +20,7 @@ from data import MovieLens
from model import GCMCLayer, DenseBiDecoder, BiDecoder
from utils import get_activation, get_optimizer, torch_total_param_num, torch_net_info, MetricLogger, to_etype_name
import dgl
import dgl.multiprocessing as mp
from dgl.multiprocessing import Queue
import torch.multiprocessing as mp
class Net(nn.Module):
def __init__(self, args, dev_id):
......@@ -382,10 +381,4 @@ if __name__ == '__main__':
# This avoids creating certain formats in each sub-process, which saves momory and CPU.
dataset.train_enc_graph.create_formats_()
dataset.train_dec_graph.create_formats_()
procs = []
for proc_id in range(n_gpus):
p = mp.Process(target=run, args=(proc_id, n_gpus, args, devices, dataset))
p.start()
procs.append(p)
for p in procs:
p.join()
mp.spawn(run, args=(n_gpus, args, devices, dataset), nprocs=n_gpus)
......@@ -28,70 +28,19 @@ python3 train_full.py --dataset cora --gpu 0 # full graph
* citeseer: ~0.7110
* pubmed: ~0.7830
### Minibatch training
### Minibatch training for node classification
Train w/ mini-batch sampling (on the Reddit dataset)
```bash
python3 train_sampling.py --num-epochs 30 # neighbor sampling
python3 train_sampling.py --num-epochs 30 --inductive # inductive learning with neighbor sampling
python3 train_cv.py --num-epochs 30 # control variate sampling
```
For multi-gpu training
```bash
python3 train_sampling_multi_gpu.py --num-epochs 30 --gpu 0,1,... # neighbor sampling
python3 train_sampling_multi_gpu.py --num-epochs 30 --inductive --gpu 0,1,... # inductive learning
python3 train_cv_multi_gpu.py --num-epochs 30 --gpu 0,1,... # control variate sampling
```
Accuracy:
| Model | Accuracy |
|:---------------------:|:--------:|
| Full Graph | 0.9504 |
| Neighbor Sampling | 0.9495 |
| N.S. (Inductive) | 0.9460 |
| Control Variate | 0.9490 |
### Unsupervised training
Train w/ mini-batch sampling for node classification on OGB-products:
Train w/ mini-batch sampling in an unsupervised fashion (on the Reddit dataset)
```bash
python3 train_sampling_unsupervised.py
python3 node_classification.py
python3 multi_gpu_node_classification.py
```
Notably,
* The loss function is defined by predicting whether an edge exists between two nodes or not. This matches the official
implementation, and is equivalent to the loss defined in the paper with 1-hop random walks.
* When computing the score of `(u, v)`, the connections between node `u` and `v` are removed from neighbor sampling.
This trick increases the F1-micro score on test set by 0.02.
* The performance of the learned embeddings are measured by training a softmax regression with scikit-learn, as described
in the paper.
Micro F1 score reaches 0.9212 on test set.
### Use GPU sampling and CUDA UVA sampling
For training scripts `train_sampling.py`, `train_sampling_multi_gpu.py` and `train_sampling_unsupervised.py`, we provide arguments `--graph-device` and `--data-device`.
For `--graph-device`, we provide the following choices:
- `cpu` (default): Use CPU to sample the graph structure stored in host memory.
- `gpu`: Use GPU to sample the graph structure stored in GPU device memory. You have to copy the graph structure (only the `csc` format is needed) to GPU before passing it to the dataloader. This is the fastest way for sampling but requires storing the whole graph structure in GPU memory and will duplicate it in each GPU in multi-GPU training.
- `uva`: Use GPU to sample the graph structure stored in **pinned** host memory through zero-copy access. You have to pin the graph structure before passing it to the dataloader. This is much faster than CPU sampling and especially useful when the graph structure is too large to fit into the GPU memory.
For `--data-device`, we provide the following choices:
- `cpu`: Node features are stored in host memory. It will take a lot time for slicing and transfering node features to GPU during training.
- `gpu` (default): Node features are stored in GPU device memory. This is the fastest way for feature slicing and transfering but cosumes a lot of GPU memory.
- `uva`: Use GPU to slice and access the node features stored in **pinned** host memory (also called `UnifiedTensor`) through zero-copy access. This is especially useful when the node features are too large to fit into the GPU memory.
### Training with PyTorch Lightning
We also provide minibatch training scripts with PyTorch Lightning in `train_lightning.py` and `train_lightning_unsupervised.py`.
### Minibatch training for link prediction
Requires `pytorch_lightning` and `torchmetrics`.
Train w/ mini-batch sampling for link prediction on OGB-Citation2:
```bash
python3 train_lightning.py
python3 train_lightning_unsupervised.py
python3 link_pred.py
```
More Examples for Training GraphSAGE
============================
### Unsupervised training
Train w/ mini-batch sampling in an unsupervised fashion (on the Reddit dataset)
```bash
python3 train_sampling_unsupervised.py
```
Notably,
* The loss function is defined by predicting whether an edge exists between two nodes or not. This matches the official
implementation, and is equivalent to the loss defined in the paper with 1-hop random walks.
* When computing the score of `(u, v)`, the connections between node `u` and `v` are removed from neighbor sampling.
This trick increases the F1-micro score on test set by 0.02.
* The performance of the learned embeddings are measured by training a softmax regression with scikit-learn, as described
in the paper.
Micro F1 score reaches 0.9212 on test set.
This example also demonstrates the advanced usages of multi-GPU `DDP` training, UVA-base sampling, full GPU sampling, and fine control of storing the graph structure and features individually.
### Training with PyTorch Lightning
We also provide minibatch training scripts with PyTorch Lightning in `train_lightning.py` and `train_lightning_unsupervised.py`.
Requires `pytorch_lightning` and `torchmetrics`.
```bash
python3 train_lightning.py
python3 train_lightning_unsupervised.py
```
......@@ -5,13 +5,12 @@ import torch as th
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import dgl.multiprocessing as mp
import torch.multiprocessing as mp
import dgl.function as fn
import dgl.nn.pytorch as dglnn
import time
import argparse
from torch.nn.parallel import DistributedDataParallel
import tqdm
from model import SAGE, compute_acc_unsupervised as compute_acc
from negative_sampler import NegativeSampler
......@@ -33,7 +32,7 @@ class CrossEntropyLoss(nn.Module):
loss = F.binary_cross_entropy_with_logits(score, label.float())
return loss
def evaluate(model, g, nfeat, labels, train_nids, val_nids, test_nids, device):
def evaluate(model, g, nfeat, labels, train_nids, val_nids, test_nids, device, args):
"""
Evaluate the model on the validation set specified by ``val_mask``.
g : The entire graph.
......@@ -71,10 +70,8 @@ def run(proc_id, n_gpus, args, devices, data):
if args.data_device == 'gpu':
nfeat = nfeat.to(device)
labels = labels.to(device)
elif args.data_device == 'uva':
nfeat = dgl.contrib.UnifiedTensor(nfeat, device=device)
labels = dgl.contrib.UnifiedTensor(labels, device=device)
in_feats = nfeat.shape[1]
# Create PyTorch DataLoader for constructing blocks
......@@ -91,22 +88,25 @@ def run(proc_id, n_gpus, args, devices, data):
args.num_workers = 0
# Create sampler
sampler = dgl.dataloading.MultiLayerNeighborSampler(
sampler = dgl.dataloading.NeighborSampler(
[int(fanout) for fanout in args.fan_out.split(',')])
dataloader = dgl.dataloading.EdgeDataLoader(
g, train_seeds, sampler, exclude='reverse_id',
sampler = dgl.dataloading.as_edge_prediction_sampler(
sampler, exclude='reverse_id',
# For each edge with ID e in Reddit dataset, the reverse edge is e ± |E|/2.
reverse_eids=th.cat([
th.arange(n_edges // 2, n_edges),
th.arange(0, n_edges // 2)]).to(train_seeds),
negative_sampler=NegativeSampler(g, args.num_negs, args.neg_share,
device if args.graph_device == 'uva' else None),
device if args.graph_device == 'uva' else None))
dataloader = dgl.dataloading.EdgeDataLoader(
g, train_seeds, sampler,
device=device,
use_ddp=n_gpus > 1,
batch_size=args.batch_size,
shuffle=True,
drop_last=False,
num_workers=args.num_workers)
num_workers=args.num_workers,
use_uva=args.graph_device == 'uva')
# Define model and optimizer
model = SAGE(in_feats, args.num_hidden, args.num_hidden, args.num_layers, F.relu, args.dropout)
......@@ -157,18 +157,19 @@ def run(proc_id, n_gpus, args, devices, data):
proc_id, epoch, step, loss.item(), np.mean(iter_pos[3:]), np.mean(iter_neg[3:]), np.mean(iter_d[3:]), np.mean(iter_t[3:]), gpu_mem_alloc))
tic_step = time.time()
if step % args.eval_every == 0 and proc_id == 0:
eval_acc, test_acc = evaluate(model, g, nfeat, labels, train_nid, val_nid, test_nid, device)
toc = time.time()
if proc_id == 0:
print('Epoch Time(s): {:.4f}'.format(toc - tic))
if epoch >= 5:
avg += toc - tic
if (epoch + 1) % args.eval_every == 0:
eval_acc, test_acc = evaluate(model, g, nfeat, labels, train_nid, val_nid, test_nid, device, args)
print('Eval Acc {:.4f} Test Acc {:.4f}'.format(eval_acc, test_acc))
if eval_acc > best_eval_acc:
best_eval_acc = eval_acc
best_test_acc = test_acc
print('Best Eval Acc {:.4f} Test Acc {:.4f}'.format(best_eval_acc, best_test_acc))
toc = time.time()
if proc_id == 0:
print('Epoch Time(s): {:.4f}'.format(toc - tic))
if epoch >= 5:
avg += toc - tic
if n_gpus > 1:
th.distributed.barrier()
......@@ -201,15 +202,7 @@ def main(args):
# this to avoid competition overhead on machines with many cores.
# Change it to a proper number on your machine, especially for multi-GPU training.
os.environ['OMP_NUM_THREADS'] = str(mp.cpu_count() // 2 // n_gpus)
if n_gpus > 1:
# Copy the graph to shared memory explicitly before pinning.
# In other cases, we can just rely on fork's copy-on-write.
# TODO: the original graph g is not freed.
if args.graph_device == 'uva':
g = g.shared_memory('g')
if args.data_device == 'uva':
nfeat = nfeat.share_memory_()
labels = labels.share_memory_()
# Pack data
data = train_nid, val_nid, test_nid, n_classes, g, nfeat, labels
......@@ -222,13 +215,7 @@ def main(args):
elif n_gpus == 1:
run(0, n_gpus, args, devices, data)
else:
procs = []
for proc_id in range(n_gpus):
p = mp.Process(target=run, args=(proc_id, n_gpus, args, devices, data))
p.start()
procs.append(p)
for p in procs:
p.join()
mp.spawn(run, args=(n_gpus, args, devices, data), nprocs=n_gpus)
if __name__ == '__main__':
......@@ -247,7 +234,7 @@ if __name__ == '__main__':
argparser.add_argument('--fan-out', type=str, default='10,25')
argparser.add_argument('--batch-size', type=int, default=10000)
argparser.add_argument('--log-every', type=int, default=20)
argparser.add_argument('--eval-every', type=int, default=1000)
argparser.add_argument('--eval-every', type=int, default=5)
argparser.add_argument('--lr', type=float, default=0.003)
argparser.add_argument('--dropout', type=float, default=0.5)
argparser.add_argument('--num-workers', type=int, default=0,
......
## DistGNN vertex-cut based graph partitioning (using Libra)
### How to run graph partitioning
```python ../../../../python/dgl/distgnn/partition/main_Libra.py <dataset> <#partitions>```
Example: The following command-line creates 4 partitions of pubmed graph
```python ../../../../python/dgl/distgnn/partition/main_Libra.py pubmed 4```
The ouptut partitions are created in the current directory in Libra_result_\<dataset\>/ folder.
The *upcoming DistGNN* application can directly use these partitions for distributed training.
### How Libra partitioning works
Libra is a vertex-cut based graph partitioning method. It applies greedy heuristics to uniquely distribute the input graph edges among the partitions. It generates the partitions as a list of edges. Script ```main_Libra.py``` after getting the Libra partitions converts the Libra output to DGL/DistGNN input format.
Note: Current Libra implementation is sequential. Extra overhead is paid due to the additional work of format conversion of the partitioned graph.
### Expected partitioning timinigs
Cora, Pubmed, Citeseer: < 10 sec (<10GB)
Reddit: 1500 sec (~ 25GB)
OGBN-Products: ~2000 sec (~30GB)
Proteins: 18000 sec (Format conversion from public data takes time) (~100GB)
OGBN-Paper100M: 25000 sec (~200GB)
### Settings
Tested with:
Cent OS 7.6
gcc v8.3.0
PyTorch 1.7.1
Python 3.7.10
## Distributed training
This is an example of training GraphSage in a distributed fashion. Before training, please install some python libs by pip:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment