Unverified Commit cfe6e70b authored by Quan (Andy) Gan's avatar Quan (Andy) Gan Committed by GitHub
Browse files

RGAT baseline for OGB-LSC node classification challenge (#2810)

* RGAT baseline

* update links (data.dgl.ai hasn't refreshed yet)

* nvm still use dgl.ai domain

* descriptions

* actually it took less

* address comments

* stop worrying about cache

* update links

* oops
parent 05c53ca3
# Baseline Code for MAG240M
The code is ported from the R-GAT examples [here](https://github.com/snap-stanford/ogb/tree/master/examples/lsc/mag240m). Please refer to the [OGB-LSC paper](https://arxiv.org/abs/2103.09430) for the detailed setting.
## Installation Requirements
```
ogb>=1.3.0
torch>=1.7.0
```
## Running Preprocessing Script
```
python preprocess.py \
--rootdir . \
--author-output-path ./author.npy \
--inst-output-path ./inst.npy \
--graph-output-path ./graph.dgl \
--graph-as-homogeneous \
--full-output-path ./full.npy
```
This will give you the following files:
* `author.npy`: The author features, preprocessed by averaging the neighboring paper features.
* `inst.npy`: The institution features, preprocessed by averaging the neighboring author features.
* `graph.dgl`: The *homogenized* DGL graph stored in CSC format, which is friendly for neighbor sampling.
Edge types are stored on the edges as an `int8` feature. Nodes are in the order of author, institution,
and paper.
* `full.npy`: The concatenated author, institution, and paper features.
Since that will usually take a long time, we also offer the above files for download:
* [`author.npy`](https://dgl-data.s3-accelerate.amazonaws.com/dataset/OGB-LSC/author.npy)
* [`inst.npy`](https://dgl-data.s3-accelerate.amazonaws.com/dataset/OGB-LSC/inst.npy)
* [`graph.dgl`](https://dgl-data.s3-accelerate.amazonaws.com/dataset/OGB-LSC/graph.dgl)
* [`full.npy`](https://dgl-data.s3-accelerate.amazonaws.com/dataset/OGB-LSC/full.npy)
In addition, we offer
* [`full_feat.npy`](https://dgl-data.s3-accelerate.amazonaws.com/dataset/OGB-LSC/full_feat.npy): The preprocessed full feature matrix
for running OGB's own baseline. Note that the features are concatenated in the order of paper, author, and
institution, unlike the one in our baseline code. It is also preprocessed in float32 arithmetics instead
of float16 arithmetics.
## Running Training Script
```
python train.py \
--rootdir . \
--graph-preprocess-path ./graph.dgl \
--full-preprocess-path ./full.npy
```
The validation accuracy is 0.701. We do not have ground truth test labels so we do not report
test accuracy.
## Hardware configurations
We successfully run 8 experiments in parallel on an AWS p4d.24x large instance with the preprocessed feature
matrices stored on an NVMe SSD to enable fast disk read. Each experiment requires less than 128GB CPU
memory and less than 12GB GPU memory to run. Every epoch takes around 6 minutes 30 seconds to train and
1 minutes 40 seconds to validate.
If your hard drive is slow, it is best to load all the features into memory for a reasonable training speed.
The CPU memory consumption will go up to as large as 512GB though.
import ogb
from ogb.lsc import MAG240MDataset
import tqdm
import numpy as np
import torch
import dgl
import dgl.function as fn
import argparse
import os
parser = argparse.ArgumentParser()
parser.add_argument('--rootdir', type=str, default='.', help='Directory to download the OGB dataset.')
parser.add_argument('--author-output-path', type=str, help='Path to store the author features.')
parser.add_argument('--inst-output-path', type=str,
help='Path to store the institution features.')
parser.add_argument('--graph-output-path', type=str, help='Path to store the graph.')
parser.add_argument('--graph-format', type=str, default='csc', help='Graph format (coo, csr or csc).')
parser.add_argument('--graph-as-homogeneous', action='store_true', help='Store the graph as DGL homogeneous graph.')
parser.add_argument('--full-output-path', type=str,
help='Path to store features of all nodes. Effective only when graph is homogeneous.')
args = parser.parse_args()
print('Building graph')
dataset = MAG240MDataset(root=args.rootdir)
ei_writes = dataset.edge_index('author', 'writes', 'paper')
ei_cites = dataset.edge_index('paper', 'paper')
ei_affiliated = dataset.edge_index('author', 'institution')
# We sort the nodes starting with the papers, then the authors, then the institutions.
author_offset = 0
inst_offset = author_offset + dataset.num_authors
paper_offset = inst_offset + dataset.num_institutions
g = dgl.heterograph({
('author', 'write', 'paper'): (ei_writes[0], ei_writes[1]),
('paper', 'write-by', 'author'): (ei_writes[1], ei_writes[0]),
('author', 'affiliate-with', 'institution'): (ei_affiliated[0], ei_affiliated[1]),
('institution', 'affiliate', 'author'): (ei_affiliated[1], ei_affiliated[0]),
('paper', 'cite', 'paper'): (np.concatenate([ei_cites[0], ei_cites[1]]), np.concatenate([ei_cites[1], ei_cites[0]]))
})
paper_feat = dataset.paper_feat
author_feat = np.memmap(args.author_output_path, mode='w+', dtype='float16', shape=(dataset.num_authors, dataset.num_paper_features))
inst_feat = np.memmap(args.inst_output_path, mode='w+', dtype='float16', shape=(dataset.num_institutions, dataset.num_paper_features))
# Iteratively process author features along the feature dimension.
BLOCK_COLS = 16
with tqdm.trange(0, dataset.num_paper_features, BLOCK_COLS) as tq:
for start in tq:
tq.set_postfix_str('Reading paper features...')
g.nodes['paper'].data['x'] = torch.FloatTensor(paper_feat[:, start:start + BLOCK_COLS].astype('float32'))
# Compute author features...
tq.set_postfix_str('Computing author features...')
g.update_all(fn.copy_u('x', 'm'), fn.mean('m', 'x'), etype='write-by')
# Then institution features...
tq.set_postfix_str('Computing institution features...')
g.update_all(fn.copy_u('x', 'm'), fn.mean('m', 'x'), etype='affiliate-with')
tq.set_postfix_str('Writing author features...')
author_feat[:, start:start + BLOCK_COLS] = g.nodes['author'].data['x'].numpy().astype('float16')
tq.set_postfix_str('Writing institution features...')
inst_feat[:, start:start + BLOCK_COLS] = g.nodes['institution'].data['x'].numpy().astype('float16')
del g.nodes['paper'].data['x']
del g.nodes['author'].data['x']
del g.nodes['institution'].data['x']
author_feat.flush()
inst_feat.flush()
# Convert to homogeneous if needed. (The RGAT baseline needs homogeneous graph)
if args.graph_as_homogeneous:
# Process graph
g = dgl.to_homogeneous(g)
# DGL ensures that nodes with the same type are put together with the order preserved.
# DGL also ensures that the node types are sorted in ascending order.
assert torch.equal(
g.ndata[dgl.NTYPE],
torch.cat([torch.full((dataset.num_authors,), 0),
torch.full((dataset.num_institutions,), 1),
torch.full((dataset.num_papers,), 2)]))
assert torch.equal(
g.ndata[dgl.NID],
torch.cat([torch.arange(dataset.num_authors),
torch.arange(dataset.num_institutions),
torch.arange(dataset.num_papers)]))
g.edata['etype'] = g.edata[dgl.ETYPE].byte()
del g.edata[dgl.ETYPE]
del g.ndata[dgl.NTYPE]
del g.ndata[dgl.NID]
# Process feature
full_feat = np.memmap(
args.full_output_path, mode='w+', dtype='float16',
shape=(dataset.num_authors + dataset.num_institutions + dataset.num_papers, dataset.num_paper_features))
BLOCK_ROWS = 100000
for start in tqdm.trange(0, dataset.num_authors, BLOCK_ROWS):
end = min(dataset.num_authors, start + BLOCK_ROWS)
full_feat[author_offset + start:author_offset + end] = author_feat[start:end]
for start in tqdm.trange(0, dataset.num_institutions, BLOCK_ROWS):
end = min(dataset.num_institutions, start + BLOCK_ROWS)
full_feat[inst_offset + start:inst_offset + end] = inst_feat[start:end]
for start in tqdm.trange(0, dataset.num_papers, BLOCK_ROWS):
end = min(dataset.num_papers, start + BLOCK_ROWS)
full_feat[paper_offset + start:paper_offset + end] = paper_feat[start:end]
# Convert the graph to the given format and save. (The RGAT baseline needs CSC graph)
g = g.formats(args.graph_format)
dgl.save_graphs(args.graph_output_path, g)
#!/usr/bin/env python
# coding: utf-8
import ogb
from ogb.lsc import MAG240MDataset, MAG240MEvaluator
import dgl
import torch
import numpy as np
import time
import tqdm
import dgl.function as fn
import numpy as np
import dgl.nn as dglnn
import torch.nn as nn
import torch.nn.functional as F
import argparse
class RGAT(nn.Module):
def __init__(self, in_channels, out_channels, hidden_channels, num_etypes, num_layers, num_heads, dropout, pred_ntype):
super().__init__()
self.convs = nn.ModuleList()
self.norms = nn.ModuleList()
self.skips = nn.ModuleList()
self.convs.append(nn.ModuleList([
dglnn.GATConv(in_channels, hidden_channels // num_heads, num_heads, allow_zero_in_degree=True)
for _ in range(num_etypes)
]))
self.norms.append(nn.BatchNorm1d(hidden_channels))
self.skips.append(nn.Linear(in_channels, hidden_channels))
for _ in range(num_layers - 1):
self.convs.append(nn.ModuleList([
dglnn.GATConv(hidden_channels, hidden_channels // num_heads, num_heads, allow_zero_in_degree=True)
for _ in range(num_etypes)
]))
self.norms.append(nn.BatchNorm1d(hidden_channels))
self.skips.append(nn.Linear(hidden_channels, hidden_channels))
self.mlp = nn.Sequential(
nn.Linear(hidden_channels, hidden_channels),
nn.BatchNorm1d(hidden_channels),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(hidden_channels, out_channels)
)
self.dropout = nn.Dropout(dropout)
self.hidden_channels = hidden_channels
self.pred_ntype = pred_ntype
self.num_etypes = num_etypes
def forward(self, mfgs, x):
for i in range(len(mfgs)):
mfg = mfgs[i]
x_dst = x[:mfg.num_dst_nodes()]
n_src = mfg.num_src_nodes()
n_dst = mfg.num_dst_nodes()
mfg = dgl.block_to_graph(mfg)
x_skip = self.skips[i](x_dst)
for j in range(self.num_etypes):
subg = mfg.edge_subgraph(mfg.edata['etype'] == j, preserve_nodes=True)
x_skip += self.convs[i][j](subg, (x, x_dst)).view(-1, self.hidden_channels)
x = self.norms[i](x_skip)
x = F.elu(x)
x = self.dropout(x)
return self.mlp(x)
class ExternalNodeCollator(dgl.dataloading.NodeCollator):
def __init__(self, g, idx, sampler, offset, feats, label):
super().__init__(g, idx, sampler)
self.offset = offset
self.feats = feats
self.label = label
def collate(self, items):
input_nodes, output_nodes, mfgs = super().collate(items)
# Copy input features
mfgs[0].srcdata['x'] = torch.FloatTensor(self.feats[input_nodes])
mfgs[-1].dstdata['y'] = torch.LongTensor(self.label[output_nodes - self.offset])
return input_nodes, output_nodes, mfgs
def train(args, dataset, g, feats, paper_offset):
print('Loading masks and labels')
train_idx = torch.LongTensor(dataset.get_idx_split('train')) + paper_offset
valid_idx = torch.LongTensor(dataset.get_idx_split('valid')) + paper_offset
label = dataset.paper_label
print('Initializing dataloader...')
sampler = dgl.dataloading.MultiLayerNeighborSampler([15, 25])
train_collator = ExternalNodeCollator(g, train_idx, sampler, paper_offset, feats, label)
valid_collator = ExternalNodeCollator(g, valid_idx, sampler, paper_offset, feats, label)
train_dataloader = torch.utils.data.DataLoader(
train_collator.dataset,
batch_size=1024,
shuffle=True,
drop_last=False,
collate_fn=train_collator.collate,
num_workers=4
)
valid_dataloader = torch.utils.data.DataLoader(
valid_collator.dataset,
batch_size=1024,
shuffle=True,
drop_last=False,
collate_fn=valid_collator.collate,
num_workers=2
)
print('Initializing model...')
model = RGAT(dataset.num_paper_features, dataset.num_classes, 1024, 5, 2, 4, 0.5, 'paper').cuda()
opt = torch.optim.Adam(model.parameters(), lr=0.001)
sched = torch.optim.lr_scheduler.StepLR(opt, step_size=25, gamma=0.25)
best_acc = 0
for _ in range(args.epochs):
model.train()
with tqdm.tqdm(train_dataloader) as tq:
for i, (input_nodes, output_nodes, mfgs) in enumerate(tq):
mfgs = [g.to('cuda') for g in mfgs]
x = mfgs[0].srcdata['x']
y = mfgs[-1].dstdata['y']
y_hat = model(mfgs, x)
loss = F.cross_entropy(y_hat, y)
opt.zero_grad()
loss.backward()
opt.step()
acc = (y_hat.argmax(1) == y).float().mean()
tq.set_postfix({'loss': '%.4f' % loss.item(), 'acc': '%.4f' % acc.item()}, refresh=False)
model.eval()
correct = total = 0
for i, (input_nodes, output_nodes, mfgs) in enumerate(tqdm.tqdm(valid_dataloader)):
with torch.no_grad():
mfgs = [g.to('cuda') for g in mfgs]
x = mfgs[0].srcdata['x']
y = mfgs[-1].dstdata['y']
y_hat = model(mfgs, x)
correct += (y_hat.argmax(1) == y).sum().item()
total += y_hat.shape[0]
acc = correct / total
print('Validation accuracy:', acc)
sched.step()
if best_acc < acc:
best_acc = acc
print('Updating best model...')
torch.save(model.state_dict(), args.model_path)
def test(args, dataset, g, feats, paper_offset):
print('Loading masks and labels...')
valid_idx = torch.LongTensor(dataset.get_idx_split('valid')) + paper_offset
test_idx = torch.LongTensor(dataset.get_idx_split('test')) + paper_offset
label = dataset.paper_label
print('Initializing data loader...')
sampler = dgl.dataloading.MultiLayerNeighborSampler([160, 160])
valid_collator = ExternalNodeCollator(g, valid_idx, sampler, paper_offset, feats, label)
valid_dataloader = torch.utils.data.DataLoader(
valid_collator.dataset,
batch_size=16,
shuffle=False,
drop_last=False,
collate_fn=valid_collator.collate,
num_workers=2
)
test_collator = ExternalNodeCollator(g, test_idx, sampler, paper_offset, feats, label)
test_dataloader = torch.utils.data.DataLoader(
test_collator.dataset,
batch_size=16,
shuffle=False,
drop_last=False,
collate_fn=test_collator.collate,
num_workers=4
)
print('Loading model...')
model = RGAT(dataset.num_paper_features, dataset.num_classes, 1024, 5, 2, 4, 0.5, 'paper').cuda()
model.load_state_dict(torch.load(args.model_path))
model.eval()
correct = total = 0
for i, (input_nodes, output_nodes, mfgs) in enumerate(tqdm.tqdm(valid_dataloader)):
with torch.no_grad():
mfgs = [g.to('cuda') for g in mfgs]
x = mfgs[0].srcdata['x']
y = mfgs[-1].dstdata['y']
y_hat = model(mfgs, x)
correct += (y_hat.argmax(1) == y).sum().item()
total += y_hat.shape[0]
acc = correct / total
print('Validation accuracy:', acc)
evaluator = MAG240MEvaluator()
y_preds = []
for i, (input_nodes, output_nodes, mfgs) in enumerate(tqdm.tqdm(test_dataloader)):
with torch.no_grad():
mfgs = [g.to('cuda') for g in mfgs]
x = mfgs[0].srcdata['x']
y = mfgs[-1].dstdata['y']
y_hat = model(mfgs, x)
y_preds.append(y_hat.argmax(1).cpu())
evaluator.save_test_submission({'y_pred': torch.cat(y_preds)}, args.submission_path)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--rootdir', type=str, default='.', help='Directory to download the OGB dataset.')
parser.add_argument('--graph-path', type=str, default='./graph.dgl', help='Path to the graph.')
parser.add_argument('--full-feature-path', type=str, default='./full.npy',
help='Path to the features of all nodes.')
parser.add_argument('--epochs', type=int, default=100, help='Number of epochs.')
parser.add_argument('--model-path', type=str, default='./model.pt', help='Path to store the best model.')
parser.add_argument('--submission-path', type=str, default='./results', help='Submission directory.')
args = parser.parse_args()
dataset = MAG240MDataset(root=args.rootdir)
print('Loading graph')
(g,), _ = dgl.load_graphs(args.graph_path)
g = g.formats(['csc'])
print('Loading features')
paper_offset = dataset.num_authors + dataset.num_institutions
num_nodes = paper_offset + dataset.num_papers
num_features = dataset.num_paper_features
feats = np.memmap(args.full_feature_path, mode='r', dtype='float16', shape=(num_nodes, num_features))
if args.epochs != 0:
train(args, dataset, g, feats, paper_offset)
test(args, dataset, g, feats, paper_offset)
# Baselines for OGB Large-Scale Challenge (LSC) at KDD Cup 2021 # Baselines for OGB Large-Scale Challenge (LSC) at KDD Cup 2021
\ No newline at end of file
We offer alternative download links for OGB datasets:
- [Node Classification with MAG240M](https://dgl-data.s3-accelerate.amazonaws.com/dataset/OGB-LSC/mag240m_kddcup2021.zip)
- [Link Prediction with WikiKG90M](https://dgl-data.s3-accelerate.amazonaws.com/dataset/OGB-LSC/wikikg90m_kddcup2021.zip)
- [Graph Classification with PCQM4M](https://dgl-data.s3-accelerate.amazonaws.com/dataset/OGB-LSC/pcqm4m_kddcup2021.zip)
...@@ -60,6 +60,8 @@ class GATConv(nn.Module): ...@@ -60,6 +60,8 @@ class GATConv(nn.Module):
causing silent performance regression. This module will raise a DGLError if it detects causing silent performance regression. This module will raise a DGLError if it detects
0-in-degree nodes in input graph. By setting ``True``, it will suppress the check 0-in-degree nodes in input graph. By setting ``True``, it will suppress the check
and let the users handle it by themselves. Defaults: ``False``. and let the users handle it by themselves. Defaults: ``False``.
bias : bool, optional
If True, learns a bias term. Defaults: ``True``.
Note Note
---- ----
...@@ -141,7 +143,8 @@ class GATConv(nn.Module): ...@@ -141,7 +143,8 @@ class GATConv(nn.Module):
negative_slope=0.2, negative_slope=0.2,
residual=False, residual=False,
activation=None, activation=None,
allow_zero_in_degree=False): allow_zero_in_degree=False,
bias=True):
super(GATConv, self).__init__() super(GATConv, self).__init__()
self._num_heads = num_heads self._num_heads = num_heads
self._in_src_feats, self._in_dst_feats = expand_as_pair(in_feats) self._in_src_feats, self._in_dst_feats = expand_as_pair(in_feats)
...@@ -160,6 +163,10 @@ class GATConv(nn.Module): ...@@ -160,6 +163,10 @@ class GATConv(nn.Module):
self.feat_drop = nn.Dropout(feat_drop) self.feat_drop = nn.Dropout(feat_drop)
self.attn_drop = nn.Dropout(attn_drop) self.attn_drop = nn.Dropout(attn_drop)
self.leaky_relu = nn.LeakyReLU(negative_slope) self.leaky_relu = nn.LeakyReLU(negative_slope)
if bias:
self.bias = nn.Parameter(th.FloatTensor(size=(num_heads * out_feats,)))
else:
self.register_buffer('bias', None)
if residual: if residual:
if self._in_dst_feats != out_feats: if self._in_dst_feats != out_feats:
self.res_fc = nn.Linear( self.res_fc = nn.Linear(
...@@ -191,6 +198,7 @@ class GATConv(nn.Module): ...@@ -191,6 +198,7 @@ class GATConv(nn.Module):
nn.init.xavier_normal_(self.fc_dst.weight, gain=gain) nn.init.xavier_normal_(self.fc_dst.weight, gain=gain)
nn.init.xavier_normal_(self.attn_l, gain=gain) nn.init.xavier_normal_(self.attn_l, gain=gain)
nn.init.xavier_normal_(self.attn_r, gain=gain) nn.init.xavier_normal_(self.attn_r, gain=gain)
nn.init.constant_(self.bias, 0)
if isinstance(self.res_fc, nn.Linear): if isinstance(self.res_fc, nn.Linear):
nn.init.xavier_normal_(self.res_fc.weight, gain=gain) nn.init.xavier_normal_(self.res_fc.weight, gain=gain)
...@@ -296,8 +304,11 @@ class GATConv(nn.Module): ...@@ -296,8 +304,11 @@ class GATConv(nn.Module):
rst = graph.dstdata['ft'] rst = graph.dstdata['ft']
# residual # residual
if self.res_fc is not None: if self.res_fc is not None:
resval = self.res_fc(h_dst).view(h_dst.shape[0], -1, self._out_feats) resval = self.res_fc(h_dst).view(h_dst.shape[0], self._num_heads, self._out_feats)
rst = rst + resval rst = rst + resval
# bias
if self.bias is not None:
rst = rst + self.bias.view(1, self._num_heads, self._out_feats)
# activation # activation
if self.activation: if self.activation:
rst = self.activation(rst) rst = self.activation(rst)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment