Unverified Commit 5d8330cc authored by Minjie Wang's avatar Minjie Wang Committed by GitHub
Browse files

[Test] Add model speed and accuracy tests for RGCN (#2458)

* add rgcn acc bench

* fix issue of multiple parametrize

* model acc/speed test: RGCN

* fix dep problem in docker run

* add docstring
parent f8b3ebce
......@@ -27,7 +27,7 @@
],
// List of branches to benchmark. If not provided, defaults to "master"
// (for git) or "default" (for mercurial).
"branches": ["master", "0.5.0", "0.5.2", "0.5.3", "0.4.3.post2"], // for git
"branches": ["HEAD"], // for git
// The DVCS being used. If not set, it will be automatically
// determined from "repo" by looking at the protocol in the URL
// (if remote), or by looking for special directories, such as
......
import dgl
from dgl.nn.pytorch import RelGraphConv
import torch
import torch.nn as nn
import torch.nn.functional as F
from .. import utils
class RGCN(nn.Module):
def __init__(self,
num_nodes,
n_hidden,
num_classes,
num_rels,
num_bases,
num_hidden_layers,
dropout):
super(RGCN, self).__init__()
self.layers = nn.ModuleList()
# i2h
self.layers.append(RelGraphConv(num_nodes, n_hidden, num_rels, "basis",
num_bases, activation=F.relu, dropout=dropout,
low_mem=True))
# h2h
for i in range(num_hidden_layers):
self.layers.append(RelGraphConv(n_hidden, n_hidden, num_rels, "basis",
num_bases, activation=F.relu, dropout=dropout,
low_mem=True))
# o2h
self.layers.append(RelGraphConv(n_hidden, num_classes, num_rels, "basis",
num_bases, activation=None, low_mem=True))
def forward(self, g, h, r, norm):
for layer in self.layers:
h = layer(g, h, r, norm)
return h
def evaluate(model, g, feats, edge_type, edge_norm, labels, idx):
model.eval()
with torch.no_grad():
logits = model(g, feats, edge_type, edge_norm)
logits = logits[idx]
_, indices = torch.max(logits, dim=1)
correct = torch.sum(indices == labels)
return correct.item() * 1.0 / len(labels) * 100
@utils.benchmark('acc')
@utils.parametrize('data', ['aifb', 'mutag'])
def track_acc(data):
# args
if data == 'aifb':
num_bases = -1
l2norm = 0.
elif data == 'mutag':
num_bases = 30
l2norm = 5e-4
elif data == 'am':
num_bases = 40
l2norm = 5e-4
else:
raise ValueError()
data = utils.process_data(data)
device = utils.get_bench_device()
g = data[0]
num_rels = len(g.canonical_etypes)
category = data.predict_category
num_classes = data.num_classes
train_mask = g.nodes[category].data.pop('train_mask').bool().to(device)
test_mask = g.nodes[category].data.pop('test_mask').bool().to(device)
labels = g.nodes[category].data.pop('labels').to(device)
# calculate norm for each edge type and store in edge
for canonical_etype in g.canonical_etypes:
u, v, eid = g.all_edges(form='all', etype=canonical_etype)
_, inverse_index, count = torch.unique(v, return_inverse=True, return_counts=True)
degrees = count[inverse_index]
norm = 1. / degrees.float()
norm = norm.unsqueeze(1)
g.edges[canonical_etype].data['norm'] = norm
# get target category id
category_id = len(g.ntypes)
for i, ntype in enumerate(g.ntypes):
if ntype == category:
category_id = i
g = dgl.to_homogeneous(g, edata=['norm']).to(device)
num_nodes = g.number_of_nodes()
edge_norm = g.edata['norm']
edge_type = g.edata[dgl.ETYPE].long()
# find out the target node ids in g
target_idx = torch.where(g.ndata[dgl.NTYPE] == category_id)[0]
train_idx = target_idx[train_mask]
test_idx = target_idx[test_mask]
train_labels = labels[train_mask]
test_labels = labels[test_mask]
# since the nodes are featureless, the input feature is then the node id.
feats = torch.arange(num_nodes, device=device)
# create model
model = RGCN(num_nodes,
16,
num_classes,
num_rels,
num_bases,
0,
0).to(device)
optimizer = torch.optim.Adam(model.parameters(),
lr=1e-2,
weight_decay=l2norm)
model.train()
for epoch in range(30):
logits = model(g, feats, edge_type, edge_norm)
loss = F.cross_entropy(logits[train_idx], train_labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
acc = evaluate(model, g, feats, edge_type, edge_norm, test_labels, test_idx)
return acc
......@@ -98,4 +98,4 @@ def track_time(data):
optimizer.step()
t1 = time.time()
return t1 - t0
return (t1 - t0) / num_epochs
import time
import dgl
from dgl.nn.pytorch import RelGraphConv
import torch
import torch.nn as nn
import torch.nn.functional as F
from .. import utils
class RGCN(nn.Module):
def __init__(self,
num_nodes,
n_hidden,
num_classes,
num_rels,
num_bases,
num_hidden_layers,
dropout):
super(RGCN, self).__init__()
self.layers = nn.ModuleList()
# i2h
self.layers.append(RelGraphConv(num_nodes, n_hidden, num_rels, "basis",
num_bases, activation=F.relu, dropout=dropout,
low_mem=True))
# h2h
for i in range(num_hidden_layers):
self.layers.append(RelGraphConv(n_hidden, n_hidden, num_rels, "basis",
num_bases, activation=F.relu, dropout=dropout,
low_mem=True))
# o2h
self.layers.append(RelGraphConv(n_hidden, num_classes, num_rels, "basis",
num_bases, activation=None, low_mem=True))
def forward(self, g, h, r, norm):
for layer in self.layers:
h = layer(g, h, r, norm)
return h
@utils.benchmark('time', 3600)
@utils.parametrize('data', ['aifb', 'am'])
def track_time(data):
# args
if data == 'aifb':
num_bases = -1
l2norm = 0.
elif data == 'am':
num_bases = 40
l2norm = 5e-4
else:
raise ValueError()
data = utils.process_data(data)
device = utils.get_bench_device()
num_epochs = 30
g = data[0]
num_rels = len(g.canonical_etypes)
category = data.predict_category
num_classes = data.num_classes
train_mask = g.nodes[category].data.pop('train_mask').bool().to(device)
test_mask = g.nodes[category].data.pop('test_mask').bool().to(device)
labels = g.nodes[category].data.pop('labels').to(device)
# calculate norm for each edge type and store in edge
for canonical_etype in g.canonical_etypes:
u, v, eid = g.all_edges(form='all', etype=canonical_etype)
_, inverse_index, count = torch.unique(v, return_inverse=True, return_counts=True)
degrees = count[inverse_index]
norm = 1. / degrees.float()
norm = norm.unsqueeze(1)
g.edges[canonical_etype].data['norm'] = norm
# get target category id
category_id = len(g.ntypes)
for i, ntype in enumerate(g.ntypes):
if ntype == category:
category_id = i
g = dgl.to_homogeneous(g, edata=['norm']).to(device)
num_nodes = g.number_of_nodes()
edge_norm = g.edata['norm']
edge_type = g.edata[dgl.ETYPE].long()
# find out the target node ids in g
target_idx = torch.where(g.ndata[dgl.NTYPE] == category_id)[0]
train_idx = target_idx[train_mask]
test_idx = target_idx[test_mask]
train_labels = labels[train_mask]
test_labels = labels[test_mask]
# since the nodes are featureless, the input feature is then the node id.
feats = torch.arange(num_nodes, device=device)
# create model
model = RGCN(num_nodes,
16,
num_classes,
num_rels,
num_bases,
0,
0).to(device)
optimizer = torch.optim.Adam(model.parameters(),
lr=1e-2,
weight_decay=l2norm)
model.train()
t0 = time.time()
for epoch in range(num_epochs):
logits = model(g, feats, edge_type, edge_norm)
loss = F.cross_entropy(logits[train_idx], train_labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
t1 = time.time()
return (t1 - t0) / num_epochs
......@@ -89,4 +89,4 @@ def track_time(data):
optimizer.step()
t1 = time.time()
return t1 - t0
return (t1 - t0) / num_epochs
......@@ -130,4 +130,4 @@ def track_time(data):
t1 = time.time()
return t1 - t0
return (t1 - t0) / num_epochs
......@@ -180,4 +180,4 @@ def track_time(data):
t1 = time.time()
return t1 - t0
return (t1 - t0) / num_epochs
import os
import shutil, zipfile
import requests
import inspect
import numpy as np
import pandas
import dgl
......@@ -37,7 +38,7 @@ def get_graph(name):
print(name + " doesn't exist")
return None
class ogb_data(object):
class OGBDataset(object):
def __init__(self, g, num_labels):
self._g = g
self._num_labels = num_labels
......@@ -81,13 +82,21 @@ def load_ogb_product(name):
graph.ndata['val_mask'] = val_mask
graph.ndata['test_mask'] = test_mask
return ogb_data(graph, num_labels)
return OGBDataset(graph, num_labels)
def process_data(name):
if name == 'cora':
return dgl.data.CoraGraphDataset()
elif name == 'pubmed':
return dgl.data.PubmedGraphDataset()
elif name == 'aifb':
return dgl.data.AIFBDataset()
elif name == 'mutag':
return dgl.data.MUTAGDataset()
elif name == 'bgs':
return dgl.data.BGSDataset()
elif name == 'am':
return dgl.data.AMDataset()
elif name == 'reddit':
return dgl.data.RedditDataset(self_loop=True)
elif name == 'ogbn-products':
......@@ -119,17 +128,88 @@ TRACK_SETUP = {
}
def parametrize(param_name, params):
"""Decorator for benchmarking over a set of parameters.
Parameters
----------
param_name : str
Parameter name. Must be one of the arguments of the decorated function.
params : list[any]
List of values to benchmark for the given parameter name. Recommend
to use Python's native object type (e.g., int, str, list[int]) because
ASV will display them on the plot.
Examples
--------
Benchmark function `foo` when argument `x` is equal to 10 or 20.
.. code::
@benchmark('time')
@parametrize('x', [10, 20]):
def foo(x):
pass
Benchmark function with multiple parametrizations. It will run the function
with all possible combinations. The example below generates 6 benchmarks.
.. code::
@benchmark('time')
@parametrize('x', [10, 20]):
@parametrize('y', [-1, -2, -3]):
def foo(x, y):
pass
When using multiple parametrizations, it can have arbitrary order. The example
below is the same as the above one.
.. code::
@benchmark('time')
@parametrize('y', [-1, -2, -3]):
@parametrize('x', [10, 20]):
def foo(x, y):
pass
"""
def _wrapper(func):
sig_params = inspect.signature(func).parameters.keys()
num_params = len(sig_params)
if getattr(func, 'params', None) is None:
func.params = []
func.params.append(params)
func.params = [None] * num_params
if getattr(func, 'param_names', None) is None:
func.param_names = []
func.param_names.append(param_name)
func.param_names = [None] * num_params
found_param = False
for i, sig_param in enumerate(sig_params):
if sig_param == param_name:
func.params[i] = params
func.param_names[i] = param_name
found_param = True
break
if not found_param:
raise ValueError('Invalid parameter name:', param_name)
return func
return _wrapper
def benchmark(track_type, timeout=60):
"""Decorator for indicating the benchmark type.
Parameters
----------
track_type : str
Type. Must be either:
- 'time' : For timing. Unit: second.
- 'acc' : For accuracy. Unit: percentage, value between 0 and 100.
timeout : int
Timeout threshold in second.
Examples
--------
.. code::
@benchmark('time')
def foo():
pass
"""
assert track_type in ['time', 'acc']
def _wrapper(func):
func.unit = TRACK_UNITS[track_type]
......
......@@ -5,7 +5,7 @@ set -e
. /opt/conda/etc/profile.d/conda.sh
pip install -r /asv/torch_gpu_pip.txt
pip install pandas
pip install pandas rdflib
# install
pushd python
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment