Unverified Commit 71283997 authored by Jinjing Zhou's avatar Jinjing Zhou Committed by GitHub
Browse files

Integrate Regression Test with Jenkins (#2448)

* add bench jenkins

* instance type

* fix

* fix

* fix

* 111

* test

* 111

* 111

* fix

* test

* run

* fix

* fix

* fix

* fix

* fix

* publish results

* 111

* regression

* launch ec2 script

* fix

* add

* run on master

* change

* rrr

* run gpu

* fix

* fix

* try fix

* fix

* ff

* fix

* fix

* fix

* refactor

* fix

* fix

* update

* fix

* fix

* fix

* fix

* remove import torchtext

* add shm size

* update

* fix

* fix

* fix

* fix

* fix this!!!!

* 111

* fix

* remove verbose

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* update readme

* fix

* fix

* fix

* change asv default to head

* commit sage and rgcn

* fix

* update
parent 4e7a646b
...@@ -17,17 +17,19 @@ ...@@ -17,17 +17,19 @@
// uninstalling the project. See asv.conf.json documentation. // uninstalling the project. See asv.conf.json documentation.
// //
"build_command": [ "build_command": [
"/bin/bash {conf_dir}/build_dgl_asv.sh" "/bin/bash {conf_dir}/scripts/build_dgl_asv.sh"
], ],
"install_command": [ "install_command": [
"/bin/bash {conf_dir}/install_dgl_asv.sh" "/bin/bash {conf_dir}/scripts/install_dgl_asv.sh"
], ],
"uninstall_command": [ "uninstall_command": [
"return-code=any python -m pip uninstall -y dgl" "return-code=any python -m pip uninstall -y dgl"
], ],
// List of branches to benchmark. If not provided, defaults to "master" // List of branches to benchmark. If not provided, defaults to "master"
// (for git) or "default" (for mercurial). // (for git) or "default" (for mercurial).
"branches": ["HEAD", "master"], // for git "branches": [
"HEAD"
], // for git
// The DVCS being used. If not set, it will be automatically // The DVCS being used. If not set, it will be automatically
// determined from "repo" by looking at the protocol in the URL // determined from "repo" by looking at the protocol in the URL
// (if remote), or by looking for special directories, such as // (if remote), or by looking for special directories, such as
......
import dgl
import itertools
import torch as th
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.multiprocessing as mp
from torch.utils.data import DataLoader
import dgl.nn.pytorch as dglnn
from dgl.nn import RelGraphConv
import time
from .. import utils
class EntityClassify(nn.Module):
""" Entity classification class for RGCN
Parameters
----------
device : int
Device to run the layer.
num_nodes : int
Number of nodes.
h_dim : int
Hidden dim size.
out_dim : int
Output dim size.
num_rels : int
Numer of relation types.
num_bases : int
Number of bases. If is none, use number of relations.
num_hidden_layers : int
Number of hidden RelGraphConv Layer
dropout : float
Dropout
use_self_loop : bool
Use self loop if True, default False.
low_mem : bool
True to use low memory implementation of relation message passing function
trade speed with memory consumption
"""
def __init__(self,
device,
num_nodes,
h_dim,
out_dim,
num_rels,
num_bases=None,
num_hidden_layers=1,
dropout=0,
use_self_loop=False,
low_mem=False,
layer_norm=False):
super(EntityClassify, self).__init__()
self.device = device
self.num_nodes = num_nodes
self.h_dim = h_dim
self.out_dim = out_dim
self.num_rels = num_rels
self.num_bases = None if num_bases < 0 else num_bases
self.num_hidden_layers = num_hidden_layers
self.dropout = dropout
self.use_self_loop = use_self_loop
self.low_mem = low_mem
self.layer_norm = layer_norm
self.layers = nn.ModuleList()
# i2h
self.layers.append(RelGraphConv(
self.h_dim, self.h_dim, self.num_rels, "basis",
self.num_bases, activation=F.relu, self_loop=self.use_self_loop,
low_mem=self.low_mem, dropout=self.dropout, layer_norm = layer_norm))
# h2h
for idx in range(self.num_hidden_layers):
self.layers.append(RelGraphConv(
self.h_dim, self.h_dim, self.num_rels, "basis",
self.num_bases, activation=F.relu, self_loop=self.use_self_loop,
low_mem=self.low_mem, dropout=self.dropout, layer_norm = layer_norm))
# h2o
self.layers.append(RelGraphConv(
self.h_dim, self.out_dim, self.num_rels, "basis",
self.num_bases, activation=None,
self_loop=self.use_self_loop,
low_mem=self.low_mem, layer_norm = layer_norm))
def forward(self, blocks, feats, norm=None):
if blocks is None:
# full graph training
blocks = [self.g] * len(self.layers)
h = feats
for layer, block in zip(self.layers, blocks):
block = block.to(self.device)
h = layer(block, h, block.edata['etype'], block.edata['norm'])
return h
class RelGraphEmbedLayer(nn.Module):
r"""Embedding layer for featureless heterograph.
Parameters
----------
device : int
Device to run the layer.
num_nodes : int
Number of nodes.
node_tides : tensor
Storing the node type id for each node starting from 0
num_of_ntype : int
Number of node types
input_size : list of int
A list of input feature size for each node type. If None, we then
treat certain input feature as an one-hot encoding feature.
embed_size : int
Output embed size
embed_name : str, optional
Embed name
"""
def __init__(self,
device,
num_nodes,
node_tids,
num_of_ntype,
input_size,
embed_size,
sparse_emb=False,
embed_name='embed'):
super(RelGraphEmbedLayer, self).__init__()
self.device = device
self.embed_size = embed_size
self.embed_name = embed_name
self.num_nodes = num_nodes
self.sparse_emb = sparse_emb
# create weight embeddings for each node for each relation
self.embeds = nn.ParameterDict()
self.num_of_ntype = num_of_ntype
self.idmap = th.empty(num_nodes).long()
for ntype in range(num_of_ntype):
if input_size[ntype] is not None:
input_emb_size = input_size[ntype].shape[1]
embed = nn.Parameter(th.Tensor(input_emb_size, self.embed_size))
nn.init.xavier_uniform_(embed)
self.embeds[str(ntype)] = embed
self.node_embeds = th.nn.Embedding(node_tids.shape[0], self.embed_size, sparse=self.sparse_emb)
nn.init.uniform_(self.node_embeds.weight, -1.0, 1.0)
def forward(self, node_ids, node_tids, type_ids, features):
"""Forward computation
Parameters
----------
node_ids : tensor
node ids to generate embedding for.
node_tids : tensor
node type ids
features : list of features
list of initial features for nodes belong to different node type.
If None, the corresponding features is an one-hot encoding feature,
else use the features directly as input feature and matmul a
projection matrix.
Returns
-------
tensor
embeddings as the input of the next layer
"""
tsd_ids = node_ids.to(self.node_embeds.weight.device)
embeds = th.empty(node_ids.shape[0], self.embed_size, device=self.device)
for ntype in range(self.num_of_ntype):
if features[ntype] is not None:
loc = node_tids == ntype
embeds[loc] = features[ntype][type_ids[loc]].to(self.device) @ self.embeds[str(ntype)].to(self.device)
else:
loc = node_tids == ntype
embeds[loc] = self.node_embeds(tsd_ids[loc]).to(self.device)
return embeds
def evaluate(model, embed_layer, eval_loader, node_feats):
model.eval()
embed_layer.eval()
eval_logits = []
eval_seeds = []
with th.no_grad():
for sample_data in tqdm.tqdm(eval_loader):
th.cuda.empty_cache()
seeds, blocks = sample_data
feats = embed_layer(blocks[0].srcdata[dgl.NID],
blocks[0].srcdata[dgl.NTYPE],
blocks[0].srcdata['type_id'],
node_feats)
logits = model(blocks, feats)
eval_logits.append(logits.cpu().detach())
eval_seeds.append(seeds.cpu().detach())
eval_logits = th.cat(eval_logits)
eval_seeds = th.cat(eval_seeds)
return eval_logits, eval_seeds
@utils.benchmark('time', 3600)
@utils.parametrize('data', ['am', 'ogbn-mag'])
def track_acc(data):
dataset = utils.process_data(data)
device = utils.get_bench_device()
if data == 'am':
n_bases = 40
l2norm = 5e-4
elif data == 'ogbn-mag':
n_bases = 2
l2norm = 0
else:
raise ValueError()
fanouts = [25,15]
n_layers = 2
batch_size = 1024
n_hidden = 64
dropout = 0.5
use_self_loop = True
lr = 0.01
n_epochs = 20
low_mem = True
num_workers = 4
hg = dataset[0]
category = dataset.predict_category
num_classes = dataset.num_classes
train_mask = hg.nodes[category].data.pop('train_mask')
train_idx = th.nonzero(train_mask, as_tuple=False).squeeze()
test_mask = hg.nodes[category].data.pop('test_mask')
test_idx = th.nonzero(test_mask, as_tuple=False).squeeze()
labels = hg.nodes[category].data.pop('labels').to(device)
num_of_ntype = len(hg.ntypes)
num_rels = len(hg.canonical_etypes)
node_feats = []
for ntype in hg.ntypes:
if len(hg.nodes[ntype].data) == 0 or 'feat' not in hg.nodes[ntype].data:
node_feats.append(None)
else:
feat = hg.nodes[ntype].data.pop('feat')
node_feats.append(feat.share_memory_())
# get target category id
category_id = len(hg.ntypes)
for i, ntype in enumerate(hg.ntypes):
if ntype == category:
category_id = i
g = dgl.to_homogeneous(hg)
u, v, eid = g.all_edges(form='all')
# global norm
_, inverse_index, count = th.unique(v, return_inverse=True, return_counts=True)
degrees = count[inverse_index]
norm = th.ones(eid.shape[0]) / degrees
norm = norm.unsqueeze(1)
g.edata['norm'] = norm
g.edata['etype'] = g.edata[dgl.ETYPE]
g.ndata['type_id'] = g.ndata[dgl.NID]
g.ndata['ntype'] = g.ndata[dgl.NTYPE]
node_ids = th.arange(g.number_of_nodes())
# find out the target node ids
node_tids = g.ndata[dgl.NTYPE]
loc = (node_tids == category_id)
target_nids = node_ids[loc]
train_nids = target_nids[train_idx]
# Create csr/coo/csc formats before launching training processes with multi-gpu.
# This avoids creating certain formats in each sub-process, which saves momory and CPU.
g.create_formats_()
sampler = dgl.dataloading.MultiLayerNeighborSampler(fanouts)
collator = dgl.dataloading.NodeCollator(g, train_nids, sampler, return_indices=True)
loader = dgl.dataloading.DataLoader(
collator.dataset, collate_fn=collator.collate,
batch_size=batch_size, shuffle=True, num_workers=4)
# test_sampler = dgl.dataloading.MultiLayerNeighborSampler(fanouts)
test_loader = DataLoader(dataset=test_idx.numpy(),
batch_size=batch_size,
collate_fn=collator.collate,
shuffle=False,
num_workers=4)
# node features
# None for one-hot feature, if not none, it should be the feature tensor.
#
embed_layer = RelGraphEmbedLayer(device,
g.number_of_nodes(),
node_tids,
num_of_ntype,
node_feats,
n_hidden,
sparse_emb=True)
# create model
# all model params are in device.
model = EntityClassify(device,
g.number_of_nodes(),
n_hidden,
num_classes,
num_rels,
num_bases=n_bases,
num_hidden_layers=n_layers - 2,
dropout=dropout,
use_self_loop=use_self_loop,
low_mem=low_mem,
layer_norm=False)
embed_layer = embed_layer.to(device)
model = model.to(device)
all_params = itertools.chain(model.parameters(), embed_layer.embeds.parameters())
optimizer = th.optim.Adam(all_params, lr=lr, weight_decay=l2norm)
emb_optimizer = th.optim.SparseAdam(list(embed_layer.node_embeds.parameters()), lr=lr)
print("start training...")
t0 = time.time()
for epoch in range(n_epochs):
model.train()
embed_layer.train()
for i, sample_data in enumerate(loader):
input_nodes, output_nodes, seed_idx, blocks = sample_data
feats = embed_layer(input_nodes,
blocks[0].srcdata['ntype'],
blocks[0].srcdata['type_id'],
node_feats)
logits = model(blocks, feats)
loss = F.cross_entropy(logits, labels[train_idx][seed_idx])
optimizer.zero_grad()
emb_optimizer.zero_grad()
loss.backward()
optimizer.step()
emb_optimizer.step()
test_logits, test_seeds = evaluate(model, embed_layer, test_loader, node_feats)
test_loss = F.cross_entropy(test_logits, labels[test_seeds].cpu()).item()
test_acc = th.sum(test_logits.argmax(dim=1) == labels[test_seeds].cpu()).item() / len(test_seeds)
t1 = time.time()
return test_acc
import dgl
import torch as th
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.multiprocessing as mp
from torch.utils.data import DataLoader
import dgl.nn.pytorch as dglnn
import time
from .. import utils
class SAGE(nn.Module):
def __init__(self,
in_feats,
n_hidden,
n_classes,
n_layers,
activation,
dropout):
super().__init__()
self.n_layers = n_layers
self.n_hidden = n_hidden
self.n_classes = n_classes
self.layers = nn.ModuleList()
self.layers.append(dglnn.SAGEConv(in_feats, n_hidden, 'mean'))
for i in range(1, n_layers - 1):
self.layers.append(dglnn.SAGEConv(n_hidden, n_hidden, 'mean'))
self.layers.append(dglnn.SAGEConv(n_hidden, n_classes, 'mean'))
self.dropout = nn.Dropout(dropout)
self.activation = activation
def forward(self, blocks, x):
h = x
for l, (layer, block) in enumerate(zip(self.layers, blocks)):
h = layer(block, h)
if l != len(self.layers) - 1:
h = self.activation(h)
h = self.dropout(h)
return h
def inference(self, g, x, batch_size, device):
"""
Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling).
g : the entire graph.
x : the input of entire node set.
The inference code is written in a fashion that it could handle any number of nodes and
layers.
"""
# During inference with sampling, multi-layer blocks are very inefficient because
# lots of computations in the first few layers are repeated.
# Therefore, we compute the representation of all nodes layer by layer. The nodes
# on each layer are of course splitted in batches.
# TODO: can we standardize this?
for l, layer in enumerate(self.layers):
y = th.zeros(g.number_of_nodes(), self.n_hidden if l !=
len(self.layers) - 1 else self.n_classes)
sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
dataloader = dgl.dataloading.NodeDataLoader(
g,
th.arange(g.number_of_nodes()),
sampler,
batch_size=batch_size,
shuffle=True,
drop_last=False,
num_workers=4)
for input_nodes, output_nodes, blocks in dataloader:
block = blocks[0]
block = block.int().to(device)
h = x[input_nodes].to(device)
h = layer(block, h)
if l != len(self.layers) - 1:
h = self.activation(h)
h = self.dropout(h)
y[output_nodes] = h.cpu()
x = y
return y
def compute_acc(pred, labels):
"""
Compute the accuracy of prediction given the labels.
"""
labels = labels.long()
return (th.argmax(pred, dim=1) == labels).float().sum() / len(pred)
def evaluate(model, g, inputs, labels, val_nid, batch_size, device):
"""
Evaluate the model on the validation set specified by ``val_nid``.
g : The entire graph.
inputs : The features of all the nodes.
labels : The labels of all the nodes.
val_nid : the node Ids for validation.
batch_size : Number of nodes to compute at the same time.
device : The GPU device to evaluate on.
"""
model.eval()
with th.no_grad():
pred = model.inference(g, inputs, batch_size, device)
model.train()
return compute_acc(pred[val_nid], labels[val_nid])
def load_subtensor(g, seeds, input_nodes, device):
"""
Copys features and labels of a set of nodes onto GPU.
"""
batch_inputs = g.ndata['features'][input_nodes].to(device)
batch_labels = g.ndata['labels'][seeds].to(device)
return batch_inputs, batch_labels
@utils.benchmark('acc', 3600)
@utils.parametrize('data', ['ogbn-products', "reddit"])
def track_acc(data):
data = utils.process_data(data)
device = utils.get_bench_device()
g = data[0]
g.ndata['features'] = g.ndata['feat']
g.ndata['labels'] = g.ndata['label']
in_feats = g.ndata['features'].shape[1]
n_classes = data.num_labels
# Create csr/coo/csc formats before launching training processes with multi-gpu.
# This avoids creating certain formats in each sub-process, which saves momory and CPU.
g.create_formats_()
num_epochs = 20
num_hidden = 16
num_layers = 2
fan_out = '5,10'
batch_size = 1024
lr = 0.003
dropout = 0.5
num_workers = 4
train_nid = th.nonzero(g.ndata['train_mask'], as_tuple=True)[0]
# Create PyTorch DataLoader for constructing blocks
sampler = dgl.dataloading.MultiLayerNeighborSampler(
[int(fanout) for fanout in fan_out.split(',')])
dataloader = dgl.dataloading.NodeDataLoader(
g,
train_nid,
sampler,
batch_size=batch_size,
shuffle=True,
drop_last=False,
num_workers=num_workers)
# Define model and optimizer
model = SAGE(in_feats, num_hidden, n_classes, num_layers, F.relu, dropout)
model = model.to(device)
loss_fcn = nn.CrossEntropyLoss()
loss_fcn = loss_fcn.to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
# dry run one epoch
for step, (input_nodes, seeds, blocks) in enumerate(dataloader):
# Load the input features as well as output labels
#batch_inputs, batch_labels = load_subtensor(g, seeds, input_nodes, device)
blocks = [block.int().to(device) for block in blocks]
batch_inputs = blocks[0].srcdata['features']
batch_labels = blocks[-1].dstdata['labels']
# Compute loss and prediction
batch_pred = model(blocks, batch_inputs)
loss = loss_fcn(batch_pred, batch_labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Training loop
for epoch in range(num_epochs):
# Loop over the dataloader to sample the computation dependency graph as a list of
# blocks.
for step, (input_nodes, seeds, blocks) in enumerate(dataloader):
# Load the input features as well as output labels
#batch_inputs, batch_labels = load_subtensor(g, seeds, input_nodes, device)
blocks = [block.int().to(device) for block in blocks]
batch_inputs = blocks[0].srcdata['features']
batch_labels = blocks[-1].dstdata['labels']
# Compute loss and prediction
batch_pred = model(blocks, batch_inputs)
loss = loss_fcn(batch_pred, batch_labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
test_g = g
test_nid = th.nonzero(
~(test_g.ndata['train_mask'] | test_g.ndata['val_mask']), as_tuple=True)[0]
test_acc = evaluate(
model, test_g, test_g.ndata['features'], test_g.ndata['labels'], test_nid, batch_size, device)
return test_acc.item()
...@@ -5,7 +5,6 @@ import torch ...@@ -5,7 +5,6 @@ import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from torch.utils.data import IterableDataset, DataLoader from torch.utils.data import IterableDataset, DataLoader
import torchtext
import dgl import dgl
import dgl.function as fn import dgl.function as fn
......
import os, pickle import json
import shutil, zipfile import os
import pickle
import shutil
import zipfile
import requests import requests
import inspect import inspect
import numpy as np import numpy as np
import pandas import pandas
import dgl import dgl
import torch import torch
import torchtext
def _download(url, path, filename): def _download(url, path, filename):
fn = os.path.join(path, filename) fn = os.path.join(path, filename)
...@@ -22,15 +25,30 @@ def _download(url, path, filename): ...@@ -22,15 +25,30 @@ def _download(url, path, filename):
writer.write(chunk) writer.write(chunk)
print('Download finished.') print('Download finished.')
def get_livejournal(): def get_livejournal():
_download('https://snap.stanford.edu/data/soc-LiveJournal1.txt.gz', # Same as https://snap.stanford.edu/data/soc-LiveJournal1.txt.gz
'/tmp', 'soc-LiveJournal1.txt.gz') _download('https://dgl-asv-data.s3-us-west-2.amazonaws.com/dataset/livejournal/soc-LiveJournal1.txt.gz',
df = pandas.read_csv('/tmp/soc-LiveJournal1.txt.gz', sep='\t', skiprows=4, header=None, '/tmp/dataset', 'soc-LiveJournal1.txt.gz')
df = pandas.read_csv('/tmp/dataset/soc-LiveJournal1.txt.gz', sep='\t', skiprows=4, header=None,
names=['src', 'dst'], compression='gzip')
src = df['src'].values
dst = df['dst'].values
print('construct the graph')
return dgl.graph((src, dst))
def get_filmbaster():
# Same as https://snap.stanford.edu/data/bigdata/communities/com-friendster.ungraph.txt.gz
_download('https://dgl-asv-data.s3-us-west-2.amazonaws.com/dataset/friendster/com-friendster.ungraph.txt.gz',
'/tmp/dataset', 'com-friendster.ungraph.txt.gz')
df = pandas.read_csv('/tmp/dataset/com-friendster.ungraph.txt.gz', sep='\t', skiprows=4, header=None,
names=['src', 'dst'], compression='gzip') names=['src', 'dst'], compression='gzip')
src = np.array(df['src']) src = df['src'].values
dst = np.array(df['dst']) dst = df['dst'].values
print('construct the graph') print('construct the graph')
return dgl.DGLGraph((src, dst), readonly=True) return dgl.graph((src, dst))
def get_graph(name): def get_graph(name):
if name == 'livejournal': if name == 'livejournal':
...@@ -39,6 +57,7 @@ def get_graph(name): ...@@ -39,6 +57,7 @@ def get_graph(name):
print(name + " doesn't exist") print(name + " doesn't exist")
return None return None
class OGBDataset(object): class OGBDataset(object):
def __init__(self, g, num_labels, predict_category=None): def __init__(self, g, num_labels, predict_category=None):
self._g = g self._g = g
...@@ -75,7 +94,8 @@ def load_ogb_product(): ...@@ -75,7 +94,8 @@ def load_ogb_product():
graph.ndata['label'] = labels graph.ndata['label'] = labels
in_feats = graph.ndata['feat'].shape[1] in_feats = graph.ndata['feat'].shape[1]
num_labels = len(torch.unique(labels[torch.logical_not(torch.isnan(labels))])) num_labels = len(torch.unique(
labels[torch.logical_not(torch.isnan(labels))]))
# Find the node IDs in the training, validation, and test set. # Find the node IDs in the training, validation, and test set.
train_nid, val_nid, test_nid = splitted_idx['train'], splitted_idx['valid'], splitted_idx['test'] train_nid, val_nid, test_nid = splitted_idx['train'], splitted_idx['valid'], splitted_idx['test']
...@@ -148,12 +168,15 @@ class PinsageDataset: ...@@ -148,12 +168,15 @@ class PinsageDataset:
def __getitem__(self, idx): def __getitem__(self, idx):
return self._g return self._g
def load_nowplaying_rs(): def load_nowplaying_rs():
name = 'nowplaying_rs.pkl' # follow examples/pytorch/pinsage/README to create nowplaying_rs.pkl import torchtext
# follow examples/pytorch/pinsage/README to create nowplaying_rs.pkl
name = 'nowplaying_rs.pkl'
dataset_dir = os.path.join(os.getcwd(), 'dataset') dataset_dir = os.path.join(os.getcwd(), 'dataset')
os.symlink('/tmp/dataset/', dataset_dir) os.symlink('/tmp/dataset/', dataset_dir)
dataset_path = os.path.join(dataset_dir, name) dataset_path = os.path.join(dataset_dir, "nowplaying_rs", name)
# Load dataset # Load dataset
with open(dataset_path, 'rb') as f: with open(dataset_path, 'rb') as f:
dataset = pickle.load(f) dataset = pickle.load(f)
...@@ -169,14 +192,17 @@ def load_nowplaying_rs(): ...@@ -169,14 +192,17 @@ def load_nowplaying_rs():
# Assign user and movie IDs and use them as features (to learn an individual trainable # Assign user and movie IDs and use them as features (to learn an individual trainable
# embedding for each entity) # embedding for each entity)
g.nodes[user_ntype].data['id'] = torch.arange(g.number_of_nodes(user_ntype)) g.nodes[user_ntype].data['id'] = torch.arange(
g.nodes[item_ntype].data['id'] = torch.arange(g.number_of_nodes(item_ntype)) g.number_of_nodes(user_ntype))
g.nodes[item_ntype].data['id'] = torch.arange(
g.number_of_nodes(item_ntype))
# Prepare torchtext dataset and vocabulary # Prepare torchtext dataset and vocabulary
fields = {} fields = {}
examples = [] examples = []
for key, texts in item_texts.items(): for key, texts in item_texts.items():
fields[key] = torchtext.data.Field(include_lengths=True, lower=True, batch_first=True) fields[key] = torchtext.data.Field(
include_lengths=True, lower=True, batch_first=True)
for i in range(g.number_of_nodes(item_ntype)): for i in range(g.number_of_nodes(item_ntype)):
example = torchtext.data.Example.fromlist( example = torchtext.data.Example.fromlist(
[item_texts[key][i] for key in item_texts.keys()], [item_texts[key][i] for key in item_texts.keys()],
...@@ -188,6 +214,7 @@ def load_nowplaying_rs(): ...@@ -188,6 +214,7 @@ def load_nowplaying_rs():
return PinsageDataset(g, user_ntype, item_ntype, textset) return PinsageDataset(g, user_ntype, item_ntype, textset)
def process_data(name): def process_data(name):
if name == 'cora': if name == 'cora':
return dgl.data.CoraGraphDataset() return dgl.data.CoraGraphDataset()
...@@ -212,29 +239,38 @@ def process_data(name): ...@@ -212,29 +239,38 @@ def process_data(name):
else: else:
raise ValueError('Invalid dataset name:', name) raise ValueError('Invalid dataset name:', name)
def get_bench_device(): def get_bench_device():
return os.environ.get('DGL_BENCH_DEVICE', 'cpu') device = os.environ.get('DGL_BENCH_DEVICE', 'cpu')
if device.lower() == "gpu":
return "cuda:0"
else:
return device
def setup_track_time(*args, **kwargs): def setup_track_time(*args, **kwargs):
# fix random seed # fix random seed
np.random.seed(42) np.random.seed(42)
torch.random.manual_seed(42) torch.random.manual_seed(42)
def setup_track_acc(*args, **kwargs): def setup_track_acc(*args, **kwargs):
# fix random seed # fix random seed
np.random.seed(42) np.random.seed(42)
torch.random.manual_seed(42) torch.random.manual_seed(42)
TRACK_UNITS = { TRACK_UNITS = {
'time' : 's', 'time': 's',
'acc' : '%', 'acc': '%',
} }
TRACK_SETUP = { TRACK_SETUP = {
'time' : setup_track_time, 'time': setup_track_time,
'acc' : setup_track_acc, 'acc': setup_track_acc,
} }
def parametrize(param_name, params): def parametrize(param_name, params):
"""Decorator for benchmarking over a set of parameters. """Decorator for benchmarking over a set of parameters.
...@@ -297,6 +333,40 @@ def parametrize(param_name, params): ...@@ -297,6 +333,40 @@ def parametrize(param_name, params):
return func return func
return _wrapper return _wrapper
class TestFilter:
def __init__(self):
self.conf = None
if "DGL_REG_CONF" in os.environ:
current_dir = os.path.dirname(os.path.abspath(__file__))
path = os.path.join(current_dir, "../../",
os.environ["DGL_REG_CONF"])
with open(path, "r") as f:
self.conf = json.load(f)
if "INSTANCE_TYPE" in os.environ:
instance_type = os.environ["INSTANCE_TYPE"]
else:
raise Exception(
"Must set both DGL_REG_CONF and INSTANCE_TYPE as env")
self.enabled_tests = self.conf[instance_type]["tests"]
else:
import logging
logging.warning("No regression test conf file specified")
def check(self, func):
funcfullname = inspect.getmodule(func).__name__ + "." + func.__name__
if self.conf is None:
return True
else:
for enabled_testname in self.enabled_tests:
if enabled_testname in funcfullname:
return True
return False
filter = TestFilter()
def benchmark(track_type, timeout=60): def benchmark(track_type, timeout=60):
"""Decorator for indicating the benchmark type. """Decorator for indicating the benchmark type.
...@@ -319,9 +389,13 @@ def benchmark(track_type, timeout=60): ...@@ -319,9 +389,13 @@ def benchmark(track_type, timeout=60):
pass pass
""" """
assert track_type in ['time', 'acc'] assert track_type in ['time', 'acc']
def _wrapper(func): def _wrapper(func):
func.unit = TRACK_UNITS[track_type] func.unit = TRACK_UNITS[track_type]
func.setup = TRACK_SETUP[track_type] func.setup = TRACK_SETUP[track_type]
func.timeout = timeout func.timeout = timeout
if not filter.check(func):
# skip if not enabled
func.benchmark_name = "skip_" + func.__name__
return func return func
return _wrapper return _wrapper
...@@ -13,6 +13,7 @@ pip install asv ...@@ -13,6 +13,7 @@ pip install asv
pip uninstall -y dgl pip uninstall -y dgl
export DGL_BENCH_DEVICE=$DEVICE export DGL_BENCH_DEVICE=$DEVICE
echo "DGL_BENCH_DEVICE=$DGL_BENCH_DEVICE"
pushd $ROOT/benchmarks pushd $ROOT/benchmarks
cat asv.conf.json cat asv.conf.json
asv machine --yes asv machine --yes
......
Regression Test Suite
========================
### Spec of task.json
```json
# Note the test will be run if the name specified below is a substring of the full test name.
# The fullname of "benchmarks/model_acc/bench_sage_ns.track_acc" will be "model_acc.bench_sage_ns.track_acc". Test will be run if it contains any keyword.
# For example, "model_acc" will run all the tests under "model_acc" folder
# "bench_sage" will run both "bench_sage" and "bench_sage_ns"
# "bench_sage." will only run "bench_sage"
# "ns" will run any tests name contains "ms"
# "" will run all tests
{
"c5.9xlarge": { # The instance type to run the test
"tests": [
"bench_sage" # The test to be run on this instance
],
"env": {
"DEVICE": "cpu" # The environment variable passed to publish.sh
}
},
"g4dn.2xlarge": {
...
}
}
```
### Environment variable
- `MOUNT_PATH` specify the directory in the host to be mapped into docker, if exists will map the `MOUNT_PATH`(in host) to `/tmp/dataset`(in docker)
- `INSTANCE_TYPE` specify the current instance type
- `DGL_REG_CONF` specify the path to `task.json`, which is relative to the repo root. If specified, must specify `INSTANCE_TYPE` also
\ No newline at end of file
...@@ -4,8 +4,15 @@ set -e ...@@ -4,8 +4,15 @@ set -e
. /opt/conda/etc/profile.d/conda.sh . /opt/conda/etc/profile.d/conda.sh
# Default building only with cpu
DEVICE=${DGL_BENCH_DEVICE:-cpu}
# build # build
CMAKE_VARS="-DUSE_CUDA=ON" if [[ $DEVICE == "cpu" ]]; then
CMAKE_VARS=""
else
CMAKE_VARS="-DUSE_CUDA=ON"
fi
mkdir -p build mkdir -p build
pushd build pushd build
cmake $CMAKE_VARS .. cmake $CMAKE_VARS ..
......
...@@ -7,6 +7,7 @@ set -e ...@@ -7,6 +7,7 @@ set -e
pip install -r /asv/torch_gpu_pip.txt pip install -r /asv/torch_gpu_pip.txt
pip install pandas rdflib ogb pip install pandas rdflib ogb
# install # install
pushd python pushd python
rm -rf build *.egg-info dist rm -rf build *.egg-info dist
......
...@@ -17,7 +17,6 @@ ...@@ -17,7 +17,6 @@
# the host machine. # the host machine.
# #
if [ $# -eq 2 ]; then if [ $# -eq 2 ]; then
MACHINE=$1 MACHINE=$1
DEVICE=$2 DEVICE=$2
...@@ -27,15 +26,51 @@ else ...@@ -27,15 +26,51 @@ else
fi fi
WS_ROOT=/asv/dgl WS_ROOT=/asv/dgl
docker pull dgllib/dgl-ci-gpu:conda
if [ -z "$DGL_REG_CONF"]; then
DOCKER_ENV_OPT="$DOCKER_ENV_OPT"
else
DOCKER_ENV_OPT=" -e DGL_REG_CONF=$DGL_REG_CONF $DOCKER_ENV_OPT"
fi
if [ -z "$INSTANCE_TYPE"]; then
DOCKER_ENV_OPT="$DOCKER_ENV_OPT"
else
DOCKER_ENV_OPT=" -e INSTANCE_TYPE=$INSTANCE_TYPE $DOCKER_ENV_OPT"
fi
if [ -z "$MOUNT_PATH"]; then
DOCKER_MOUNT_OPT=""
else
DOCKER_MOUNT_OPT="-v ${MOUNT_PATH}:/tmp/dataset -v ${MOUNT_PATH}/dgl_home/:/root/.dgl/"
fi
echo $HOME
echo "Mount Point: ${DOCKER_MOUNT_OPT}"
echo "Env opt: ${DOCKER_ENV_OPT}"
echo "DEVICE: ${DEVICE}"
if [[ $DEVICE == "cpu" ]]; then
docker run --name dgl-reg \
--rm \
$DOCKER_MOUNT_OPT \
$DOCKER_ENV_OPT \
--shm-size="4g" \
--hostname=$MACHINE -dit dgllib/dgl-ci-gpu:conda /bin/bash
else
docker run --name dgl-reg \
--rm --runtime=nvidia \
$DOCKER_MOUNT_OPT \
$DOCKER_ENV_OPT \
--shm-size="4g" \
--hostname=$MACHINE -dit dgllib/dgl-ci-gpu:conda /bin/bash
fi
docker run --name dgl-reg \
--rm --runtime=nvidia \
--hostname=$MACHINE -dit dgllib/dgl-ci-gpu:conda /bin/bash
docker exec dgl-reg mkdir -p $WS_ROOT docker exec dgl-reg mkdir -p $WS_ROOT
docker cp ../.git dgl-reg:$WS_ROOT docker cp ../../.git dgl-reg:$WS_ROOT
docker cp . dgl-reg:$WS_ROOT/benchmarks/ docker cp ../ dgl-reg:$WS_ROOT/benchmarks/
docker cp torch_gpu_pip.txt dgl-reg:/asv docker cp torch_gpu_pip.txt dgl-reg:/asv
docker exec dgl-reg bash $WS_ROOT/benchmarks/run.sh $DEVICE docker exec $DOCKER_ENV_OPT dgl-reg bash $WS_ROOT/benchmarks/run.sh $DEVICE
docker cp dgl-reg:$WS_ROOT/benchmarks/results . docker cp dgl-reg:$WS_ROOT/benchmarks/results ../
docker cp dgl-reg:$WS_ROOT/benchmarks/html . docker cp dgl-reg:$WS_ROOT/benchmarks/html ../
docker stop dgl-reg docker stop dgl-reg
...@@ -10,4 +10,7 @@ networkx ...@@ -10,4 +10,7 @@ networkx
matplotlib matplotlib
nltk nltk
requests[security] requests[security]
tqdm tqdm
\ No newline at end of file awscli
# 0.6.0 is for pytorch 1.5
torchtext==0.6.0
\ No newline at end of file
{
"c5.9xlarge": {
"tests": [
""
],
"env": {
"DEVICE": "cpu"
}
},
"g4dn.2xlarge": {
"tests": [
""
],
"env": {
"DEVICE": "gpu"
}
}
}
\ No newline at end of file
import argparse, time import argparse
import time
import numpy as np import numpy as np
import networkx as nx import networkx as nx
import torch import torch
...@@ -12,6 +13,7 @@ from gcn import GCN ...@@ -12,6 +13,7 @@ from gcn import GCN
#from gcn_mp import GCN #from gcn_mp import GCN
#from gcn_spmv import GCN #from gcn_spmv import GCN
def evaluate(model, features, labels, mask): def evaluate(model, features, labels, mask):
model.eval() model.eval()
with torch.no_grad(): with torch.no_grad():
...@@ -22,6 +24,7 @@ def evaluate(model, features, labels, mask): ...@@ -22,6 +24,7 @@ def evaluate(model, features, labels, mask):
correct = torch.sum(indices == labels) correct = torch.sum(indices == labels)
return correct.item() * 1.0 / len(labels) return correct.item() * 1.0 / len(labels)
def main(args): def main(args):
# load and preprocess dataset # load and preprocess dataset
if args.dataset == 'cora': if args.dataset == 'cora':
...@@ -122,21 +125,21 @@ if __name__ == '__main__': ...@@ -122,21 +125,21 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser(description='GCN') parser = argparse.ArgumentParser(description='GCN')
register_data_args(parser) register_data_args(parser)
parser.add_argument("--dropout", type=float, default=0.5, parser.add_argument("--dropout", type=float, default=0.5,
help="dropout probability") help="dropout probability")
parser.add_argument("--gpu", type=int, default=-1, parser.add_argument("--gpu", type=int, default=-1,
help="gpu") help="gpu")
parser.add_argument("--lr", type=float, default=1e-2, parser.add_argument("--lr", type=float, default=1e-2,
help="learning rate") help="learning rate")
parser.add_argument("--n-epochs", type=int, default=200, parser.add_argument("--n-epochs", type=int, default=200,
help="number of training epochs") help="number of training epochs")
parser.add_argument("--n-hidden", type=int, default=16, parser.add_argument("--n-hidden", type=int, default=16,
help="number of hidden gcn units") help="number of hidden gcn units")
parser.add_argument("--n-layers", type=int, default=1, parser.add_argument("--n-layers", type=int, default=1,
help="number of hidden gcn layers") help="number of hidden gcn layers")
parser.add_argument("--weight-decay", type=float, default=5e-4, parser.add_argument("--weight-decay", type=float, default=5e-4,
help="Weight for L2 loss") help="Weight for L2 loss")
parser.add_argument("--self-loop", action='store_true', parser.add_argument("--self-loop", action='store_true',
help="graph self-loop (default=False)") help="graph self-loop (default=False)")
parser.set_defaults(self_loop=False) parser.set_defaults(self_loop=False)
args = parser.parse_args() args = parser.parse_args()
print(args) print(args)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment