SPMV specialization (#32)

* fix edge list order problem in cached graph. * minor fix * fix bug in edge iter * SPMV works * gcn spmv on CPU * change gcn style * fix cached graph performance; fixed gcn dataset bug * reorg dir * non-batch spmv; partial update problem with shape change * fix reorder problem; finish gcn-batch impl * pop API * GPU context

SPMV specialization (#32)
* fix edge list order problem in cached graph. * minor fix * fix bug in edge iter * SPMV works * gcn spmv on CPU * change gcn style * fix cached graph performance; fixed gcn dataset bug * reorg dir * non-batch spmv; partial update problem with shape change * fix reorder problem; finish gcn-batch impl * pop API * GPU context
2c489fad · Minjie Wang · GitHub · 11e42d10 · 2c489fad · 2c489fad
Unverified Commit 2c489fad authored Aug 06, 2018 by Minjie Wang Committed by GitHub Aug 06, 2018
20 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -132,3 +132,6 @@ examples/pytorch/data/ind.citeseer.allx
 examples/pytorch/.DS_Store
 examples/.DS_Store
 .DS_Store
+
+# data directory
+_download
--- a/examples/profiling/.gitignore
+++ b/examples/profiling/.gitignore
--- a/examples/profiling/README.md
+++ b/examples/profiling/README.md
--- a/examples/profiling/gt_bench.py
+++ b/examples/profiling/gt_bench.py
--- a/examples/profiling/gt_bench.sh
+++ b/examples/profiling/gt_bench.sh
--- a/examples/profiling/igraph_bench.py
+++ b/examples/profiling/igraph_bench.py
--- a/examples/profiling/igraph_bench.sh
+++ b/examples/profiling/igraph_bench.sh
--- a/examples/profiling/nx_bench.py
+++ b/examples/profiling/nx_bench.py
--- a/examples/profiling/nx_bench.sh
+++ b/examples/profiling/nx_bench.sh
--- a/examples/profiling/pagerank_alg.py
+++ b/examples/profiling/pagerank_alg.py
--- a/examples/profiling/pgp.sh
+++ b/examples/profiling/pgp.sh
--- a/examples/profiling/profile.sh
+++ b/examples/profiling/profile.sh
--- a/examples/profiling/sp_bench.py
+++ b/examples/profiling/sp_bench.py
--- a/examples/profiling/sp_bench.sh
+++ b/examples/profiling/sp_bench.sh
--- a/examples/profiling/tf_bench.py
+++ b/examples/profiling/tf_bench.py
--- a/examples/profiling/tf_bench.sh
+++ b/examples/profiling/tf_bench.sh
--- a/examples/profiling/th_bench.py
+++ b/examples/profiling/th_bench.py
--- a/examples/profiling/utils.py
+++ b/examples/profiling/utils.py
--- a/examples/pytorch/dataset.py
+++ b/examples/pytorch/dataset.py
-import numpy as np
-import pickle as pkl
-import networkx as nx
-import scipy.sparse as sp
-import sys
-
-# (lingfan): following dataset loading and preprocessing code from tkipf/gcn
-# https://github.com/tkipf/gcn/blob/master/gcn/utils.py
-
-def parse_index_file(filename):
-    """Parse index file."""
-    index = []
-    for line in open(filename):
-        index.append(int(line.strip()))
-    return index
-
-
-def sample_mask(idx, l):
-    """Create mask."""
-    mask = np.zeros(l)
-    mask[idx] = 1
-    return np.array(mask, dtype=np.bool)
-
-
-def load_data(dataset_str):
-    """
-    Loads input data from gcn/data directory
-
-    ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object;
-    ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object;
-    ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances
-        (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object;
-    ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object;
-    ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object;
-    ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object;
-    ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict
-        object;
-    ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object.
-
-    All objects above must be saved using python pickle module.
-
-    :param dataset_str: Dataset name
-    :return: All data input files loaded (as well the training/test data).
-    """
-    names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
-    objects = []
-    for i in range(len(names)):
-        with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
-            if sys.version_info > (3, 0):
-                objects.append(pkl.load(f, encoding='latin1'))
-            else:
-                objects.append(pkl.load(f))
-
-    x, y, tx, ty, allx, ally, graph = tuple(objects)
-    test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str))
-    test_idx_range = np.sort(test_idx_reorder)
-
-    if dataset_str == 'citeseer':
-        # Fix citeseer dataset (there are some isolated nodes in the graph)
-        # Find isolated nodes, add them as zero-vecs into the right position
-        test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
-        tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
-        tx_extended[test_idx_range-min(test_idx_range), :] = tx
-        tx = tx_extended
-        ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
-        ty_extended[test_idx_range-min(test_idx_range), :] = ty
-        ty = ty_extended
-
-    features = sp.vstack((allx, tx)).tolil()
-    features[test_idx_reorder, :] = features[test_idx_range, :]
-    adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
-
-    labels = np.vstack((ally, ty))
-    labels[test_idx_reorder, :] = labels[test_idx_range, :]
-
-    idx_test = test_idx_range.tolist()
-    idx_train = range(len(y))
-    idx_val = range(len(y), len(y)+500)
-
-    train_mask = sample_mask(idx_train, labels.shape[0])
-    val_mask = sample_mask(idx_val, labels.shape[0])
-    test_mask = sample_mask(idx_test, labels.shape[0])
-
-    y_train = np.zeros(labels.shape)
-    y_val = np.zeros(labels.shape)
-    y_test = np.zeros(labels.shape)
-    y_train[train_mask, :] = labels[train_mask, :]
-    y_val[val_mask, :] = labels[val_mask, :]
-    y_test[test_mask, :] = labels[test_mask, :]
-
-    return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask
-
-
-def preprocess_features(features):
-    """Row-normalize feature matrix and convert to tuple representation"""
-    rowsum = np.array(features.sum(1))
-    r_inv = np.power(rowsum, -1).flatten()
-    r_inv[np.isinf(r_inv)] = 0.
-    r_mat_inv = sp.diags(r_inv)
-    features = r_mat_inv.dot(features)
-    return features
--- a/examples/pytorch/gcn.py
+++ b/examples/pytorch/gcn.py
-"""
-Semi-Supervised Classification with Graph Convolutional Networks
-Paper: https://arxiv.org/abs/1609.02907
-Code: https://github.com/tkipf/gcn
-"""
-
-import networkx as nx
-from dgl.graph import DGLGraph
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import argparse
-from dataset import load_data, preprocess_features
-import numpy as np
-
-class NodeUpdateModule(nn.Module):
-    def __init__(self, input_dim, output_dim, act=None, p=None):
-        super(NodeUpdateModule, self).__init__()
-        self.linear = nn.Linear(input_dim, output_dim)
-        self.act = act
-        self.p = p
-
-    def forward(self, node, msgs_repr):
-        h = node['h']
-        # aggregate messages
-        h = h + msgs_repr
-        h = self.linear(h)
-        if self.act is not None:
-            h = self.act(h)
-        return {'h': h}
-
-
-class GCN(nn.Module):
-    def __init__(self, input_dim, num_hidden, num_classes, num_layers, activation, dropout=None, output_projection=True):
-        super(GCN, self).__init__()
-        self.dropout = dropout
-        self.layers = nn.ModuleList()
-        # hidden layers
-        last_dim = input_dim
-        for _ in range(num_layers):
-            self.layers.append(
-                    NodeUpdateModule(last_dim, num_hidden, act=activation, p=dropout))
-            last_dim = num_hidden
-        # output layer
-        if output_projection:
-            self.layers.append(NodeUpdateModule(num_hidden, num_classes, p=dropout))
-
-
-    def forward(self, g):
-        g.register_message_func(lambda src, dst, edge: src['h'])
-        g.register_reduce_func('sum')
-        for layer in self.layers:
-            # apply dropout
-            if self.dropout is not None:
-                # TODO (lingfan): use batched dropout once we have better api
-                #                 for global manipulation
-                for n in g.nodes():
-                    g.node[n]['h'] = F.dropout(g.node[n]['h'], p=self.dropout)
-            g.register_update_func(layer)
-            g.update_all()
-        logits = [g.node[n]['h'] for n in g.nodes()]
-        return torch.cat(logits, dim=0)
-
-
-def main(args):
-    # load and preprocess dataset
-    adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data(args.dataset)
-    features = preprocess_features(features)
-
-    # initialize graph
-    g = DGLGraph(adj)
-
-    # create GCN model
-    model = GCN(features.shape[1],
-                args.num_hidden,
-                y_train.shape[1],
-                args.num_layers,
-                F.relu,
-                args.dropout)
-
-    # use optimizer
-    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
-
-    # convert labels and masks to tensor
-    labels = torch.FloatTensor(y_train)
-    mask = torch.FloatTensor(train_mask.astype(np.float32))
-    n_train = torch.sum(mask)
-
-    for epoch in range(args.epochs):
-        # reset grad
-        optimizer.zero_grad()
-
-        # reset graph states
-        for n in g.nodes():
-            g.node[n]['h'] = torch.FloatTensor(features[n].toarray())
-
-        # forward
-        logits = model.forward(g)
-
-        # masked cross entropy loss
-        # TODO: (lingfan) use gather to speed up
-        logp = F.log_softmax(logits, 1)
-        loss = -torch.sum(logp * labels * mask.view(-1, 1)) / n_train
-        print("epoch {} loss: {}".format(epoch, loss.item()))
-
-        loss.backward()
-        optimizer.step()
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='GCN')
-    parser.add_argument("--dataset", type=str, required=True,
-            help="dataset name")
-    parser.add_argument("--num-layers", type=int, default=1,
-            help="number of gcn layers")
-    parser.add_argument("--num-hidden", type=int, default=64,
-            help="number of hidden units")
-    parser.add_argument("--epochs", type=int, default=10,
-            help="training epoch")
-    parser.add_argument("--dropout", type=float, default=None,
-            help="dropout probability")
-    parser.add_argument("--lr", type=float, default=0.001,
-            help="learning rate")
-    args = parser.parse_args()
-    print(args)
-
-    main(args)
-