[Bugfix] tolist and dependencies in `dgl.data` (#239)

* change Index.tolist -> Index.tonumpy; fix bug in traversal; remove dependencies in data * fix import * fix __all__ and some docstring

[Bugfix] tolist and dependencies in `dgl.data` (#239)
* change Index.tolist -> Index.tonumpy; fix bug in traversal; remove dependencies in data * fix import * fix __all__ and some docstring
21255b65 · Minjie Wang · GitHub · eafcb7e7 · 21255b65 · 21255b65
Unverified Commit 21255b65 authored Dec 05, 2018 by Minjie Wang Committed by GitHub Dec 05, 2018
14 changed files
--- a/examples/pytorch/tree_lstm/train.py
+++ b/examples/pytorch/tree_lstm/train.py
@@ -8,7 +8,7 @@ import torch.optim as optim
 from torch.utils.data import DataLoader

 import dgl
-import dgl.data as data
+from dgl.data.tree import SST

 from tree_lstm import TreeLSTM

@@ -25,22 +25,22 @@ def main(args):
    if cuda:
        th.cuda.set_device(args.gpu)

-    trainset = data.SST()
+    trainset = SST()
    train_loader = DataLoader(dataset=trainset,
                              batch_size=args.batch_size,
-                              collate_fn=data.SST.batcher(device),
+                              collate_fn=SST.batcher(device),
                              shuffle=True,
                              num_workers=0)
-    devset = data.SST(mode='dev')
+    devset = SST(mode='dev')
    dev_loader = DataLoader(dataset=devset,
                            batch_size=100,
-                            collate_fn=data.SST.batcher(device),
+                            collate_fn=SST.batcher(device),
                            shuffle=False,
                            num_workers=0)

-    testset = data.SST(mode='test')
+    testset = SST(mode='test')
    test_loader = DataLoader(dataset=testset,
-                             batch_size=100, collate_fn=data.SST.batcher(device), shuffle=False, num_workers=0)
+                             batch_size=100, collate_fn=SST.batcher(device), shuffle=False, num_workers=0)

    model = TreeLSTM(trainset.num_vocabs,
                     args.x_size,

--- a/python/dgl/data/sbm.py
+++ b/python/dgl/data/sbm.py
+"""Dataset for stochastic block model."""
 import math
 import os
 import pickle

--- a/python/dgl/data/tree.py
+++ b/python/dgl/data/tree.py
@@ -6,8 +6,6 @@ Including:
 from __future__ import absolute_import

 from collections import namedtuple, OrderedDict
-from nltk.tree import Tree
-from nltk.corpus.reader import BracketParseCorpusReader
 import networkx as nx

 import numpy as np
@@ -16,6 +14,8 @@ import dgl
 import dgl.backend as F
 from dgl.data.utils import download, extract_archive, get_download_dir, _get_dgl_url

+__all__ = ['SSTBatch', 'SST']
+
 _urls = {
    'sst' : 'dataset/sst.zip',
 }
@@ -63,6 +63,7 @@ class SST(object):
        print('Dataset creation finished. #Trees:', len(self.trees))

    def _load(self):
+        from nltk.corpus.reader import BracketParseCorpusReader
        # load vocab file
        self.vocab = OrderedDict()
        with open(self.vocab_file, encoding='utf-8') as vf:

--- a/python/dgl/data/utils.py
+++ b/python/dgl/data/utils.py
@@ -13,6 +13,8 @@ except ImportError:
        pass
    requests = requests_failed_to_import

+__all__ = ['download', 'check_sha1', 'extract_archive', 'get_download_dir']
+
 def _get_dgl_url(file_url):
    """Get DGL online url for download."""
    dgl_repo_url = 'https://s3.us-east-2.amazonaws.com/dgl.ai/'

--- a/python/dgl/frame.py
+++ b/python/dgl/frame.py
@@ -765,7 +765,7 @@ class FrameRef(MutableMapping):
        if isinstance(query, slice):
            query = range(query.start, query.stop)
        else:
-            query = query.tolist()
+            query = query.tonumpy()

        if isinstance(self._index_data, slice):
            self._index_data = range(self._index_data.start, self._index_data.stop)
@@ -861,51 +861,3 @@ def frame_like(other, num_rows):
    #   now supports non-exist columns.
    newf._initializers = other._initializers
    return newf
-
-def merge_frames(frames, indices, max_index, reduce_func):
-    """Merge a list of frames.
-
-    The result frame contains `max_index` number of rows. For each frame in
-    the given list, its row is merged as follows:
-
-        merged[indices[i][row]] += frames[i][row]
-
-    Parameters
-    ----------
-    frames : iterator of dgl.frame.FrameRef
-        A list of frames to be merged.
-    indices : iterator of dgl.utils.Index
-        The indices of the frame rows.
-    reduce_func : str
-        The reduce function (only 'sum' is supported currently)
-
-    Returns
-    -------
-    merged : FrameRef
-        The merged frame.
-    """
-    # TODO(minjie)
-    assert False, 'Buggy code, disabled for now.'
-    assert reduce_func == 'sum'
-    assert len(frames) > 0
-    schemes = frames[0].schemes
-    # create an adj to merge
-    # row index is equal to the concatenation of all the indices.
-    row = sum([idx.tolist() for idx in indices], [])
-    col = list(range(len(row)))
-    n = max_index
-    m = len(row)
-    row = F.unsqueeze(F.tensor(row, dtype=F.int64), 0)
-    col = F.unsqueeze(F.tensor(col, dtype=F.int64), 0)
-    idx = F.cat([row, col], dim=0)
-    dat = F.ones((m,))
-    adjmat = F.sparse_tensor(idx, dat, [n, m])
-    ctx_adjmat = utils.CtxCachedObject(lambda ctx: F.to_context(adjmat, ctx))
-    merged = {}
-    for key in schemes:
-        # the rhs of the spmv is the concatenation of all the frame columns
-        feats = F.pack([fr[key] for fr in frames])
-        merged_feats = F.spmm(ctx_adjmat.get(F.get_context(feats)), feats)
-        merged[key] = merged_feats
-    merged = FrameRef(Frame(merged))
-    return merged
--- a/python/dgl/graph.py
+++ b/python/dgl/graph.py
@@ -8,7 +8,7 @@ from collections import defaultdict
 import dgl
 from .base import ALL, is_all, DGLError, dgl_warning
 from . import backend as F
-from .frame import FrameRef, Frame, merge_frames
+from .frame import FrameRef, Frame
 from .graph_index import GraphIndex, create_graph_index
 from .runtime import ir, scheduler, Runtime
 from . import utils

--- a/python/dgl/runtime/degree_bucketing.py
+++ b/python/dgl/runtime/degree_bucketing.py
@@ -168,7 +168,7 @@ def _process_buckets(buckets):
    msg_ids = [utils.toindex(msg_id) for msg_id in msg_ids]

    # handle zero deg
-    degs = degs.tolist()
+    degs = degs.tonumpy()
    if degs[-1] == 0:
        degs = degs[:-1]
        zero_deg_nodes = dsts[-1]

--- a/python/dgl/traversal.py
+++ b/python/dgl/traversal.py
@@ -44,7 +44,7 @@ def bfs_nodes_generator(graph, source, reversed=False):
    ret = _CAPI_DGLBFSNodes(ghandle, source, reversed)
    all_nodes = utils.toindex(ret(0)).tousertensor()
    # TODO(minjie): how to support directly creating python list
-    sections = utils.toindex(ret(1)).tousertensor().tolist()
+    sections = utils.toindex(ret(1)).tonumpy().tolist()
    node_frontiers = F.split(all_nodes, sections, dim=0)
    return node_frontiers

@@ -84,7 +84,7 @@ def bfs_edges_generator(graph, source, reversed=False):
    ret = _CAPI_DGLBFSEdges(ghandle, source, reversed)
    all_edges = utils.toindex(ret(0)).tousertensor()
    # TODO(minjie): how to support directly creating python list
-    sections = utils.toindex(ret(1)).tousertensor().tolist()
+    sections = utils.toindex(ret(1)).tonumpy().tolist()
    edge_frontiers = F.split(all_edges, sections, dim=0)
    return edge_frontiers

@@ -120,7 +120,7 @@ def topological_nodes_generator(graph, reversed=False):
    ret = _CAPI_DGLTopologicalNodes(ghandle, reversed)
    all_nodes = utils.toindex(ret(0)).tousertensor()
    # TODO(minjie): how to support directly creating python list
-    sections = utils.toindex(ret(1)).tousertensor().tolist()
+    sections = utils.toindex(ret(1)).tonumpy().tolist()
    return F.split(all_nodes, sections, dim=0)

 def dfs_edges_generator(graph, source, reversed=False):
@@ -165,7 +165,7 @@ def dfs_edges_generator(graph, source, reversed=False):
    ret = _CAPI_DGLDFSEdges(ghandle, source, reversed)
    all_edges = utils.toindex(ret(0)).tousertensor()
    # TODO(minjie): how to support directly creating python list
-    sections = utils.toindex(ret(1)).tousertensor().tolist()
+    sections = utils.toindex(ret(1)).tonumpy().tolist()
    return F.split(all_edges, sections, dim=0)

 def dfs_labeled_edges_generator(
@@ -244,11 +244,11 @@ def dfs_labeled_edges_generator(
    # TODO(minjie): how to support directly creating python list
    if return_labels:
        all_labels = utils.toindex(ret(1)).tousertensor()
-        sections = utils.toindex(ret(2)).tousertensor().tolist()
+        sections = utils.toindex(ret(2)).tonumpy().tolist()
        return (F.split(all_edges, sections, dim=0),
                F.split(all_labels, sections, dim=0))
    else:
-        sections = utils.toindex(ret(1)).tousertensor().tolist()
+        sections = utils.toindex(ret(1)).tonumpy().tolist()
        return F.split(all_edges, sections, dim=0)

 _init_api("dgl.traversal")
--- a/python/dgl/utils.py
+++ b/python/dgl/utils.py
@@ -15,24 +15,24 @@ class Index(object):
        self._initialize_data(data)

    def _initialize_data(self, data):
-        self._list_data = None   # a numpy type data or a slice
+        self._pydata = None   # a numpy type data or a slice
        self._user_tensor_data = dict()  # dictionary of user tensors
        self._dgl_tensor_data = None  # a dgl ndarray
        self._dispatch(data)

    def __iter__(self):
-        for i in self.tolist():
+        for i in self.tonumpy():
            yield int(i)

    def __len__(self):
-        if self._list_data is not None and isinstance(self._list_data, slice):
-            slc = self._list_data
+        if self._pydata is not None and isinstance(self._pydata, slice):
+            slc = self._pydata
            if slc.step is None:
                return slc.stop - slc.start
            else:
                return (slc.stop - slc.start) // slc.step
-        elif self._list_data is not None:
-            return len(self._list_data)
+        elif self._pydata is not None:
+            return len(self._pydata)
        elif len(self._user_tensor_data) > 0:
            data = next(iter(self._user_tensor_data.values()))
            return len(data)
@@ -40,7 +40,7 @@ class Index(object):
            return len(self._dgl_tensor_data)

    def __getitem__(self, i):
-        return int(self.tolist()[i])
+        return int(self.tonumpy()[i])

    def _dispatch(self, data):
        """Store data based on its type."""
@@ -59,35 +59,35 @@ class Index(object):
                raise DGLError('Index data must be 1D int64 vector, but got: %s' % str(data))
            self._dgl_tensor_data = data
        elif isinstance(data, slice):
-            # save it in the _list_data temporarily; materialize it if `tolist` is called
-            self._list_data = data
+            # save it in the _pydata temporarily; materialize it if `tonumpy` is called
+            self._pydata = data
        else:
            try:
-                self._list_data = np.array([int(data)]).astype(np.int64)
+                self._pydata = np.array([int(data)]).astype(np.int64)
            except:
                try:
                    data = np.array(data).astype(np.int64)
                    if data.ndim != 1:
                        raise DGLError('Index data must be 1D int64 vector,'
                                       ' but got: %s' % str(data))
-                    self._list_data = data
+                    self._pydata = data
                except:
                    raise DGLError('Error index data: %s' % str(data))
-            self._user_tensor_data[F.cpu()] = F.zerocopy_from_numpy(self._list_data)
+            self._user_tensor_data[F.cpu()] = F.zerocopy_from_numpy(self._pydata)

-    def tolist(self):
-        """Convert to a python-list compatible object."""
-        if self._list_data is None:
+    def tonumpy(self):
+        """Convert to a numpy ndarray."""
+        if self._pydata is None:
            if self._dgl_tensor_data is not None:
-                self._list_data = self._dgl_tensor_data.asnumpy()
+                self._pydata = self._dgl_tensor_data.asnumpy()
            else:
                data = self.tousertensor()
-                self._list_data = F.zerocopy_to_numpy(data)
-        elif isinstance(self._list_data, slice):
+                self._pydata = F.zerocopy_to_numpy(data)
+        elif isinstance(self._pydata, slice):
            # convert it to numpy array
-            slc = self._list_data
-            self._list_data = np.arange(slc.start, slc.stop, slc.step).astype(np.int64)
-        return self._list_data
+            slc = self._pydata
+            self._pydata = np.arange(slc.start, slc.stop, slc.step).astype(np.int64)
+        return self._pydata

    def tousertensor(self, ctx=None):
        """Convert to user tensor (defined in `backend`)."""
@@ -100,7 +100,7 @@ class Index(object):
                self._user_tensor_data[F.cpu()] = F.zerocopy_from_dlpack(dl)
            else:
                # zero copy from numpy array
-                self._user_tensor_data[F.cpu()] = F.zerocopy_from_numpy(self.tolist())
+                self._user_tensor_data[F.cpu()] = F.zerocopy_from_numpy(self.tonumpy())
        if ctx not in self._user_tensor_data:
            # copy from cpu to another device
            data = next(iter(self._user_tensor_data.values()))
@@ -117,8 +117,8 @@ class Index(object):
        return self._dgl_tensor_data

    def is_slice(self, start, stop, step=None):
-        return (isinstance(self._list_data, slice)
-                and self._list_data == slice(start, stop, step))
+        return (isinstance(self._pydata, slice)
+                and self._pydata == slice(start, stop, step))

    def __getstate__(self):
        return self.tousertensor()

--- a/tests/graph_index/test_basics.py
+++ b/tests/graph_index/test_basics.py
@@ -11,14 +11,14 @@ def test_edge_id():

    gi.add_nodes(4)
    gi.add_edge(0, 1)
-    eid = gi.edge_id(0, 1).tolist()
+    eid = gi.edge_id(0, 1).tonumpy()
    assert len(eid) == 1
    assert eid[0] == 0
    assert gi.is_multigraph()

    # multiedges
    gi.add_edge(0, 1)
-    eid = gi.edge_id(0, 1).tolist()
+    eid = gi.edge_id(0, 1).tonumpy()
    assert len(eid) == 2
    assert eid[0] == 0
    assert eid[1] == 1
@@ -60,7 +60,7 @@ def test_edge_id():

    gi.add_nodes(4)
    gi.add_edge(0, 1)
-    eid = gi.edge_id(0, 1).tolist()
+    eid = gi.edge_id(0, 1).tonumpy()
    assert len(eid) == 1
    assert eid[0] == 0


--- a/tests/mxnet/test_graph_index.py
+++ b/tests/mxnet/test_graph_index.py
@@ -62,13 +62,13 @@ def check_basics(g, ig):

    for u in randv.asnumpy():
        for v in randv.asnumpy():
-            if len(g.edge_id(u, v).tolist()) == 1:
-                assert g.edge_id(u, v).tolist() == ig.edge_id(u, v).tolist()
+            if len(g.edge_id(u, v)) == 1:
+                assert g.edge_id(u, v).tonumpy() == ig.edge_id(u, v).tonumpy()
            assert g.has_edge_between(u, v) == ig.has_edge_between(u, v)
    randv = utils.toindex(randv)
-    ids = g.edge_ids(randv, randv)[2].tolist()
-    assert sum(ig.edge_ids(randv, randv)[2].tolist() == ids) == len(ids)
-    assert sum(g.has_edges_between(randv, randv).tolist() == ig.has_edges_between(randv, randv).tolist()) == len(randv)
+    ids = g.edge_ids(randv, randv)[2].tonumpy()
+    assert sum(ig.edge_ids(randv, randv)[2].tonumpy() == ids) == len(ids)
+    assert sum(g.has_edges_between(randv, randv).tonumpy() == ig.has_edges_between(randv, randv).tonumpy()) == len(randv)


 def test_basics():

--- a/tests/pytorch/test_frame.py
+++ b/tests/pytorch/test_frame.py
@@ -208,7 +208,7 @@ def test_row3():
    assert f.is_contiguous()
    assert f.is_span_whole_column()
    assert f.num_rows == N
-    del f[th.tensor([2, 3])]
+    del f[toindex(th.tensor([2, 3]))]
    assert not f.is_contiguous()
    assert not f.is_span_whole_column()
    # delete is lazy: only reflect on the ref while the

--- a/tests/pytorch/test_index.py
+++ b/tests/pytorch/test_index.py
@@ -49,7 +49,7 @@ def test_index():
    # from np data
    data = np.ones((10,), dtype=np.int64) * 10
    idx = toindex(data)
-    y1 = idx.tolist()
+    y1 = idx.tonumpy()
    y2 = idx.tousertensor().numpy()
    y3 = idx.todgltensor().asnumpy()
    assert np.allclose(ans, y1)
@@ -59,7 +59,7 @@ def test_index():
    # from list
    data = [10] * 10
    idx = toindex(data)
-    y1 = idx.tolist()
+    y1 = idx.tonumpy()
    y2 = idx.tousertensor().numpy()
    y3 = idx.todgltensor().asnumpy()
    assert np.allclose(ans, y1)
@@ -69,7 +69,7 @@ def test_index():
    # from torch
    data = th.ones((10,), dtype=th.int64) * 10
    idx = toindex(data)
-    y1 = idx.tolist()
+    y1 = idx.tonumpy()
    y2 = idx.tousertensor().numpy()
    y3 = idx.todgltensor().asnumpy()
    assert np.allclose(ans, y1)
@@ -79,7 +79,7 @@ def test_index():
    # from dgl.NDArray
    data = dgl.ndarray.array(np.ones((10,), dtype=np.int64) * 10)
    idx = toindex(data)
-    y1 = idx.tolist()
+    y1 = idx.tonumpy()
    y2 = idx.tousertensor().numpy()
    y3 = idx.todgltensor().asnumpy()
    assert np.allclose(ans, y1)

--- a/tutorials/models/2_small_graph/3_tree-lstm.py
+++ b/tutorials/models/2_small_graph/3_tree-lstm.py
@@ -46,13 +46,13 @@ Tree LSTM DGL Tutorial
 #

 import dgl
-import dgl.data as data
+from dgl.data.tree import SST

 # Each sample in the dataset is a constituency tree. The leaf nodes
 # represent words. The word is a int value stored in the "x" field.
 # The non-leaf nodes has a special word PAD_WORD. The sentiment
 # label is stored in the "y" feature field.
-trainset = data.SST(mode='tiny')  # the "tiny" set has only 5 trees
+trainset = SST(mode='tiny')  # the "tiny" set has only 5 trees
 tiny_sst = trainset.trees
 num_vocabs = trainset.num_vocabs
 num_classes = trainset.num_classes
@@ -337,7 +337,7 @@ optimizer = th.optim.Adagrad(model.parameters(),
                          
 train_loader = DataLoader(dataset=tiny_sst,
                          batch_size=5,
-                          collate_fn=data.SST.batcher(device),
+                          collate_fn=SST.batcher(device),
                          shuffle=False,
                          num_workers=0)