[Backend] TF backend (#978)

* tf * add builtin support * fiix * pytest * fix * fix * fix some bugs * fix selecting * fix todo * fix test * fix test fail in tf * fix * fix * fix gather row * fix gather row * log backend * fix gather row * fix gather row * fix for pytorch * fix * fix * fix * fix * fix * fix tests * fix * fix * fix * fix * fix * fix * fix convert * fix * fix * fix * fix inplace * add alignment setting * add debug option * Revert "add alignment setting" This reverts commit ec63fb3506ea84fff7d447a1fbdfd1d5d1fb6110. * tf ci * fix lint * fix lint * add tfdlpack * fix type * add env * fix backend * fix * fix tests * remove one_hot * remove comment * remove comment * fix * use pip to install all * fix test * fix base * fix * fix * add skip * upgrade cmake * change version * change ci * fix * fix * fix * fix * fix seg fault * fix * fix python version * fix * try fix * fix * fix * tf takes longer time in ci * change py version * fix * fix * fix oom * change kg env * change kg env * 啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊 * 我再也不搞各种乱七八糟环境了…… * use pytest * Chang image

[Backend] TF backend (#978)
* tf * add builtin support * fiix * pytest * fix * fix * fix some bugs * fix selecting * fix todo * fix test * fix test fail in tf * fix * fix * fix gather row * fix gather row * log backend * fix gather row * fix gather row * fix for pytorch * fix * fix * fix * fix * fix * fix tests * fix * fix * fix * fix * fix * fix * fix convert * fix * fix * fix * fix inplace * add alignment setting * add debug option * Revert "add alignment setting" This reverts commit ec63fb3506ea84fff7d447a1fbdfd1d5d1fb6110. * tf ci * fix lint * fix lint * add tfdlpack * fix type * add env * fix backend * fix * fix tests * remove one_hot * remove comment * remove comment * fix * use pip to install all * fix test * fix base * fix * fix * add skip * upgrade cmake * change version * change ci * fix * fix * fix * fix * fix seg fault * fix * fix python version * fix * try fix * fix * fix * tf takes longer time in ci * change py version * fix * fix * fix oom * change kg env * change kg env * 啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊 * 我再也不搞各种乱七八糟环境了…… * use pytest * Chang image
d30a69bf · VoVAllen · GitHub · cf9ba90f · d30a69bf · d30a69bf
Unverified Commit d30a69bf authored Dec 20, 2019 by VoVAllen Committed by GitHub Dec 20, 2019
20 changed files
--- a/python/dgl/backend/numpy/tensor.py
+++ b/python/dgl/backend/numpy/tensor.py
@@ -212,10 +212,3 @@ def zerocopy_to_numpy(input):
 def zerocopy_from_numpy(np_array):
    return np_array
-def one_hot(t, num_classes=-1):
-    if num_classes == -1:
-        num_classes = np.max(t) + 1
-    res = np.eye(num_classes)[np.array(t).reshape(-1)]
-    return res.reshape(list(t.shape)+[num_classes])
--- a/python/dgl/backend/pytorch/tensor.py
+++ b/python/dgl/backend/pytorch/tensor.py
@@ -281,8 +281,6 @@ def zerocopy_to_dgl_ndarray(input):
 def zerocopy_from_dgl_ndarray(input):
    return dlpack.from_dlpack(input.to_dlpack())
-def one_hot(t, num_classes=-1):
-    return th.nn.functional.one_hot(t, num_classes)
 class BinaryReduce(th.autograd.Function):

--- a/python/dgl/backend/tensorflow/__init__.py
+++ b/python/dgl/backend/tensorflow/__init__.py
+from .tensor import *
--- a/python/dgl/backend/tensorflow/tensor.py
+++ b/python/dgl/backend/tensorflow/tensor.py
+from __future__ import absolute_import
+from distutils.version import LooseVersion
+import tensorflow as tf
+from tensorflow.python.eager import context
+import builtins
+import tfdlpack
+import numpy as np
+from tfdlpack import to_dlpack, from_dlpack
+from ... import ndarray as nd
+from ... import kernel as K
+from ...function.base import TargetCode
+TF_VERSION = LooseVersion(tf.__version__)
+def data_type_dict():
+    return {'float16': tf.float16,
+            'float32': tf.float32,
+            'float64': tf.float64,
+            'uint8': tf.uint8,
+            'int8': tf.int8,
+            'int16': tf.int16,
+            'int32': tf.int32,
+            'int64': tf.int64}
+def cpu():
+    return "/cpu:0"
+def tensor(data, dtype=None):
+    return tf.convert_to_tensor(data, dtype=dtype)
+def as_scalar(data):
+    return data.numpy().asscalar()
+def get_preferred_sparse_format():
+    """Get the preferred sparse matrix format supported by the backend.
+    Different backends have their preferred backend. This info is useful when
+    constructing a sparse matrix.
+    """
+    return "coo"
+def sparse_matrix(data, index, shape, force_format=False):
+    fmt = index[0]
+    if fmt != 'coo':
+        raise TypeError(
+            'Tensorflow backend only supports COO format. But got %s.' % fmt)
+    spmat = tf.SparseTensor(indices=tf.transpose(
+        index[1], (1, 0)), values=data, dense_shape=shape)
+    return spmat, None
+def sparse_matrix_indices(spmat):
+    return ('coo', spmat.indices)
+def is_tensor(obj):
+    return isinstance(obj, tf.Tensor)
+def shape(input):
+    return input.shape
+def dtype(input):
+    return input.dtype
+def ndim(input):
+    return input.ndim
+def context(input):
+    return input.device
+def device_type(ctx):
+    return tf.DeviceSpec.from_string(ctx).device_type.lower()
+def device_id(ctx):
+    return tf.DeviceSpec.from_string(ctx).device_index
+def astype(input, ty):
+    return tf.cast(input, dtype=ty)
+def asnumpy(input):
+    if isinstance(input, tf.SparseTensor):
+        # tf.sparse.to_dense assume sorted indices, need to turn off validate_indices in our cases
+        return tf.sparse.to_dense(input, validate_indices=False).numpy()
+    else:
+        return input.numpy()
+def copy_to(input, ctx):
+    with tf.device(ctx):
+        new_tensor = tf.identity(input)
+    return new_tensor
+def sum(input, dim, keepdims=False):
+    return tf.reduce_sum(input, axis=dim, keepdims=keepdims)
+def reduce_sum(input):
+    return tf.reduce_sum(input)
+def mean(input, dim):
+    return tf.reduce_mean(input, axis=dim)
+def reduce_mean(input):
+    return th.reduce_mean(input)
+def max(input, dim):
+    return tf.reduce_max(input, axis=dim)
+def reduce_max(input):
+    return tf.reduce_max(input)
+def min(input, dim):
+    return tf.reduce_min(input, axis=dim)
+def reduce_min(input):
+    return tf.reduce_min(input)
+def argsort(input, dim, descending):
+    if descending:
+        return tf.cast(tf.argsort(input, axis=dim, direction="DESCENDING"), dtype=tf.int64)
+    else:
+        return tf.cast(tf.argsort(input, axis=dim, direction="ASCENDING"), dtype=tf.int64)
+def topk(input, k, dim, descending=True):
+    if not descending:
+        input = -input
+    shape = np.arange(input.ndim)
+    shape[dim], shape[-1] = shape[-1], shape[dim]
+    out1 = tf.transpose(input, perm=shape)
+    out2 = tf.math.top_k(out1, k=k, sorted=True)
+    out = tf.transpose(out2[0], shape)
+    if not descending:
+        out = -out
+    return out
+def argtopk(input, k, dim, descending=True):
+    if not descending:
+        input = -input
+    shape = np.arange(input.ndim)
+    shape[dim], shape[-1] = shape[-1], shape[dim]
+    out1 = tf.transpose(input, perm=shape)
+    out2 = tf.math.top_k(out1, k=k, sorted=True)
+    out = tf.transpose(out2[1], shape)
+    if not descending:
+        out = -out
+    return out
+def exp(input):
+    return tf.exp(input)
+def softmax(input, dim=-1):
+    return tf.math.softmax(input, axis=dim)
+def cat(seq, dim):
+    return tf.concat(seq, axis=dim)
+def stack(seq, dim):
+    return tf.stack(seq, axis=dim)
+def split(input, sizes_or_sections, dim):
+    return tf.split(input, sizes_or_sections, axis=dim)
+def repeat(input, repeats, dim):
+    return tf.keras.backend.repeat_elements(input, repeats, dim)
+def gather_row(data, row_index):
+    return tf.gather(data, row_index)
+def slice_axis(data, axis, begin, end):
+    # assert axis == 0
+    # tf doesn't behave well with negative
+    s = [slice(None) for i in range(data.ndim)]
+    if end == 0:
+        end = data.shape[axis]
+    s[axis] = slice(begin, end, None)
+    return data[tuple(s)]
+def take(data, indices, dim):
+    return tf.gather_nd(data, indices, dim)
+def narrow_row(x, start, stop):
+    return x[start:stop]
+def scatter_row(data, row_index, value):
+    row_index = tf.expand_dims(row_index, 1)
+    return tf.tensor_scatter_nd_update(data, row_index, value)
+def scatter_row_inplace(data, row_index, value):
+    raise NotImplementedError("Tensorflow doesn't support inplace update")
+def squeeze(input, dim):
+    return tf.squeeze(input, axis=dim)
+def unsqueeze(input, dim):
+    return tf.expand_dims(input, axis=dim)
+def reshape(input, shape):
+    return tf.reshape(input, shape)
+def swapaxes(input, axis1, axis2):
+    return tf.transpose(input, perm=[axis1, axis2])
+def zeros(shape, dtype, ctx):
+    with tf.device(ctx):
+        t = tf.zeros(shape, dtype=dtype)
+    return t
+def zeros_like(input):
+    return tf.zeros_like(input)
+def ones(shape, dtype, ctx):
+    with tf.device(ctx):
+        t = tf.ones(shape, dtype=dtype)
+    return t
+def uniform(shape, dtype, ctx, low, high):
+    with tf.device(ctx):
+        t = tf.random.uniform(shape, dtype=dtype, minval=low, maxval=high)
+    return t
+def pad_packed_tensor(input, lengths, value, l_min=None):
+    old_shape = input.shape
+    if isinstance(lengths, tf.Tensor):
+        max_len = as_scalar(lengths.max())
+    else:
+        max_len = builtins.max(lengths)
+    if l_min is not None:
+        max_len = builtins.max(max_len, l_min)
+    batch_size = len(lengths)
+    ndim = input.ndim
+    tensor_list = []
+    cum_row = 0
+    pad_nparray = np.zeros((ndim, 2), dtype=np.int32)
+    for l in lengths:
+        t = input[cum_row:cum_row+l]
+        pad_nparray[0, 1] = max_len - l
+        t = tf.pad(t, tf.constant(pad_nparray),
+                   mode='CONSTANT', constant_values=value)
+        tensor_list.append(t)
+        cum_row += l
+    return tf.stack(tensor_list, axis=0)
+def pack_padded_tensor(input, lengths):
+    out_list = []
+    for i, l in enumerate(lengths):
+        t = input[i]
+        out = t[:l]
+        out_list.append(out)
+    return tf.concat(out_list, axis=0)
+def unsorted_1d_segment_sum(input, seg_id, n_segs, dim):
+    assert dim == 0  # Why we need dim for 1d?
+    return tf.math.unsorted_segment_sum(input, seg_id, n_segs)
+def unsorted_1d_segment_mean(input, seg_id, n_segs, dim):
+    assert dim == 0  # Why we need dim for 1d?
+    return tf.math.unsorted_segment_mean(input, seg_id, n_segs)
+# TODO: TF has unsorted_segment_max, which can accelerate _max_on on batched graph
+def boolean_mask(input, mask):
+    return tf.boolean_mask(input, mask)
+def equal(x, y):
+    return x == y
+def logical_not(input):
+    return ~input
+def unique(input):
+    return tf.unique(input).y
+def full_1d(length, fill_value, dtype, ctx):
+    with tf.device(ctx):
+        t = tf.fill([length], value=fill_value)
+        t = tf.cast(t, dtype=dtype)
+    return t
+def nonzero_1d(input):
+    nonzero_bool = (input != False)
+    return tf.reshape(tf.where(nonzero_bool), (-1, ))
+def sort_1d(input):
+    return tf.sort(input), tf.cast(tf.argsort(input), dtype=tf.int64)
+def arange(start, stop):
+    with tf.device("/cpu:0"):
+        t = tf.range(start, stop, dtype=tf.int64)
+    return t
+def rand_shuffle(arr):
+    return tf.random.shuffle(arr)
+def zerocopy_to_dlpack(input):
+    return tfdlpack.to_dlpack(input)
+def zerocopy_from_dlpack(dlpack_tensor):
+    return tfdlpack.from_dlpack(dlpack_tensor)
+def zerocopy_to_numpy(input):
+    # NOTE: not zerocopy
+    return np.array(memoryview(input))
+def zerocopy_from_numpy(np_array):
+    # NOTE: not zerocopy
+    # This assumes tensor should be on cpu
+    with tf.device("/cpu:0"):
+        t = tf.convert_to_tensor(np_array)
+    return t
+def zerocopy_to_dgl_ndarray(input):
+    return nd.from_dlpack(zerocopy_to_dlpack(input))
+def zerocopy_from_dgl_ndarray(input):
+    return zerocopy_from_dlpack(input.to_dlpack())
+def binary_reduce(reducer, binary_op, graph, lhs, rhs, lhs_data, rhs_data,
+                  out_size, lhs_map, rhs_map, out_map):
+    @tf.custom_gradient
+    def _lambda(lhs_data, rhs_data):
+        return binary_reduce_real(reducer, binary_op, graph, lhs, rhs, lhs_data, rhs_data,
+                                  out_size, lhs_map, rhs_map, out_map)
+    return _lambda(lhs_data, rhs_data)
+def binary_reduce_real(reducer, binary_op, graph, lhs, rhs, lhs_data, rhs_data,
+                       out_size, lhs_map, rhs_map, out_map):
+    lhs_data_nd = zerocopy_to_dgl_ndarray(lhs_data)
+    rhs_data_nd = zerocopy_to_dgl_ndarray(rhs_data)
+    feat_shape = K.infer_binary_feature_shape(
+        binary_op, lhs_data_nd, rhs_data_nd)
+    out_shape = feat_shape
+    if binary_op == 'dot':
+        out_shape = feat_shape[:-1]
+    # out_data = lhs_data.new_empty((out_size,) + out_shape)
+    out_data = tf.zeros((out_size,) + out_shape, dtype=lhs_data.dtype)
+    out_data_nd = zerocopy_to_dgl_ndarray(out_data)
+    K.binary_op_reduce(
+        reducer if reducer != 'mean' else 'sum',
+        binary_op, graph, lhs, rhs, lhs_data_nd, rhs_data_nd,
+        out_data_nd, lhs_map[0], rhs_map[0], out_map[0])
+    # normalize if mean reducer
+    # NOTE(zihao): this is a temporary hack and we should have better solution in the future.
+    if reducer == 'mean':
+        # degs = lhs_data.new_empty((out_data.shape[0],))
+        degs = tf.zeros((out_data.shape[0],), dtype=lhs_data.dtype)
+        degs_nd = zerocopy_to_dgl_ndarray(degs)
+        if lhs != TargetCode.DST:  # src or edge
+            target = lhs
+            n = lhs_data.shape[0]
+            in_map = lhs_map[0]
+        else:  # rhs != TargetCode.DST
+            target = rhs
+            n = rhs_data.shape[0]
+            in_map = rhs_map[0]
+        # in_ones = lhs_data.new_ones((n,))
+        in_ones = tf.ones((n,), dtype=lhs_data.dtype)
+        in_ones_nd = zerocopy_to_dgl_ndarray(in_ones)
+        K.copy_reduce(
+            'sum', graph, target, in_ones_nd, degs_nd, in_map, out_map[0])
+        # reshape
+        degs = tf.reshape(degs,
+                          (out_data.shape[0],) + (1,) * (out_data.ndim - 1))
+        degs = tf.clip_by_value(degs, clip_value_min=1,
+                                clip_value_max=np.inf)  # ???
+        out_data = out_data / degs
+    else:
+        degs = None
+    def grad(grad_out):
+        grad_lhs = None
+        grad_rhs = None
+        if reducer == 'mean':
+            grad_out = grad_out / degs
+        grad_out_nd = zerocopy_to_dgl_ndarray(grad_out)
+        if True:
+            # grad_lhs = grad_out.new_empty((lhs_data_nd.shape[0],) + feat_shape)
+            grad_lhs = tf.zeros((lhs_data_nd.shape[0],) + feat_shape)
+            K.backward_lhs_binary_op_reduce(
+                reducer if reducer != 'mean' else 'sum',
+                binary_op, graph, lhs, rhs, lhs_data_nd, rhs_data_nd,
+                out_data_nd, grad_out_nd, zerocopy_to_dgl_ndarray(grad_lhs),
+                lhs_map[1], rhs_map[1], out_map[1])
+            grad_lhs = _reduce_grad(grad_lhs, lhs_data_nd.shape)
+        if True:
+            # grad_rhs = grad_out.new_empty((rhs_data_nd.shape[0],) + feat_shape)
+            grad_rhs = tf.zeros((rhs_data_nd.shape[0],) + feat_shape)
+            K.backward_rhs_binary_op_reduce(
+                reducer if reducer != 'mean' else 'sum',
+                binary_op, graph, lhs, rhs, lhs_data_nd, rhs_data_nd,
+                out_data_nd, grad_out_nd, zerocopy_to_dgl_ndarray(grad_rhs),
+                lhs_map[1], rhs_map[1], out_map[1])
+            grad_rhs = _reduce_grad(grad_rhs, rhs_data_nd.shape)
+        return grad_lhs, grad_rhs
+    return out_data, grad
+def copy_reduce(reducer, graph, target, in_data, out_size, in_map,
+                out_map):
+    @tf.custom_gradient
+    def _labmda(in_data):
+        return copy_reduce_real(reducer, graph, target, in_data, out_size, in_map,
+                                out_map)
+    return _labmda(in_data)
+def copy_reduce_real(reducer, graph, target, in_data, out_size, in_map,
+                     out_map):
+    out_data = tf.zeros(
+        (out_size,) + tuple(in_data.shape[1:]), dtype=in_data.dtype)
+    in_data_nd = zerocopy_to_dgl_ndarray(in_data)
+    out_data_nd = zerocopy_to_dgl_ndarray(out_data)
+    K.copy_reduce(
+        reducer if reducer != 'mean' else 'sum',
+        graph, target, in_data_nd, out_data_nd, in_map[0], out_map[0])
+    # normalize if mean reducer
+    # NOTE(zihao): this is a temporary hack and we should have better solution in the future.
+    if reducer == 'mean':
+        # in_ones = in_data.new_ones((in_data.shape[0],))
+        in_ones = tf.ones(in_data.shape[0], dtype=in_data.dtype)
+        # degs = in_data.new_empty((out_data.shape[0],))
+        degs = tf.zeros(out_data.shape[0], dtype=in_data.dtype)
+        in_ones_nd = zerocopy_to_dgl_ndarray(in_ones)
+        degs_nd = zerocopy_to_dgl_ndarray(degs)
+        K.copy_reduce(
+            'sum', graph, target, in_ones_nd, degs_nd, in_map[0], out_map[0])
+        # reshape
+        degs = tf.reshape(degs,
+                          (out_data.shape[0],) + (1,) * (out_data.ndim - 1))
+        degs = tf.clip_by_value(degs, clip_value_min=1,
+                                clip_value_max=np.inf)  # TODO: ???
+        out_data = out_data / degs
+    else:
+        degs = None
+    # save_for_backward can only save variables
+    def grad(grad_out):
+        if reducer == 'mean':
+            grad_out = grad_out / degs
+        grad_out_nd = zerocopy_to_dgl_ndarray(grad_out)
+        # if ctx.needs_input_grad[3]:
+        if True:
+            # grad_in = grad_out.new_empty(in_data_nd.shape)
+            grad_in = tf.zeros(in_data_nd.shape)
+            K.backward_copy_reduce(
+                reducer if reducer != 'mean' else 'sum',
+                graph, target, in_data_nd, out_data_nd, grad_out_nd,
+                zerocopy_to_dgl_ndarray(grad_in), in_map[1], out_map[1])
+        return grad_in
+    return out_data, grad
+def _reduce_grad(grad, shape):
+    """Reduce gradient on the broadcast dimension
+    If there is broadcast in forward pass, gradients need to be reduced on
+    broadcast dimension. This function checks the input tensor shape and
+    gradient shape and perform the reduction.
+    Parameters
+    ----------
+    grad: Tensor
+        Gradient tensor
+    shape: tuple
+        Shape of input tensor
+    Returns
+    -------
+    Tensor
+    """
+    grad_shape = grad.shape[1:]
+    in_shape = shape[1:]
+    if in_shape == grad_shape:
+        # no need to reduce
+        return grad
+    num_to_squeeze = len(grad_shape) - len(in_shape)
+    # pad inshape
+    in_shape = (1,) * num_to_squeeze + in_shape
+    reduce_idx = np.array(np.nonzero(
+        np.array(grad_shape) - np.array(in_shape)))
+    reduce_idx += 1  # skip batch dim
+    reduce_idx_tensor = tf.constant(tuple(
+        reduce_idx.flatten().tolist()))
+    grad = tf.reduce_sum(grad, axis=reduce_idx_tensor, keepdims=True)
+    return tf.reshape(grad, shape)
+def sync():
+    context = context().context()
+    context.async_wait()
--- a/python/dgl/contrib/sampling/sampler.py
+++ b/python/dgl/contrib/sampling/sampler.py
@@ -623,7 +623,7 @@ class EdgeSampler(object):
            self._is_uniform = True
        else:
            self._is_uniform = False
-            self._edge_weight = F.zerocopy_to_dgl_ndarray(edge_weight[self._seed_edges])
+            self._edge_weight = F.zerocopy_to_dgl_ndarray(F.gather_row(edge_weight, self._seed_edges))
            if node_weight is None:
                self._node_weight = empty((0,), 'float32')
            else:

--- a/python/dgl/nodeflow.py
+++ b/python/dgl/nodeflow.py
@@ -339,8 +339,7 @@ class NodeFlow(DGLBaseGraph):
            The parent node id array.
        """
        nid = utils.toindex(nid)
-        # TODO(minjie): should not directly use []
+        return F.gather_row(self._node_mapping.tousertensor(), nid.tousertensor())
-        return self._node_mapping.tousertensor()[nid.tousertensor()]
    def map_to_parent_eid(self, eid):
        """This maps the child edge Ids to the parent Ids.
@@ -356,8 +355,7 @@ class NodeFlow(DGLBaseGraph):
            The parent edge id array.
        """
        eid = utils.toindex(eid)
-        # TODO(minjie): should not directly use []
+        return F.gather_row(self._edge_mapping.tousertensor(), eid.tousertensor())
-        return self._edge_mapping.tousertensor()[eid.tousertensor()]
    def map_from_parent_nid(self, layer_id, parent_nids, remap_local=False):
        """Map parent node Ids to NodeFlow node Ids in a certain layer.
@@ -509,7 +507,7 @@ class NodeFlow(DGLBaseGraph):
        ret = self._edge_mapping.tousertensor()[start:end]
        # If `add_self_loop` is enabled, the returned parent eid can be -1.
        # We have to make sure this case doesn't happen.
-        assert F.asnumpy(F.sum(ret == -1, 0)) == 0, "The eid in the parent graph is invalid."
+        assert F.asnumpy(ret == -1).sum(0) == 0, "The eid in the parent graph is invalid."
        return ret
    def block_edges(self, block_id, remap_local=False):
@@ -1025,7 +1023,7 @@ def _copy_to_like(arr1, arr2):
 def _get_frame(frame, names, ids, ctx):
    col_dict = {}
    for name in names:
-        col = frame[name][_copy_to_like(ids, frame[name])]
+        col = F.gather_row(frame[name], _copy_to_like(ids, frame[name]))
        if ctx:
            col = F.copy_to(col, ctx)
        col_dict[name] = col
@@ -1044,6 +1042,7 @@ def _copy_frame(frame, ctx):
 def _update_frame(frame, names, ids, new_frame):
    col_dict = {name: new_frame[name] for name in names}
    if len(col_dict) > 0:
+        # This will raise error for tensorflow, because inplace update is not supported
        frame.update_rows(ids, FrameRef(Frame(col_dict)), inplace=True)
 _init_api("dgl.nodeflow", __name__)
--- a/python/dgl/runtime/scheduler.py
+++ b/python/dgl/runtime/scheduler.py
@@ -1016,7 +1016,8 @@ def _build_idx_map(idx, nbits):
    x = idx.tousertensor()
    map_len = int(F.asnumpy(F.max(x, dim=0))) + 1
    old_to_new = F.full_1d(map_len, -1, dtype=F.int64, ctx=F.cpu())
-    F.scatter_row_inplace(old_to_new, x, F.arange(0, len(x)))
+    # Use out-place update due to tensorflow compatibility
+    old_to_new = F.scatter_row(old_to_new, x, F.arange(0, len(x)))
    old_to_new = utils.to_nbits_int(old_to_new, nbits)
    old_to_new = F.zerocopy_to_dgl_ndarray(old_to_new)
    return utils.CtxCachedObject(lambda ctx: nd.array(old_to_new, ctx=ctx))

--- a/python/dgl/runtime/spmv.py
+++ b/python/dgl/runtime/spmv.py
@@ -172,8 +172,8 @@ def build_gidx_and_mapping_uv(edge_tuples, num_src, num_dst):
    forward, backward = gidx.get_csr_shuffle_order(0)
    eid = eid.tousertensor()
    nbits = gidx.bits_needed(0)
-    forward_map = utils.to_nbits_int(eid[forward.tousertensor()], nbits)
+    forward_map = utils.to_nbits_int(F.gather_row(eid, forward.tousertensor()), nbits)
-    backward_map = utils.to_nbits_int(eid[backward.tousertensor()], nbits)
+    backward_map = utils.to_nbits_int(F.gather_row(eid, backward.tousertensor()), nbits)
    forward_map = F.zerocopy_to_dgl_ndarray(forward_map)
    backward_map = F.zerocopy_to_dgl_ndarray(backward_map)
    edge_map = utils.CtxCachedObject(

--- a/python/dgl/utils.py
+++ b/python/dgl/utils.py
@@ -384,7 +384,7 @@ def build_relabel_map(x, is_sorted=False):
        unique_x = x
    map_len = int(F.asnumpy(F.max(unique_x, dim=0))) + 1
    old_to_new = F.zeros((map_len,), dtype=F.int64, ctx=F.cpu())
-    F.scatter_row_inplace(old_to_new, unique_x, F.arange(0, len(unique_x)))
+    old_to_new = F.scatter_row(old_to_new, unique_x, F.arange(0, len(unique_x)))
    return unique_x, old_to_new
 def build_relabel_dict(x):

--- a/python/dgl/view.py
+++ b/python/dgl/view.py
@@ -41,7 +41,7 @@ class NodeView(object):
    def __call__(self):
        """Return the nodes."""
-        return F.arange(0, len(self))
+        return F.copy_to(F.arange(0, len(self)), F.cpu())
 class NodeDataView(MutableMapping):
    """The data view class when G.nodes[...].data is called.

--- a/src/runtime/workspace_pool.cc
+++ b/src/runtime/workspace_pool.cc
@@ -117,15 +117,15 @@ WorkspacePool::WorkspacePool(DLDeviceType device_type, std::shared_ptr<DeviceAPI
 }
 WorkspacePool::~WorkspacePool() {
-  for (size_t i = 0; i < array_.size(); ++i) {
+  // for (size_t i = 0; i < array_.size(); ++i) {
-    if (array_[i] != nullptr) {
+  //   if (array_[i] != nullptr) {
-      DGLContext ctx;
+  //     DGLContext ctx;
-      ctx.device_type = device_type_;
+  //     ctx.device_type = device_type_;
-      ctx.device_id = static_cast<int>(i);
+  //     ctx.device_id = static_cast<int>(i);
-      array_[i]->Release(ctx, device_.get());
+  //     array_[i]->Release(ctx, device_.get());
-      delete array_[i];
+  //     delete array_[i];
-    }
+  //   }
-  }
+  // }
 }
 void* WorkspacePool::AllocWorkspace(DGLContext ctx, size_t size) {

--- a/tests/backend/pytorch/__init__.py
+++ b/tests/backend/pytorch/__init__.py
@@ -26,6 +26,9 @@ def attach_grad(x):
        return x.requires_grad_()
 def backward(x, head_gradient=None):
+    if head_gradient is not None and head_gradient.shape[0] == 1 and len(head_gradient.shape) == 1:
+        # Fix for torch 1.3.1
+        head_gradient = th.tensor(head_gradient.item()).to(head_gradient.device)
    x.backward(head_gradient)
 def grad(x):

--- a/tests/backend/tensorflow/__init__.py
+++ b/tests/backend/tensorflow/__init__.py
+from __future__ import absolute_import
+import numpy as np
+import tensorflow as tf
+from scipy.sparse import coo_matrix
+def cuda():
+    return '/gpu:0'
+def is_cuda_available():
+    return tf.test.is_gpu_available(cuda_only=True)
+def array_equal(a, b):
+    return np.array_equal(a.numpy(), b.numpy())
+def allclose(a, b, rtol=1e-4, atol=1e-4):
+    return np.allclose(a.numpy(),
+                       b.numpy(), rtol=rtol, atol=atol)
+def randn(shape):
+    return tf.random.normal(shape)
+class GradContext:
+    def __init__(self):
+        self.tensor_for_grad = []
+        self.grad_list = []
+        self.tape = None
+    def set_tape(self, tape):
+        self.tape = tape
+    def add_tensor(self, x):
+        idx_pop = []
+        for idx, ele in enumerate(self.tensor_for_grad):
+            if ele._id == x._id:
+                idx_pop.append(idx)
+        if len(idx_pop) > 0:
+            self.tensor_for_grad.pop(idx_pop[0])
+        if self.tape is not None:
+            self.tape.watch(x)
+        self.tensor_for_grad.append(x)
+    def backward(self, x, head_gradient=None):
+        if head_gradient is not None:
+            x = x * head_gradient
+        self.grad_list = self.tape.gradient(x, self.tensor_for_grad)
+    def is_no_grad(self, x):
+        idx_pop = []
+        for idx, ele in enumerate(self.tensor_for_grad):
+            if ele._id == x._id:
+                idx_pop.append(idx)
+        if len(idx_pop) == 0:
+            return True
+        else:
+            return self.grad_list[idx_pop[0]] is None
+    def grad(self, x):
+        idx_pop = []
+        for idx, ele in enumerate(self.tensor_for_grad):
+            if ele._id == x._id:
+                idx_pop.append(idx)
+        assert len(idx_pop) == 1
+        t = self.grad_list[idx_pop[0]]
+        return tf.convert_to_tensor(t)
+cgrad = GradContext()
+def get_cgrad():
+    return cgrad
+class record_grad:
+    def __init__(self):
+        self.tape = tf.GradientTape()
+    def __enter__(self):
+        cgrad.set_tape(self.tape)
+        self.tape.__enter__()
+        for x in cgrad.tensor_for_grad:
+            self.tape.watch(x)
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        # pass
+        self.tape.__exit__(exc_type, exc_value, exc_traceback)
+        cgrad.tape = None
+def attach_grad(x):
+    cgrad.add_tensor(x)
+    return x
+def backward(x, head_gradient=None):
+    cgrad.backward(x, head_gradient)
+def grad(x):
+    return cgrad.grad(x)
+def is_no_grad(x):
+    return cgrad.is_no_grad(x)
+def full(shape, fill_value, dtype, ctx):
+    with tf.device(ctx):
+        t = tf.constant(fill_value, shape=shape, dtype=dtype)
+    return t
+def narrow_row_set(x, start, stop, new):
+    # x[start:stop] = new
+    raise NotImplementedError("TF doesn't support inplace update")
+def sparse_to_numpy(x):
+    # tf.sparse.to_dense assume sorted indices, need to turn off validate_indices in our cases
+    return tf.sparse.to_dense(x, validate_indices=False).numpy()
+def clone(x):
+    return tf.identity(x)
+def reduce_sum(x):
+    return tf.reduce_sum(x)
+def softmax(x, dim):
+    return tf.math.softmax(x, axis=dim)
+def spmm(x, y):
+    return tf.sparse.sparse_dense_matmul(x, y)
+def add(a, b):
+    return a + b
+def sub(a, b):
+    return a - b
+def mul(a, b):
+    return a * b
+def div(a, b):
+    return a / b
+def sum(x, dim):
+    return tf.reduce_sum(x, axis=dim)
+def max(x, dim):
+    return tf.reduce_max(x, axis=dim)
+def min(x, dim):
+    return tf.reduce_min(x, axis=dim)
+def prod(x, dim):
+    return tf.reduce_prod(x, axis=dim)
+def matmul(a, b):
+    return tf.linalg.matmul(a, b)
+def dot(a, b):
+    return sum(mul(a, b), dim=-1)
+no_grad = None
--- a/tests/compute/test_hetero_basics.py
+++ b/tests/compute/test_hetero_basics.py
@@ -600,7 +600,7 @@ def test_repr():
    repr_string = G.__repr__()
    print(repr_string)
-@unittest.skipIf(dgl.backend.backend_name == "tensorflow", reason="will core dump")
 def test_group_apply_edges():
    def edge_udf(edges):
        h = F.sum(edges.data['feat'] * (edges.src['h'] + edges.dst['h']), dim=2)

--- a/tests/compute/test_heterograph.py
+++ b/tests/compute/test_heterograph.py
@@ -935,7 +935,6 @@ def test_level1():
    assert fail
-@unittest.skipIf(dgl.backend.backend_name == "tensorflow", reason="Core dump")
 def test_level2():
    #edges = {
    #    'follows': ([0, 1], [1, 2]),

--- a/tests/compute/test_nodeflow.py
+++ b/tests/compute/test_nodeflow.py
@@ -9,20 +9,24 @@ from dgl import utils
 import dgl.function as fn
 from functools import partial
 import itertools
+import unittest
 def generate_rand_graph(n, connect_more=False, complete=False, add_self_loop=False):
    if complete:
-        cord = [(i,j) for i, j in itertools.product(range(n), range(n)) if i != j]
+        cord = [(i, j)
+                for i, j in itertools.product(range(n), range(n)) if i != j]
        row = [t[0] for t in cord]
        col = [t[1] for t in cord]
        data = np.ones((len(row),))
        arr = sp.sparse.coo_matrix((data, (row, col)), shape=(n, n))
    else:
-        arr = (sp.sparse.random(n, n, density=0.1, format='coo') != 0).astype(np.int64)
+        arr = (sp.sparse.random(n, n, density=0.1,
+                                format='coo') != 0).astype(np.int64)
        # having one node to connect to all other nodes.
        if connect_more:
            arr[0] = 1
-            arr[:,0] = 1
+            arr[:, 0] = 1
    if add_self_loop:
        g = dgl.DGLGraph(arr, readonly=False)
        nodes = np.arange(g.number_of_nodes())
@@ -57,6 +61,7 @@ def test_self_loop():
        for eid in parent_loop_eid:
            assert eid in parent_eid
 def create_mini_batch(g, num_hops, add_self_loop=False):
    seed_ids = np.array([1, 2, 0, 3])
    sampler = NeighborSampler(g, batch_size=4, expand_factor=g.number_of_nodes(),
@@ -66,6 +71,7 @@ def create_mini_batch(g, num_hops, add_self_loop=False):
    assert_array_equal(F.asnumpy(nfs[0].layer_parent_nid(-1)), seed_ids)
    return nfs[0]
 def check_basic(g, nf):
    num_nodes = 0
    for i in range(nf.num_layers):
@@ -81,7 +87,8 @@ def check_basic(g, nf):
    assert np.all(F.asnumpy(nf.has_nodes(list(range(num_nodes)))))
    for i in range(num_nodes):
        assert nf.has_node(i)
-    assert np.all(F.asnumpy(nf.has_nodes(list(range(num_nodes, 2 * num_nodes)))) == 0)
+    assert np.all(F.asnumpy(nf.has_nodes(
+        list(range(num_nodes, 2 * num_nodes)))) == 0)
    for i in range(num_nodes, 2 * num_nodes):
        assert not nf.has_node(i)
@@ -114,7 +121,6 @@ def check_basic(g, nf):
        data1 = g.edges[nf.block_parent_eid(i)].data['h2']
        assert_array_equal(F.asnumpy(data), F.asnumpy(data1))
    # negative layer Ids.
    for i in range(-1, -nf.num_layers, -1):
        in_deg = nf.layer_in_degree(i)
@@ -140,7 +146,8 @@ def test_basic():
    # should also work for negative layer ids
    for l in range(-1, -num_layers, -1):
        nids1 = nf.map_from_parent_nid(l, parent_nids, remap_local=True)
-        nids2 = nf.map_from_parent_nid(l + num_layers, parent_nids, remap_local=True)
+        nids2 = nf.map_from_parent_nid(
+            l + num_layers, parent_nids, remap_local=True)
        assert_array_equal(F.asnumpy(nids1), F.asnumpy(nids2))
    g = generate_rand_graph(100)
@@ -162,16 +169,20 @@ def check_apply_nodes(create_node_flow, use_negative_block_id):
        nf = create_node_flow(g, num_layers)
        nf.copy_from_parent()
        new_feats = F.randn((nf.layer_size(l), 5))
        def update_func(nodes):
-            return {'h1' : new_feats}
+            return {'h1': new_feats}
        nf.apply_layer(l, update_func)
-        assert_array_equal(F.asnumpy(nf.layers[l].data['h1']), F.asnumpy(new_feats))
+        assert_array_equal(
+            F.asnumpy(nf.layers[l].data['h1']), F.asnumpy(new_feats))
        new_feats = F.randn((4, 5))
        def update_func1(nodes):
-            return {'h1' : new_feats}
+            return {'h1': new_feats}
        nf.apply_layer(l, update_func1, v=nf.layer_nid(l)[0:4])
-        assert_array_equal(F.asnumpy(nf.layers[l].data['h1'][0:4]), F.asnumpy(new_feats))
+        assert_array_equal(
+            F.asnumpy(nf.layers[l].data['h1'][0:4]), F.asnumpy(new_feats))
 def test_apply_nodes():
@@ -194,16 +205,19 @@ def check_apply_edges(create_node_flow):
            return {'h2': new_feats, "f2": edges.src["f"] + edges.dst["f"]}
        nf.apply_block(i, update_func)
-        assert_array_equal(F.asnumpy(nf.blocks[i].data['h2']), F.asnumpy(new_feats))
+        assert_array_equal(
+            F.asnumpy(nf.blocks[i].data['h2']), F.asnumpy(new_feats))
        # should also work for negative block ids
        nf.apply_block(-num_layers + i, update_func)
-        assert_array_equal(F.asnumpy(nf.blocks[i].data['h2']), F.asnumpy(new_feats))
+        assert_array_equal(
+            F.asnumpy(nf.blocks[i].data['h2']), F.asnumpy(new_feats))
        eids = nf.block_parent_eid(i)
        srcs, dsts = g.find_edges(eids)
        expected_f_sum = g.nodes[srcs].data["f"] + g.nodes[dsts].data["f"]
-        assert_array_equal(F.asnumpy(nf.blocks[i].data['f2']), F.asnumpy(expected_f_sum))
+        assert_array_equal(
+            F.asnumpy(nf.blocks[i].data['f2']), F.asnumpy(expected_f_sum))
 def check_apply_edges1(create_node_flow):
@@ -220,18 +234,21 @@ def check_apply_edges1(create_node_flow):
        nf.register_apply_edge_func(update_func, i)
        nf.apply_block(i)
-        assert_array_equal(F.asnumpy(nf.blocks[i].data['h2']), F.asnumpy(new_feats))
+        assert_array_equal(
+            F.asnumpy(nf.blocks[i].data['h2']), F.asnumpy(new_feats))
        # should also work for negative block ids
        nf.register_apply_edge_func(update_func, -num_layers + i)
        nf.apply_block(-num_layers + i)
-        assert_array_equal(F.asnumpy(nf.blocks[i].data['h2']), F.asnumpy(new_feats))
+        assert_array_equal(
+            F.asnumpy(nf.blocks[i].data['h2']), F.asnumpy(new_feats))
        eids = nf.block_parent_eid(i)
        srcs, dsts = g.find_edges(eids)
        expected_f_sum = g.nodes[srcs].data["f"] + g.nodes[dsts].data["f"]
        #expected_f_sum = g.ndata["f"][srcs] + g.ndata["f"][dsts]
-        assert_array_equal(F.asnumpy(nf.blocks[i].data['f2']), F.asnumpy(expected_f_sum))
+        assert_array_equal(
+            F.asnumpy(nf.blocks[i].data['f2']), F.asnumpy(expected_f_sum))
 def test_apply_edges():
@@ -251,11 +268,12 @@ def check_flow_compute(create_node_flow, use_negative_block_id=False):
    for i in range(num_layers):
        l = -num_layers + i if use_negative_block_id else i
        nf.block_compute(l, fn.copy_src(src='h', out='m'), fn.sum(msg='m', out='t'),
-                         lambda nodes: {'h' : nodes.data['t'] + 1})
+                         lambda nodes: {'h': nodes.data['t'] + 1})
        g.update_all(fn.copy_src(src='h', out='m'), fn.sum(msg='m', out='t'),
-                     lambda nodes: {'h' : nodes.data['t'] + 1})
+                     lambda nodes: {'h': nodes.data['t'] + 1})
        assert_allclose(F.asnumpy(nf.layers[i + 1].data['h']),
-                        F.asnumpy(g.nodes[nf.layer_parent_nid(i + 1)].data['h']),
+                        F.asnumpy(
+                            g.nodes[nf.layer_parent_nid(i + 1)].data['h']),
                        rtol=1e-4, atol=1e-4)
    # Test the computation when only a few nodes are active in a layer.
@@ -264,12 +282,14 @@ def check_flow_compute(create_node_flow, use_negative_block_id=False):
        l = -num_layers + i if use_negative_block_id else i
        vs = nf.layer_nid(i+1)[0:4]
        nf.block_compute(l, fn.copy_src(src='h', out='m'), fn.sum(msg='m', out='t'),
-                        lambda nodes: {'h' : nodes.data['t'] + 1}, v=vs)
+                         lambda nodes: {'h': nodes.data['t'] + 1}, v=vs)
        g.update_all(fn.copy_src(src='h', out='m'), fn.sum(msg='m', out='t'),
-                     lambda nodes: {'h' : nodes.data['t'] + 1})
+                     lambda nodes: {'h': nodes.data['t'] + 1})
        data1 = nf.layers[i + 1].data['h'][0:4]
        data2 = g.nodes[nf.map_to_parent_nid(vs)].data['h']
-        assert_allclose(F.asnumpy(data1), F.asnumpy(data2), rtol=1e-4, atol=1e-4)
+        assert_allclose(F.asnumpy(data1), F.asnumpy(
+            data2), rtol=1e-4, atol=1e-4)
 def check_flow_compute1(create_node_flow, use_negative_block_id=False):
    num_layers = 2
@@ -284,12 +304,14 @@ def check_flow_compute1(create_node_flow, use_negative_block_id=False):
        l = -num_layers + i if use_negative_block_id else i
        nf.register_message_func(fn.copy_src(src='h', out='m'), l)
        nf.register_reduce_func(fn.sum(msg='m', out='t'), l)
-        nf.register_apply_node_func(lambda nodes: {'h' : nodes.data['t'] + 1}, l)
+        nf.register_apply_node_func(
+            lambda nodes: {'h': nodes.data['t'] + 1}, l)
        nf.block_compute(l)
        g.update_all(fn.copy_src(src='h', out='m'), fn.sum(msg='m', out='t'),
-                     lambda nodes: {'h' : nodes.data['t'] + 1})
+                     lambda nodes: {'h': nodes.data['t'] + 1})
        assert_allclose(F.asnumpy(nf.layers[i + 1].data['h']),
-                        F.asnumpy(g.nodes[nf.layer_parent_nid(i + 1)].data['h']),
+                        F.asnumpy(
+                            g.nodes[nf.layer_parent_nid(i + 1)].data['h']),
                        rtol=1e-4, atol=1e-4)
    # test the case that we register UDFs in all blocks.
@@ -299,16 +321,18 @@ def check_flow_compute1(create_node_flow, use_negative_block_id=False):
    nf.layers[0].data['h'] = nf.layers[0].data['h1']
    nf.register_message_func(fn.copy_src(src='h', out='m'))
    nf.register_reduce_func(fn.sum(msg='m', out='t'))
-    nf.register_apply_node_func(lambda nodes: {'h' : nodes.data['t'] + 1})
+    nf.register_apply_node_func(lambda nodes: {'h': nodes.data['t'] + 1})
    for i in range(num_layers):
        l = -num_layers + i if use_negative_block_id else i
        nf.block_compute(l)
        g.update_all(fn.copy_src(src='h', out='m'), fn.sum(msg='m', out='t'),
-                     lambda nodes: {'h' : nodes.data['t'] + 1})
+                     lambda nodes: {'h': nodes.data['t'] + 1})
        assert_allclose(F.asnumpy(nf.layers[i + 1].data['h']),
-                        F.asnumpy(g.nodes[nf.layer_parent_nid(i + 1)].data['h']),
+                        F.asnumpy(
+                            g.nodes[nf.layer_parent_nid(i + 1)].data['h']),
                        rtol=1e-4, atol=1e-4)
 class SrcMulEdgeMessageFunction(object):
    def __init__(self, src_field, edge_field, out_field):
        self.mul_op = operator.mul
@@ -327,7 +351,8 @@ class SrcMulEdgeMessageFunction(object):
        sdata = F.reshape(sdata, sshape + (1,) * (rank - F.ndim(sdata)))
        edata = F.reshape(edata, eshape + (1,) * (rank - F.ndim(edata)))
        ret = self.mul_op(sdata, edata)
-        return {self.out_field : ret}
+        return {self.out_field: ret}
 def check_flow_compute2(create_node_flow):
    num_layers = 2
@@ -339,14 +364,16 @@ def check_flow_compute2(create_node_flow):
    g.ndata['h'] = g.ndata['h1']
    nf.layers[0].data['h'] = nf.layers[0].data['h1']
    for i in range(num_layers):
-        nf.block_compute(i, SrcMulEdgeMessageFunction('h', 'h', 't'), fn.sum('t', 'h1'))
+        nf.block_compute(i, SrcMulEdgeMessageFunction(
+            'h', 'h', 't'), fn.sum('t', 'h1'))
        nf.block_compute(i, fn.src_mul_edge('h', 'h', 'h'), fn.sum('h', 'h'))
        g.update_all(fn.src_mul_edge('h', 'h', 'h'), fn.sum('h', 'h'))
        assert_allclose(F.asnumpy(nf.layers[i + 1].data['h1']),
                        F.asnumpy(nf.layers[i + 1].data['h']),
                        rtol=1e-4, atol=1e-4)
        assert_allclose(F.asnumpy(nf.layers[i + 1].data['h']),
-                        F.asnumpy(g.nodes[nf.layer_parent_nid(i + 1)].data['h']),
+                        F.asnumpy(
+                            g.nodes[nf.layer_parent_nid(i + 1)].data['h']),
                        rtol=1e-4, atol=1e-4)
    nf = create_node_flow(g, num_layers)
@@ -358,9 +385,11 @@ def check_flow_compute2(create_node_flow):
        nf.block_compute(i, fn.u_mul_v('h', 'h', 't'), fn.sum('t', 's'))
        g.update_all(fn.u_mul_v('h', 'h', 't'), fn.sum('t', 's'))
        assert_allclose(F.asnumpy(nf.layers[i + 1].data['s']),
-                        F.asnumpy(g.nodes[nf.layer_parent_nid(i + 1)].data['s']),
+                        F.asnumpy(
+                            g.nodes[nf.layer_parent_nid(i + 1)].data['s']),
                        rtol=1e-4, atol=1e-4)
 def test_flow_compute():
    check_flow_compute(create_full_nodeflow)
    check_flow_compute(create_mini_batch)
@@ -380,11 +409,11 @@ def check_prop_flows(create_node_flow):
    # Test the computation on a layer at a time.
    for i in range(num_layers):
        g.update_all(fn.copy_src(src='h', out='m'), fn.sum(msg='m', out='t'),
-                     lambda nodes: {'h' : nodes.data['t'] + 1})
+                     lambda nodes: {'h': nodes.data['t'] + 1})
    # Test the computation on all layers.
    nf2.prop_flow(fn.copy_src(src='h', out='m'), fn.sum(msg='m', out='t'),
-                  lambda nodes: {'h' : nodes.data['t'] + 1})
+                  lambda nodes: {'h': nodes.data['t'] + 1})
    assert_allclose(F.asnumpy(nf2.layers[-1].data['h']),
                    F.asnumpy(g.nodes[nf2.layer_parent_nid(-1)].data['h']),
                    rtol=1e-4, atol=1e-4)
@@ -395,6 +424,8 @@ def test_prop_flows():
    check_prop_flows(create_mini_batch)
+@unittest.skipIf(dgl.backend.backend_name == "tensorflow",
+                 reason="TF doesn't support inplace update, nf.copy_to_parent will trigger this")
 def test_copy():
    num_layers = 2
    g = generate_rand_graph(100)
@@ -417,7 +448,8 @@ def test_copy():
    nf = create_mini_batch(g, num_layers)
    node_embed_names = [['h'], ['h1'], ['h']]
    edge_embed_names = [['h2'], ['h2']]
-    nf.copy_from_parent(node_embed_names=node_embed_names, edge_embed_names=edge_embed_names)
+    nf.copy_from_parent(node_embed_names=node_embed_names,
+                        edge_embed_names=edge_embed_names)
    for i in range(nf.num_layers):
        assert len(node_embed_names[i]) == len(nf.layers[i].data.keys())
        for key in node_embed_names[i]:
@@ -434,14 +466,16 @@ def test_copy():
    nf = create_mini_batch(g, num_layers)
    g.ndata['h0'] = F.clone(g.ndata['h'])
    node_embed_names = [['h0'], [], []]
-    nf.copy_from_parent(node_embed_names=node_embed_names, edge_embed_names=None)
+    nf.copy_from_parent(node_embed_names=node_embed_names,
+                        edge_embed_names=None)
    for i in range(num_layers):
        nf.block_compute(i, fn.copy_src(src='h%d' % i, out='m'), fn.sum(msg='m', out='t'),
-                         lambda nodes: {'h%d' % (i+1) : nodes.data['t'] + 1})
+                         lambda nodes: {'h%d' % (i+1): nodes.data['t'] + 1})
        g.update_all(fn.copy_src(src='h', out='m'), fn.sum(msg='m', out='t'),
-                     lambda nodes: {'h' : nodes.data['t'] + 1})
+                     lambda nodes: {'h': nodes.data['t'] + 1})
        assert_allclose(F.asnumpy(nf.layers[i + 1].data['h%d' % (i+1)]),
-                        F.asnumpy(g.nodes[nf.layer_parent_nid(i + 1)].data['h']),
+                        F.asnumpy(
+                            g.nodes[nf.layer_parent_nid(i + 1)].data['h']),
                        rtol=1e-4, atol=1e-4)
    nf.copy_to_parent(node_embed_names=[['h0'], ['h1'], ['h2']])
    for i in range(num_layers + 1):
@@ -453,17 +487,20 @@ def test_copy():
    g.ndata['h1'] = F.clone(g.ndata['h'])
    g.ndata['h2'] = F.clone(g.ndata['h'])
    node_embed_names = [['h0'], ['h1'], ['h2']]
-    nf.copy_from_parent(node_embed_names=node_embed_names, edge_embed_names=None)
+    nf.copy_from_parent(node_embed_names=node_embed_names,
+                        edge_embed_names=None)
    def msg_func(edge, ind):
        assert 'h%d' % ind in edge.src.keys()
-        return {'m' : edge.src['h%d' % ind]}
+        return {'m': edge.src['h%d' % ind]}
    def reduce_func(node, ind):
        assert 'h%d' % (ind + 1) in node.data.keys()
-        return {'h' : F.sum(node.mailbox['m'], 1) + node.data['h%d' % (ind + 1)]}
+        return {'h': F.sum(node.mailbox['m'], 1) + node.data['h%d' % (ind + 1)]}
    for i in range(num_layers):
-        nf.block_compute(i, partial(msg_func, ind=i), partial(reduce_func, ind=i))
+        nf.block_compute(i, partial(msg_func, ind=i),
+                         partial(reduce_func, ind=i))
 def test_block_edges():
@@ -533,7 +570,8 @@ def test_block_incidence_matrix():
        # should work for negative block ids
        adjs_by_neg = []
        for typestr in typestrs:
-            adj_by_neg, _ = nf.block_incidence_matrix(-nf.num_blocks + i, typestr, F.cpu())
+            adj_by_neg, _ = nf.block_incidence_matrix(
+                -nf.num_blocks + i, typestr, F.cpu())
            adj_by_neg = F.sparse_to_numpy(adj_by_neg)
            adjs_by_neg.append(adj_by_neg)

--- a/tests/compute/test_readout.py
+++ b/tests/compute/test_readout.py
@@ -63,7 +63,6 @@ def test_simple_readout():
    # assert F.allclose(max_bg_e, F.stack([maxe1, F.zeros(5)], 0)) 
-@unittest.skipIf(dgl.backend.backend_name == "tensorflow", reason="Don't know why fail")
 def test_topk_nodes():
    # test#1: basic
    g0 = dgl.DGLGraph(nx.path_graph(14))
@@ -101,7 +100,6 @@ def test_topk_nodes():
    assert F.allclose(val, F.stack([F.topk(feat0, 6, 0), F.topk(feat1, 6, 0)], 0))
-@unittest.skipIf(dgl.backend.backend_name == "tensorflow", reason="Don't know why fail")
 def test_topk_edges():
    # test#1: basic
    g0 = dgl.DGLGraph(nx.path_graph(14))

--- a/tests/compute/test_sampler.py
+++ b/tests/compute/test_sampler.py
@@ -158,6 +158,7 @@ def test_layer_sampler():
    _test_layer_sampler()
    _test_layer_sampler(prefetch=True)
+@unittest.skipIf(dgl.backend.backend_name == "tensorflow", reason="Error occured when multiprocessing")
 def test_nonuniform_neighbor_sampler():
    # Construct a graph with
    # (1) A path (0, 1, ..., 99) with weight 1
@@ -235,6 +236,7 @@ def check_head_tail(g):
    assert len(tail_nid) == len(g.tail_nid)
    np.testing.assert_equal(tail_nid, ldst)
 def check_negative_sampler(mode, exclude_positive, neg_size):
    g = generate_rand_graph(100)
    num_edges = g.number_of_edges()
@@ -305,7 +307,7 @@ def check_negative_sampler(mode, exclude_positive, neg_size):
        neg_dst = F.gather_row(neg_edges.parent_nid, neg_ldst)
        neg_eid = F.gather_row(neg_edges.parent_eid, neg_leid)
        exists = neg_edges.edata['false_neg']
-        neg_edges.edata['etype'] = g.edata['etype'][neg_eid]
+        neg_edges.edata['etype'] = F.gather_row(g.edata['etype'], neg_eid)
        for i in range(len(neg_eid)):
            u, v = F.asnumpy(neg_src[i]), F.asnumpy(neg_dst[i])
            if g.has_edge_between(u, v):
@@ -346,14 +348,14 @@ def check_weighted_negative_sampler(mode, exclude_positive, neg_size):
                                            exclude_positive=exclude_positive,
                                            return_false_neg=True):
        pos_lsrc, pos_ldst, pos_leid = pos_edges.all_edges(form='all', order='eid')
-        assert_array_equal(F.asnumpy(pos_edges.parent_eid[pos_leid]),
+        assert_array_equal(F.asnumpy(F.gather_row(pos_edges.parent_eid, pos_leid)),
-                           F.asnumpy(g.edge_ids(pos_edges.parent_nid[pos_lsrc],
+                           F.asnumpy(g.edge_ids(F.gather_row(pos_edges.parent_nid, pos_lsrc),
-                                                pos_edges.parent_nid[pos_ldst])))
+                                                F.gather_row(pos_edges.parent_nid, pos_ldst))))
        neg_lsrc, neg_ldst, neg_leid = neg_edges.all_edges(form='all', order='eid')
-        neg_src = neg_edges.parent_nid[neg_lsrc]
+        neg_src = F.gather_row(neg_edges.parent_nid, neg_lsrc)
-        neg_dst = neg_edges.parent_nid[neg_ldst]
+        neg_dst = F.gather_row(neg_edges.parent_nid, neg_ldst)
-        neg_eid = neg_edges.parent_eid[neg_leid]
+        neg_eid = F.gather_row(neg_edges.parent_eid, neg_leid)
        for i in range(len(neg_eid)):
            neg_d = int(F.asnumpy(neg_dst[i]))
            neg_e = int(F.asnumpy(neg_eid[i]))
@@ -362,8 +364,8 @@ def check_weighted_negative_sampler(mode, exclude_positive, neg_size):
                assert int(F.asnumpy(neg_src[i])) != pos_map[(neg_d, neg_e)]
        check_head_tail(neg_edges)
-        pos_tails = pos_edges.parent_nid[pos_edges.tail_nid]
+        pos_tails = F.gather_row(pos_edges.parent_nid, pos_edges.tail_nid)
-        neg_tails = neg_edges.parent_nid[neg_edges.tail_nid]
+        neg_tails = F.gather_row(neg_edges.parent_nid, neg_edges.tail_nid)
        pos_tails = np.sort(F.asnumpy(pos_tails))
        neg_tails = np.sort(F.asnumpy(neg_tails))
        np.testing.assert_equal(pos_tails, neg_tails)
@@ -387,11 +389,11 @@ def check_weighted_negative_sampler(mode, exclude_positive, neg_size):
                                            relations=g.edata['etype'],
                                            return_false_neg=True):
        neg_lsrc, neg_ldst, neg_leid = neg_edges.all_edges(form='all', order='eid')
-        neg_src = neg_edges.parent_nid[neg_lsrc]
+        neg_src = F.gather_row(neg_edges.parent_nid, neg_lsrc)
-        neg_dst = neg_edges.parent_nid[neg_ldst]
+        neg_dst = F.gather_row(neg_edges.parent_nid, neg_ldst)
-        neg_eid = neg_edges.parent_eid[neg_leid]
+        neg_eid = F.gather_row(neg_edges.parent_eid, neg_leid)
        exists = neg_edges.edata['false_neg']
-        neg_edges.edata['etype'] = g.edata['etype'][neg_eid]
+        neg_edges.edata['etype'] = F.gather_row(g.edata['etype'], neg_eid)
        for i in range(len(neg_eid)):
            u, v = F.asnumpy(neg_src[i]), F.asnumpy(neg_dst[i])
            if g.has_edge_between(u, v):
@@ -414,11 +416,11 @@ def check_weighted_negative_sampler(mode, exclude_positive, neg_size):
                                            relations=g.edata['etype'],
                                            return_false_neg=True):
        neg_lsrc, neg_ldst, neg_leid = neg_edges.all_edges(form='all', order='eid')
-        neg_src = neg_edges.parent_nid[neg_lsrc]
+        neg_src = F.gather_row(neg_edges.parent_nid, neg_lsrc)
-        neg_dst = neg_edges.parent_nid[neg_ldst]
+        neg_dst = F.gather_row(neg_edges.parent_nid, neg_ldst)
-        neg_eid = neg_edges.parent_eid[neg_leid]
+        neg_eid = F.gather_row(neg_edges.parent_eid, neg_leid)
        exists = neg_edges.edata['false_neg']
-        neg_edges.edata['etype'] = g.edata['etype'][neg_eid]
+        neg_edges.edata['etype'] = F.gather_row(g.edata['etype'], neg_eid)
        for i in range(len(neg_eid)):
            u, v = F.asnumpy(neg_src[i]), F.asnumpy(neg_dst[i])
            if g.has_edge_between(u, v):
@@ -463,7 +465,7 @@ def check_weighted_negative_sampler(mode, exclude_positive, neg_size):
        else:
            neg_dst = neg_edges.parent_nid[neg_ldst]
            np.add.at(node_sampled, F.asnumpy(neg_dst), 1)
-        np.add.at(edge_sampled, F.asnumpy(pos_edges.parent_eid[pos_leid]), 1)
+        np.add.at(edge_sampled, F.asnumpy(F.gather_row(pos_edges.parent_eid, pos_leid)), 1)
        total_samples += batch_size
        if (total_samples >= max_samples):
@@ -496,12 +498,12 @@ def check_weighted_negative_sampler(mode, exclude_positive, neg_size):
        _, _, pos_leid = pos_edges.all_edges(form='all', order='eid')
        neg_lsrc, neg_ldst, _ = neg_edges.all_edges(form='all', order='eid')
        if 'head' in mode:
-            neg_src = neg_edges.parent_nid[neg_lsrc]
+            neg_src = F.gather_row(neg_edges.parent_nid, neg_lsrc)
            np.add.at(node_sampled, F.asnumpy(neg_src), 1)
        else:
-            neg_dst = neg_edges.parent_nid[neg_ldst]
+            neg_dst = F.gather_row(neg_edges.parent_nid, neg_ldst)
            np.add.at(node_sampled, F.asnumpy(neg_dst), 1)
-        np.add.at(edge_sampled, F.asnumpy(pos_edges.parent_eid[pos_leid]), 1)
+        np.add.at(edge_sampled, F.asnumpy(F.gather_row(pos_edges.parent_eid, pos_leid)), 1)
        total_samples += batch_size
        if (total_samples >= max_samples):
@@ -522,7 +524,8 @@ def check_weighted_negative_sampler(mode, exclude_positive, neg_size):
    assert np.allclose(node_rate, node_rate_a * 5, atol=0.002)
    assert np.allclose(node_rate_a, node_rate_b, atol=0.0002)
-@unittest.skipIf(dgl.backend.backend_name == "tensorflow", reason="Core dump")
+@unittest.skipIf(dgl.backend.backend_name == "tensorflow", reason="TF doesn't support item assignment")
 def test_negative_sampler():
    check_negative_sampler('PBG-head', False, 10)
    check_negative_sampler('head', True, 10)

--- a/tests/scripts/build_dgl.sh
+++ b/tests/scripts/build_dgl.sh
 #!/bin/bash
 set -e
+. /opt/conda/etc/profile.d/conda.sh
 if [ $# -ne 1 ]; then
    echo "Device argument required, can be cpu or gpu"
@@ -25,10 +26,14 @@ make -j4
 popd
 pushd python
+for backend in pytorch mxnet tensorflow
+do 
+conda activate "${backend}-ci"
 rm -rf build *.egg-info dist
-pip3 uninstall -y dgl
+pip uninstall -y dgl
 # test install
 python3 setup.py install
 # test inplace build (for cython)
 python3 setup.py build_ext --inplace
+done
 popd
\ No newline at end of file
--- a/tests/scripts/task_example_test.sh
+++ b/tests/scripts/task_example_test.sh
 #!/bin/bash
+. /opt/conda/etc/profile.d/conda.sh
+conda activate pytorch-ci
 GCN_EXAMPLE_DIR="./examples/pytorch/"
 function fail {