sync BN

25985c31 · Hang Zhang · d40adbc4 · 25985c31 · 25985c31 · 25985c31
Commit 25985c31 authored Apr 12, 2018 by Hang Zhang
15 changed files
--- a/encoding/functions/customize.py
+++ b/encoding/functions/customize.py
@@ -5,16 +5,14 @@
 ## Copyright (c) 2017
 ##
 ## This source code is licensed under the MIT-style license found in the
-## LICENSE file in the root directory of this source tree 
+## LICENSE file in the root directory of this source tree
 ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

+"""Encoding Customized Functions"""
 import math
-import threading
 import torch
-import torch.nn as nn
-import torch.nn.functional as F
 from torch.autograd import Function, Variable
-from torch.nn.modules.utils import _single, _pair, _triple
+from torch.nn.modules.utils import _pair

 from .._ext import encoding_lib

@@ -23,32 +21,31 @@ __all__ = ['dilatedavgpool2d']
 class _dilatedavgpool2d(Function):
    @staticmethod
    def forward(ctx, input, kernel_size, stride, padding,
-            dilation=1):
+                dilation=1):
        ctx.kH, ctx.kW = _pair(kernel_size)
-        ctx.dH, ctx.dW = _pair(stride if stride is not None else 
-            kernel_size)
+        ctx.dH, ctx.dW = _pair(stride if stride is not None else kernel_size)
        ctx.padH, ctx.padW = _pair(padding)
        ctx.dilationH, ctx.dilationW = _pair(dilation)
-        b,c,h,w = input.size()
-        if ctx.dH==1 and ctx.dW==1:
+        b, c, h, w = input.size()
+        if ctx.dH == 1 and ctx.dW == 1:
            # keep the size for dilated avgpool
            ow, oh = w, h
        else:
            ow = math.floor(float(w-ctx.kW+2*ctx.padW)/float(ctx.dW)) +1
            oh = math.floor(float(h-ctx.kH+2*ctx.padH)/float(ctx.dH)) +1
        with torch.cuda.device_of(input):
-            output = input.new(b,c,oh,ow)
+            output = input.new(b, c, oh, ow)
        ctx.save_for_backward(input)
        if isinstance(input, torch.cuda.FloatTensor):
            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Float_DilatedAvgPool2d_Forward(input, output,
-                    ctx.kH, ctx.kW, ctx.dH, ctx.dW, ctx.padH, ctx.padW,
-                    ctx.dilationH, ctx.dilationW)
+                encoding_lib.Encoding_Float_DilatedAvgPool2d_Forward(
+                    input, output, ctx.kH, ctx.kW, ctx.dH, ctx.dW, ctx.padH,
+                    ctx.padW, ctx.dilationH, ctx.dilationW)
        elif isinstance(input, torch.cuda.DoubleTensor):
            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Double_DilatedAvgPool2d_Forward(input, output,
-                    ctx.kH, ctx.kW, ctx.dH, ctx.dW, ctx.padH, ctx.padW,
-                    ctx.dilationH, ctx.dilationW)
+                encoding_lib.Encoding_Double_DilatedAvgPool2d_Forward(
+                    input, output, ctx.kH, ctx.kW, ctx.dH, ctx.dW, ctx.padH,
+                    ctx.padW, ctx.dilationH, ctx.dilationW)
        else:
            raise RuntimeError('Unimplemented data type!')
        return output
@@ -75,13 +72,14 @@ class _dilatedavgpool2d(Function):
        return gradInput, None, None, None, None


-def dilatedavgpool2d(input, kernel_size, stride=None, padding=0, 
-        dilation=1):
-    """Dilated Average Pool 2d, for dilation of DenseNet. 
- 
+def dilatedavgpool2d(input, kernel_size, stride=None, padding=0,
+                     dilation=1):
+    """Dilated Average Pool 2d, for dilation of DenseNet.
+
    Reference:

-        Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang, Ambrish Tyagi, Amit Agrawal. “Context Encoding for Semantic Segmentation. CVPR 2018
+        Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang,
+        Ambrish Tyagi, Amit Agrawal. “Context Encoding for Semantic Segmentation. CVPR 2018

    Applies 2D average-pooling operation in kh x kw regions by step size
    dh x dw steps. The number of output features is equal to the number of
@@ -99,5 +97,4 @@ def dilatedavgpool2d(input, kernel_size, stride=None, padding=0,
          a tuple (padh x padw), Default: 0
        dilation: the dilation parameter similar to Conv2d
    """
-    return _dilatedavgpool2d.apply(input, kernel_size, stride, padding,
-            dilation)
+    return _dilatedavgpool2d.apply(input, kernel_size, stride, padding, dilation)
--- a/encoding/functions/encoding.py
+++ b/encoding/functions/encoding.py
@@ -5,13 +5,11 @@
 ## Copyright (c) 2017
 ##
 ## This source code is licensed under the MIT-style license found in the
-## LICENSE file in the root directory of this source tree 
+## LICENSE file in the root directory of this source tree
 ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

-import threading
+"""Functions for Encoding Layer"""
 import torch
-import torch.nn as nn
-import torch.nn.functional as F
 from torch.autograd import Function, Variable
 from .._ext import encoding_lib

@@ -19,13 +17,13 @@ __all__ = ['aggregate', 'scaledL2']

 class _aggregate(Function):
    @staticmethod
-    def forward(self, A, X, C):
+    def forward(ctx, A, X, C):
        # A \in(BxNxK) R \in(BxNxKxD) => E \in(BxNxD)
-        self.save_for_backward(A, X, C)
-        B, N, K = A.size()
+        ctx.save_for_backward(A, X, C)
+        B, _, K = A.size()
        D = X.size(2)
        with torch.cuda.device_of(A):
-            E = A.new(B,K,D)
+            E = A.new(B, K, D)
        if isinstance(A, torch.cuda.FloatTensor):
            with torch.cuda.device_of(A):
                encoding_lib.Encoding_Float_aggregate_forward(E, A, X, C)
@@ -37,19 +35,19 @@ class _aggregate(Function):
        return E

    @staticmethod
-    def backward(self, gradE):
-        A, X, C = self.saved_variables
+    def backward(ctx, gradE):
+        A, X, C = ctx.saved_variables
        with torch.cuda.device_of(A):
            gradA = Variable(A.data.new().resize_as_(A.data))
            gradX = Variable(A.data.new().resize_as_(X.data))
            gradC = Variable(A.data.new().resize_as_(C.data))
        if isinstance(A.data, torch.cuda.FloatTensor):
            with torch.cuda.device_of(A.data):
-                encoding_lib.Encoding_Float_aggregate_backward(gradA.data, 
+                encoding_lib.Encoding_Float_aggregate_backward(gradA.data, \
                    gradE.data, A.data, X.data, C.data)
        elif isinstance(A.data, torch.cuda.DoubleTensor):
            with torch.cuda.device_of(A.data):
-                encoding_lib.Encoding_Double_aggregate_backward(gradA.data, 
+                encoding_lib.Encoding_Double_aggregate_backward(gradA.data, \
                    gradE.data, A.data, X.data, C.data)
        else:
            raise RuntimeError('Unimplemented data type!')
@@ -59,14 +57,17 @@ class _aggregate(Function):

 def aggregate(A, X, C):
    r"""
-    Aggregate operation, aggregate the residuals of inputs (:math:`X`) with repect to the codewords (:math:`C`) with assignment weights (:math:`A`).
-    
+    Aggregate operation, aggregate the residuals of inputs (:math:`X`) with repect
+    to the codewords (:math:`C`) with assignment weights (:math:`A`).

    .. math::
        e_{k} = \sum_{i=1}^{N} a_{ik} (x_i - d_k)

    Shape:
-        - Input: :math:`A\in\mathcal{R}^{B\times N\times K}` :math:`X\in\mathcal{R}^{B\times N\times D}` :math:`C\in\mathcal{R}^{K\times D}`  (where :math:`B` is batch, :math:`N` is total number of features, :math:`K` is number is codewords, :math:`D` is feature dimensions.)
+        - Input: :math:`A\in\mathcal{R}^{B\times N\times K}`
+          :math:`X\in\mathcal{R}^{B\times N\times D}` :math:`C\in\mathcal{R}^{K\times D}`
+          (where :math:`B` is batch, :math:`N` is total number of features,
+          :math:`K` is number is codewords, :math:`D` is feature dimensions.)
        - Output: :math:`E\in\mathcal{R}^{B\times K\times D}`

    Examples:
@@ -82,11 +83,11 @@ def aggregate(A, X, C):

 class _scaledL2(Function):
    @staticmethod
-    def forward(self, X, C, S):
-        B,N,D = X.size()
+    def forward(ctx, X, C, S):
+        B, N, _ = X.size()
        K = C.size(0)
        with torch.cuda.device_of(X):
-            SL = X.new(B,N,K)
+            SL = X.new(B, N, K)
        if isinstance(X, torch.cuda.FloatTensor):
            with torch.cuda.device_of(X):
                encoding_lib.Encoding_Float_scaledl2_forward(SL, X, C, S)
@@ -95,12 +96,12 @@ class _scaledL2(Function):
                encoding_lib.Encoding_Double_scaledl2_forward(SL, X, C, S)
        else:
            raise RuntimeError('Unimplemented data type!')
-        self.save_for_backward(X, C, S, SL)
+        ctx.save_for_backward(X, C, S, SL)
        return SL

    @staticmethod
-    def backward(self, gradSL):
-        X, C, S, SL = self.saved_variables
+    def backward(ctx, gradSL):
+        X, C, S, SL = ctx.saved_variables
        K = C.size(0)
        with torch.cuda.device_of(X.data):
            gradX = Variable(X.data.new().resize_as_(X.data))
@@ -108,15 +109,15 @@ class _scaledL2(Function):
            gradS = Variable(X.data.new().resize_as_(S.data))
        if isinstance(X.data, torch.cuda.FloatTensor):
            with torch.cuda.device_of(X.data):
-                encoding_lib.Encoding_Float_scaledl2_backward(gradSL.data, 
+                encoding_lib.Encoding_Float_scaledl2_backward(gradSL.data, \
                    gradX.data, gradC.data, X.data, C.data, S.data)
        elif isinstance(X.data, torch.cuda.DoubleTensor):
            with torch.cuda.device_of(X.data):
-                encoding_lib.Encoding_Double_scaledl2_backward(gradSL.data, 
+                encoding_lib.Encoding_Double_scaledl2_backward(gradSL.data, \
                    gradX.data, gradC.data, X.data, C.data, S.data)
        else:
            raise RuntimeError('Unimplemented data type!')
-        gradS.data.copy_((gradSL*(SL/S.view(1,1,K))).sum(0).sum(0).data)
+        gradS.data.copy_((gradSL*(SL/S.view(1, 1, K))).sum(0).sum(0).data)
        return gradX, gradC, gradS


@@ -128,10 +129,11 @@ def scaledL2(X, C, S):
        sl_{ik} = s_k \|x_i-c_k\|^2

    Shape:
-        - Input: :math:`X\in\mathcal{R}^{B\times N\times D}` :math:`C\in\mathcal{R}^{K\times D}` :math:`S\in \mathcal{R}^K` (where :math:`B` is batch, :math:`N` is total number of features, :math:`K` is number is codewords, :math:`D` is feature dimensions.)
+        - Input: :math:`X\in\mathcal{R}^{B\times N\times D}`
+          :math:`C\in\mathcal{R}^{K\times D}` :math:`S\in \mathcal{R}^K`
+          (where :math:`B` is batch, :math:`N` is total number of features,
+          :math:`K` is number is codewords, :math:`D` is feature dimensions.)
        - Output: :math:`E\in\mathcal{R}^{B\times N\times K}`

    """
    return _scaledL2.apply(X, C, S)
-
-
--- a/encoding/functions/syncbn.py
+++ b/encoding/functions/syncbn.py
@@ -5,107 +5,107 @@
 ## Copyright (c) 2017
 ##
 ## This source code is licensed under the MIT-style license found in the
-## LICENSE file in the root directory of this source tree 
+## LICENSE file in the root directory of this source tree
 ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

-import threading
+"""Synchronized Batch Normalization functions"""
 import torch
-import torch.nn as nn
-import torch.nn.functional as F
 from torch.autograd import Function, Variable
 from .._ext import encoding_lib

 __all__ = ['sum_square', 'batchnormtrain', 'batchnormeval']

 class _sum_square(Function):
+    @staticmethod
    def forward(ctx, input):
        ctx.save_for_backward(input)
-        B,C,H,W = input.size()
+        B, C, _, _ = input.size()
        with torch.cuda.device_of(input):
-            xsum    = input.new().resize_(C).zero_()
+            xsum = input.new().resize_(C).zero_()
            xsquare = input.new().resize_(C).zero_()
        if isinstance(input, torch.cuda.FloatTensor):
            with torch.cuda.device_of(input):
                encoding_lib.Encoding_Float_sum_square_Forward(
-                    input.view(B,C,-1), xsum, xsquare)
+                    input.view(B, C, -1), xsum, xsquare)
        elif isinstance(input, torch.cuda.DoubleTensor):
            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Double_sum_square_Forward( 
-                    input.view(B,C,-1), xsum, xsquare)
+                encoding_lib.Encoding_Double_sum_square_Forward(
+                    input.view(B, C, -1), xsum, xsquare)
        else:
-            raise RuntimeError('Unimplemented data type!') 
+            raise RuntimeError('Unimplemented data type!')
        return xsum, xsquare

+    @staticmethod
    def backward(ctx, gradSum, gradSquare):
-        input, = ctx.saved_tensors
-        B,C,H,W = input.size()
-        with torch.cuda.device_of(input):
-            gradInput = input.new().resize_(B,C,H*W).zero_()
-        if isinstance(input, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(input):
+        input, = ctx.saved_variables
+        B, C, H, W = input.data.size()
+        with torch.cuda.device_of(input.data):
+            gradInput = Variable(input.data.new().resize_(B, C, H*W).zero_())
+        if isinstance(input.data, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(input.data):
                encoding_lib.Encoding_Float_sum_square_Backward(
-                    gradInput, input.view(B,C,-1), gradSum, gradSquare)
-        elif isinstance(input, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Double_sum_square_Backward( 
-                    gradInput, input.view(B,C,-1), gradSum, gradSquare)
+                    gradInput, input.data.view(B, C, -1), gradSum, gradSquare)
+        elif isinstance(input.data, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(input.data):
+                encoding_lib.Encoding_Double_sum_square_Backward(
+                    gradInput, input.data.view(B, C, -1), gradSum, gradSquare)
        else:
-            raise RuntimeError('Unimplemented data type!') 
-        return gradInput.view(B,C,H,W)
+            raise RuntimeError('Unimplemented data type!')
+        return gradInput.view(B, C, H, W)


 def sum_square(input):
    r"""
    Calculate sum of elements and sum of squares for Batch Normalization.
    """
-    return _sum_square()(input)
+    return _sum_square.apply(input)


 class _batchnorm(Function):
-    def __init__(ctx, training=False):
-        super(_batchnorm, ctx).__init__()
-        ctx.training = training
+    def __init__(self, training=False):
+        super(_batchnorm, self).__init__()
+        self.training = training

-    def forward(ctx, input, gamma, beta, mean, std):
-        ctx.save_for_backward(input, gamma, beta, mean, std)
-        assert(input.dim()==3)
+    def forward(self, input, gamma, beta, mean, std):
+        self.save_for_backward(input, gamma, beta, mean, std)
+        assert(input.dim() == 3)
        with torch.cuda.device_of(input):
            invstd = 1.0 / std
            output = input.new().resize_as_(input)
        if isinstance(input, torch.cuda.FloatTensor):
            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Float_batchnorm_Forward(output, 
+                encoding_lib.Encoding_Float_batchnorm_Forward(output, \
                    input, mean, invstd, gamma, beta)
        elif isinstance(input, torch.cuda.DoubleTensor):
            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Double_batchnorm_Forward(output, 
+                encoding_lib.Encoding_Double_batchnorm_Forward(output, \
                    input, mean, invstd, gamma, beta)
        else:
            raise RuntimeError('Unimplemented data type!')
-        return output 
+        return output

-    def backward(ctx, gradOutput):
-        input, gamma, beta, mean, std = ctx.saved_tensors
+    def backward(self, gradOutput):
+        input, gamma, beta, mean, std = self.saved_tensors
        invstd = 1.0 / std
        with torch.cuda.device_of(input):
            gradInput = gradOutput.new().resize_as_(input).zero_()
            gradGamma = gradOutput.new().resize_as_(gamma).zero_()
-            gradBeta  = gradOutput.new().resize_as_(beta).zero_()
-            gradMean  = gradOutput.new().resize_as_(mean).zero_()
-            gradStd   = gradOutput.new().resize_as_(std).zero_()
+            gradBeta = gradOutput.new().resize_as_(beta).zero_()
+            gradMean = gradOutput.new().resize_as_(mean).zero_()
+            gradStd = gradOutput.new().resize_as_(std).zero_()

        if isinstance(input, torch.cuda.FloatTensor):
            with torch.cuda.device_of(input):
                encoding_lib.Encoding_Float_batchnorm_Backward(
-                    gradOutput, input, gradInput, gradGamma, gradBeta, 
+                    gradOutput, input, gradInput, gradGamma, gradBeta,
                    mean, invstd, gamma, beta, gradMean, gradStd,
-                    ctx.training) 
+                    self.training)
        elif isinstance(input, torch.cuda.DoubleTensor):
            with torch.cuda.device_of(input):
                encoding_lib.Encoding_Double_batchnorm_Backward(
-                    gradOutput, input, gradInput, gradGamma, gradBeta, 
+                    gradOutput, input, gradInput, gradGamma, gradBeta,
                    mean, invstd, gamma, beta, gradMean, gradStd,
-                    ctx.training) 
+                    self.training)
        else:
            raise RuntimeError('Unimplemented data type!')
        return gradInput, gradGamma, gradBeta, gradMean, gradStd

--- a/encoding/nn/__init__.py
+++ b/encoding/nn/__init__.py
@@ -5,10 +5,10 @@
 ## Copyright (c) 2017
 ##
 ## This source code is licensed under the MIT-style license found in the
-## LICENSE file in the root directory of this source tree 
+## LICENSE file in the root directory of this source tree
 ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

+"""Encoding NN Modules"""
 from .encoding import *
 from .syncbn import *
-from .basic import *
 from .customize import *
--- a/encoding/nn/basic.py
+++ b/encoding/nn/basic.py
-##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-## Created by: Hang Zhang
-## ECE Department, Rutgers University
-## Email: zhang.hang@rutgers.edu
-## Copyright (c) 2017
-##
-## This source code is licensed under the MIT-style license found in the
-## LICENSE file in the root directory of this source tree 
-##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-
-import math
-import torch
-from torch.autograd import Variable
-from torch.nn import Module, Sequential
-from torch.nn import functional as F
-from torch.nn.parameter import Parameter
-from torch.nn.modules.utils import _single, _pair, _triple
-
-from ..parallel import my_data_parallel
-from ..functions import view_each
-
-__all__ = ['Module', 'Sequential', 'Conv1d', 'Conv2d', 'ConvTranspose2d', 'ReLU', 'Sigmoid', 'MaxPool2d', 'AvgPool2d', 'AdaptiveAvgPool2d', 'Dropout2d', 'Linear']
-
-class _ConvNd(Module):
-    def __init__(self, in_channels, out_channels, kernel_size, stride,
-                 padding, dilation, transposed, output_padding, groups, bias):
-        super(_ConvNd, self).__init__()
-        if in_channels % groups != 0:
-            raise ValueError('in_channels must be divisible by groups')
-        if out_channels % groups != 0:
-            raise ValueError('out_channels must be divisible by groups')
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        self.stride = stride
-        self.padding = padding
-        self.dilation = dilation
-        self.transposed = transposed
-        self.output_padding = output_padding
-        self.groups = groups
-        if transposed:
-            self.weight = Parameter(torch.Tensor(
-                in_channels, out_channels // groups, *kernel_size))
-        else:
-            self.weight = Parameter(torch.Tensor(
-                out_channels, in_channels // groups, *kernel_size))
-        if bias:
-            self.bias = Parameter(torch.Tensor(out_channels))
-        else:
-            self.register_parameter('bias', None)
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        n = self.in_channels
-        for k in self.kernel_size:
-            n *= k
-        stdv = 1. / math.sqrt(n)
-        self.weight.data.uniform_(-stdv, stdv)
-        if self.bias is not None:
-            self.bias.data.uniform_(-stdv, stdv)
-
-    def __repr__(self):
-        s = ('{name}({in_channels}, {out_channels}, kernel_size={kernel_size}'
-             ', stride={stride}')
-        if self.padding != (0,) * len(self.padding):
-            s += ', padding={padding}'
-        if self.dilation != (1,) * len(self.dilation):
-            s += ', dilation={dilation}'
-        if self.output_padding != (0,) * len(self.output_padding):
-            s += ', output_padding={output_padding}'
-        if self.groups != 1:
-            s += ', groups={groups}'
-        if self.bias is None:
-            s += ', bias=False'
-        s += ')'
-        return s.format(name=self.__class__.__name__, **self.__dict__)
-
-
-class Conv1d(_ConvNd):
-    r"""Applies a 1D convolution over an input signal composed of several 
-    input planes.
-    In the simplest case, the output value of the layer with input size
-    :math:`(N, C_{in}, L)` and output :math:`(N, C_{out}, L_{out})` can be
-    precisely described as:
-
-    .. math::
-        \begin{array}{ll}
-        out(N_i, C_{out_j})  = bias(C_{out_j})
-                       + \sum_{{k}=0}^{C_{in}-1} weight(C_{out_j}, k)  
-                       \star input(N_i, k)
-        \end{array}
-
-    where :math:`\star` is the valid `cross-correlation`_ operator
-
-    | :attr:`stride` controls the stride for the cross-correlation.
-    | If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides for :attr:`padding` number of points.
-    | :attr:`dilation` controls the spacing between the kernel points; also
-      known as the à trous algorithm. It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
-    | :attr:`groups` controls the connections between inputs and outputs. `in_channels` and `out_channels` must both be divisible by `groups`.
-    |       At groups=1, all inputs are convolved to all outputs.
-    |       At groups=2, the operation becomes equivalent to having two conv layers side by side, each seeing half the input channels, and producing half the output channels, and both subsequently concatenated.
-            At groups=`in_channels`, each input channel is convolved with its own set of filters (of size `out_channels // in_channels`).
-
-    Args:
-        in_channels (int): Number of channels in the input image
-        out_channels (int): Number of channels produced by the convolution
-        kernel_size (int or tuple): Size of the convolving kernel
-        stride (int or tuple, optional): Stride of the convolution. Default: 1
-        padding (int or tuple, optional): Zero-padding added to both sides of
-            the input. Default: 0
-        dilation (int or tuple, optional): Spacing between kernel
-            elements. Default: 1
-        groups (int, optional): Number of blocked connections from input
-            channels to output channels. Default: 1
-        bias (bool, optional): If True, adds a learnable bias to the output. Default: True
-
-    Shape:
-        - Input: :math:`(N, C_{in}, L_{in})`
-        - Output: :math:`(N, C_{out}, L_{out})` where
-          :math:`L_{out} = floor((L_{in}  + 2 * padding - dilation * (kernel\_size - 1) - 1) / stride + 1)`
-
-    Attributes:
-        weight (Tensor): the learnable weights of the module of shape
-            (out_channels, in_channels, kernel_size)
-        bias (Tensor):   the learnable bias of the module of shape
-            (out_channels)
-
-    Examples::
-        >>> m = nn.Conv1d(16, 33, 3, stride=2)
-        >>> input = autograd.Variable(torch.randn(20, 16, 50))
-        >>> output = m(input)
-
-    .. _cross-correlation:
-        https://en.wikipedia.org/wiki/Cross-correlation
-    .. _link:
-        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
-
-    """
-
-    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
-                 padding=0, dilation=1, groups=1, bias=True):
-        kernel_size = _single(kernel_size)
-        stride = _single(stride)
-        padding = _single(padding)
-        dilation = _single(dilation)
-        super(Conv1d, self).__init__(
-            in_channels, out_channels, kernel_size, stride, padding, dilation,
-            False, _single(0), groups, bias)
-
-    def forward(self, input):
-        return F.conv1d(input, self.weight, self.bias, self.stride,
-                        self.padding, self.dilation, self.groups)
-
-
-class Conv2d(_ConvNd):
-    r"""Applies a 2D convolution over an input signal composed of several input
-    planes.
-    In the simplest case, the output value of the layer with input size
-    :math:`(N, C_{in}, H, W)` and output :math:`(N, C_{out}, H_{out}, W_{out})`
-    can be precisely described as:
-
-    .. math::
-        \begin{array}{ll}
-        out(N_i, C_{out_j})  = bias(C_{out_j})
-                       + \sum_{{k}=0}^{C_{in}-1} weight(C_{out_j}, k)  \star input(N_i, k)
-        \end{array}
-
-    where :math:`\star` is the valid 2D `cross-correlation`_ operator
-
-    | :attr:`stride` controls the stride for the cross-correlation.
-    | If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides for :attr:`padding` number of points.
-    | :attr:`dilation` controls the spacing between the kernel points; also
-      known as the à trous algorithm. It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
-    | :attr:`groups` controls the connections between inputs and outputs. `in_channels` and `out_channels` must both be divisible by `groups`.
-    |       At groups=1, all inputs are convolved to all outputs.
-    |       At groups=2, the operation becomes equivalent to having two conv layers side by side, each seeing half the input channels, and producing half the output channels, and both subsequently concatenated.
-            At groups=`in_channels`, each input channel is convolved with its own set of filters (of size `out_channels // in_channels`).
-
-    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
-        - a single ``int`` -- in which case the same value is used for the height and width dimension
-        - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
-          and the second `int` for the width dimension
-
-    Args:
-        in_channels (int): Number of channels in the input image
-        out_channels (int): Number of channels produced by the convolution
-        kernel_size (int or tuple): Size of the convolving kernel
-        stride (int or tuple, optional): Stride of the convolution. Default: 1
-        padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0
-        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
-        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
-        bias (bool, optional): If True, adds a learnable bias to the output. Default: True
-
-    Shape:
-        - Input: :math:`(N, C_{in}, H_{in}, W_{in})`
-        - Output: :math:`(N, C_{out}, H_{out}, W_{out})` where
-          :math:`H_{out} = floor((H_{in}  + 2 * padding[0] - dilation[0] * (kernel\_size[0] - 1) - 1) / stride[0] + 1)`
-          :math:`W_{out} = floor((W_{in}  + 2 * padding[1] - dilation[1] * (kernel\_size[1] - 1) - 1) / stride[1] + 1)`
-
-    Attributes:
-        weight (Tensor): the learnable weights of the module of shape
-                         (out_channels, in_channels, kernel_size[0], kernel_size[1])
-        bias (Tensor):   the learnable bias of the module of shape (out_channels)
-
-    Examples::
-        >>> # With square kernels and equal stride
-        >>> m = nn.Conv2d(16, 33, 3, stride=2)
-        >>> # non-square kernels and unequal stride and with padding
-        >>> m = nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
-        >>> # non-square kernels and unequal stride and with padding and dilation
-        >>> m = nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1))
-        >>> input = autograd.Variable(torch.randn(20, 16, 50, 100))
-        >>> output = m(input)
-
-    .. _cross-correlation:
-        https://en.wikipedia.org/wiki/Cross-correlation
-    .. _link:
-        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
-
-    """
-    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
-                 padding=0, dilation=1, groups=1, bias=True):
-        kernel_size = _pair(kernel_size)
-        stride = _pair(stride)
-        padding = _pair(padding)
-        dilation = _pair(dilation)
-        super(Conv2d, self).__init__(
-            in_channels, out_channels, kernel_size, stride, padding, dilation,
-            False, _pair(0), groups, bias)
-
-    def forward(self, input):
-        if isinstance(input, Variable):
-            return F.conv2d(input, self.weight, self.bias, self.stride,
-                            self.padding, self.dilation, self.groups)
-        elif isinstance(input, tuple) or isinstance(input, list):
-            return my_data_parallel(self, input)
-        else:
-            raise RuntimeError('unknown input type')
-
-
-class _ConvTransposeMixin(object):
-    def forward(self, input, output_size=None):
-        output_padding = self._output_padding(input, output_size)
-        func = self._backend.ConvNd(
-            self.stride, self.padding, self.dilation, self.transposed,
-            output_padding, self.groups)
-        if self.bias is None:
-            return func(input, self.weight)
-        else:
-            return func(input, self.weight, self.bias)
-
-    def _output_padding(self, input, output_size):
-        if output_size is None:
-            return self.output_padding
-
-        output_size = list(output_size)
-        k = input.dim() - 2
-        if len(output_size) == k + 2:
-            output_size = output_size[-2:]
-        if len(output_size) != k:
-            raise ValueError(
-                "output_size must have {} or {} elements (got {})"
-                .format(k, k + 2, len(output_size)))
-
-        def dim_size(d):
-            return ((input.size(d + 2) - 1) * self.stride[d] -
-                    2 * self.padding[d] + self.kernel_size[d])
-
-        min_sizes = [dim_size(d) for d in range(k)]
-        max_sizes = [min_sizes[d] + self.stride[d] - 1 for d in range(k)]
-        for size, min_size, max_size in zip(output_size, min_sizes, max_sizes):
-            if size < min_size or size > max_size:
-                raise ValueError((
-                    "requested an output size of {}, but valid sizes range "
-                    "from {} to {} (for an input of {})").format(
-                        output_size, min_sizes, max_sizes, input.size()[2:]))
-
-        return tuple([output_size[d] - min_sizes[d] for d in range(k)])
-
-
-class ConvTranspose2d(_ConvTransposeMixin, _ConvNd):
-    r"""Applies a 2D transposed convolution operator over an input image
-    composed of several input planes.
-    This module can be seen as the gradient of Conv2d with respect to its input.
-    It is also known as a fractionally-strided convolution or
-    a deconvolution (although it is not an actual deconvolution operation).
-
-    | :attr:`stride` controls the stride for the cross-correlation.
-    | If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides for :attr:`padding` number of points.
-    | If :attr:`output_padding` is non-zero, then the output is implicitly zero-padded on one side for :attr:`output_padding` number of points.
-    | :attr:`dilation` controls the spacing between the kernel points; also known as the à trous algorithm. It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
-    | :attr:`groups` controls the connections between inputs and outputs. `in_channels` and `out_channels` must both be divisible by `groups`.
-    |       At groups=1, all inputs are convolved to all outputs.
-    |       At groups=2, the operation becomes equivalent to having two conv layers side by side, each seeing half the input channels, and producing half the output channels, and both subsequently concatenated.
-            At groups=`in_channels`, each input channel is convolved with its own set of filters (of size `out_channels // in_channels`).
-
-    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`output_padding` can either be:
-        - a single ``int`` -- in which case the same value is used for the height and width dimensions
-        - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
-          and the second `int` for the width dimension
-
-    Args:
-        in_channels (int): Number of channels in the input image
-        out_channels (int): Number of channels produced by the convolution
-        kernel_size (int or tuple): Size of the convolving kernel
-        stride (int or tuple, optional): Stride of the convolution. Default: 1
-        padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0
-        output_padding (int or tuple, optional): Zero-padding added to one side of the output. Default: 0
-        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
-        bias (bool, optional): If True, adds a learnable bias to the output. Default: True
-        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
-
-    Shape:
-        - Input: :math:`(N, C_{in}, H_{in}, W_{in})`
-        - Output: :math:`(N, C_{out}, H_{out}, W_{out})` where
-          :math:`H_{out} = (H_{in} - 1) * stride[0] - 2 * padding[0] + kernel\_size[0] + output\_padding[0]`
-          :math:`W_{out} = (W_{in} - 1) * stride[1] - 2 * padding[1] + kernel\_size[1] + output\_padding[1]`
-
-    Attributes:
-        weight (Tensor): the learnable weights of the module of shape
-                         (in_channels, out_channels, kernel_size[0], kernel_size[1])
-        bias (Tensor):   the learnable bias of the module of shape (out_channels)
-
-    Examples::
-        >>> # With square kernels and equal stride
-        >>> m = nn.ConvTranspose2d(16, 33, 3, stride=2)
-        >>> # non-square kernels and unequal stride and with padding
-        >>> m = nn.ConvTranspose2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
-        >>> input = autograd.Variable(torch.randn(20, 16, 50, 100))
-        >>> output = m(input)
-        >>> # exact output size can be also specified as an argument
-        >>> input = autograd.Variable(torch.randn(1, 16, 12, 12))
-        >>> downsample = nn.Conv2d(16, 16, 3, stride=2, padding=1)
-        >>> upsample = nn.ConvTranspose2d(16, 16, 3, stride=2, padding=1)
-        >>> h = downsample(input)
-        >>> h.size()
-        torch.Size([1, 16, 6, 6])
-        >>> output = upsample(h, output_size=input.size())
-        >>> output.size()
-        torch.Size([1, 16, 12, 12])
-
-    .. _cross-correlation:
-        https://en.wikipedia.org/wiki/Cross-correlation
-    .. _link:
-        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
-    """
-    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
-                 padding=0, output_padding=0, groups=1, bias=True, 
-                 dilation=1):
-        kernel_size = _pair(kernel_size)
-        stride = _pair(stride)
-        padding = _pair(padding)
-        dilation = _pair(dilation)
-        output_padding = _pair(output_padding)
-        super(ConvTranspose2d, self).__init__(
-            in_channels, out_channels, kernel_size, stride, padding, dilation,
-            True, output_padding, groups, bias)
-
-    def forward(self, input, output_size=None):
-        output_padding = self._output_padding(input, output_size)
-        if isinstance(input, Variable):
-            return F.conv_transpose2d(
-                input, self.weight, self.bias, self.stride, self.padding,
-                output_padding, self.groups, self.dilation)
-        elif isinstance(input, tuple) or isinstance(input, list):
-            return my_data_parallel(self, input)
-        else:
-            raise RuntimeError('unknown input type')
-
-
-class Threshold(Module):
-    def __init__(self, threshold, value, inplace=False):
-        super(Threshold, self).__init__()
-        self.threshold = threshold
-        self.value = value
-        self.inplace = inplace
-
-    def forward(self, input):
-        if isinstance(input, Variable):
-            return F.threshold(input, self.threshold, self.value, 
-                               self.inplace)
-        elif isinstance(input, tuple) or isinstance(input, list):
-            return my_data_parallel(self, input)
-        else:
-            raise RuntimeError('unknown input type')
-
-    def __repr__(self):
-        inplace_str = ', inplace' if self.inplace else ''
-        return self.__class__.__name__ + ' (' \
-            + str(self.threshold) \
-            + ', ' + str(self.value) \
-            + inplace_str + ')'
-
-
-class ReLU(Threshold):
-    """Applies the rectified linear unit function element-wise
-    :math:`{ReLU}(x)= max(0, x)`
-    Args:
-        inplace: can optionally do the operation in-place. Default: False
-    Shape:
-        - Input: :math:`(N, *)` where `*` means, any number of additional
-          dimensions
-        - Output: :math:`(N, *)`, same shape as the input
-    Examples::
-        >>> m = nn.ReLU()
-        >>> input = autograd.Variable(torch.randn(2))
-        >>> print(input)
-        >>> print(m(input))
-    """
-    def __init__(self, inplace=False):
-        super(ReLU, self).__init__(0, 0, inplace)
-
-    def __repr__(self):
-        inplace_str = 'inplace' if self.inplace else ''
-        return self.__class__.__name__ + ' (' \
-            + inplace_str + ')'
-
-
-class Sigmoid(Module):
-    """Applies the element-wise function :math:`f(x) = 1 / ( 1 + exp(-x))`
-    Shape:
-
-        - Input: :math:`(N, *)` where `*` means, any number of additional
-          dimensions
-        - Output: :math:`(N, *)`, same shape as the input
-
-    Examples::
-
-        >>> m = nn.Sigmoid()
-        >>> input = autograd.Variable(torch.randn(2))
-        >>> print(input)
-        >>> print(m(input))
-    """
-    def forward(self, input):
-        if isinstance(input, Variable):
-            return torch.sigmoid(input)
-        elif isinstance(input, tuple) or isinstance(input, list):
-            return my_data_parallel(self, input)
-        else:
-            raise RuntimeError('unknown input type')
-
-    def __repr__(self):
-        return self.__class__.__name__ + ' ()'
-
-
-class MaxPool2d(Module):
-    r"""Applies a 2D max pooling over an input signal composed of several 
-    input planes.
-    In the simplest case, the output value of the layer with input size 
-    :math:`(N, C, H, W)`, output :math:`(N, C, H_{out}, W_{out})` and 
-    :attr:`kernel_size` :math:`(kH, kW)`
-    can be precisely described as:
-
-    .. math::
-        \begin{array}{ll}
-        out(N_i, C_j, h, w)  = \max_{{m}=0}^{kH-1} \max_{{n}=0}^{kW-1}
-                               input(N_i, C_j, stride[0] * h + m, stride[1] * w + n)
-        \end{array}
-
-    | If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
-      for :attr:`padding` number of points
-    | :attr:`dilation` controls the spacing between the kernel points. It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
-
-    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
-        - a single ``int`` -- in which case the same value is used for the height and width dimension
-        - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
-          and the second `int` for the width dimension
-
-    Args:
-        kernel_size: the size of the window to take a max over
-        stride: the stride of the window. Default value is :attr:`kernel_size`
-        padding: implicit zero padding to be added on both sides
-        dilation: a parameter that controls the stride of elements in the window
-        return_indices: if True, will return the max indices along with the outputs.
-                        Useful when Unpooling later
-        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
-
-    Shape:
-        - Input: :math:`(N, C, H_{in}, W_{in})`
-        - Output: :math:`(N, C, H_{out}, W_{out})` where
-          :math:`H_{out} = floor((H_{in}  + 2 * padding[0] - dilation[0] * (kernel\_size[0] - 1) - 1) / stride[0] + 1)`
-          :math:`W_{out} = floor((W_{in}  + 2 * padding[1] - dilation[1] * (kernel\_size[1] - 1) - 1) / stride[1] + 1)`
-    Examples::
-        >>> # pool of square window of size=3, stride=2
-        >>> m = nn.MaxPool2d(3, stride=2)
-        >>> # pool of non-square window
-        >>> m = nn.MaxPool2d((3, 2), stride=(2, 1))
-        >>> input = autograd.Variable(torch.randn(20, 16, 50, 32))
-        >>> output = m(input)
-
-    .. _link:
-        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
-
-    """
-    def __init__(self, kernel_size, stride=None, padding=0, dilation=1,
-                 return_indices=False, ceil_mode=False):
-        super(MaxPool2d, self).__init__()
-        self.kernel_size = kernel_size
-        self.stride = stride or kernel_size
-        self.padding = padding
-        self.dilation = dilation
-        self.return_indices = return_indices
-        self.ceil_mode = ceil_mode
-
-    def forward(self, input):
-        if isinstance(input, Variable):
-            return F.max_pool2d(input, self.kernel_size, self.stride, \
-                self.padding, self.dilation, self.ceil_mode, \
-                self.return_indices)
-        elif isinstance(input, tuple) or isinstance(input, list):
-            return my_data_parallel(self, input)
-        else:
-            raise RuntimeError('unknown input type')
-
-    def __repr__(self):
-        kh, kw = _pair(self.kernel_size)
-        dh, dw = _pair(self.stride)
-        padh, padw = _pair(self.padding)
-        dilh, dilw = _pair(self.dilation)
-        padding_str = ', padding=(' + str(padh) + ', ' + str(padw) + ')' \
-            if padh != 0 and padw != 0 else ''
-        dilation_str = (', dilation=(' + str(dilh) + ', ' + str(dilw) + ')'
-                        if dilh != 0 and dilw != 0 else '')
-        return self.__class__.__name__ + ' (' \
-            + 'size=(' + str(kh) + ', ' + str(kw) + ')' \
-            + ', stride=(' + str(dh) + ', ' + str(dw) + ')' \
-            + padding_str + dilation_str + ')'
-
-
-class AvgPool2d(Module):
-    r"""Applies a 2D average pooling over an input signal composed of several input
-    planes.
-    In the simplest case, the output value of the layer with input size :math:`(N, C, H, W)`,
-    output :math:`(N, C, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kH, kW)`
-    can be precisely described as:
-
-    .. math::
-        \begin{array}{ll}
-        out(N_i, C_j, h, w)  = 1 / (kH * kW) * \sum_{{m}=0}^{kH-1} \sum_{{n}=0}^{kW-1}
-                               input(N_i, C_j, stride[0] * h + m, stride[1] * w + n)
-        \end{array}
-
-    | If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides for :attr:`padding` number of points
-
-    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding` can either be:
-        - a single ``int`` -- in which case the same value is used for the height and width dimension
-        - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
-          and the second `int` for the width dimension
-
-    Args:
-        kernel_size: the size of the window
-        stride: the stride of the window. Default value is :attr:`kernel_size`
-        padding: implicit zero padding to be added on both sides
-        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
-        count_include_pad: when True, will include the zero-padding in the averaging calculation
-
-    Shape:
-        - Input: :math:`(N, C, H_{in}, W_{in})`
-        - Output: :math:`(N, C, H_{out}, W_{out})` where
-          :math:`H_{out} = floor((H_{in}  + 2 * padding[0] - kernel\_size[0]) / stride[0] + 1)`
-          :math:`W_{out} = floor((W_{in}  + 2 * padding[1] - kernel\_size[1]) / stride[1] + 1)`
-
-    Examples::
-        >>> # pool of square window of size=3, stride=2
-        >>> m = nn.AvgPool2d(3, stride=2)
-        >>> # pool of non-square window
-        >>> m = nn.AvgPool2d((3, 2), stride=(2, 1))
-        >>> input = autograd.Variable(torch.randn(20, 16, 50, 32))
-        >>> output = m(input)
-
-    """
-    def __init__(self, kernel_size, stride=None, padding=0, ceil_mode=False,
-                 count_include_pad=True):
-        super(AvgPool2d, self).__init__()
-        self.kernel_size = kernel_size
-        self.stride = stride or kernel_size
-        self.padding = padding
-        self.ceil_mode = ceil_mode
-        self.count_include_pad = count_include_pad
-
-    def forward(self, input):
-        if isinstance(input, Variable):
-            return F.avg_pool2d(input, self.kernel_size, self.stride,
-                                self.padding, self.ceil_mode, self.count_include_pad)
-        elif isinstance(input, tuple) or isinstance(input, list):
-            return my_data_parallel(self, input)
-        else:
-            raise RuntimeError('unknown input type')
-
-
-class AdaptiveAvgPool2d(Module):
-    """Applies a 2D adaptive average pooling over an input signal composed of several input planes.
-    The output is of size H x W, for any input size.
-    The number of output features is equal to the number of input planes.
-
-    Args:
-        output_size: the target output size of the image of the form H x W.
-                     Can be a tuple (H, W) or a single number H for a square image H x H
-
-    Examples:
-        >>> # target output size of 5x7
-        >>> m = nn.AdaptiveAvgPool2d((5,7))
-        >>> input = autograd.Variable(torch.randn(1, 64, 8, 9))
-        >>> output = m(input)
-        >>> # target output size of 7x7 (square)
-        >>> m = nn.AdaptiveAvgPool2d(7)
-        >>> input = autograd.Variable(torch.randn(1, 64, 10, 9))
-        >>> output = m(input)
-
-    """
-    def __init__(self, output_size):
-        super(AdaptiveAvgPool2d, self).__init__()
-        self.output_size = output_size
-
-    def forward(self, input):
-        if isinstance(input, Variable):
-            return F.adaptive_avg_pool2d(input, self.output_size)
-        elif isinstance(input, tuple) or isinstance(input, list):
-            return my_data_parallel(self, input)
-        else:
-            raise RuntimeError('unknown input type')
-
-    def __repr__(self):
-        return self.__class__.__name__ + ' (' \
-            + 'output_size=' + str(self.output_size) + ')'
-
-
-class Dropout2d(Module):
-    r"""Randomly zeroes whole channels of the input tensor.
-    The channels to zero-out are randomized on every forward call.
-    *Usually the input comes from Conv2d modules.*
-    As described in the paper
-    `Efficient Object Localization Using Convolutional Networks`_ ,
-    if adjacent pixels within feature maps are strongly correlated
-    (as is normally the case in early convolution layers) then iid dropout
-    will not regularize the activations and will otherwise just result
-    in an effective learning rate decrease.
-    In this case, :func:`nn.Dropout2d` will help promote independence between
-    feature maps and should be used instead.
-
-    Args:
-        p (float, optional): probability of an element to be zeroed.
-        inplace (bool, optional): If set to True, will do this operation
-            in-place
-
-    Shape:
-        - Input: :math:`(N, C, H, W)`
-        - Output: :math:`(N, C, H, W)` (same shape as input)
-
-    Examples::
-        >>> m = nn.Dropout2d(p=0.2)
-        >>> input = autograd.Variable(torch.randn(20, 16, 32, 32))
-        >>> output = m(input)
-
-    .. _Efficient Object Localization Using Convolutional Networks:
-       http://arxiv.org/abs/1411.4280
-
-    """
-    def __init__(self, p=0.5, inplace=False):
-        super(Dropout2d, self).__init__()
-        if p < 0 or p > 1:
-            raise ValueError("dropout probability has to be between 0 and 1, "
-                             "but got {}".format(p))
-        self.p = p
-        self.inplace = inplace
-        self.drop = torch.nn.Dropout2d(p=p, inplace=inplace)
-
-    def forward(self, input):
-        if isinstance(input, Variable):
-            return self.drop(input)
-        elif isinstance(input, tuple) or isinstance(input, list):
-            return my_data_parallel(self.drop, input)
-        else:
-            raise RuntimeError('unknown input type')
-
-    def __repr__(self):
-        inplace_str = ', inplace' if self.inplace else ''
-        return self.__class__.__name__ + ' (' \
-            + 'p=' + str(self.p) \
-            + inplace_str + ')'
-
-
-class Linear(Module):
-    r"""Applies a linear transformation to the incoming data: :math:`y = Ax + b`
-
-    Args:
-        in_features: size of each input sample
-        out_features: size of each output sample
-        bias: If set to False, the layer will not learn an additive bias.
-            Default: True
-
-    Shape:
-        - Input: :math:`(N, *, in\_features)` where `*` means any number of
-          additional dimensions
-        - Output: :math:`(N, *, out\_features)` where all but the last dimension
-          are the same shape as the input.
-
-    Attributes:
-        weight: the learnable weights of the module of shape
-            (out_features x in_features)
-        bias:   the learnable bias of the module of shape (out_features)
-
-    Examples::
-        >>> m = nn.Linear(20, 30)
-        >>> input = autograd.Variable(torch.randn(128, 20))
-        >>> output = m(input)
-        >>> print(output.size())
-
-    """
-    def __init__(self, in_features, out_features, bias=True):
-        super(Linear, self).__init__()
-        self.in_features = in_features
-        self.out_features = out_features
-        self.weight = Parameter(torch.Tensor(out_features, in_features))
-        if bias:
-            self.bias = Parameter(torch.Tensor(out_features))
-        else:
-            self.register_parameter('bias', None)
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        stdv = 1. / math.sqrt(self.weight.size(1))
-        self.weight.data.uniform_(-stdv, stdv)
-        if self.bias is not None:
-            self.bias.data.uniform_(-stdv, stdv)
-
-    def forward(self, input):
-        if isinstance(input, Variable):
-            return F.linear(input, self.weight, self.bias)
-        elif isinstance(input, tuple) or isinstance(input, list):
-            return my_data_parallel(self, input)
-        else:
-            raise RuntimeError('unknown input type')
-
-    def __repr__(self):
-        return self.__class__.__name__ + ' (' \
-            + str(self.in_features) + ' -> ' \
-            + str(self.out_features) + ')'
--- a/encoding/nn/customize.py
+++ b/encoding/nn/customize.py
@@ -5,19 +5,15 @@
 ## Copyright (c) 2017
 ##
 ## This source code is licensed under the MIT-style license found in the
-## LICENSE file in the root directory of this source tree 
+## LICENSE file in the root directory of this source tree
 ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

-import math
+"""Encoding Custermized NN Module"""
 import torch
-from torch.autograd import Variable
-from torch.nn import Module, Parameter
+from torch.nn import Module, Sequential, Conv2d, ReLU, AdaptiveAvgPool2d
 from torch.nn import functional as F

-from ..parallel import my_data_parallel
 from .syncbn import BatchNorm2d
-from ..functions import view_each, upsample
-from .basic import *

 __all__ = ['GramMatrix', 'View', 'Sum', 'Mean', 'Normalize', 'PyramidPooling']

@@ -48,12 +44,7 @@ class View(Module):
            self.size = torch.Size(args)

    def forward(self, input):
-        if isinstance(input, Variable):
-            return input.view(self.size)
-        elif isinstance(input, tuple) or isinstance(input, list):
-            return view_each(input, self.size)
-        else:
-            raise RuntimeError('unknown input type')
+        return input.view(self.size)


 class Sum(Module):
@@ -63,12 +54,7 @@ class Sum(Module):
        self.keep_dim = keep_dim

    def forward(self, input):
-        if isinstance(input, Variable):
-            return input.sum(self.dim, self.keep_dim)
-        elif isinstance(input, tuple) or isinstance(input, list):
-            return my_data_parallel(self, input)
-        else:
-            raise RuntimeError('unknown input type')
+        return input.sum(self.dim, self.keep_dim)


 class Mean(Module):
@@ -78,12 +64,7 @@ class Mean(Module):
        self.keep_dim = keep_dim

    def forward(self, input):
-        if isinstance(input, Variable):
-            return input.mean(self.dim, self.keep_dim)
-        elif isinstance(input, tuple) or isinstance(input, list):
-            return my_data_parallel(self, input)
-        else:
-            raise RuntimeError('unknown input type')
+        return input.mean(self.dim, self.keep_dim)


 class Normalize(Module):
@@ -108,20 +89,15 @@ class Normalize(Module):
    def __init__(self, p=2, dim=1):
        super(Normalize, self).__init__()
        self.p = p
-        self.dim =dim
+        self.dim = dim

    def forward(self, x):
-        if isinstance(x, Variable):
-            return F.normalize(x, self.p, self.dim, eps=1e-10)
-        elif isinstance(x, tuple) or isinstance(x, list):
-            return my_data_parallel(self, x)
-        else:
-            raise RuntimeError('unknown input type')
+        return F.normalize(x, self.p, self.dim, eps=1e-10)


 class PyramidPooling(Module):
    """
-    Reference: 
+    Reference:
        Zhao, Hengshuang, et al. *"Pyramid scene parsing network."*
    """
    def __init__(self, in_channels):
@@ -146,31 +122,16 @@ class PyramidPooling(Module):
                                ReLU(True))

    def _cat_each(self, x, feat1, feat2, feat3, feat4):
-        assert(len(x)==len(feat1))
+        assert(len(x) == len(feat1))
        z = []
        for i in range(len(x)):
-            z.append( torch.cat((x[i], feat1[i], feat2[i], feat3[i], feat4[i]), 1))
+            z.append(torch.cat((x[i], feat1[i], feat2[i], feat3[i], feat4[i]), 1))
        return z

    def forward(self, x):
-        if isinstance(x, Variable):
-            _, _, h, w = x.size()
-        elif isinstance(x, tuple) or isinstance(x, list):
-            _, _, h, w = x[0].size()
-        else:
-            raise RuntimeError('unknown input type')
-        feat1 = upsample(self.conv1(self.pool1(x)),(h,w),
-                              mode='bilinear')
-        feat2 = upsample(self.conv2(self.pool2(x)),(h,w),
-                              mode='bilinear')
-        feat3 = upsample(self.conv3(self.pool3(x)),(h,w), 
-                              mode='bilinear')
-        feat4 = upsample(self.conv4(self.pool4(x)),(h,w), 
-                              mode='bilinear')
-        if isinstance(x, Variable):
-            return torch.cat((x, feat1, feat2, feat3, feat4), 1)
-        elif isinstance(x, tuple) or isinstance(x, list):
-            return self._cat_each(x, feat1, feat2, feat3, feat4)
-        else:
-            raise RuntimeError('unknown input type')
-
+        _, _, h, w = x.size()
+        feat1 = F.upsample(self.conv1(self.pool1(x)), (h, w), mode='bilinear')
+        feat2 = F.upsample(self.conv2(self.pool2(x)), (h, w), mode='bilinear')
+        feat3 = F.upsample(self.conv3(self.pool3(x)), (h, w), mode='bilinear')
+        feat4 = F.upsample(self.conv4(self.pool4(x)), (h, w), mode='bilinear')
+        return torch.cat((x, feat1, feat2, feat3, feat4), 1)
--- a/encoding/nn/encoding.py
+++ b/encoding/nn/encoding.py
@@ -5,53 +5,65 @@
 ## Copyright (c) 2017
 ##
 ## This source code is licensed under the MIT-style license found in the
-## LICENSE file in the root directory of this source tree 
+## LICENSE file in the root directory of this source tree
 ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

-import threading
+"""Encoding Package Core NN Modules."""
 import torch
 from torch.nn import Module, Parameter
 import torch.nn.functional as F
-from torch.autograd import Function, Variable
-from torch.nn.modules.utils import _single, _pair, _triple
+from torch.autograd import Variable
+from torch.nn.modules.utils import _pair

-from .._ext import encoding_lib
-from ..functions import scaledL2, aggregate
-from ..parallel import my_data_parallel
-from ..functions import dilatedavgpool2d
+from ..functions import scaledL2, aggregate, dilatedavgpool2d

-__all__ = ['Encoding', 'EncodingDrop', 'Inspiration', 'DilatedAvgPool2d', 'UpsampleConv2d'] 
+__all__ = ['Encoding', 'EncodingDrop', 'Inspiration', 'DilatedAvgPool2d', 'UpsampleConv2d']

 class Encoding(Module):
    r"""
-    Encoding Layer: a learnable residual encoder over 3d or 4d input that 
-    is seen as a mini-batch.
+    Encoding Layer: a learnable residual encoder.

    .. image:: _static/img/cvpr17.svg
        :width: 50%
        :align: center

-    .. math::
+    Encoding Layer accpets 3D or 4D inputs.
+    It considers an input featuremaps with the shape of :math:`C\times H\times W`
+    as a set of C-dimentional input features :math:`X=\{x_1, ...x_N\}`, where N is total number
+    of features given by :math:`H\times W`, which learns an inherent codebook
+    :math:`D=\{d_1,...d_K\}` and a set of smoothing factor of visual centers
+    :math:`S=\{s_1,...s_K\}`. Encoding Layer outputs the residuals with soft-assignment weights
+    :math:`e_k=\sum_{i=1}^Ne_{ik}`, where

-        e_{ik} = \frac{exp(-s_k\|x_{i}-c_k\|^2)}{\sum_{j=1}^K exp(-s_j\|x_{i}-c_j\|^2)} (x_i - c_k)
+    .. math::

-    Please see the `example of training Deep TEN <./experiments/texture.html>`_.
+        e_{ik} = \frac{exp(-s_k\|r_{ik}\|^2)}{\sum_{j=1}^K exp(-s_j\|r_{ij}\|^2)} r_{ik}

-    Reference:
-        Hang Zhang, Jia Xue, and Kristin Dana. "Deep TEN: Texture Encoding Network." *The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2017*
+    and the residuals are given by :math:`r_{ik} = x_i - d_k`. The output encoders are
+    :math:`E=\{e_1,...e_K\}`.

    Args:
        D: dimention of the features or feature channels
        K: number of codeswords

    Shape:
-        - Input: :math:`X\in\mathcal{R}^{B\times N\times D}` or :math:`\mathcal{R}^{B\times D\times H\times W}` (where :math:`B` is batch, :math:`N` is total number of features or :math:`H\times W`.)
+        - Input: :math:`X\in\mathcal{R}^{B\times N\times D}` or
+          :math:`\mathcal{R}^{B\times D\times H\times W}` (where :math:`B` is batch,
+          :math:`N` is total number of features or :math:`H\times W`.)
        - Output: :math:`E\in\mathcal{R}^{B\times K\times D}`
-        
+
    Attributes:
        codewords (Tensor): the learnable codewords of shape (:math:`K\times D`)
        scale (Tensor): the learnable scale factor of visual centers

+    Reference:
+        Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang, Ambrish Tyagi,
+        Amit Agrawal. “Context Encoding for Semantic Segmentation.
+        *The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2018*
+
+        Hang Zhang, Jia Xue, and Kristin Dana. "Deep TEN: Texture Encoding Network."
+        *The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2017*
+
    Examples:
        >>> import encoding
        >>> import torch
@@ -66,32 +78,26 @@ class Encoding(Module):
        super(Encoding, self).__init__()
        # init codewords and smoothing factor
        self.D, self.K = D, K
-        self.codewords = Parameter(torch.Tensor(K, D), 
-            requires_grad=True)
-        self.scale = Parameter(torch.Tensor(K), requires_grad=True) 
+        self.codewords = Parameter(torch.Tensor(K, D), requires_grad=True)
+        self.scale = Parameter(torch.Tensor(K), requires_grad=True)
        self.reset_params()
-        
+
    def reset_params(self):
        std1 = 1./((self.K*self.D)**(1/2))
        self.codewords.data.uniform_(-std1, std1)
        self.scale.data.uniform_(-1, 0)

    def forward(self, X):
-        if isinstance(X, tuple) or isinstance(X, list):
-            # for self-parallel mode, please see encoding.nn
-            return my_data_parallel(self, X)
-        elif not isinstance(X, Variable):
-            raise RuntimeError('unknown input type')
        # input X is a 4D tensor
-        assert(X.size(1)==self.D)
+        assert(X.size(1) == self.D)
        if X.dim() == 3:
            # BxDxN
-            B, N, K, D = X.size(0), X.size(2), self.K, self.D
-            X = X.transpose(1,2).contiguous()
+            B, D = X.size(0), self.D
+            X = X.transpose(1, 2).contiguous()
        elif X.dim() == 4:
            # BxDxHxW
-            B, N, K, D = X.size(0), X.size(2)*X.size(3), self.K, self.D
-            X = X.view(B,D,-1).transpose(1,2).contiguous()
+            B, D = X.size(0), self.D
+            X = X.view(B, D, -1).transpose(1, 2).contiguous()
        else:
            raise RuntimeError('Encoding Layer unknown input dims!')
        # assignment weights NxKxD
@@ -106,15 +112,16 @@ class Encoding(Module):
            + str(self.D) + ')'

 class EncodingDrop(Module):
+    r"""Dropout regularized Encoding Layer.
+    """
    def __init__(self, D, K):
        super(EncodingDrop, self).__init__()
        # init codewords and smoothing factor
        self.D, self.K = D, K
-        self.codewords = Parameter(torch.Tensor(K, D), 
-            requires_grad=True)
-        self.scale = Parameter(torch.Tensor(K), requires_grad=True) 
+        self.codewords = Parameter(torch.Tensor(K, D), requires_grad=True)
+        self.scale = Parameter(torch.Tensor(K), requires_grad=True)
        self.reset_params()
-        
+
    def reset_params(self):
        std1 = 1./((self.K*self.D)**(1/2))
        self.codewords.data.uniform_(-std1, std1)
@@ -127,21 +134,16 @@ class EncodingDrop(Module):
            self.scale.data.zero_().add_(-0.5)

    def forward(self, X):
-        if isinstance(X, tuple) or isinstance(X, list):
-            # for self-parallel mode, please see encoding.nn
-            return my_data_parallel(self, X)
-        elif not isinstance(X, Variable):
-            raise RuntimeError('unknown input type')
        # input X is a 4D tensor
-        assert(X.size(1)==self.D)
+        assert(X.size(1) == self.D)
        if X.dim() == 3:
            # BxDxN
-            B, N, K, D = X.size(0), X.size(2), self.K, self.D
-            X = X.transpose(1,2).contiguous()
+            B, D = X.size(0), self.D
+            X = X.transpose(1, 2).contiguous()
        elif X.dim() == 4:
            # BxDxHxW
-            B, N, K, D = X.size(0), X.size(2)*X.size(3), self.K, self.D
-            X = X.view(B,D,-1).transpose(1,2).contiguous()
+            B, D = X.size(0), self.D
+            X = X.view(B, D, -1).transpose(1, 2).contiguous()
        else:
            raise RuntimeError('Encoding Layer unknown input dims!')
        self._drop()
@@ -159,25 +161,28 @@ class EncodingDrop(Module):


 class Inspiration(Module):
-    r""" 
-    Inspiration Layer (CoMatch Layer) enables the multi-style transfer in feed-forward network, which learns to match the target feature statistics during the training. 
-    This module is differentialble and can be inserted in standard feed-forward network to be learned directly from the loss function without additional supervision. 
+    r"""
+    Inspiration Layer (CoMatch Layer) enables the multi-style transfer in feed-forward
+    network, which learns to match the target feature statistics during the training.
+    This module is differentialble and can be inserted in standard feed-forward network
+    to be learned directly from the loss function without additional supervision.

    .. math::
        Y = \phi^{-1}[\phi(\mathcal{F}^T)W\mathcal{G}]

-    Please see the `example of MSG-Net <./experiments/style.html>`_  
+    Please see the `example of MSG-Net <./experiments/style.html>`_
    training multi-style generative network for real-time transfer.

    Reference:
-        Hang Zhang and Kristin Dana. "Multi-style Generative Network for Real-time Transfer."  *arXiv preprint arXiv:1703.06953 (2017)*
+        Hang Zhang and Kristin Dana. "Multi-style Generative Network for Real-time Transfer."
+        *arXiv preprint arXiv:1703.06953 (2017)*
    """
    def __init__(self, C, B=1):
        super(Inspiration, self).__init__()
        # B is equal to 1 or input mini_batch
-        self.weight = Parameter(torch.Tensor(1,C,C), requires_grad=True)
+        self.weight = Parameter(torch.Tensor(1, C, C), requires_grad=True)
        # non-parameter buffer
-        self.G = Variable(torch.Tensor(B,C,C), requires_grad=True)
+        self.G = Variable(torch.Tensor(B, C, C), requires_grad=True)
        self.C = C
        self.reset_parameters()

@@ -189,8 +194,9 @@ class Inspiration(Module):

    def forward(self, X):
        # input X is a 3D feature map
-        self.P = torch.bmm(self.weight.expand_as(self.G),self.G)
-        return torch.bmm(self.P.transpose(1,2).expand(X.size(0), self.C, self.C), X.view(X.size(0),X.size(1),-1)).view_as(X)
+        self.P = torch.bmm(self.weight.expand_as(self.G), self.G)
+        return torch.bmm(self.P.transpose(1, 2).expand(X.size(0), self.C, self.C),
+                         X.view(X.size(0), X.size(1), -1)).view_as(X)

    def __repr__(self):
        return self.__class__.__name__ + '(' \
@@ -203,18 +209,21 @@ class DilatedAvgPool2d(Module):

    Reference:

-        Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang, Ambrish Tyagi, Amit Agrawal. “Context Encoding for Semantic Segmentation. CVPR 2018
+        Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang, Ambrish Tyagi,
+        Amit Agrawal. “Context Encoding for Semantic Segmentation.
+        *The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2018*

    Applies a 2D average pooling over an input signal composed of several input planes.

    In the simplest case, the output value of the layer with input size :math:`(N, C, H, W)`,
-    output :math:`(B, C, H_{out}, W_{out})`, :attr:`kernel_size` :math:`(k_H,k_W)`, :attr:`stride` :math:`(s_H,s_W)` :attr:`dilation` :math:`(d_H,d_W)`
+    output :math:`(B, C, H_{out}, W_{out})`, :attr:`kernel_size` :math:`(k_H,k_W)`,
+    :attr:`stride` :math:`(s_H,s_W)` :attr:`dilation` :math:`(d_H,d_W)`
    can be precisely described as:

    .. math::

        \begin{array}{ll}
-        out(b, c, h, w)  = 1 / (k_H \cdot k_W) \cdot 
+        out(b, c, h, w)  = 1 / (k_H \cdot k_W) \cdot
        \sum_{{m}=0}^{k_H-1} \sum_{{n}=0}^{k_W-1}
        input(b, c, s_H \cdot h + d_H \cdot m, s_W \cdot w + d_W \cdot n)
        \end{array}
@@ -222,11 +231,13 @@ class DilatedAvgPool2d(Module):
    | If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
      for :attr:`padding` number of points

-    | The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
+    | The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`,
+      :attr:`dilation` can either be:

-        - a single ``int`` -- in which case the same value is used for the height and width dimension
-        - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
-          and the second `int` for the width dimension
+        - a single ``int`` -- in which case the same value is used for the height
+          and width dimension
+        - a ``tuple`` of two ints -- in which case, the first `int` is used for
+          the height dimension, and the second `int` for the width dimension

    Args:
        kernel_size: the size of the window
@@ -257,13 +268,8 @@ class DilatedAvgPool2d(Module):
        self.dilation = dilation

    def forward(self, input):
-        if isinstance(input, Variable):
-            return dilatedavgpool2d(input, self.kernel_size, self.stride,
+        return dilatedavgpool2d(input, self.kernel_size, self.stride,
                                self.padding, self.dilation)
-        elif isinstance(input, tuple) or isinstance(input, list):
-            return my_data_parallel(self, input)
-        else:
-            raise RuntimeError('unknown input type')

    def __repr__(self):
        return self.__class__.__name__ + ' (' \
@@ -275,14 +281,17 @@ class DilatedAvgPool2d(Module):

 class UpsampleConv2d(Module):
    r"""
-    To avoid the checkerboard artifacts of standard Fractionally-strided Convolution, we adapt an integer stride convolution but producing a :math:`2\times 2` outputs for each convolutional window. 
+    To avoid the checkerboard artifacts of standard Fractionally-strided Convolution,
+    we adapt an integer stride convolution but producing a :math:`2\times 2` outputs for
+    each convolutional window.

    .. image:: _static/img/upconv.png
        :width: 50%
        :align: center

    Reference:
-        Hang Zhang and Kristin Dana. "Multi-style Generative Network for Real-time Transfer."  *arXiv preprint arXiv:1703.06953 (2017)*
+        Hang Zhang and Kristin Dana. "Multi-style Generative Network for Real-time Transfer."
+        *arXiv preprint arXiv:1703.06953 (2017)*

    Args:
        in_channels (int): Number of channels in the input image
@@ -290,8 +299,10 @@ class UpsampleConv2d(Module):
        kernel_size (int or tuple): Size of the convolving kernel
        stride (int or tuple, optional): Stride of the convolution. Default: 1
        padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0
-        output_padding (int or tuple, optional): Zero-padding added to one side of the output. Default: 0
-        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        output_padding (int or tuple, optional): Zero-padding added to one side of the output.
+          Default: 0
+        groups (int, optional): Number of blocked connections from input channels to output
+          channels. Default: 1
        bias (bool, optional): If True, adds a learnable bias to the output. Default: True
        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
        scale_factor (int): scaling factor for upsampling convolution. Default: 1
@@ -327,7 +338,7 @@ class UpsampleConv2d(Module):

    """
    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
-                 padding=0, dilation=1, groups=1, scale_factor =1, 
+                 padding=0, dilation=1, groups=1, scale_factor=1,
                 bias=True):
        super(UpsampleConv2d, self).__init__()
        kernel_size = _pair(kernel_size)
@@ -347,11 +358,11 @@ class UpsampleConv2d(Module):
        self.groups = groups
        self.scale_factor = scale_factor
        self.weight = Parameter(torch.Tensor(
-            out_channels * scale_factor * scale_factor, 
+            out_channels * scale_factor * scale_factor,
            in_channels // groups, *kernel_size))
        if bias:
-            self.bias = Parameter(torch.Tensor(out_channels * 
-                scale_factor * scale_factor))
+            self.bias = Parameter(torch.Tensor(
+                out_channels * scale_factor * scale_factor))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()
@@ -366,12 +377,6 @@ class UpsampleConv2d(Module):
            self.bias.data.uniform_(-stdv, stdv)

    def forward(self, input):
-        if isinstance(input, Variable):
-            out = F.conv2d(input, self.weight, self.bias, self.stride,
-                            self.padding, self.dilation, self.groups)
-            return F.pixel_shuffle(out, self.scale_factor)
-        elif isinstance(input, tuple) or isinstance(input, list):
-            return my_data_parallel(self, input)
-        else:
-            raise RuntimeError('unknown input type')
-
+        out = F.conv2d(input, self.weight, self.bias, self.stride,
+                       self.padding, self.dilation, self.groups)
+        return F.pixel_shuffle(out, self.scale_factor)
--- a/encoding/nn/syncbn.py
+++ b/encoding/nn/syncbn.py
@@ -5,41 +5,34 @@
 ## Copyright (c) 2017
 ##
 ## This source code is licensed under the MIT-style license found in the
-## LICENSE file in the root directory of this source tree 
+## LICENSE file in the root directory of this source tree
 ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

-import math
+"""Synchronized Cross-GPU Batch Normalization Module"""
 import threading
 import torch
-import torch.cuda.comm as comm
-from torch.autograd import Variable
-from torch.nn import Module, Sequential
-from torch.nn import functional as F
+from torch.nn import Module, Sequential, Conv1d, Conv2d, ConvTranspose2d, \
+    ReLU, Sigmoid, MaxPool2d, AvgPool2d, AdaptiveAvgPool2d, Dropout2d, Linear
 from torch.nn.parameter import Parameter
-from torch.nn.modules.utils import _single, _pair, _triple
-from torch.nn.parallel.scatter_gather import scatter, scatter_kwargs, \
-    gather

-from ..functions import view_each, multi_each, sum_each, batchnormtrain, batchnormeval, sum_square 
-from ..parallel import my_data_parallel, Broadcast, AllReduce
+from ..functions import batchnormtrain, batchnormeval, sum_square
+from ..parallel import allreduce

-__all__ = ['BatchNorm1d', 'BatchNorm2d']
+# import standard layers for convinent use
+__all__ = ['BatchNorm1d', 'BatchNorm2d', 'Module', 'Sequential', 'Conv1d',
+           'Conv2d', 'ConvTranspose2d', 'ReLU', 'Sigmoid', 'MaxPool2d',
+           'AvgPool2d', 'AdaptiveAvgPool2d', 'Dropout2d', 'Linear']

 class BatchNorm1d(Module):
-    r"""Synchronized Batch Normalization 1d
+    r"""Cross-GPU Synchronized Batch normalization (SyncBN)

-    `Implementation ideas <./notes/syncbn.html>`_. Please use compatible :class:`encoding.parallel.SelfDataParallel` and :class:`encoding.nn`
-
-    Reference:
-
-        Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang, Ambrish Tyagi, Amit Agrawal. “Context Encoding for Semantic Segmentation. CVPR 2018
-
-    Applies Batch Normalization over a 2d or 3d input that is seen as a
-    mini-batch.
+    Standard BN [1]_ implementation only normalize the data within each device.
+    SyncBN normalizes the input within the whole mini-batch.
+    We follow the sync-onece implmentation described in the paper [2]_ .

    .. math::

-        y = \frac{x - \mu[x]}{ \sqrt{var[x] + \epsilon}} * \gamma + \beta
+        y = \frac{x - mean[x]}{ \sqrt{Var[x] + \epsilon}} * gamma + beta

    The mean and standard-deviation are calculated per-dimension over
    the mini-batches and gamma and beta are learnable parameter vectors
@@ -50,6 +43,9 @@ class BatchNorm1d(Module):

    During evaluation, this running mean/variance is used for normalization.

+    Because the BatchNorm is done over the `C` dimension, computing statistics
+    on `(N, L)` slices, it's common terminology to call this Temporal BatchNorm
+
    Args:
        num_features: num_features from an expected input of size
            `batch_size x num_features [x width]`
@@ -57,16 +53,16 @@ class BatchNorm1d(Module):
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Default: 0.1
-        affine: a boolean value that when set to true, gives the layer 
-            learnable affine parameters. Default: True
+        affine: a boolean value that when set to ``True``, gives the layer learnable
+            affine parameters. Default: ``True``

    Shape:
        - Input: :math:`(N, C)` or :math:`(N, C, L)`
        - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)

    Examples:
-        >>> m = encoding.nn.BatchNorm1d(100).cuda()
-        >>> input = autograd.Variable(torch.randn(20, 100)).cuda()
+        >>> # Use exactly the same as standard BatchNrom1d
+        >>> m = nn.BatchNorm1d(100)
        >>> output = m(input)
    """
    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True):
@@ -85,6 +81,9 @@ class BatchNorm1d(Module):
        self.register_buffer('running_var', torch.ones(num_features))
        self.reset_parameters()
        self.writelock = threading.Lock()
+        nGPUs = torch.cuda.device_count()
+        self.xsum = SharedTensor(nGPUs)
+        self.xsquare = SharedTensor(nGPUs)

    def reset_parameters(self):
        self.running_mean.zero_()
@@ -100,142 +99,48 @@ class BatchNorm1d(Module):

    def _check_input_dim(self, input):
        if input.dim() != 3:
-            raise ValueError('expected 3D input (got {}D input)'
+            raise ValueError('expected 4D input (got {}D input)'
                             .format(input.dim()))

    def forward(self, input):
-        if isinstance(input, Variable):
-            self._check_input_dim(input)
-            if self.training:
-                xsum, xsquare = sum_square(input.unsqueeze(3))
-                N = input.size(0)*input.size(2)
-                mean = xsum / N
-                sumvar = xsquare - xsum * xsum / N
-                unbias_var = sumvar / (N - 1)
-                std = (sumvar / N + self.eps).sqrt()
-                # update running_mean and var
-                self.running_mean = (1-self.momentum) * self.running_mean \
-                    + self.momentum * mean.data
-                self.running_var = (1-self.momentum) * self.running_var + \
-                    self.momentum * unbias_var.data
-                # forward
-                output = batchnormtrain(
-                    input, self.weight, 
-                    self.bias, mean, 
-                    std)
-                return output
-            else:
-                var_mean = Variable(self.running_mean, requires_grad=False)
-                bias_var = Variable(self.running_var, requires_grad=False)
-                std = (bias_var + self.eps).sqrt()
-                return batchnormeval(
-                    input, self.weight, self.bias, var_mean, std)
-
-        elif isinstance(input, tuple) or isinstance(input, list):
-            self._check_input_dim(input[0])
-            # if evaluation, do it simple
-            if not self.training:
-                return my_data_parallel(self, input)
-            if len(input) == 1:
-                return self.forward(input[0])
-            # calculate mean and var using multithreading
-            all_sum, all_xsquare = {},{}
-            def _worker(i, x, lock):
-                try:
-                    with torch.cuda.device_of(x):
-                        xsum, xsquare = sum_square(x.unsqueeze(3))
-                    with lock:
-                        all_sum[i] = xsum 
-                        all_xsquare[i] = xsquare 
-                except Exception as e:
-                    with lock:
-                        all_sum[i] = e
-                        all_xsquare[i] = e
-            threads = [threading.Thread(target=_worker,
-                                        args=(i, x, self.writelock))
-                        for i, x in enumerate(input)]
-            for thread in threads:
-                thread.start()
-            for thread in threads:
-                thread.join()
-            # convert to list
-            def _to_list(x):
-                outputs = []
-                for i in range(len(x)):
-                    outputs.append(x[i])
-                return outputs
-            
-            all_sum = _to_list(all_sum)
-            all_xsquare = _to_list(all_xsquare)
-            xsums = AllReduce()(*all_sum)
-            xsquares = AllReduce()(*all_xsquare)
-
-            nGPUs = len(input)
-            N = nGPUs * input[0].size(0)*input[0].size(2)
-            assert(N>1)
-            xmean = xsums[0].data / N
-            unbias_var = (xsquares[0].data - N * xmean * xmean) / (N-1) 
+        self._check_input_dim(input)
+        if self.training:
+            # push the value
+            isum, isquare = sum_square(input.unsqueeze(3))
+            idxs = self.xsum.push(isum)
+            idxq = self.xsquare.push(isquare)
+            xsum = self.xsum[idxs]
+            xsquare = self.xsquare[idxq]
+            # calculate N
+            N = len(self.xsum)*input.size(0)*input.size(2)
+            mean = xsum / N
+            sumvar = xsquare - xsum * xsum / N
+            unbias_var = sumvar / (N - 1)
+            std = (sumvar / N + self.eps).sqrt()
            # update running_mean and var
            self.running_mean = (1-self.momentum) * self.running_mean \
-                + self.momentum * xmean
+                + self.momentum * mean.data
            self.running_var = (1-self.momentum) * self.running_var + \
-                self.momentum * unbias_var
-            # Broadcast the weight, bias, mean, std
-            device_ids = list(range(torch.cuda.device_count()))
-            weights = Broadcast(device_ids[:len(input)])(self.weight) 
-            biases = Broadcast(device_ids[:len(input)])(self.bias)
-            # parallel-apply
-            results = {}
-            def _worker_bn(i, x, xsum, xsquare, weight, bias, lock):
-                var_input = _get_a_var(x)
-                mean = xsum / N
-                std  = (xsquare / N - mean * mean + self.eps).sqrt()
-                try:
-                    with torch.cuda.device_of(var_input):
-                        result = batchnormtrain(
-                            x, weight, bias, mean, std)
-                    with lock: 
-                        results[i] = result
-                except Exception as e:
-                    with lock:
-                        results[i] = e
-            threads = [threading.Thread(target=_worker_bn,
-                                        args=(i, x, xsum, xsquare, weight, 
-                                              bias, self.writelock)
-                                       )
-                        for i,( x, xsum, xsquare, weight, bias) in 
-                        enumerate(zip(input, xsums, xsquares, 
-                                      weights, biases))]
-            for thread in threads:
-                thread.start()
-            for thread in threads:
-                thread.join()
-            outputs = []
-            for i in range(len(results)):
-                output = results[i]
-                if isinstance(output, Exception):
-                    raise output
-                outputs.append(output)
-            return outputs
+                self.momentum * unbias_var.data
+            # forward
+            return batchnormtrain(input, self.weight,
+                                  self.bias, mean, std)
        else:
-            raise RuntimeError('unknown input type')
+            std = (self.running_var + self.eps).sqrt()
+            return batchnormeval(input, self.weight, self.bias,
+                                 self.running_mean, std)


 class BatchNorm2d(Module):
-    r"""Synchronized Batch Normalization 2d
-
-    `Implementation ideas <./notes/syncbn.html>`_. Please use compatible :class:`encoding.parallel.SelfDataParallel` and :class:`encoding.nn`. 
-
-    Reference:
-
-        Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang, Ambrish Tyagi, Amit Agrawal. “Context Encoding for Semantic Segmentation. CVPR 2018
+    r"""Cross-GPU Synchronized Batch normalization (SyncBN)

-    Applies Batch Normalization over a 4d input that is seen as a mini-batch
-    of 3d inputs
+    Standard BN [1]_ implementation only normalize the data within each device.
+    SyncBN normalizes the input within the whole mini-batch.
+    We follow the sync-onece implmentation described in the paper [2]_ .

    .. math::

-        y = \frac{x - \mu[x]}{ \sqrt{var[x] + \epsilon}} * \gamma + \beta
+        y = \frac{x - mean[x]}{ \sqrt{Var[x] + \epsilon}} * gamma + beta

    The mean and standard-deviation are calculated per-dimension over
    the mini-batches and gamma and beta are learnable parameter vectors
@@ -246,6 +151,9 @@ class BatchNorm2d(Module):

    During evaluation, this running mean/variance is used for normalization.

+    Because the BatchNorm is done over the `C` dimension, computing statistics
+    on `(N, H, W)` slices, it's common terminology to call this Spatial BatchNorm
+
    Args:
        num_features: num_features from an expected input of
            size batch_size x num_features x height x width
@@ -253,16 +161,20 @@ class BatchNorm2d(Module):
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Default: 0.1
-        affine: a boolean value that when set to true, gives the layer learnable
-            affine parameters. Default: True
+        affine: a boolean value that when set to ``True``, gives the layer learnable
+            affine parameters. Default: ``True``

    Shape:
        - Input: :math:`(N, C, H, W)`
        - Output: :math:`(N, C, H, W)` (same shape as input)

+    Reference:
+        .. [1] Ioffe, Sergey, and Christian Szegedy. "Batch normalization: Accelerating deep network training by reducing internal covariate shift." *ICML 2015*
+        .. [2] Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang, Ambrish Tyagi, and Amit Agrawal. "Context Encoding for Semantic Segmentation." *CVPR 2018*
+
    Examples:
-        >>> m = encoding.nn.BatchNorm2d(100).cuda()
-        >>> input = autograd.Variable(torch.randn(20, 100, 35, 45)).cuda()
+        >>> # Use exactly the same as standard BatchNrom2d
+        >>> m = nn.BatchNorm2d(100)
        >>> output = m(input)
    """
    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True):
@@ -281,6 +193,8 @@ class BatchNorm2d(Module):
        self.register_buffer('running_var', torch.ones(num_features))
        self.reset_parameters()
        self.writelock = threading.Lock()
+        nGPUs = torch.cuda.device_count()
+        self.xsum, self.xsquare = SharedTensor(nGPUs), SharedTensor(nGPUs)

    def reset_parameters(self):
        self.running_mean.zero_()
@@ -300,141 +214,91 @@ class BatchNorm2d(Module):
                             .format(input.dim()))

    def forward(self, input):
-        if isinstance(input, Variable):
-            self._check_input_dim(input)
-            if self.training:
-                xsum, xsquare = sum_square(input)
-                N = input.size(0)*input.size(2)*input.size(3)
-                mean = xsum / N
-                sumvar = xsquare - xsum * xsum / N
-                unbias_var = sumvar / (N - 1)
-                std = (sumvar / N + self.eps).sqrt()
-                # update running_mean and var
-                self.running_mean = (1-self.momentum) * self.running_mean \
-                    + self.momentum * mean.data
-                self.running_var = (1-self.momentum) * self.running_var + \
-                    self.momentum * unbias_var.data
-                # forward
-                B, C, H, W = input.size()
-                output = batchnormtrain(
-                    input.view(B,C,-1).contiguous(), self.weight, 
-                    self.bias, mean, 
-                    std)
-                return output.view(B, C, H, W)
-            else:
-                var_mean = Variable(self.running_mean, requires_grad=False)
-                bias_var = Variable(self.running_var, requires_grad=False)
-                std = (bias_var + self.eps).sqrt()
-                B, C, H, W = input.size()
-                return batchnormeval(
-                    input.view(B,C,-1).contiguous(), 
-                    self.weight, self.bias, var_mean, 
-                    std).view(B, C, H, W)
-
-        elif isinstance(input, tuple) or isinstance(input, list):
-            self._check_input_dim(input[0])
-            # if evaluation, do it simple
-            if not self.training:
-                return my_data_parallel(self, input)
-            if len(input) == 1:
-                return self.forward(input[0])
-            # calculate mean and var using multithreading
-            all_sum, all_xsquare = {},{}
-            def _worker(i, x, lock):
-                try:
-                    with torch.cuda.device_of(x):
-                        xsum, xsquare = sum_square(x)
-                    with lock:
-                        all_sum[i] = xsum 
-                        all_xsquare[i] = xsquare 
-                except Exception as e:
-                    with lock:
-                        all_sum[i] = e
-                        all_xsquare[i] = e
-            threads = [threading.Thread(target=_worker,
-                                        args=(i, x, self.writelock))
-                        for i, x in enumerate(input)]
-            for thread in threads:
-                thread.start()
-            for thread in threads:
-                thread.join()
-            # convert to list
-            def _to_list(x):
-                outputs = []
-                for i in range(len(x)):
-                    outputs.append(x[i])
-                return outputs
-            
-            all_sum = _to_list(all_sum)
-            all_xsquare = _to_list(all_xsquare)
-            xsums = AllReduce()(*all_sum)
-            xsquares = AllReduce()(*all_xsquare)
-
-            nGPUs = len(input)
-            N = nGPUs * input[0].size(0)*input[0].size(2)*input[0].size(3)
-            assert(N>1)
-            xmean = xsums[0].data / N
-            unbias_var = (xsquares[0].data - N * xmean * xmean) / (N-1) 
+        self._check_input_dim(input)
+        if self.training:
+            # push the value
+            isum, isquare = sum_square(input)
+            idxs = self.xsum.push(isum)
+            idxq = self.xsquare.push(isquare)
+            xsum = self.xsum[idxs]
+            xsquare = self.xsquare[idxq]
+            # calculate N
+            N = len(self.xsum)*input.size(0)*input.size(2)*input.size(3)
+            mean = xsum / N
+            sumvar = xsquare - xsum * xsum / N
+            unbias_var = sumvar / (N - 1)
+            std = (sumvar / N + self.eps).sqrt()
            # update running_mean and var
            self.running_mean = (1-self.momentum) * self.running_mean \
-                + self.momentum * xmean
+                + self.momentum * mean.data
            self.running_var = (1-self.momentum) * self.running_var + \
-                self.momentum * unbias_var
-            # Broadcast the weight, bias, mean, std
-            device_ids = list(range(torch.cuda.device_count()))
-            weights = Broadcast(device_ids[:len(input)])(self.weight) 
-            biases = Broadcast(device_ids[:len(input)])(self.bias)
-            # parallel-apply
-            results = {}
-            def _worker_bn(i, x, xsum, xsquare, weight, bias, lock):
-                var_input = _get_a_var(x)
-                mean = xsum / N
-                std  = (xsquare / N - mean * mean + self.eps).sqrt()
-                try:
-                    with torch.cuda.device_of(var_input):
-                        B, C, H, W = x.size()
-                        result = batchnormtrain(
-                            x.view(B,C, -1), weight, bias, mean, 
-                            std).view(B, C, H, W)
-                    with lock: 
-                        results[i] = result
-                except Exception as e:
-                    with lock:
-                        results[i] = e
-            threads = [threading.Thread(target=_worker_bn,
-                                        args=(i, x, xsum, xsquare, weight, 
-                                              bias, self.writelock)
-                                       )
-                        for i,( x, xsum, xsquare, weight, bias) in 
-                        enumerate(zip(input, xsums, xsquares, 
-                                      weights, biases))]
-            for thread in threads:
-                thread.start()
-            for thread in threads:
-                thread.join()
-            outputs = []
-            for i in range(len(results)):
-                output = results[i]
-                if isinstance(output, Exception):
-                    raise output
-                outputs.append(output)
-            return outputs
+                self.momentum * unbias_var.data
+            # forward
+            B, C, H, W = input.size()
+            output = batchnormtrain(
+                input.view(B, C, -1).contiguous(), self.weight,
+                self.bias, mean,
+                std)
+            return output.view(B, C, H, W)
        else:
-            raise RuntimeError('unknown input type')
-
-
-def _get_a_var(obj):
-    if isinstance(obj, Variable):
-        return obj
-
-    if isinstance(obj, list) or isinstance(obj, tuple):
-        results = map(_get_a_var, obj)
-        for result in results:
-            if isinstance(result, Variable):
-                return result
-    if isinstance(obj, dict):
-        results = map(_get_a_var, obj.items())
-        for result in results:
-            if isinstance(result, Variable):
-                return result
-    return None
+            std = (self.running_var + self.eps).sqrt()
+            B, C, H, W = input.size()
+            return batchnormeval(input.view(B, C, -1).contiguous(), self.weight, self.bias,
+                                 self.running_mean, std).view(B, C, H, W)
+
+
+class SharedTensor(object):
+    """Shared Tensor
+    """
+    def __init__(self, nGPUs):
+        self.mutex = threading.Lock()
+        self.all_tasks_done = threading.Condition(self.mutex)
+        self.nGPUs = nGPUs
+        self._clear()
+
+    def _clear(self):
+        self.list = []
+        self.push_tasks = self.nGPUs
+        self.reduce_tasks = self.nGPUs
+
+    def push(self, t):
+        """push a Tensor
+        """
+        with self.mutex:
+            if self.push_tasks == 0:
+                self._clear()
+            self.list.append(t)
+            idx = len(self.list) - 1
+            self.push_tasks -= 1
+
+        with self.all_tasks_done:
+            if self.push_tasks == 0:
+                self.all_tasks_done.notify_all()
+            while self.push_tasks:
+                self.all_tasks_done.wait()
+        return idx
+
+    def _reduce(self):
+        with self.mutex:
+            if self.reduce_tasks == self.nGPUs:
+                assert(len(self.list) == self.nGPUs)
+                self.outlist = allreduce(*self.list)
+                self.reduce_tasks -= 1
+            else:
+                self.reduce_tasks -= 1
+
+        with self.all_tasks_done:
+            if self.reduce_tasks == 0:
+                self.all_tasks_done.notify_all()
+            while self.reduce_tasks:
+                self.all_tasks_done.wait()
+
+    def __getitem__(self, idx):
+        self._reduce()
+        return self.outlist[idx]
+
+    def __len__(self):
+        return len(self.list)
+
+    def __repr__(self):
+        return ('SharedTensor')
--- a/encoding/parallel.py
+++ b/encoding/parallel.py
@@ -5,113 +5,42 @@
 ## Copyright (c) 2017
 ##
 ## This source code is licensed under the MIT-style license found in the
-## LICENSE file in the root directory of this source tree 
+## LICENSE file in the root directory of this source tree
 ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

+"""Encoding Data Parallel"""
 import threading
 import torch
-import torch.cuda.nccl as nccl
-import torch.cuda.comm as comm
-from torch.autograd import Variable, Function
+from torch.autograd import Variable
 from torch.nn.modules import Module
-from torch.nn.parallel.scatter_gather import scatter, scatter_kwargs, \
-    gather
+from torch.nn.parallel.scatter_gather import scatter_kwargs
 from torch.nn.parallel.replicate import replicate
 from torch.nn.parallel.parallel_apply import parallel_apply
+from torch.nn.parallel._functions import ReduceAddCoalesced, Broadcast

-__all__ = ['Reduce', 'AllReduce', 'Broadcast', 'ModelDataParallel', 
-    'CriterionDataParallel', 'SelfDataParallel']
-
-def nccl_all_reduce(inputs):
-    # TODO, figure out why nccl all_reduce doesn't work for gradcheck
-    input_size = inputs[0].size()
-    #if nccl.is_available(inputs):
-    for i, inp in enumerate(inputs):
-        assert inp.is_cuda, \
-            "reduce_add expects all inputs to be on GPUs"
-        if inp.size() != input_size:
-            got = 'x'.join(str(x) for x in inp.size())
-            expected = 'x'.join(str(x) for x in input_size)
-            raise ValueError("input {} has invalid size: got {}, \
-                but expected {}".format(i, got, expected))
-    nccl.all_reduce(inputs)
-    return inputs
-
-def comm_all_reduce(inputs):
-    # comm backend
-    result = comm.reduce_add(inputs)
-    results = []
-    for i in range(len(inputs)):
-        results.append(result.clone().cuda(i))
-    return results
-
-class Reduce(Function):
-    def forward(ctx, *inputs):
-        ctx.save_for_backward(*inputs)
-        if len(inputs) == 1:
-            return inputs[0]
-        return comm.reduce_add(inputs)
-
-    def backward(ctx, gradOutput):
-        inputs = tuple(ctx.saved_tensors)
-        if len(inputs) == 1:
-            return gradOutput
-        gradInputs = []
-        for i in range(len(inputs)):
-            with torch.cuda.device_of(inputs[i]):
-                gradInputs.append(gradOutput.cuda())
-        return tuple(gradInputs)
-
-class AllReduce(Function):
-    """Cross GPU all reduce autograd operation for calculate mean and
-    variance in SyncBN.
-    """
-    def forward(ctx, *inputs):
-        outputs = comm_all_reduce(list(inputs))
-        return tuple(outputs)
-
-    def backward(ctx, *gradOutputs):
-        gradInputs = comm_all_reduce(list(gradOutputs))
-        return tuple(gradInputs)
+__all__ = ['allreduce', 'ModelDataParallel', 'CriterionDataParallel']


-class Broadcast(Function):
-    """Multi-GPU broadcast autograd function
+def allreduce(*inputs):
+    """Cross GPU all reduce autograd operation for calculate mean and
+    variance in SyncBN.
    """
-    def __init__(self, target_gpus):
-        super(Broadcast, self).__init__()
-        self.target_gpus = target_gpus
-
-    def forward(self, *inputs):
-        if not all(input.is_cuda for input in inputs):
-            raise TypeError('Broadcast function not implemented for CPU tensors')
-        if len(inputs) == 0:
-            return tuple()
-        self.num_inputs = len(inputs)
-        self.input_device = inputs[0].get_device()
-        outputs = comm.broadcast_coalesced(inputs, self.target_gpus)
-        return tuple([t for tensors in outputs for t in tensors])
-
-    def backward(self, *grad_outputs):
-        grad_outputs = [grad_outputs[i:i + self.num_inputs]
-                        for i in range(0, len(grad_outputs), self.num_inputs)]
-        return comm.reduce_add_coalesced(grad_outputs, self.input_device)
+    target_gpus = [inputs[i].get_device() for i in range(len(inputs))]
+    result = ReduceAddCoalesced.apply(target_gpus[0], 1, *inputs)
+    outputs = Broadcast.apply(target_gpus, *result)
+    assert len(outputs) == len(inputs)
+    return outputs


 class ModelDataParallel(Module):
    """Implements data parallelism at the module level.

-    Reference:
-
-        Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang, Ambrish Tyagi, Amit Agrawal. “Context Encoding for Semantic Segmentation. CVPR 2018
-
    This container parallelizes the application of the given module by
-    splitting the input across the specified devices by chunking in the 
-    batch dimension. 
+    splitting the input across the specified devices by chunking in the
+    batch dimension.
    In the forward pass, the module is replicated on each device,
-    and each replica handles a portion of the input. During the backwards
-    pass, gradients from each replica are summed into the original module.
-    Note that the outputs are not gathered, please use compatible 
+    and each replica handles a portion of the input. During the backwards pass, gradients from each replica are summed into the original module.
+    Note that the outputs are not gathered, please use compatible
    :class:`encoding.parallel.CriterionDataParallel`.

    The batch size should be larger than the number of GPUs used. It should
@@ -122,10 +51,15 @@ class ModelDataParallel(Module):
        module: module to be parallelized
        device_ids: CUDA devices (default: all devices)

+    Reference:
+        Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang, Ambrish Tyagi,
+        Amit Agrawal. “Context Encoding for Semantic Segmentation.
+        *The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2018*
+
    Example::

        >>> net = encoding.nn.ModelDataParallel(model, device_ids=[0, 1, 2])
-        >>> output = net(input_var)
+        >>> y = net(x)
    """
    def __init__(self, module, device_ids=None, output_device=None, dim=0):
        super(ModelDataParallel, self).__init__()
@@ -140,13 +74,6 @@ class ModelDataParallel(Module):
        self.master_mean, self.master_var = {}, {}
        if len(self.device_ids) == 1:
            self.module.cuda(device_ids[0])
-        """
-        # TODO FIXME temporal solution for BN
-        for m in self.module.modules():
-            classname = m.__class__.__name__ 
-            if classname.find('BatchNorm2d') != -1:
-                m.momentum = 0.9996
-        """

    def forward(self, *inputs, **kwargs):
        inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
@@ -155,7 +82,7 @@ class ModelDataParallel(Module):
        replicas = self.replicate(self.module, \
            self.device_ids[:len(inputs)])
        outputs = self.parallel_apply(replicas, inputs, kwargs)
-        return outputs 
+        return outputs

    def replicate(self, module, device_ids):
        return replicate(module, device_ids)
@@ -166,18 +93,26 @@ class ModelDataParallel(Module):
    def parallel_apply(self, replicas, inputs, kwargs):
        return parallel_apply(replicas, inputs, kwargs)

-    
+
 class CriterionDataParallel(Module):
    """
-    Calculate loss in multiple-GPUs, which balance the memory usage for 
+    Calculate loss in multiple-GPUs, which balance the memory usage for
    Semantic Segmentation.

-    Reference:
- 
-        Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang, Ambrish Tyagi, Amit Agrawal. “Context Encoding for Semantic Segmentation. CVPR 2018
-
    The targets are splitted across the specified devices by chunking in
    the batch dimension. Please use together with :class:`encoding.parallel.ModelDataParallel`.
+
+    Reference:
+        Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang, Ambrish Tyagi,
+        Amit Agrawal. “Context Encoding for Semantic Segmentation.
+        *The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2018*
+
+    Example::
+
+        >>> net = encoding.nn.ModelDataParallel(model, device_ids=[0, 1, 2])
+        >>> criterion = encoding.nn.CriterionDataParallel(criterion, device_ids=[0, 1, 2])
+        >>> y = net(x)
+        >>> loss = criterion(y, target)
    """
    def __init__(self, module, device_ids=None, output_device=None, dim=0):
        super(CriterionDataParallel, self).__init__()
@@ -200,7 +135,7 @@ class CriterionDataParallel(Module):
            return self.module(inputs, *targets[0], **kwargs[0])
        replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
        outputs = self.parallel_apply(replicas, inputs, targets, kwargs)
-        return self.gather(outputs, self.output_device)
+        return ReduceAddCoalesced.apply(self.output_device, 1, *outputs) / len(outputs)

    def replicate(self, module, device_ids):
        return replicate(module, device_ids)
@@ -209,64 +144,10 @@ class CriterionDataParallel(Module):
        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)

    def parallel_apply(self, replicas, inputs, targets, kwargs):
-        return criterion_parallel_apply(replicas, inputs, targets, kwargs)
-
-    def gather(self, outputs, output_device):
-        return gather(outputs, output_device, dim=self.dim).mean()
-    
+        return _criterion_parallel_apply(replicas, inputs, targets, kwargs)

-class SelfDataParallel(Module):
-    """SelfDataParallel, please make sure you understand it before using.

-    Reference:
-
-        Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang, Ambrish Tyagi, Amit Agrawal. “Context Encoding for Semantic Segmentation. CVPR 2018
-
-    Each module in the network should be in self-parallel mode, 
-    which allows list of inputs from multiple GPUs.
-    Please see :class:`encoding.nn` for detail, use with cautious
-    """
-    def __init__(self, module, device_ids=None, output_device=None, dim=0):
-        super(SelfDataParallel, self).__init__()
-        if device_ids is None:
-            device_ids = list(range(torch.cuda.device_count()))
-        if output_device is None:
-            output_device = device_ids[0]
-        self.dim = dim
-        self.module = module
-        self.device_ids = device_ids
-        self.output_device = output_device
-        self.master_mean, self.master_var = {}, {}
-        if len(self.device_ids) == 1:
-            self.module.cuda(device_ids[0])
-
-    def forward(self, *inputs, **kwargs):
-        inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
-        if self.training:
-            # self parallel mode
-            outputs = self.module(inputs)
-            return outputs
-        else:
-            # TODO check faster?
-            if len(self.device_ids) == 1:
-                return self.module(*inputs[0], **kwargs[0])
-            replicas = self.replicate(self.module, \
-                self.device_ids[:len(inputs)])
-            outputs = self.parallel_apply(replicas, inputs, kwargs)
-            return outputs 
-            
-    def replicate(self, module, device_ids):
-        return replicate(module, device_ids)
-
-    def parallel_apply(self, replicas, inputs, kwargs):
-        return parallel_apply(replicas, inputs, kwargs)
-
-    def scatter(self, inputs, kwargs, device_ids):
-        outputs = scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
-        return outputs
-
-
-def criterion_parallel_apply(modules, inputs, targets, kwargs_tup=None):
+def _criterion_parallel_apply(modules, inputs, targets, kwargs_tup=None):
    assert len(modules) == len(inputs)
    assert len(targets) == len(inputs)
    if kwargs_tup:
@@ -281,13 +162,8 @@ def criterion_parallel_apply(modules, inputs, targets, kwargs_tup=None):
    results = {}

    def _worker(i, module, input, target, kwargs, results, lock):
-        var_input = input
-        while not isinstance(var_input, Variable):
-            var_input = var_input[0]
-        var_target = target
-        while not isinstance(var_target, Variable):
-            var_target = var_target[0]
        try:
+            var_input = _get_a_var(input)
            with torch.cuda.device_of(var_input):
                output = module(input, *target, **kwargs)
            with lock:
@@ -297,9 +173,8 @@ def criterion_parallel_apply(modules, inputs, targets, kwargs_tup=None):
                results[i] = e

    threads = [threading.Thread(target=_worker,
-                                args=(i, module, input, target, 
-                                      kwargs, results, lock),
-                                )
+                                args=(i, module, input, target,
+                                      kwargs, results, lock),)
               for i, (module, input, target, kwargs) in
               enumerate(zip(modules, inputs, targets, kwargs_tup))]

@@ -316,77 +191,18 @@ def criterion_parallel_apply(modules, inputs, targets, kwargs_tup=None):
    return outputs


-def get_a_var(obj):
+def _get_a_var(obj):
    if isinstance(obj, Variable):
        return obj

    if isinstance(obj, list) or isinstance(obj, tuple):
-        results = map(get_a_var, obj)
+        results = map(_get_a_var, obj)
        for result in results:
            if isinstance(result, Variable):
                return result
    if isinstance(obj, dict):
-        results = map(get_a_var, obj.items())
+        results = map(_get_a_var, obj.items())
        for result in results:
            if isinstance(result, Variable):
                return result
    return None
-
-
-def my_parallel_apply(modules, inputs, kwargs_tup=None):
-    assert len(modules) == len(inputs)
-    if kwargs_tup:
-        assert len(modules) == len(kwargs_tup)
-    else:
-        kwargs_tup = ({},) * len(modules)
-    # Fast track
-    if len(modules) == 1:
-        return (modules[0](*inputs[0], **kwargs_tup[0]), )
-
-    lock = threading.Lock()
-    results = {}
-
-    def _worker(i, module, input, kwargs, results, lock):
-        var_input = get_a_var(input)
-        try:
-            with torch.cuda.device_of(var_input):
-                output = module(input, **kwargs)
-            with lock:
-                results[i] = output
-        except Exception as e:
-            with lock:
-                results[i] = e
-
-    threads = [threading.Thread(target=_worker,
-                                args=(i, module, input, kwargs, results, lock),
-                                )
-               for i, (module, input, kwargs) in
-               enumerate(zip(modules, inputs, kwargs_tup))]
-
-    for thread in threads:
-        thread.start()
-    for thread in threads:
-        thread.join()
-    outputs = []
-    for i in range(len(inputs)):
-        output = results[i]
-        if isinstance(output, Exception):
-            raise output
-        outputs.append(output)
-    return outputs
-
-
-def my_data_parallel(module, inputs, device_ids=None, \
-    dim=0, module_kwargs=None):
-    if device_ids is None:
-        device_ids = list(range(torch.cuda.device_count()))
-
-    if len(inputs) == 1:
-        return module(inputs[0])
-
-    #print('my data parallel, len(inputs)', len(inputs))
-    replicas = replicate(module, device_ids[:len(inputs)])
-    outputs = my_parallel_apply(replicas, inputs, module_kwargs)
-    return outputs 
-
-
--- a/encoding/utils.py
+++ b/encoding/utils.py
@@ -5,22 +5,20 @@
 ## Copyright (c) 2017
 ##
 ## This source code is licensed under the MIT-style license found in the
-## LICENSE file in the root directory of this source tree 
+## LICENSE file in the root directory of this source tree
 ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

-import torch
+"""Encoding Util Tools"""
 import shutil
 import os
-import sys
-import time
 import math
-import tqdm
+import torch

-__all__ = ['get_optimizer', 'LR_Scheduler', 'save_checkpoint', 'progress_bar']
+__all__ = ['get_optimizer', 'LR_Scheduler', 'save_checkpoint']

 def get_optimizer(args, model, diff_LR=True):
    """
-    Returns an optimizer for given model, 
+    Returns an optimizer for given model,

    Args:
        args: :attr:`args.lr`, :attr:`args.momentum`, :attr:`args.weight_decay`
@@ -29,17 +27,17 @@ def get_optimizer(args, model, diff_LR=True):
    if diff_LR and model.pretrained is not None:
        print('Using different learning rate for pre-trained features')
        optimizer = torch.optim.SGD([
-                        {'params': model.pretrained.parameters()}, 
-                        {'params': model.head.parameters(), 
-                          'lr': args.lr*10},
-                    ], 
-                    lr=args.lr,
-                    momentum=args.momentum, 
-                    weight_decay=args.weight_decay)
+            {'params': model.pretrained.parameters()},
+            {'params': model.head.parameters(),
+             'lr': args.lr*10},
+            ],
+                                    lr=args.lr,
+                                    momentum=args.momentum,
+                                    weight_decay=args.weight_decay)
    else:
        optimizer = torch.optim.SGD(model.parameters(), lr=args.lr,
-                                    momentum=args.momentum, 
-                                    weight_decay=args.weight_decay) 
+                                    momentum=args.momentum,
+                                    weight_decay=args.weight_decay)
    return optimizer


@@ -53,12 +51,14 @@ class LR_Scheduler(object):
    Poly mode: ``lr = baselr * (1 - iter/maxiter) ^ 0.9``

    Args:
-        args:  :attr:`args.lr_scheduler` lr scheduler mode (`cos`, `poly`), :attr:`args.lr` base learning rate, :attr:`args.epochs` number of epochs, :attr:`args.lr_step`
+        args:  :attr:`args.lr_scheduler` lr scheduler mode (`cos`, `poly`),
+          :attr:`args.lr` base learning rate, :attr:`args.epochs` number of epochs,
+          :attr:`args.lr_step`

        niters: number of iterations per epoch
    """
    def __init__(self, args, niters=0):
-        self.mode = args.lr_scheduler 
+        self.mode = args.lr_scheduler
        print('Using {} LR Scheduler!'.format(self.mode))
        self.lr = args.lr
        if self.mode == 'step':
@@ -81,8 +81,7 @@ class LR_Scheduler(object):
            raise RuntimeError('Unknown LR scheduler!')
        if epoch > self.epoch:
            print('\n=>Epoches %i, learning rate = %.4f, \
-                previous best = %.4f' % (
-                epoch, lr, best_pred))
+                previous best = %.4f' % (epoch, lr, best_pred))
            self.epoch = epoch
        self._adjust_learning_rate(optimizer, lr)

@@ -92,7 +91,7 @@ class LR_Scheduler(object):
        else:
            # enlarge the lr at the head
            optimizer.param_groups[0]['lr'] = lr
-            for i in range(1,len(optimizer.param_groups)):
+            for i in range(1, len(optimizer.param_groups)):
                optimizer.param_groups[i]['lr'] = lr * 10


@@ -106,88 +105,3 @@ def save_checkpoint(state, args, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, directory + 'model_best.pth.tar')
-
-
-# refer to https://github.com/kuangliu/pytorch-cifar/blob/master/utils.py
-_, term_width = os.popen('stty size', 'r').read().split()
-term_width = int(term_width)-1
-TOTAL_BAR_LENGTH = 36.
-last_time = time.time()
-begin_time = last_time
-
-def progress_bar(current, total, msg=None):
-    """Progress Bar for display
-    """
-    global last_time, begin_time
-    if current == 0:
-        begin_time = time.time()    # Reset for new bar.
-
-    cur_len = int(TOTAL_BAR_LENGTH*current/total)
-    rest_len = int(TOTAL_BAR_LENGTH - cur_len) - 1
-
-    sys.stdout.write(' [')
-    for i in range(cur_len):
-        sys.stdout.write('=')
-    sys.stdout.write('>')
-    for i in range(rest_len):
-        sys.stdout.write('.')
-    sys.stdout.write(']')
-
-    cur_time = time.time()
-    step_time = cur_time - last_time
-    last_time = cur_time
-    tot_time = cur_time - begin_time
-
-    L = []
-    L.append('    Step: %s' % _format_time(step_time))
-    L.append(' | Tot: %s' % _format_time(tot_time))
-    if msg:
-        L.append(' | ' + msg)
-
-    msg = ''.join(L)
-    sys.stdout.write(msg)
-    for i in range(term_width-int(TOTAL_BAR_LENGTH)-len(msg)-3):
-        sys.stdout.write(' ')
-
-    # Go back to the center of the bar.
-    for i in range(term_width-int(TOTAL_BAR_LENGTH/2)):
-        sys.stdout.write('\b')
-    sys.stdout.write(' %d/%d ' % (current+1, total))
-
-    if current < total-1:
-        sys.stdout.write('\r')
-    else:
-        sys.stdout.write('\n')
-    sys.stdout.flush()
-
-def _format_time(seconds):
-    days = int(seconds / 3600/24)
-    seconds = seconds - days*3600*24
-    hours = int(seconds / 3600)
-    seconds = seconds - hours*3600
-    minutes = int(seconds / 60)
-    seconds = seconds - minutes*60
-    secondsf = int(seconds)
-    seconds = seconds - secondsf
-    millis = int(seconds*1000)
-
-    f = ''
-    i = 1
-    if days > 0:
-        f += str(days) + 'D'
-        i += 1
-    if hours > 0 and i <= 2:
-        f += str(hours) + 'h'
-        i += 1
-    if minutes > 0 and i <= 2:
-        f += str(minutes) + 'm'
-        i += 1
-    if secondsf > 0 and i <= 2:
-        f += str(secondsf) + 's'
-        i += 1
-    if millis > 0 and i <= 2:
-        f += str(millis) + 'ms'
-        i += 1
-    if f == '':
-        f = '0ms'
-    return f
--- a/requirements.txt
+++ b/requirements.txt
 tqdm
-dominate
+nose
--- a/setup.py
+++ b/setup.py
@@ -23,16 +23,17 @@ class install(setuptools.command.install.install):
    def run(self):
        self.create_version_file()
        setuptools.command.install.install.run(self)
-        subprocess.check_call("python test/test.py".split())
+        subprocess.check_call("python tests/unit_test.py".split())
    @staticmethod
    def create_version_file():
        global version, cwd
        print('-- Building version ' + version)
        version_path = os.path.join(cwd, 'encoding', 'version.py')
        with open(version_path, 'w') as f:
+            f.write('"""This is encoding version file."""\n')
            f.write("__version__ = '{}'\n".format(version))

-version = '0.2.0'
+version = '0.3.0'
 try:
    sha = subprocess.check_output(['git', 'rev-parse', 'HEAD'], 
        cwd=cwd).decode('ascii').strip()

--- a/tests/lint.py
+++ b/tests/lint.py
+#!/usr/bin/env python
+# pylint: disable=protected-access, unused-variable, locally-disabled, len-as-condition
+"""Lint helper to generate lint summary of source.
+
+Copyright by Contributors
+"""
+from __future__ import print_function
+import argparse
+import codecs
+import sys
+import re
+import os
+import cpplint
+from cpplint import _cpplint_state
+from pylint import epylint
+
+CXX_SUFFIX = set(['cc', 'c', 'cpp', 'h', 'cu', 'hpp'])
+PYTHON_SUFFIX = set(['py'])
+
+def filepath_enumerate(paths):
+    """Enumerate the file paths of all subfiles of the list of paths"""
+    out = []
+    for path in paths:
+        if os.path.isfile(path):
+            out.append(path)
+        else:
+            for root, dirs, files in os.walk(path):
+                for name in files:
+                    out.append(os.path.normpath(os.path.join(root, name)))
+    return out
+
+class LintHelper(object):
+    """Class to help runing the lint and records summary"""
+
+    @staticmethod
+    def _print_summary_map(strm, result_map, ftype):
+        """Print summary of certain result map."""
+        if len(result_map) == 0:
+            return 0
+        npass = len([x for k, x in result_map.items() if len(x) == 0])
+        strm.write('=====%d/%d %s files passed check=====\n' % (npass, len(result_map), ftype))
+        for fname, emap in result_map.items():
+            if len(emap) == 0:
+                continue
+            strm.write('%s: %d Errors of %d Categories map=%s\n' % (
+                fname, sum(emap.values()), len(emap), str(emap)))
+        return len(result_map) - npass
+
+    def __init__(self):
+        self.project_name = None
+        self.cpp_header_map = {}
+        self.cpp_src_map = {}
+        self.python_map = {}
+        pylint_disable = ['superfluous-parens',
+                          'too-many-instance-attributes',
+                          'too-few-public-methods']
+        # setup pylint
+        self.pylint_opts = ['--extension-pkg-whitelist=numpy',
+                            '--disable=' + ','.join(pylint_disable)]
+
+        self.pylint_cats = set(['error', 'warning', 'convention', 'refactor'])
+        # setup cpp lint
+        cpplint_args = ['.', '--extensions=' + (','.join(CXX_SUFFIX))]
+        _ = cpplint.ParseArguments(cpplint_args)
+        cpplint._SetFilters(','.join(['-build/c++11',
+                                      '-build/namespaces',
+                                      '-build/include,',
+                                      '+build/include_what_you_use',
+                                      '+build/include_order']))
+        cpplint._SetCountingStyle('toplevel')
+        cpplint._line_length = 100
+
+    def process_cpp(self, path, suffix):
+        """Process a cpp file."""
+        _cpplint_state.ResetErrorCounts()
+        cpplint.ProcessFile(str(path), _cpplint_state.verbose_level)
+        _cpplint_state.PrintErrorCounts()
+        errors = _cpplint_state.errors_by_category.copy()
+
+        if suffix == 'h':
+            self.cpp_header_map[str(path)] = errors
+        else:
+            self.cpp_src_map[str(path)] = errors
+
+    def process_python(self, path):
+        """Process a python file."""
+        (pylint_stdout, pylint_stderr) = epylint.py_run(
+            ' '.join([str(path)] + self.pylint_opts), return_std=True)
+        emap = {}
+        err = pylint_stderr.read()
+        if len(err):
+            print(err)
+        for line in pylint_stdout:
+            sys.stderr.write(line)
+            key = line.split(':')[-1].split('(')[0].strip()
+            if key not in self.pylint_cats:
+                continue
+            if key not in emap:
+                emap[key] = 1
+            else:
+                emap[key] += 1
+        self.python_map[str(path)] = emap
+
+    def print_summary(self, strm):
+        """Print summary of lint."""
+        nerr = 0
+        nerr += LintHelper._print_summary_map(strm, self.cpp_header_map, 'cpp-header')
+        nerr += LintHelper._print_summary_map(strm, self.cpp_src_map, 'cpp-soruce')
+        nerr += LintHelper._print_summary_map(strm, self.python_map, 'python')
+        if nerr == 0:
+            strm.write('All passed!\n')
+        else:
+            strm.write('%d files failed lint\n' % nerr)
+        return nerr
+
+# singleton helper for lint check
+_HELPER = LintHelper()
+
+def get_header_guard_dmlc(filename):
+    """Get Header Guard Convention for DMLC Projects.
+
+    For headers in include, directly use the path
+    For headers in src, use project name plus path
+
+    Examples: with project-name = dmlc
+        include/dmlc/timer.h -> DMLC_TIMTER_H_
+        src/io/libsvm_parser.h -> DMLC_IO_LIBSVM_PARSER_H_
+    """
+    fileinfo = cpplint.FileInfo(filename)
+    file_path_from_root = fileinfo.RepositoryName()
+    inc_list = ['include', 'api', 'wrapper', 'contrib']
+    if os.name == 'nt':
+        inc_list.append("mshadow")
+
+    if file_path_from_root.find('src/') != -1 and _HELPER.project_name is not None:
+        idx = file_path_from_root.find('src/')
+        file_path_from_root = _HELPER.project_name +  file_path_from_root[idx + 3:]
+    else:
+        idx = file_path_from_root.find("include/")
+        if idx != -1:
+            file_path_from_root = file_path_from_root[idx + 8:]
+        for spath in inc_list:
+            prefix = spath + '/'
+            if file_path_from_root.startswith(prefix):
+                file_path_from_root = re.sub('^' + prefix, '', file_path_from_root)
+                break
+    return re.sub(r'[-./\s]', '_', file_path_from_root).upper() + '_'
+
+cpplint.GetHeaderGuardCPPVariable = get_header_guard_dmlc
+
+def process(fname, allow_type):
+    """Process a file."""
+    fname = str(fname)
+    arr = fname.rsplit('.', 1)
+    if fname.find('#') != -1 or arr[-1] not in allow_type:
+        return
+    if arr[-1] in CXX_SUFFIX:
+        _HELPER.process_cpp(fname, arr[-1])
+    if arr[-1] in PYTHON_SUFFIX:
+        _HELPER.process_python(fname)
+
+def main():
+    """Main entry function."""
+    parser = argparse.ArgumentParser(description="lint source codes")
+    parser.add_argument('project', help='project name')
+    parser.add_argument('filetype', choices=['python', 'cpp', 'all'],
+                        help='source code type')
+    parser.add_argument('path', nargs='+', help='path to traverse')
+    parser.add_argument('--exclude_path', nargs='+', default=[],
+                        help='exclude this path, and all subfolders if path is a folder')
+    parser.add_argument('--pylint-rc', default=None,
+                        help='pylint rc file')
+    args = parser.parse_args()
+
+    _HELPER.project_name = args.project
+    if args.pylint_rc is not None:
+        _HELPER.pylint_opts = ['--rcfile='+args.pylint_rc,]
+    file_type = args.filetype
+    allow_type = []
+    if file_type == 'python' or file_type == 'all':
+        allow_type += [x for x in PYTHON_SUFFIX]
+    if file_type == 'cpp' or file_type == 'all':
+        allow_type += [x for x in CXX_SUFFIX]
+    allow_type = set(allow_type)
+    if sys.version_info.major == 2 and os.name != 'nt':
+        sys.stderr = codecs.StreamReaderWriter(sys.stderr,
+                                               codecs.getreader('utf8'),
+                                               codecs.getwriter('utf8'),
+                                               'replace')
+    # get excluded files
+    excluded_paths = filepath_enumerate(args.exclude_path)
+    for path in args.path:
+        if os.path.isfile(path):
+            normpath = os.path.normpath(path)
+            if normpath not in excluded_paths:
+                process(path, allow_type)
+        else:
+            for root, dirs, files in os.walk(path):
+                for name in files:
+                    file_path = os.path.normpath(os.path.join(root, name))
+                    if file_path not in excluded_paths:
+                        process(file_path, allow_type)
+    nerr = _HELPER.print_summary(sys.stderr)
+    sys.exit(nerr > 0)
+
+if __name__ == '__main__':
+    main()
--- a/tests/pylintrc
+++ b/tests/pylintrc
+[MASTER]
+
+# Specify a configuration file.
+#rcfile=
+
+# Python code to execute, usually for sys.path manipulation such as
+# pygtk.require().
+#init-hook=
+
+# Add files or directories to the blacklist. They should be base names, not
+# paths.
+ignore=CVS
+
+# Add files or directories matching the regex patterns to the blacklist. The
+# regex matches against base names, not paths.
+ignore-patterns=
+
+# Pickle collected data for later comparisons.
+persistent=yes
+
+# List of plugins (as comma separated values of python modules names) to load,
+# usually to register additional checkers.
+load-plugins=
+
+# Use multiple processes to speed up Pylint.
+jobs=8
+
+# Allow loading of arbitrary C extensions. Extensions are imported into the
+# active Python interpreter and may run arbitrary code.
+unsafe-load-any-extension=no
+
+# A comma-separated list of package or module names from where C extensions may
+# be loaded. Extensions are loading into the active Python interpreter and may
+# run arbitrary code
+extension-pkg-whitelist=numpy,opencv
+
+# Allow optimization of some AST trees. This will activate a peephole AST
+# optimizer, which will apply various small optimizations. For instance, it can
+# be used to obtain the result of joining multiple strings with the addition
+# operator. Joining a lot of strings can lead to a maximum recursion error in
+# Pylint and this flag can prevent that. It has one side effect, the resulting
+# AST will be different than the one from reality. This option is deprecated
+# and it will be removed in Pylint 2.0.
+optimize-ast=no
+
+
+[MESSAGES CONTROL]
+
+# Only show warnings with the listed confidence levels. Leave empty to show
+# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
+confidence=
+
+# Enable the message, report, category or checker with the given id(s). You can
+# either give multiple identifier separated by comma (,) or put this option
+# multiple time (only on the command line, not in the configuration file where
+# it should appear only once). See also the "--disable" option for examples.
+enable=indexing-exception,old-raise-syntax
+
+# Disable the message, report, category or checker with the given id(s). You
+# can either give multiple identifiers separated by comma (,) or put this
+# option multiple times (only on the command line, not in the configuration
+# file where it should appear only once).You can also use "--disable=all" to
+# disable everything first and then reenable specific checks. For example, if
+# you want to run only the similarities checker, you can use "--disable=all
+# --enable=similarities". If you want to run only the classes checker, but have
+# no Warning level messages displayed, use"--disable=all --enable=classes
+# --disable=W"
+disable=design,similarities,no-self-use,attribute-defined-outside-init,locally-disabled,star-args,pointless-except,bad-option-value,global-statement,fixme,suppressed-message,useless-suppression,locally-enabled,no-member,no-name-in-module,import-error,unsubscriptable-object,unbalanced-tuple-unpacking,undefined-variable,protected-access,superfluous-parens,invalid-name,no-else-return,useless-super-delegation,len-as-condition,invalid-unary-operand-type,line-too-long,arguments-differ,redefined-builtin,wildcard-import,broad-except,consider-using-enumerate
+# disable=unicode-builtin,delslice-method,using-cmp-argument,setslice-method,dict-view-method,parameter-unpacking,range-builtin-not-iterating,print-statement,file-builtin,old-raise-syntax,basestring-builtin,execfile-builtin,indexing-exception,import-star-module-level,coerce-method,long-builtin,old-ne-operator,old-division,no-absolute-import,raw_input-builtin,old-octal-literal,oct-method,xrange-builtin,hex-method,unpacking-in-except,nonzero-method,raising-string,intern-builtin,reload-builtin,metaclass-assignment,cmp-method,filter-builtin-not-iterating,apply-builtin,map-builtin-not-iterating,next-method-called,unichr-builtin,buffer-builtin,dict-iter-method,input-builtin,coerce-builtin,getslice-method,useless-suppression,standarderror-builtin,zip-builtin-not-iterating,suppressed-message,cmp-builtin,backtick,long-suffix,reduce-builtin,round-builtin
+
+
+[REPORTS]
+
+# Set the output format. Available formats are text, parseable, colorized, msvs
+# (visual studio) and html. You can also give a reporter class, eg
+# mypackage.mymodule.MyReporterClass.
+output-format=text
+
+# Put messages in a separate file for each module / package specified on the
+# command line instead of printing them on stdout. Reports (if any) will be
+# written in a file name "pylint_global.[txt|html]". This option is deprecated
+# and it will be removed in Pylint 2.0.
+files-output=no
+
+# Tells whether to display a full report or only the messages
+reports=no
+
+# Python expression which should return a note less than 10 (10 is the highest
+# note). You have access to the variables errors warning, statement which
+# respectively contain the number of errors / warnings messages and the total
+# number of statements analyzed. This is used by the global evaluation report
+# (RP0004).
+evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
+
+# Template used to display messages. This is a python new-style format string
+# used to format the message information. See doc for all details
+#msg-template=
+
+
+[FORMAT]
+
+# Maximum number of characters on a single line.
+max-line-length=100
+
+# Regexp for a line that is allowed to be longer than the limit.
+ignore-long-lines=^\s*(# )?<?https?://\S+>?$
+
+# Allow the body of an if to be on the same line as the test if there is no
+# else.
+single-line-if-stmt=no
+
+# List of optional constructs for which whitespace checking is disabled. `dict-
+# separator` is used to allow tabulation in dicts, etc.: {1  : 1,\n222: 2}.
+# `trailing-comma` allows a space between comma and closing bracket: (a, ).
+# `empty-line` allows space-only lines.
+no-space-check=trailing-comma,dict-separator
+
+# Maximum number of lines in a module
+max-module-lines=1000
+
+# String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
+# tab).
+indent-string='    '
+
+# Number of spaces of indent required inside a hanging  or continued line.
+indent-after-paren=4
+
+# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
+expected-line-ending-format=
+
+
+[SPELLING]
+
+# Spelling dictionary name. Available dictionaries: none. To make it working
+# install python-enchant package.
+spelling-dict=
+
+# List of comma separated words that should not be checked.
+spelling-ignore-words=
+
+# A path to a file that contains private dictionary; one word per line.
+spelling-private-dict-file=
+
+# Tells whether to store unknown words to indicated private dictionary in
+# --spelling-private-dict-file option instead of raising a message.
+spelling-store-unknown-words=no
+
+
+[MISCELLANEOUS]
+
+# List of note tags to take in consideration, separated by a comma.
+notes=FIXME,XXX,TODO
+
+
+[TYPECHECK]
+
+# Tells whether missing members accessed in mixin class should be ignored. A
+# mixin class is detected if its name ends with "mixin" (case insensitive).
+ignore-mixin-members=yes
+
+# List of module names for which member attributes should not be checked
+# (useful for modules/projects where namespaces are manipulated during runtime
+# and thus existing member attributes cannot be deduced by static analysis. It
+# supports qualified module names, as well as Unix pattern matching.
+ignored-modules=
+
+# List of class names for which member attributes should not be checked (useful
+# for classes with dynamically set attributes). This supports the use of
+# qualified names.
+ignored-classes=optparse.Values,thread._local,_thread._local
+
+# List of members which are set dynamically and missed by pylint inference
+# system, and so shouldn't trigger E1101 when accessed. Python regular
+# expressions are accepted.
+generated-members=
+
+# List of decorators that produce context managers, such as
+# contextlib.contextmanager. Add to this list to register other decorators that
+# produce valid context managers.
+contextmanager-decorators=contextlib.contextmanager
+
+
+[LOGGING]
+
+# Logging modules to check that the string format arguments are in logging
+# function parameter format
+logging-modules=logging
+
+
+[SIMILARITIES]
+
+# Minimum lines number of a similarity.
+min-similarity-lines=4
+
+# Ignore comments when computing similarities.
+ignore-comments=yes
+
+# Ignore docstrings when computing similarities.
+ignore-docstrings=yes
+
+# Ignore imports when computing similarities.
+ignore-imports=no
+
+
+[VARIABLES]
+
+# Tells whether we should check for unused import in __init__ files.
+init-import=no
+
+# A regular expression matching the name of dummy variables (i.e. expectedly
+# not used).
+dummy-variables-rgx=(_+[a-zA-Z0-9]*?$)|dummy
+
+# List of additional names supposed to be defined in builtins. Remember that
+# you should avoid to define new builtins when possible.
+additional-builtins=
+
+# List of strings which can identify a callback function by name. A callback
+# name must start or end with one of those strings.
+callbacks=cb_,_cb
+
+# List of qualified module names which can have objects that can redefine
+# builtins.
+redefining-builtins-modules=six.moves,future.builtins
+
+
+[BASIC]
+
+# Good variable names which should always be accepted, separated by a comma
+good-names=i,j,_,a,b,op,x,y,wd,lr,kv,k,v,s,p,h,c,m,n,X,t,g,f
+
+# Bad variable names which should always be refused, separated by a comma
+bad-names=
+
+# Colon-delimited sets of names that determine each other's naming style when
+# the name regexes allow several styles.
+name-group=
+
+# Include a hint for the correct naming format with invalid-name
+include-naming-hint=no
+
+# List of decorators that produce properties, such as abc.abstractproperty. Add
+# to this list to register other decorators that produce valid properties.
+property-classes=abc.abstractproperty
+
+# Regular expression matching correct module names
+module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
+
+# Naming hint for module names
+module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
+
+# Regular expression matching correct constant names
+const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
+
+# Naming hint for constant names
+const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$
+
+# Regular expression matching correct inline iteration names
+inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
+
+# Naming hint for inline iteration names
+inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$
+
+# Regular expression matching correct method names
+method-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Naming hint for method names
+method-name-hint=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression matching correct class attribute names
+class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
+
+# Naming hint for class attribute names
+class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
+
+# Regular expression matching correct argument names
+argument-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Naming hint for argument names
+argument-name-hint=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression matching correct attribute names
+attr-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Naming hint for attribute names
+attr-name-hint=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression matching correct variable names
+variable-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Naming hint for variable names
+variable-name-hint=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression matching correct function names
+function-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Naming hint for function names
+function-name-hint=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression matching correct class names
+class-rgx=[A-Z_][a-zA-Z0-9]+$
+
+# Naming hint for class names
+class-name-hint=[A-Z_][a-zA-Z0-9]+$
+
+# Regular expression which should only match function or class names that do
+# not require a docstring.
+no-docstring-rgx=^_
+
+# Minimum line length for functions/classes that require docstrings, shorter
+# ones are exempt.
+docstring-min-length=10
+
+
+[ELIF]
+
+# Maximum number of nested blocks for function / method body
+max-nested-blocks=5
+
+
+[CLASSES]
+
+# List of method names used to declare (i.e. assign) instance attributes.
+defining-attr-methods=__init__,__new__,setUp
+
+# List of valid names for the first argument in a class method.
+valid-classmethod-first-arg=cls
+
+# List of valid names for the first argument in a metaclass class method.
+valid-metaclass-classmethod-first-arg=mcs
+
+# List of member names, which should be excluded from the protected access
+# warning.
+exclude-protected=_asdict,_fields,_replace,_source,_make
+
+
+[IMPORTS]
+
+# Deprecated modules which should not be used, separated by a comma
+deprecated-modules=optparse
+
+# Create a graph of every (i.e. internal and external) dependencies in the
+# given file (report RP0402 must not be disabled)
+import-graph=
+
+# Create a graph of external dependencies in the given file (report RP0402 must
+# not be disabled)
+ext-import-graph=
+
+# Create a graph of internal dependencies in the given file (report RP0402 must
+# not be disabled)
+int-import-graph=
+
+# Force import order to recognize a module as part of the standard
+# compatibility libraries.
+known-standard-library=
+
+# Force import order to recognize a module as part of a third party library.
+known-third-party=enchant
+
+# Analyse import fallback blocks. This can be used to support both Python 2 and
+# 3 compatible code, which means that the block might have code that exists
+# only in one or another interpreter, leading to false positives when analysed.
+analyse-fallback-blocks=no
+
+
+[DESIGN]
+
+# Maximum number of arguments for function / method
+max-args=5
+
+# Argument names that match this expression will be ignored. Default to name
+# with leading underscore
+ignored-argument-names=_.*
+
+# Maximum number of locals for function / method body
+max-locals=15
+
+# Maximum number of return / yield for function / method body
+max-returns=6
+
+# Maximum number of branch for function / method body
+max-branches=12
+
+# Maximum number of statements in function / method body
+max-statements=50
+
+# Maximum number of parents for a class (see R0901).
+max-parents=7
+
+# Maximum number of attributes for a class (see R0902).
+max-attributes=7
+
+# Minimum number of public methods for a class (see R0903).
+min-public-methods=2
+
+# Maximum number of public methods for a class (see R0904).
+max-public-methods=20
+
+# Maximum number of boolean expressions in a if statement
+max-bool-expr=5
+
+
+[EXCEPTIONS]
+
+# Exceptions that will emit a warning when being caught. Defaults to
+# "Exception"
+overgeneral-exceptions=Exception
--- a/test/test.py
+++ b/test/test.py
@@ -9,6 +9,8 @@
 ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

 import encoding
+import unittest
+
 import torch
 import torch.nn.functional as F
 from torch.autograd import Variable, gradcheck
@@ -51,7 +53,7 @@ def test_encoding():
    layer = encoding.nn.Encoding(C,K).double().cuda()
    test = gradcheck(layer, input, eps=1e-6, atol=1e-4)
    print('Testing encoding(): {}'.format(test))
-    
+

 def test_sum_square():
    B,C,H,W = 2,3,4,5
@@ -62,17 +64,15 @@ def test_sum_square():
    print('Testing sum_square(): {}'.format(test))


-def test_dilated_avgpool():
-    X = Variable(torch.cuda.FloatTensor(1,3,75,75).uniform_(-0.5,0.5))
-    input = (X,)
-    layer = encoding.nn.DilatedAvgPool2d(kernel_size=2, stride=1, padding=0, dilation=2)
-    test = gradcheck(layer, input, eps=1e-6, atol=1e-4)
-    print('Testing dilatedavgpool2d(): {}'.format(test))
+def test_all_reduce():
+    ngpu = torch.cuda.device_count()
+    X = [torch.DoubleTensor(2,4,4).uniform_(-0.5,0.5).cuda(i) for i in range(ngpu)]
+    for x in X:
+        x.requires_grad = True
+    Y = encoding.parallel.allreduce(*X)
+    assert (len(X) == len(Y))


 if __name__ == '__main__':
-    test_scaledL2()
-    test_encoding() 
-    test_aggregate()
-    test_sum_square()
-    test_dilated_avgpool()
+    import nose
+    nose.runmodule()