sync BN

25985c31 · Hang Zhang · d40adbc4 · 25985c31 · 25985c31 · 25985c31
Commit 25985c31 authored Apr 12, 2018 by Hang Zhang
20 changed files
--- a/Makefile
+++ b/Makefile
+ROOTDIR = $(CURDIR)
+
+lint: cpplint pylint
+
+cpplint:
+				tests/lint.py encoding cpp src kernel
+
+pylint:
+				pylint --rcfile=$(ROOTDIR)/tests/pylintrc --ignore-patterns=".*\.so$$,.*\.dll$$,.*\.dylib$$" encoding --ignore=_ext
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ created by [Hang Zhang](http://hangzh.com/)
 ## Citations

 **Context Encoding for Semantic Segmentation**  
-  [Hang Zhang](http://hangzh.com/), [Kristin Dana](http://eceweb1.rutgers.edu/vision/dana.html), [Jianping Shi](http://shijianping.me/), [Zhongyue Zhang](http://zhongyuezhang.com/), [Xiaogang Wang](http://www.ee.cuhk.edu.hk/~xgwang/), [Ambrish Tyagi](https://scholar.google.com/citations?user=GaSWCoUAAAAJ&hl=en), [Amit Agrawal](http://www.amitkagrawal.com/)
+  [Hang Zhang](http://hangzh.com/), [Kristin Dana](http://eceweb1.rutgers.edu/vision/dana.html), [Jianping Shi](http://shijianping.me/), [Zhongyue Zhang](http://zhongyuezhang.com/), [Xiaogang Wang](http://www.ee.cuhk.edu.hk/~xgwang/), [Ambrish Tyagi](https://scholar.google.com/citations?user=GaSWCoUAAAAJ&hl=en), [Amit Agrawal](http://www.amitkagrawal.com/) [[arXiv]](https://arxiv.org/pdf/1803.08904.pdf)
 ```
 @InProceedings{Zhang_2018_CVPR,
 author = {Zhang, Hang and Dana, Kristin and Shi, Jianping and Zhang, Zhongyue and Wang, Xiaogang and Tyagi, Ambrish and Agrawal, Amit},

--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -65,7 +65,7 @@ master_doc = 'index'

 # General information about the project.
 project = 'Encoding'
-copyright = '2017, Hang Zhang'
+copyright = '2018, Hang Zhang'
 author = 'Hang Zhang'

 # The version info for the project you're documenting, acts as replacement for

--- a/docs/source/dilated.rst
+++ b/docs/source/dilated.rst
@@ -11,15 +11,8 @@ All provided models have been verified.
 .. note::
    This code is provided together with the paper

-    * Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang, Ambrish Tyagi, Amit Agrawal. "Context Encoding for Semantic Segmentation"  *The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2018*::
+    * Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang, Ambrish Tyagi, Amit Agrawal. "Context Encoding for Semantic Segmentation"  *The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2018*

-        @InProceedings{Zhang_2018_CVPR,
-        author = {Zhang, Hang and Dana, Kristin and Shi, Jianping and Zhang, Zhongyue and Wang, Xiaogang and Tyagi, Ambrish and Agrawal, Amit},
-        title = {Context Encoding for Semantic Segmentation},
-        booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
-        month = {June},
-        year = {2018}
-        }

 .. automodule:: encoding.dilated
 .. currentmodule:: encoding.dilated
@@ -91,5 +84,3 @@ DenseNet
 ~~~~~~~~~~~~~~~~~~~~~

 .. autofunction:: densenet201
-
-
--- a/docs/source/encoding.rst
+++ b/docs/source/encoding.rst
 .. role:: hidden
    :class: hidden-section

-My NN Layers
-============
+NN Layers
+=========

-
-Modules
-------
+Customized NN modules in Encoding Package. For Synchronized Cross-GPU Batch Normalization, please visit :class:`encoding.nn.BatchNorm2d`.

 .. currentmodule:: encoding.nn

@@ -34,17 +32,8 @@ Modules
 .. autoclass:: DilatedAvgPool2d
    :members:

-Functions
---------
-
-.. currentmodule:: encoding.functions
-
-:hidden:`aggregate`
-~~~~~~~~~~~~~~~~~~~
-
-.. autofunction:: aggregate
+:hidden:`GramMatrix`
+~~~~~~~~~~~~~~~~~~~~

-:hidden:`dilatedavgpool2d`
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autofunction:: dilatedavgpool2d
+.. autoclass:: GramMatrix
+    :members:
--- a/docs/source/functions.rst
+++ b/docs/source/functions.rst
 .. role:: hidden
    :class: hidden-section

-Other Functions
-===============
+encoding.functions
+==================

 .. automodule:: encoding.functions

 .. currentmodule:: encoding.functions

-
-:hidden:`scaledL2`
-~~~~~~~~~~~~~~~~~~~
-
-.. autofunction:: scaledL2
-
-
-:hidden:`upsample`
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autofunction:: upsample
-
-
-:hidden:`dropout`
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autofunction:: dropout
-
-
-:hidden:`relu`
+:hidden:`dilatedavgpool2d`
 ~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autofunction:: relu
-
-
-:hidden:`view_each`
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autofunction:: view_each
-
-
-:hidden:`multi_each`
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autofunction:: dilatedavgpool2d

-.. autofunction:: multi_each
+:hidden:`aggregate`
+~~~~~~~~~~~~~~~~~~~

+.. autofunction:: aggregate

-:hidden:`sum_each`
-~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autofunction:: sum_each
+:hidden:`scaledL2`
+~~~~~~~~~~~~~~~~~~~

+.. autofunction:: scaledL2

-:hidden:`cat_each`
-~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autofunction:: cat_each
+:hidden:`sum_square`
+~~~~~~~~~~~~~~~~~~~~

+.. autofunction:: sum_square
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -9,8 +9,8 @@ Created by `Hang Zhang <http://hangzh.com/>`_

 An optimized PyTorch package with CUDA backend. 

-.. todo::
-    A PyTorch DataParallel compatible Synchronized Cross-GPU Batch Normalization will be provided soon.
+.. note::
+    PyTorch compatible Synchronized Cross-GPU :class:`encoding.nn.BatchNorm2d` has been released.

 .. toctree::
   :glob:
@@ -34,7 +34,6 @@ An optimized PyTorch package with CUDA backend.
   syncbn
   parallel
   dilated
-   nn
   functions
   utils


--- a/docs/source/nn.rst
+++ b/docs/source/nn.rst
-.. role:: hidden
-    :class: hidden-section
-
-Other NN Layers
-===============
-
-.. automodule:: encoding.nn
-
-Customized Layers
-----------------
-
-:hidden:`GramMatrix`
-~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: GramMatrix
-    :members:
-
-:hidden:`Normalize`
-~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: Normalize
-    :members:
-
-:hidden:`View`
-~~~~~~~~~~~~~~
-
-.. autoclass:: View
-    :members:
-
-Standard Layers
---------------
-
-Standard Layers as in PyTorch but in :class:`encoding.parallel.SelfDataParallel` mode. Use together with SyncBN.
-
-:hidden:`Conv1d`
-~~~~~~~~~~~~~~~~
-
-.. autoclass:: Conv1d
-    :members:
-
-:hidden:`Conv2d`
-~~~~~~~~~~~~~~~~
-
-.. autoclass:: Conv2d
-    :members:
-
-:hidden:`ConvTranspose2d`
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: ConvTranspose2d
-    :members:
-
-:hidden:`ReLU`
-~~~~~~~~~~~~~~
-
-.. autoclass:: ReLU
-    :members:
-
-:hidden:`Sigmoid`
-~~~~~~~~~~~~~~~~~
-
-.. autoclass:: Sigmoid
-    :members:
-
-:hidden:`MaxPool2d`
-~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: MaxPool2d
-    :members:
-
-:hidden:`AvgPool2d`
-~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: AvgPool2d
-    :members:
-
-:hidden:`AdaptiveAvgPool2d`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: AdaptiveAvgPool2d
-    :members:
-
-:hidden:`Dropout2d`
-~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: Dropout2d
-    :members:
-
-:hidden:`Linear`
-~~~~~~~~~~~~~~~~
-
-.. autoclass:: Linear
-    :members:
-
-
--- a/docs/source/notes/compile.rst
+++ b/docs/source/notes/compile.rst
@@ -15,11 +15,13 @@ Install from Source

        - On Linux::

+            pip install -r requirements.txt
            python setup.py install

        - On Mac OSX::

-             MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py install
+            pip install -r requirements.txt
+            MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py install

 Citations
 ---------

--- a/docs/source/notes/extending.rst
+++ b/docs/source/notes/extending.rst
@@ -8,7 +8,7 @@ which is extending :mod:`torch.nn` and
 Torch C and CUDA Backend
 ------------------------

-Given an example of the residual operation (in a mini-batch): 
+Given a simple example of the residual operation (in a mini-batch): 

 .. math::
    r_{ik} = x_i - c_k

--- a/docs/source/notes/syncbn.rst
+++ b/docs/source/notes/syncbn.rst
 Implementing Synchronized Multi-GPU Batch Normalization
 =======================================================

-In this tutorial, we discuss the implementation detail of Multi-GPU Batch Normalization (BN) (classic implementation: :class:`encoding.nn.BatchNorm2d` and compatible :class:`encoding.parallel.SelfDataParallel`). We will provide the training example in a later version.
+In this tutorial, we discuss the implementation detail of Multi-GPU Batch Normalization (BN) (classic implementation: :class:`encoding.nn.BatchNorm2d`. We will provide the training example in a later version.

 How BN works?
 -------------
@@ -17,13 +17,13 @@ BN layer was introduced in the paper `Batch Normalization: Accelerating Deep Net
    where :math:`\mu=\frac{\sum_i^N x_i}{N} , \sigma = \sqrt{\frac{\sum_i^N (x_i-\mu)^2}{N}+\epsilon}` and :math:`\gamma, \beta` are the learnable parameters.
        
 - Backward Pass:
-    For calculating the gradient :math:`\frac{d_\ell}{d_{x_i}}`, we need to consider the gradient from :math:`\frac{d_\ell}{d_y}` and the gradients from :math:`\frac{d_\ell}{d_\mu}` and :math:`\frac{d_\ell}{d_\sigma}`, since the :math:`\mu \text{ and } \sigma` are the function of the input :math:`x_i`. We use patial direvative in the notations:
+    For calculating the gradient :math:`\frac{d_\ell}{d_{x_i}}`, we need to consider the partial gradient from :math:`\frac{d_\ell}{d_y}` and the gradients from :math:`\frac{d_\ell}{d_\mu}` and :math:`\frac{d_\ell}{d_\sigma}`, since the :math:`\mu \text{ and } \sigma` are the function of the input :math:`x_i`. We use patial direvative in the notations:

    .. math::

-        \frac{d_\ell}{d_{x_i}} = \frac{d_\ell}{d_{y_i}}\cdot\frac{d_{y_i}}{d_{x_i}} + \frac{d_\ell}{d_\mu}\cdot\frac{d_\mu}{d_{x_i}} + \frac{d_\ell}{d_\sigma}\cdot\frac{d_\sigma}{d_{x_i}}
+        \frac{d_\ell}{d_{x_i}} = \frac{d_\ell}{d_{y_i}}\cdot\frac{\partial_{y_i}}{\partial_{x_i}} + \frac{d_\ell}{d_\mu}\cdot\frac{d_\mu}{d_{x_i}} + \frac{d_\ell}{d_\sigma}\cdot\frac{d_\sigma}{d_{x_i}}

-    where :math:`\frac{d_{y_i}}{d_{x_i}}=\frac{\gamma}{\sigma}, \frac{d_\ell}{d_\mu}=-\frac{\gamma}{\sigma}\sum_i^N\frac{d_\ell}{d_{y_i}}
+    where :math:`\frac{\partial_{y_i}}{\partial_{x_i}}=\frac{\gamma}{\sigma}, \frac{d_\ell}{d_\mu}=-\frac{\gamma}{\sigma}\sum_i^N\frac{d_\ell}{d_{y_i}}
    \text{ and } \frac{d_\sigma}{d_{x_i}}=-\frac{1}{\sigma}(\frac{x_i-\mu}{N})`.

 Why Synchronize BN?
@@ -41,41 +41,27 @@ How to Synchronize?
 Suppose we have :math:`K` number of GPUs, :math:`sum(x)_k` and :math:`sum(x^2)_k` denotes the sum of elements and sum of element squares in :math:`k^{th}` GPU.

 - Forward Pass:
-    We can calculate the sum of elements :math:`sum(x)=\sum x_i \text{ and sum of squares } sum(x^2)=\sum x_i^2` in each GPU, then apply :class:`encoding.parallel.AllReduce` operation to sum accross GPUs. Then calculate the global mean :math:`\mu=\frac{sum(x)}{N} \text{ and global variance } \sigma=\sqrt{\frac{sum(x^2)}{N}-\mu^2+\epsilon}`. 
+    We can calculate the sum of elements :math:`sum(x)=\sum x_i \text{ and sum of squares } sum(x^2)=\sum x_i^2` in each GPU, then apply :class:`encoding.parallel.allreduce` operation to sum accross GPUs. Then calculate the global mean :math:`\mu=\frac{sum(x)}{N} \text{ and global variance } \sigma=\sqrt{\frac{sum(x^2)}{N}-\mu^2+\epsilon}`. 

 - Backward Pass:
-    * :math:`\frac{d_\ell}{d_{x_i}}=\frac{\gamma}{\sigma}` can be calculated locally in each GPU.
+    * :math:`\frac{d_\ell}{d_{x_i}}=\frac{d_\ell}{d_{y_i}}\frac{\gamma}{\sigma}` can be calculated locally in each GPU.
    * Calculate the gradient of :math:`sum(x)` and :math:`sum(x^2)` individually in each GPU :math:`\frac{d_\ell}{d_{sum(x)_k}}` and :math:`\frac{d_\ell}{d_{sum(x^2)_k}}`. 

-    * Then Sync the gradient (automatically handled by :class:`encoding.parallel.AllReduce`) and continue the backward.
-
-Classic Implementation
-~~~~~~~~~~~~~~~~~~~~~~
-
- Synchronized DataParallel:
-    Standard DataParallel pipeline of public frameworks (MXNet, PyTorch...) in each training iters: 
-
-        * duplicate the network (weights) to all the GPUs,
-        * split the training batch to each GPU,
-        * forward and backward to calculate gradient,
-        * update network parameters (weights) then go to next iter.
-
-    Therefore, communicattion accross different GPUs are not supported. To address this problem, we introduce a :class:`encoding.parallel.SelfDataParallel` mode, which enables each layer to accept mutli-GPU inputs directly. Those self-parallel layers are provide in :class:`encoding.nn`.
-
- Cross GPU Autograd:
-    Due to the BN layers are frequently used in the networks, the PyTorch autograd engine will be messed up by such a complicated backward graph. To address this problem, we provide an aotograd function :class:`encoding.parallel.AllReduce` to handle the cross GPU gradient calculation.
-
-Comparing Performance 
---------------------
-
- Training Time:
-
- Segmentation Performance:
+    * Then Sync the gradient (automatically handled by :class:`encoding.parallel.allreduce`) and continue the backward.


 Citation
 --------

 .. note::
+    This code is provided together with the paper, please cite our work.
+
+        * Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang, Ambrish Tyagi, Amit Agrawal. "Context Encoding for Semantic Segmentation"  *The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2018*::

-    This code is provided together with the paper (coming soon), please cite our work.
+            @InProceedings{Zhang_2018_CVPR,
+            author = {Zhang, Hang and Dana, Kristin and Shi, Jianping and Zhang, Zhongyue and Wang, Xiaogang and Tyagi, Ambrish and Agrawal, Amit},
+            title = {Context Encoding for Semantic Segmentation},
+            booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+            month = {June},
+            year = {2018}
+            }
--- a/docs/source/parallel.rst
+++ b/docs/source/parallel.rst
@@ -4,8 +4,13 @@
 Data Parallel
 =============

- Current PyTorch DataParallel Table is not supporting mutl-gpu loss calculation, which makes the gpu memory usage very in-efficient. We address this issue here by doing CriterionDataParallel. 
- :class:`encoding.parallel.SelfDataParallel` is compatible with Synchronized Batch Normalization :class:`encoding.nn.BatchNorm2d`.
+- Current PyTorch DataParallel Table is not supporting mutl-gpu loss calculation, which makes the gpu memory usage very in-balance. We address this issue here by doing Model & CriterionDataParallel. 
+
+.. note::
+    This code is provided together with the paper
+
+    * Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang, Ambrish Tyagi, Amit Agrawal. "Context Encoding for Semantic Segmentation"  *The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2018*
+

 .. automodule:: encoding.parallel
 .. currentmodule:: encoding.parallel
@@ -22,21 +27,8 @@ Data Parallel
 .. autoclass:: CriterionDataParallel
    :members:

-:hidden:`SelfDataParallel`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: SelfDataParallel
-    :members:
-
-:hidden:`AllReduce`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: AllReduce
-    :members:
-
-:hidden:`Broadcast`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: Broadcast
-    :members:
+:hidden:`allreduce`
+~~~~~~~~~~~~~~~~~~~

+.. autofunction:: allreduce
--- a/docs/source/syncbn.rst
+++ b/docs/source/syncbn.rst
@@ -4,9 +4,8 @@
 Synchronized BatchNorm
 ======================

-The current BN is implementated insynchronized accross the gpus, which is a big problem for memory consuming tasks such as Semantic Segmenation, since the mini-batch is very small. 
-To synchronize the batchnorm accross multiple gpus is not easy to implment within the current Dataparallel framework. We address this difficulty by making each layer 'self-parallel' :class:`encoding.parallel.SelfDataParallel`, that is accepting the inputs from multi-gpus. Therefore, we can handle the synchronizing across gpus.
-
+.. note::
+    The original ``Self-Parallel`` version of ``BatchNorm`` has been deprecated in favor of PyTorch Compatible :class:`encoding.nn.BatchNorm2d`.

 .. currentmodule:: encoding.nn

@@ -44,9 +43,3 @@ Functions
 ~~~~~~~~~~~~~~~~~~~~~~~

 .. autofunction:: batchnormeval
-
-:hidden:`sum_square`
-~~~~~~~~~~~~~~~~~~~~
-
-.. autofunction:: sum_square
-
--- a/docs/source/utils.rst
+++ b/docs/source/utils.rst
@@ -24,8 +24,3 @@ Useful util functions.
 ~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autofunction:: save_checkpoint
-
-:hidden:`progress_bar`
-~~~~~~~~~~~~~~~~~~~~~~
-
-.. autofunction:: progress_bar
--- a/encoding/__init__.py
+++ b/encoding/__init__.py
@@ -5,13 +5,9 @@
 ## Copyright (c) 2017
 ##
 ## This source code is licensed under the MIT-style license found in the
-## LICENSE file in the root directory of this source tree 
+## LICENSE file in the root directory of this source tree
 ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

+"""An optimized PyTorch package with CUDA backend."""
 from .version import __version__
-
-import encoding.nn
-import encoding.functions
-import encoding.dilated
-import encoding.parallel 
-import encoding.utils
+from . import nn, functions, dilated, parallel, utils
--- a/encoding/dilated/__init__.py
+++ b/encoding/dilated/__init__.py
+"""Dilated ResNet and DenseNet"""
 from .resnet import *
 from .densenet import *
--- a/encoding/dilated/densenet.py
+++ b/encoding/dilated/densenet.py
+"""Dilated DenseNet"""
+from collections import OrderedDict
+
 import torch
-from torch.autograd import Variable
 import torch.utils.model_zoo as model_zoo
-from collections import OrderedDict

 from .. import nn
 from .. import functions as F
@@ -74,17 +75,18 @@ def densenet161(pretrained=False, **kwargs):


 class _DenseLayer(nn.Sequential):
+    # pylint: disable=expression-not-assigned
    def __init__(self, num_input_features, growth_rate, bn_size, drop_rate, dilation=1):
        super(_DenseLayer, self).__init__()
        self.add_module('norm.1', nn.BatchNorm2d(num_input_features)),
        self.add_module('relu.1', nn.ReLU(inplace=True)),
-        self.add_module('conv.1', nn.Conv2d(num_input_features, bn_size *
-                        growth_rate, kernel_size=1, stride=1, bias=False)),
+        self.add_module('conv.1', nn.Conv2d(
+            num_input_features, bn_size * growth_rate, kernel_size=1, stride=1, bias=False)),
        self.add_module('norm.2', nn.BatchNorm2d(bn_size * growth_rate)),
        self.add_module('relu.2', nn.ReLU(inplace=True)),
-        self.add_module('conv.2', nn.Conv2d(bn_size * growth_rate, growth_rate,
-                        kernel_size=3, stride=1, padding=dilation, dilation=dilation, 
-                        bias=False)),
+        self.add_module('conv.2', nn.Conv2d(
+            bn_size * growth_rate, growth_rate, kernel_size=3, stride=1,
+            padding=dilation, dilation=dilation, bias=False)),
        self.drop_rate = drop_rate

    def forward(self, x):
@@ -92,12 +94,7 @@ class _DenseLayer(nn.Sequential):
        if self.drop_rate > 0:
            new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)

-        if isinstance(x, Variable):
-            return torch.cat([x, new_features], 1)
-        elif isinstance(x, tuple) or isinstance(x, list):
-            return F.cat_each(x, new_features, 1)
-        else:
-            raise RuntimeError('unknown input type')
+        return torch.cat([x, new_features], 1)


 class _DenseBlock(nn.Sequential):
@@ -115,8 +112,8 @@ class _Transition(nn.Sequential):
        self.add_module('relu', nn.ReLU(inplace=True))
        self.add_module('conv', nn.Conv2d(num_input_features, num_output_features,
                                          kernel_size=1, stride=1, bias=False))
-        self.add_module('pool', nn.DilatedAvgPool2d(kernel_size=2, stride=stride, 
-                                                 dilation=dilation))
+        self.add_module('pool', nn.DilatedAvgPool2d(kernel_size=2, stride=stride,
+                                                    dilation=dilation))


 class DenseNet(nn.Module):
@@ -150,8 +147,8 @@ class DenseNet(nn.Module):
        ]))

        # Each denseblock
-        strides = [1,2,1,1]
-        dilations = [1,1,2,4]
+        strides = [1, 2, 1, 1]
+        dilations = [1, 1, 2, 4]
        num_features = num_init_features
        for i, num_layers in enumerate(block_config):
            block = _DenseBlock(num_layers=num_layers, num_input_features=num_features,
@@ -173,8 +170,6 @@ class DenseNet(nn.Module):
    def forward(self, x):
        features = self.features(x)
        out = F.relu(features, inplace=True)
-        """
-        out = F.avg_pool2d(out, kernel_size=7).view(features.size(0), -1)
-        out = self.classifier(out)
-        """
+        # out = F.avg_pool2d(out, kernel_size=7).view(features.size(0), -1)
+        # out = self.classifier(out)
        return out
--- a/encoding/dilated/resnet.py
+++ b/encoding/dilated/resnet.py
-from .. import nn
+"""Dilated ResNet"""
 import math
-from torch.autograd import Variable
 import torch.utils.model_zoo as model_zoo
+from .. import nn

 __all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
           'resnet152', 'BasicBlock', 'Bottleneck']
@@ -22,6 +22,8 @@ def conv3x3(in_planes, out_planes, stride=1):


 class BasicBlock(nn.Module):
+    """ResNet BasicBlock
+    """
    expansion = 1
    def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, first_dilation=1):
        super(BasicBlock, self).__init__()
@@ -55,25 +57,29 @@ class BasicBlock(nn.Module):


 class Bottleneck(nn.Module):
+    """ResNet Bottleneck
+    """
+    # pylint: disable=unused-argument
    expansion = 4
-    def __init__(self, inplanes, planes, stride=1, dilation=1, 
-            downsample=None, first_dilation=1):
+    def __init__(self, inplanes, planes, stride=1, dilation=1,
+                 downsample=None, first_dilation=1):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
-        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+        self.conv2 = nn.Conv2d(
+            planes, planes, kernel_size=3, stride=stride,
            padding=dilation, dilation=dilation, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
-        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, 
-            bias=False)
+        self.conv3 = nn.Conv2d(
+            planes, planes * 4, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * 4)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.dilation = dilation
        self.stride = stride
-    
+
    def _sum_each(self, x, y):
-        assert(len(x)==len(y))
+        assert(len(x) == len(y))
        z = []
        for i in range(len(x)):
            z.append(x[i]+y[i])
@@ -96,11 +102,7 @@ class Bottleneck(nn.Module):
        if self.downsample is not None:
            residual = self.downsample(x)

-        if isinstance(out, Variable):
-            out += residual
-        elif isinstance(out, tuple) or isinstance(out, list):
-            out = self._sum_each(out, residual)
-            
+        out += residual
        out = self.relu(out)

        return out
@@ -108,13 +110,14 @@ class Bottleneck(nn.Module):

 class ResNet(nn.Module):
    """Dilated Pre-trained ResNet Model, which preduces the stride of 8 featuremaps at conv5.
-    
+
    Reference:

        - He, Kaiming, et al. "Deep residual learning for image recognition." Proceedings of the IEEE conference on computer vision and pattern recognition. 2016.

        - Yu, Fisher, and Vladlen Koltun. "Multi-scale context aggregation by dilated convolutions."
    """
+    # pylint: disable=unused-variable
    def __init__(self, block, layers, num_classes=1000):
        self.inplanes = 64
        super(ResNet, self).__init__()
@@ -149,14 +152,14 @@ class ResNet(nn.Module):

        layers = []
        if dilation == 1 or dilation == 2:
-            layers.append(block(self.inplanes, planes, stride, dilation=1, 
+            layers.append(block(self.inplanes, planes, stride, dilation=1,
                                downsample=downsample, first_dilation=dilation))
-        elif dilation ==4:
-            layers.append(block(self.inplanes, planes, stride, dilation=2, 
+        elif dilation == 4:
+            layers.append(block(self.inplanes, planes, stride, dilation=2,
                                downsample=downsample, first_dilation=dilation))
        else:
            raise RuntimeError("=> unknown dilation size: {}".format(dilation))
-            
+
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes, dilation=dilation, first_dilation=dilation))
@@ -239,8 +242,3 @@ def resnet152(pretrained=False, **kwargs):
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
    return model
-
-if __name__ == "__main__":
-    model = ResNet(Bottleneck, [3, 4, 23, 3])
-    model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
-    print(model.layer4)
--- a/encoding/functions/__init__.py
+++ b/encoding/functions/__init__.py
-
+"""Encoding Autograd Fuctions"""
 from .encoding import *
-from .basic import *
 from .syncbn import *
 from .customize import *
--- a/encoding/functions/basic.py
+++ b/encoding/functions/basic.py
-##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-## Created by: Hang Zhang
-## ECE Department, Rutgers University
-## Email: zhang.hang@rutgers.edu
-## Copyright (c) 2017
-##
-## This source code is licensed under the MIT-style license found in the
-## LICENSE file in the root directory of this source tree 
-##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-
-import threading
-import torch
-import torch.nn.functional as F
-from torch.autograd import Function, Variable
-
-__all__ = ['squeeze_each', 'view_each', 'multi_each', 'sum_each', 
-    'cat_each', 'upsample', 'dropout', 'relu']
-
-def squeeze_each(x, dim=None):
-    """Multi-GPU version torch. squeeze()
-    
-    """
-    y = []
-    for i in range(len(x)):
-        if dim is None:
-            y.append(x[i].squeeze())
-        else:
-            y.append(x[i].squeeze(dim))
-    return y
-
-def view_each(x, size):
-    """Multi-GPU version torch.view
-
-    Returns a new tensor with the same data but different size.
-    The returned tensor shares the same data and must have the same number
-    of elements, but may have a different size. A tensor must be
-    :attr:`contiguous` to be viewed.
-
-    Args:
-        input: list of multi-gpu tensors
-        size (torch.Size or int...): Desired size
-
-    """
-    y = []
-    for i in range(len(x)):
-        y.append(x[i].view(size))
-    return y
-
-def multi_each(a, b):
-    """Multi-GPU version multiplication
-
-    .. math::
-        y[i] = a[i] * b[i]
-    """
-    y = []
-    for i in range(len(a)):
-        y.append(a[i] * b[i])
-    return y
-
-def sum_each(x, y):
-    """Multi-GPU version torch.add
-
-    .. math::
-        y[i] = a[i] + b[i]
-    """
-    assert(len(x)==len(y))
-    z = []
-    for i in range(len(x)):
-        z.append(x[i]+y[i])
-    return z
-
-
-def cat_each(x1, x2, dim):
-    """Multi-GPU version torch.cat
-
-    .. math::
-        y[i] = torch.cat(a[i], b[i], dim)
-    """
-    assert(len(x1)==len(x2))
-    z = []
-    for i in range(len(x1)):
-        with torch.cuda.device_of(x1[i]):
-            x = torch.cat((x1[i], x2[i]), dim)
-            z.append(x)
-    return z
-
-
-def dict_to_list(x):
-    """Converting Dict{} to list[]
-    """
-    y = []
-    for i in range(len(x)):
-        xi = x[i]
-        if isinstance(xi, Exception):
-            raise xi
-        y.append(xi)
-    return y
-
-
-def upsample(input, size=None, scale_factor=None, mode='nearest'):
-    """Multi-GPU version torch.nn.functional.upsample
-
-    Upsamples the input to either the given :attr:`size` or the given
-    :attr:`scale_factor`
-
-    The algorithm used for upsampling is determined by :attr:`mode`.
-
-    Currently temporal, spatial and volumetric upsampling are supported, i.e.
-    expected inputs are 3-D, 4-D or 5-D in shape.
-
-    The input dimensions are interpreted in the form:
-    `mini-batch x channels x [depth] x [height] x width`
-
-    The modes available for upsampling are: `nearest`, `linear` (3D-only),
-    `bilinear` (4D-only), `trilinear` (5D-only)
-
-    Args:
-        input (Variable): input
-        size (int or Tuple[int] or Tuple[int, int] or Tuple[int, int, int]):
-            output spatial size.
-        scale_factor (int): multiplier for spatial size. Has to be an integer.
-        mode (string): algorithm used for upsampling:
-            'nearest' | 'linear' | 'bilinear' | 'trilinear'. Default: 'nearest'
-    """
-    if isinstance(input, Variable):
-        return F.upsample(input, size=size, scale_factor=scale_factor,
-                          mode=mode)
-    elif isinstance(input, tuple) or isinstance(input, list):
-        lock = threading.Lock()
-        results = {}
-        def _worker(i, x):
-            try:
-                with torch.cuda.device_of(x):
-                    result =  F.upsample(x, size=size, \
-                        scale_factor=scale_factor,mode=mode)
-                with lock:
-                    results[i] = result
-            except Exception as e:
-                with lock:
-                    resutls[i] = e 
-        # multi-threading for different gpu
-        threads = [threading.Thread(target=_worker,
-                                    args=(i, x),
-                                    )
-                   for i, (x) in enumerate(input)]
-        for thread in threads:
-            thread.start()
-        for thread in threads:
-            thread.join() 
-        outputs = dict_to_list(results)
-        return outputs
-    else:
-        raise RuntimeError('unknown input type')
-
-
-def dropout(input, p=0.5, training=False, inplace=True):
-    """Multi-GPU version torch.nn.functional.droupout
-
-    The channels to zero-out are randomized on every forward call.
-
-    *Usually the input comes from Conv2d modules.*
-
-    As described in the paper
-    `Efficient Object Localization Using Convolutional Networks`,
-    if adjacent pixels within feature maps are strongly correlated
-    (as is normally the case in early convolution layers) then iid dropout
-    will not regularize the activations and will otherwise just result
-    in an effective learning rate decrease.
-
-    In this case, :func:`nn.Dropout2d` will help promote independence between
-    feature maps and should be used instead.
-
-    Args:
-        p (float, optional): probability of an element to be zeroed.
-        inplace (bool, optional): If set to True, will do this operation
-            in-place
-
-    Shape:
-        - Input: :math:`(N, C, H, W)`
-        - Output: :math:`(N, C, H, W)` (same shape as input)
-    
-    """
-    if isinstance(input, Variable):
-        return F.dropout(input, p, training, inplace)
-    elif isinstance(input, tuple) or isinstance(input, list):
-        lock = threading.Lock()
-        results = {}
-        def _worker(i, x):
-            try:
-                with torch.cuda.device_of(x):
-                    result =  F.dropout(x, p, training, inplace)
-                with lock:
-                    results[i] = result
-            except Exception as e:
-                with lock:
-                    resutls[i] = e 
-        # multi-threading for different gpu
-        threads = [threading.Thread(target=_worker,
-                                    args=(i, x),
-                                    )
-                   for i, (x) in enumerate(input)]
-        for thread in threads:
-            thread.start()
-        for thread in threads:
-            thread.join() 
-        outputs = dict_to_list(results)
-        return outputs
-    else:
-        raise RuntimeError('unknown input type')
-
-def relu(input, inplace=False):
-    """Multi-GPU version torch.nn.functional.relu
-
-    Applies the rectified linear unit function element-wise
-    :math:`{ReLU}(x)= max(0, x)`
-
-    Args:
-        inplace: can optionally do the operation in-place. Default: False
-
-    Shape:
-        - Input: :math:`(N, *)` where `*` means, any number of additional
-          dimensions
-        - Output: :math:`(N, *)`, same shape as the input
-    """
-    if isinstance(input, Variable):
-        return F.relu(input, inplace)
-    elif isinstance(input, tuple) or isinstance(input, list):
-        lock = threading.Lock()
-        results = {}
-        def _worker(i, x):
-            try:
-                with torch.cuda.device_of(x):
-                    result =  F.relu(x, inplace)
-                with lock:
-                    results[i] = result
-            except Exception as e:
-                with lock:
-                    resutls[i] = e 
-        # multi-threading for different gpu
-        threads = [threading.Thread(target=_worker,
-                                    args=(i, x),
-                                    )
-                   for i, (x) in enumerate(input)]
-        for thread in threads:
-            thread.start()
-        for thread in threads:
-            thread.join() 
-        outputs = dict_to_list(results)
-        return outputs
-
-    else:
-        raise RuntimeError('unknown input type')