v0.1.0

d539ddfa · Hang Zhang · 80a12ef6 · d539ddfa · d539ddfa · d539ddfa
Commit d539ddfa authored Nov 16, 2017 by Hang Zhang
20 changed files
--- a/.gitignore
+++ b/.gitignore
 *.DS_Store
 *.swp
 *.pyc
+version.py
 build/
 data/
+docs/src/
 docs/html/
+encoding/_ext/
+encoding.egg-info/
--- a/LICENSE
+++ b/LICENSE
+MIT License
+
+Copyright (c) 2017 Hang Zhang
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software. 
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/README.md
+++ b/README.md
-# PyTorch-Encoding-Layer
+# PyTorch-Encoding
+
+## [Documentation](http://hangzh.com/PyTorch-Encoding/)
+
+- Please visit the [**Docs**](http://hangzh.com/PyTorch-Encoding/) for detail instructions of installation and usage. 
+
+- [**Link**](http://hangzh.com/PyTorch-Encoding/experiments/texture.html) to the Deep TEN texture classification experiments and pre-trained models.
+
+## Citation

 **Deep TEN: Texture Encoding Network** [[arXiv]](https://arxiv.org/pdf/1612.02844.pdf)  
  [Hang Zhang](http://hangzh.com/), [Jia Xue](http://jiaxueweb.com/), [Kristin Dana](http://eceweb1.rutgers.edu/vision/dana.html)
@@ -11,9 +19,3 @@ month = {July},
 year = {2017}
 }
 ```
-
-## [Documentation](http://hangzh.com/PyTorch-Encoding/)
-
- Please visit the [**Docs**](http://hangzh.com/PyTorch-Encoding/) for detail instructions of installation and usage. 
-
- [**Link**](http://hangzh.com/PyTorch-Encoding/experiments/texture.html) to the experiments and pre-trained models.
--- a/build.py
+++ b/build.py
@@ -15,9 +15,13 @@ import subprocess
 from torch.utils.ffi import create_extension

 lib_path = os.path.join(os.path.dirname(torch.__file__), 'lib')
-this_file = os.path.dirname(os.path.realpath(__file__))
+cwd = os.path.dirname(os.path.realpath(__file__))

-# build kernel library
+# clean the build files
+clean_cmd = ['bash', 'clean.sh']
+subprocess.check_call(clean_cmd)
+
+# build CUDA library
 os.environ['TORCH_BUILD_DIR'] = lib_path
 if platform.system() == 'Darwin':
    os.environ['TH_LIBRARIES'] = os.path.join(lib_path,'libTH.1.dylib')
@@ -28,21 +32,21 @@ else:
    os.environ['THC_LIBRARIES'] = os.path.join(lib_path,'libTHC.so.1')
    ENCODING_LIB = os.path.join(lib_path, 'libENCODING.so')

-clean_cmd = ['bash', 'clean.sh']
-subprocess.check_call(clean_cmd)
-
 build_all_cmd = ['bash', 'encoding/make.sh']
 subprocess.check_call(build_all_cmd, env=dict(os.environ))

+# build FFI
 sources = ['encoding/src/encoding_lib.cpp']
-headers = ['encoding/src/encoding_lib.h']
+headers = [
+    'encoding/src/encoding_lib.h',
+]
 defines = [('WITH_CUDA', None)]
 with_cuda = True 

 include_path = [os.path.join(lib_path, 'include'),
-                os.path.join(os.environ['HOME'],'pytorch/torch/lib/THC'), 
                os.path.join(lib_path,'include/ENCODING'), 
-                os.path.join(this_file,'encoding/src/')]
+                os.path.join(cwd,'encoding/kernel/include'), 
+                os.path.join(cwd,'encoding/src/')]

 def make_relative_rpath(path):
    if platform.system() == 'Darwin':

--- a/docs/source/_static/img/cvpr17.svg
+++ b/docs/source/_static/img/cvpr17.svg
+<svg id="Layer_1" data-name="Layer 1" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 509.89 220.5"><defs><style>.cls-1,.cls-6{fill:#fff;}.cls-1,.cls-3,.cls-4,.cls-5{stroke:#010101;stroke-linecap:round;stroke-linejoin:round;}.cls-1,.cls-3,.cls-4{stroke-width:0.75px;}.cls-2,.cls-7{isolation:isolate;fill:#010101;font-family:Calibri;}.cls-2{font-size:24px;}.cls-3{fill:#94ca54;}.cls-4{fill:#2baee4;}.cls-5{fill:none;stroke-width:1.5px;stroke-dasharray:1.5 3;}.cls-7{font-size:30px;}</style></defs><title>encoding</title><g id="shape2-1"><rect class="cls-1" x="30.81" y="135.48" width="108" height="64.5"/><text class="cls-2" transform="translate(58.85 174.93)">Input</text></g><g id="shape3-4"><rect class="cls-1" x="30.81" y="43.98" width="108" height="64.5"/><text class="cls-2" transform="translate(34.83 83.43)">Dictiona<tspan x="80.55" y="0">r</tspan><tspan x="89.04" y="0">y</tspan></text></g><g id="shape4-7"><rect class="cls-3" x="204.06" y="43.98" width="108" height="64.5"/><text class="cls-2" transform="translate(212.32 83.43)">R<tspan x="12.61" y="0">esiduals</tspan></text></g><g id="shape5-10"><rect class="cls-1" x="204.06" y="135.48" width="108" height="64.5"/><text class="cls-2" transform="translate(227.01 174.93)">Assign</text></g><g id="shape7-13"><rect class="cls-1" x="366.06" y="94.5" width="108" height="64.5"/><text class="cls-2" transform="translate(370.28 133.95)">A<tspan x="13.89" y="0">g</tspan><tspan x="25.41" y="0">g</tspan><tspan x="36.7" y="0">r</tspan><tspan x="44.75" y="0">e</tspan><tspan x="56.69" y="0">g</tspan><tspan x="67.55" y="0">a</tspan><tspan x="78.82" y="0">t</tspan><tspan x="86.6" y="0">e</tspan></text></g><g id="shape9-16"><path class="cls-1" d="M204.25,167.73l-12,5v-2.52H139v-5h53.25v-2.52Z" transform="translate(-0.19)"/></g><g id="shape11-18"><path class="cls-1" d="M204.25,76.23l-12,5V78.75H139v-5h53.25V71.19Z" transform="translate(-0.19)"/></g><g id="shape12-20"><path class="cls-1" d="M204.25,76.23l-2.86,12.7-2.05-1.46-58.28,81.73L139,167.73l-2.05-1.46,58.28-81.73-2.05-1.46Z" transform="translate(-0.19)"/></g><g id="shape13-22"><path class="cls-4" d="M204.25,167.73l-11.07-6.84,2.05-1.46L137,77.7,139,76.23l2.05-1.46,58.28,81.73,2.05-1.46Z" transform="translate(-0.19)"/></g><g id="shape14-24"><path class="cls-1" d="M366.25,126.75L354,122.23l1.72-1.84L310.53,78.07l1.72-1.84L314,74.39l45.24,42.32,1.72-1.84Z" transform="translate(-0.19)"/></g><g id="shape15-26"><path class="cls-1" d="M366.25,126.75L359.73,138l-1.52-2-44.44,33.73-1.52-2-1.52-2L355.16,132l-1.52-2Z" transform="translate(-0.19)"/></g><g id="shape16-28"><path class="cls-1" d="M31,167.73l-12,5v-2.52H0.57v-5H19v-2.52Z" transform="translate(-0.19)"/></g><g id="shape17-30"><path class="cls-1" d="M509.71,126.75l-12,5v-2.52H474.25v-5h23.46v-2.52Z" transform="translate(-0.19)"/></g><g id="group1001-32"><g id="shape26-33"><path class="cls-5" d="M16.8,117.45H160.51" transform="translate(-0.19)"/></g><g id="group1000-36"><g id="shape20-37"><path class="cls-5" d="M16.38,18h468" transform="translate(-0.19)"/><rect class="cls-6" x="156.89" width="186.58" height="36"/><text class="cls-7" transform="translate(156.89 27)">En<tspan x="30.41" y="0">c</tspan><tspan x="42.85" y="0">oding-L</tspan><tspan x="132.99" y="0">a</tspan><tspan x="146.81" y="0">y</tspan><tspan x="160.02" y="0">er</tspan></text></g><g id="shape22-42"><path class="cls-5" d="M160.93,219.75H484.38" transform="translate(-0.19)"/></g><g id="shape23-45"><path class="cls-5" d="M484,219.75V18.41" transform="translate(-0.19)"/></g><g id="shape24-48"><path class="cls-5" d="M16.8,117V18" transform="translate(-0.19)"/></g><g id="shape25-51"><path class="cls-5" d="M160.93,218.13V117" transform="translate(-0.19)"/></g></g></g></svg>
\ No newline at end of file
--- a/docs/source/_static/img/figure1.jpg
+++ b/docs/source/_static/img/figure1.jpg
--- a/docs/source/_static/img/myimage.gif
+++ b/docs/source/_static/img/myimage.gif
--- a/docs/source/_static/img/upconv.png
+++ b/docs/source/_static/img/upconv.png
--- a/docs/source/dilated.rst
+++ b/docs/source/dilated.rst
@@ -6,7 +6,7 @@ Dilated Networks

 We provide correct dilated pre-trained ResNet and DenseNet for semantic segmentation. 
 For dilation of ResNet, we replace the stride of 2 Conv3x3 at begining of certain stage and update the dilation of the conv layers afterwards. 
-For dilation of DenseNet, we provide DilatedAvgPool2d that handles the dilation of the transition layers, then update the dilation of the conv layers afterwards. 
+For dilation of DenseNet, we provide :class:`encoding.nn.DilatedAvgPool2d` that handles the dilation of the transition layers, then update the dilation of the conv layers afterwards. 
 All provided models have been verified. 



--- a/docs/source/encoding.rst
+++ b/docs/source/encoding.rst
@@ -4,6 +4,7 @@
 My NN Layers
 ============

+
 Modules
 -------

@@ -21,22 +22,16 @@ Modules
 .. autoclass:: Inspiration
    :members:

-:hidden:`DilatedAvgPool2d`
+:hidden:`UpsampleConv2d`
 ~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: DilatedAvgPool2d
-    :members:
-
-:hidden:`GramMatrix`
-~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: GramMatrix
+.. autoclass:: UpsampleConv2d
    :members:

-:hidden:`Aggregate`
-~~~~~~~~~~~~~~~~~~~
+:hidden:`DilatedAvgPool2d`
+~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: Aggregate
+.. autoclass:: DilatedAvgPool2d
    :members:

 Functions
@@ -54,13 +49,3 @@ Functions

 .. autofunction:: scaledL2

-:hidden:`residual`
-~~~~~~~~~~~~~~~~~~~
-
-.. autofunction:: residual
-
-
-:hidden:`assign`
-~~~~~~~~~~~~~~~~~~~
-
-.. autofunction:: assign
--- a/docs/source/experiments/style.rst
+++ b/docs/source/experiments/style.rst
 MSG-Net Style Transfer Example
 ==============================

-.. image:: https://raw.githubusercontent.com/zhanghang1989/MSG-Net/master/images/figure1.jpg
+.. image:: ../_static/img/figure1.jpg
    :width: 55%
    :align: left

@@ -47,7 +47,7 @@ Stylize Images Using Pre-trained Model

    python camera_demo.py demo --model models/9styles.model

-.. image:: https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/myimage.gif
+.. image:: ../_static/img/myimage.gif

 - Test the model::

@@ -66,14 +66,14 @@ Stylize Images Using Pre-trained Model

 .. raw:: html

-    <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/1.jpg" width="260px" /> <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/2.jpg" width="260px" />
-    <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/3.jpg" width="260px" />
-    <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/4.jpg" width="260px" />
-    <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/5.jpg" width="260px" />
-    <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/6.jpg" width="260px" />
-    <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/7.jpg" width="260px" />
-    <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/8.jpg" width="260px" />
-    <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/9.jpg" width="260px" />
+    <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/1.jpg" width="220px" /> <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/2.jpg" width="220px" />
+    <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/3.jpg" width="220px" />
+    <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/4.jpg" width="220px" />
+    <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/5.jpg" width="220px" />
+    <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/6.jpg" width="220px" />
+    <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/7.jpg" width="220px" />
+    <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/8.jpg" width="220px" />
+    <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/9.jpg" width="220px" />

 Train Your Own MSG-Net Model
 ----------------------------
@@ -96,7 +96,7 @@ Train Your Own MSG-Net Model


 Neural Style
-------------
+------------
 `Image Style Transfer Using Convolutional Neural Networks <http://www.cv-foundation.org/openaccess/content_cvpr_2016/papers/Gatys_Image_Style_Transfer_CVPR_2016_paper.pdf>`_ by Leon A. Gatys, Alexander S. Ecker, and Matthias Bethge::

    python main.py optim --content-image images/content/venice-boat.jpg --style-image images/9styles/candy.jpg
@@ -110,12 +110,12 @@ Neural Style

 .. raw:: html

-    <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/g1.jpg" width="260px" /> <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/g2.jpg" width="260px" />
-    <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/g3.jpg" width="260px" />
-    <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/g4.jpg" width="260px" />
-    <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/g5.jpg" width="260px" />
-    <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/g6.jpg" width="260px" />
-    <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/g7.jpg" width="260px" />
-    <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/g8.jpg" width="260px" />
-    <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/g9.jpg" width="260px" />
+    <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/g1.jpg" width="220px" /> <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/g2.jpg" width="220px" />
+    <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/g3.jpg" width="220px" />
+    <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/g4.jpg" width="220px" />
+    <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/g5.jpg" width="220px" />
+    <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/g6.jpg" width="220px" />
+    <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/g7.jpg" width="220px" />
+    <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/g8.jpg" width="220px" />
+    <img src ="https://raw.githubusercontent.com/zhanghang1989/PyTorch-Style-Transfer/master/images/g9.jpg" width="220px" />

--- a/docs/source/experiments/texture.rst
+++ b/docs/source/experiments/texture.rst
 Deep TEN: Deep Texture Encoding Network Example
 ===============================================

-.. image:: http://hangzh.com/figure/cvpr17.svg
+.. image:: ../_static/img/cvpr17.svg
        :width: 100%
        :align: left

@@ -39,7 +39,7 @@ Test Pre-trained Model
 Train Your Own Model
 --------------------

- Example training command::
+- Example training command for training above model::

    python main.py --dataset minc --model encodingnet --batch-size 64 --lr 0.01 --epochs 60 


--- a/docs/source/functions.rst
+++ b/docs/source/functions.rst
@@ -14,3 +14,44 @@ Other Functions

 .. autofunction:: dilatedavgpool2d

+:hidden:`upsample`
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: upsample
+
+
+:hidden:`dropout`
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: dropout
+
+
+:hidden:`relu`
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: relu
+
+
+:hidden:`view_each`
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: view_each
+
+
+:hidden:`multi_each`
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: multi_each
+
+
+:hidden:`sum_each`
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: sum_each
+
+
+:hidden:`cat_each`
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: cat_each
+
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -7,7 +7,10 @@ Encoding Documentation

 Created by `Hang Zhang <http://hangzh.com/>`_

-PyTorch-Encoding is an optimized PyTorch package with CUDA backend, including Encoding Layer, Multi-GPU Synchronized Batch Normalization and useful util functions. Example systems are also provided in `experiments section <experiments/texture.html>`_. We hope this software will accelerate your research, please cite our `papers <notes/compile.html>`_. 
+- An optimized PyTorch package with CUDA backend, including Encoding Layer :class:`encoding.nn.Encoding`, Multi-GPU Synchronized Batch Normalization :class:`encoding.nn.BatchNorm2d` and other customized modules and functions. 
+
+- **Example Systems** for Semantic Segmentation (coming), CIFAR-10 Classification, `Texture Recognition <experiments/texture.html>`_ and `Style Transfer <experiments/style.html>`_ are provided in experiments section. 
+

 .. toctree::
   :glob:

--- a/docs/source/nn.rst
+++ b/docs/source/nn.rst
@@ -9,6 +9,12 @@ Other NN Layers
 Customized Layers
 -----------------

+:hidden:`GramMatrix`
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: GramMatrix
+    :members:
+
 :hidden:`Normalize`
 ~~~~~~~~~~~~~~~~~~~


--- a/docs/source/notes/compile.rst
+++ b/docs/source/notes/compile.rst
-Install PyTorch-Encoding
-========================
+Installing PyTorch-Encoding
+===========================

- Install PyTorch from Source to the ``$HOME`` directory:
-    * Please follow the `PyTorch instructions <https://github.com/pytorch/pytorch#from-source>`_ (recommended).
-    * Or you can simply clone a copy to ``$HOME`` directory::
+
+Install from Source
+-------------------
+
+    * Please follow the `PyTorch instructions <https://github.com/pytorch/pytorch#from-source>`_ to install PyTorch from Source to the ``$HOME`` directory (recommended). Or you can simply clone a copy to ``$HOME`` directory::

        git clone https://github.com/pytorch/pytorch $HOME/pytorch

- Install this package:
-    * Clone the repo::
+    * Install this package
+
+        - Clone the repo::

-        git clone https://github.com/zhanghang1989/PyTorch-Encoding && cd PyTorch-Encoding-Layer
+            git clone https://github.com/zhanghang1989/PyTorch-Encoding && cd PyTorch-Encoding

-    * On Linux::
+        - On Linux::

            python setup.py install

-    * On Mac OSX::
+        - On Mac OSX::

             MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py install

- Reference:
+Reference
+---------

    .. note::
        If using the code in your research, please cite our paper.

--- a/docs/source/notes/syncbn.rst
+++ b/docs/source/notes/syncbn.rst
 Implementing Synchronized Multi-GPU Batch Normalization
 =======================================================
-We will release the implementation detail of Multi-GPU Batch Normalization in later version.

-Why Synchronize?
----------------
+In this tutorial, we discuss the implementation detail of Multi-GPU Batch Normalization (BN) :class:`encoding.nn.BatchNorm2d` and compatible :class:`encoding.parallel.SelfDataParallel`. We will provide the training example in a later version.

- Standard Implementation
+How BN works?
+-------------
+
+BN layer was introduced in the paper `Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`_, which dramatically speed up the training process of the network (enables larger learning rate) and makes the network less sensitive to the weight initialization. 
+
+- Forward Pass: 
+    For the input data :math:`X={x_1, ...x_N}`, the data are normalized to be zero-mean and unit-variance, then scale and shit:
+
+    .. math::
+        y_i = \gamma\cdot\frac{x_i-\mu}{\sigma} + \beta ,
+
+    where :math:`\mu=\frac{\sum_i^N x_i}{N} , \sigma = \sqrt{\frac{\sum_i^N (x_i-\mu)^2}{N}+\epsilon}` and :math:`\gamma, \beta` are the learnable parameters.
+        
+- Backward Pass:
+    For calculating the gradient :math:`\frac{d_\ell}{d_{x_i}}`, we need to consider the gradient from :math:`\frac{d_\ell}{d_y}` and the gradients from :math:`\frac{d_\ell}{d_\mu}` and :math:`\frac{d_\ell}{d_\sigma}`, since the :math:`\mu \text{ and } \sigma` are the function of the input :math:`x_i`. We use patial direvative in the notations:
+
+    .. math::
+
+        \frac{d_\ell}{d_{x_i}} = \frac{d_\ell}{d_{y_i}}\cdot\frac{d_{y_i}}{d_{x_i}} + \frac{d_\ell}{d_\mu}\cdot\frac{d_\mu}{d_{x_i}} + \frac{d_\ell}{d_\sigma}\cdot\frac{d_\sigma}{d_{x_i}}
+
+    where :math:`\frac{d_\ell}{d_{x_i}}=\frac{\gamma}{\sigma}, \frac{d_\ell}{d_\mu}=-\frac{\gamma}{\sigma}\sum_i^N\frac{d_\ell}{d_{y_i}} 
+    \text{ and } \frac{d_\sigma}{d_{x_i}}=-\frac{1}{\sigma}(\frac{x_i-\mu}{N})`.
+
+Why Synchronize BN?
+-------------------
+
+- Standard Implementations of BN in public frameworks (suck as Caffe, MXNet, Torch, TF, PyTorch) are unsynchronized, which means that the data are normalized within each GPU. Therefore the `working batch-size` of the BN layer is `BatchSize/nGPU` (batch-size in each GPU). 
+
+- Since the `working batch-size` is typically large enough for standard vision tasks, such as classification and detection, there is no need to synchronize BN layer during the training. The synchronization will slow down the training.
+
+- However, for the Semantic Segmentation task, the state-of-the-art approaches typically adopt dilated convoluton, which is very memory consuming. The `working bath-size` can be too small for BN layers (2 or 4 in each GPU) when using larger/deeper pre-trained networks, such as :class:`encoding.dilated.ResNet` or :class:`encoding.dilated.DenseNet`. 

 How to Synchronize?
 -------------------

- Forward and Backward Pass
+Suppose we have :math:`K` number of GPUs, :math:`sum(x)_k` and :math:`sum(x^2)_k` denotes the sum of elements and sum of element squares in :math:`k^{th}` GPU.
+
+- Forward Pass:
+    We can calculate the sum of elements :math:`sum(x)=\sum x_i \text{ and sum of squares } sum(x^2)=\sum x_i^2` in each GPU, then apply :class:`encoding.parallel.AllReduce` operation to sum accross GPUs. Then calculate the global mean :math:`\mu=\frac{sum(x)}{N} \text{ and global variance } \sigma=\sqrt{\frac{sum(x^2)}{N}-\mu^2+\epsilon}`. 
+
+- Backward Pass:
+    * :math:`\frac{d_\ell}{d_{x_i}}=\frac{\gamma}{\sigma}` can be calculated locally in each GPU.
+    * Calculate the gradient of :math:`sum(x)` and :math:`sum(x^2)` individually in each GPU :math:`\frac{d_\ell}{d_{sum(x)_k}}` and :math:`\frac{d_\ell}{d_{sum(x^2)_k}}`. 
+
+    * Then Sync the gradient (automatically handled by :class:`encoding.parallel.AllReduce`) and continue the backward.
+
+- Synchronized DataParallel:
+    Standard DataParallel pipeline of public frameworks (MXNet, PyTorch...) in each training iters: 
+
+        * duplicate the network (weights) to all the GPUs,
+        * split the training batch to each GPU,
+        * forward and backward to calculate gradient,
+        * update network parameters (weights) then go to next iter.

- Synchronized DataParallel
+    Therefore, communicattion accross different GPUs are not supported. To address this problem, we introduce a :class:`encoding.parallel.SelfDataParallel` mode, which enables each layer to accept mutli-GPU inputs directly. Those self-parallel layers are provide in :class:`encoding.nn`.

- Cross GPU Autograd
+- Cross GPU Autograd:
+    Due to the BN layers are frequently used in the networks, the PyTorch autograd engine will be messed up by such a complicated backward graph. To address this problem, we provide an aotograd function :class:`encoding.parallel.AllReduce` to handle the cross GPU gradient calculation.

 Comparing Performance 
 ---------------------

+- Training Time:
+
+- Segmentation Performance:
+
+
+Citation
+--------
+
+.. note::
+
+    This code is provided together with the paper (coming soon), please cite our work.
--- a/docs/source/parallel.rst
+++ b/docs/source/parallel.rst
@@ -4,8 +4,8 @@
 Data Parallel
 =============

-Current PyTorch DataParallel Table is not supporting mutl-gpu loss calculation, which makes the gpu memory usage very in-efficient. We address this issue here by doing CriterionDataParallel. 
-The DataParallel compatible with SyncBN will be released later.
+- Current PyTorch DataParallel Table is not supporting mutl-gpu loss calculation, which makes the gpu memory usage very in-efficient. We address this issue here by doing CriterionDataParallel. 
+- :class:`encoding.parallel.SelfDataParallel` is compatible with Synchronized Batch Normalization :class:`encoding.nn.BatchNorm2d`.

 .. automodule:: encoding.parallel
 .. currentmodule:: encoding.parallel

--- a/docs/source/syncbn.rst
+++ b/docs/source/syncbn.rst
@@ -5,8 +5,7 @@ Synchronized BatchNorm
 ======================

 The current BN is implementated insynchronized accross the gpus, which is a big problem for memory consuming tasks such as Semantic Segmenation, since the mini-batch is very small. 
-To synchronize the batchnorm accross multiple gpus is not easy to implment within the current Dataparallel framework. We address this difficulty by making each layer 'self-parallel', that is accepting the inputs from multi-gpus. Therefore, we can handle different layers seperately for synchronizing it across gpus.
-We will release the whole SyncBN Module and compatible DataParallel later. 
+To synchronize the batchnorm accross multiple gpus is not easy to implment within the current Dataparallel framework. We address this difficulty by making each layer 'self-parallel' :class:`encoding.parallel.SelfDataParallel`, that is accepting the inputs from multi-gpus. Therefore, we can handle the synchronizing across gpus.


 .. currentmodule:: encoding.nn

--- a/encoding/CMakeLists.txt
+++ b/encoding/CMakeLists.txt
@@ -45,7 +45,6 @@ IF(NOT ENCODING_INSTALL_LIB_SUBDIR)
 ENDIF()

 SET(CMAKE_MACOSX_RPATH 1)
-#SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11")
 SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")

 FILE(GLOB src-cuda kernel/*.cu)
@@ -53,6 +52,7 @@ FILE(GLOB src-cuda kernel/*.cu)
 MESSAGE(STATUS "Torch_INSTALL_INCLUDE:" ${Torch_INSTALL_INCLUDE})
 CUDA_INCLUDE_DIRECTORIES(
 	${CMAKE_CURRENT_SOURCE_DIR}/kernel
+	${CMAKE_CURRENT_SOURCE_DIR}/kernel/include
 	${Torch_INSTALL_INCLUDE} 
 )
 CUDA_ADD_LIBRARY(ENCODING SHARED ${src-cuda})
@@ -75,8 +75,9 @@ IF(ENCODING_SO_VERSION)
    SOVERSION ${ENCODING_SO_VERSION})
 ENDIF(ENCODING_SO_VERSION)

-FILE(GLOB src-header kernel/generic/*.h)
+FILE(GLOB kernel-header kernel/generic/*.h)
+FILE(GLOB src-header src/generic/*.h)

 INSTALL(TARGETS ENCODING LIBRARY DESTINATION ${ENCODING_INSTALL_LIB_SUBDIR})
 INSTALL(FILES kernel/thc_encoding.h DESTINATION "${ENCODING_INSTALL_INCLUDE_SUBDIR}/ENCODING")
-INSTALL(FILES ${src-header} DESTINATION "${ENCODING_INSTALL_INCLUDE_SUBDIR}/ENCODING/generic")
+INSTALL(FILES ${src-header} ${kernel-header} DESTINATION "${ENCODING_INSTALL_INCLUDE_SUBDIR}/ENCODING/generic")