update and fix bugs (#51)

67e153dd · Hang Zhang · GitHub · 71447e1b · 67e153dd · 67e153dd
Unverified Commit 67e153dd authored May 15, 2018 by Hang Zhang Committed by GitHub May 15, 2018
20 changed files
--- a/build.py
+++ b/build.py
@@ -15,8 +15,8 @@ import subprocess
 from torch.utils.ffi import create_extension
 lib_path = os.path.join(os.path.dirname(torch.__file__), 'lib')
-cwd = os.path.dirname(os.path.realpath(__file__))
+cwd = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'encoding/')
-encoding_lib_path = os.path.join(cwd, "encoding", "lib")
+encoding_lib_path = os.path.join(cwd, "lib")
 # clean the build files
 clean_cmd = ['bash', 'clean.sh']
@@ -25,13 +25,13 @@ subprocess.check_call(clean_cmd)
 # build CUDA library
 os.environ['TORCH_BUILD_DIR'] = lib_path
 if platform.system() == 'Darwin':
-    os.environ['TH_LIBRARIES'] = os.path.join(lib_path,'libATen.1.dylib')
+    os.environ['TH_LIBRARIES'] = os.path.join(lib_path,'libATen.dylib')
-    ENCODING_LIB = os.path.join(cwd, 'encoding/lib/libENCODING.dylib')
+    ENCODING_LIB = os.path.join(cwd, 'lib/libENCODING.dylib')
 else:
    os.environ['CFLAGS'] = '-std=c99'
-    os.environ['TH_LIBRARIES'] = os.path.join(lib_path,'libATen.so.1')
+    os.environ['TH_LIBRARIES'] = os.path.join(lib_path,'libATen.so')
-    ENCODING_LIB = os.path.join(cwd, 'encoding/lib/libENCODING.so')
+    ENCODING_LIB = os.path.join(cwd, 'lib/libENCODING.so')
 build_all_cmd = ['bash', 'encoding/make.sh']
 subprocess.check_call(build_all_cmd, env=dict(os.environ))
@@ -45,9 +45,9 @@ defines = [('WITH_CUDA', None)]
 with_cuda = True 
 include_path = [os.path.join(lib_path, 'include'),
-                os.path.join(cwd,'encoding/kernel'), 
+                os.path.join(cwd,'kernel'),
-                os.path.join(cwd,'encoding/kernel/include'), 
+                os.path.join(cwd,'kernel/include'),
-                os.path.join(cwd,'encoding/src/')]
+                os.path.join(cwd,'src/')]
 def make_relative_rpath(path):
    if platform.system() == 'Darwin':
@@ -63,6 +63,7 @@ ffi = create_extension(
    define_macros=defines,
    relative_to=__file__,
    with_cuda=with_cuda,
+    extra_compile_args=["-std=c99"],
    include_dirs = include_path,
    extra_link_args = [
        make_relative_rpath(lib_path),

--- a/clean.sh
+++ b/clean.sh
 #!/usr/bin/env bash
+rm -rf build/ dist/ torch_encoding.egg-info/ encoding/lib/ encoding/_ext/ __pycache__ encoding/__pycache__
-rm -rf build/ dist/ encoding.egg-info/ encoding/lib/ encoding/_ext/ __pycache__ encoding/__pycache__
--- a/docs/source/dilated.rst
+++ b/docs/source/dilated.rst
 .. role:: hidden
    :class: hidden-section
-Dilated Networks
+encoding.dilated
 ================
 We provide correct dilated pre-trained ResNet and DenseNet (stride of 8) for semantic segmentation. 

--- a/docs/source/functions.rst
+++ b/docs/source/functions.rst
@@ -4,10 +4,20 @@
 encoding.functions
 ==================
-.. automodule:: encoding.functions
+.. automodule:: encoding.Functions
 .. currentmodule:: encoding.functions
+:hidden:`batchnorm`
+~~~~~~~~~~~~~~~~~~~~~~~~
+.. autofunction:: batchnorm
+:hidden:`batchnormeval`
+~~~~~~~~~~~~~~~~~~~~~~~
+.. autofunction:: batchnormeval
 :hidden:`dilatedavgpool2d`
 ~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -10,7 +10,7 @@ Created by `Hang Zhang <http://hangzh.com/>`_
 An optimized PyTorch package with CUDA backend. 
 .. note::
-    Please checkout the PyTorch compatible Synchronized Cross-GPU :class:`encoding.nn.BatchNorm2d` and the `mnist example <https://github.com/zhanghang1989/PyTorch-SyncBatchNorm>`_.
+    PyTorch compatible Synchronized Cross-GPU :class:`encoding.nn.SyncBatchNorm2d` and the `MNIST example <https://github.com/zhanghang1989/PyTorch-SyncBatchNorm>`_.
 .. toctree::
   :glob:
@@ -30,8 +30,7 @@ An optimized PyTorch package with CUDA backend.
   :maxdepth: 1
   :caption: Package Reference
-   encoding
+   nn
-   syncbn
   parallel
   dilated
   functions

--- a/docs/source/notes/compile.rst
+++ b/docs/source/notes/compile.rst
@@ -5,9 +5,9 @@ Install and Citations
 Install from Source
 -------------------
-    * Install PyTorch from Source (recommended). Please follow the `PyTorch instructions <https://github.com/pytorch/pytorch#from-source>`_.
+    * Install PyTorch by following the `PyTorch instructions <http://pytorch.org/>`_.
-    * Install this package
+    * Install from source
        - Clone the repo::
@@ -15,12 +15,10 @@ Install from Source
        - On Linux::
-            pip install -r requirements.txt
            python setup.py install
        - On Mac OSX::
-            pip install -r requirements.txt
            MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py install
 Citations

--- a/docs/source/parallel.rst
+++ b/docs/source/parallel.rst
 .. role:: hidden
    :class: hidden-section
-Data Parallel
+encoding.parallel
-=============
+=================
- Current PyTorch DataParallel Table is not supporting mutl-gpu loss calculation, which makes the gpu memory usage very in-balance. We address this issue here by doing Model & CriterionDataParallel. 
+- Current PyTorch DataParallel Table is not supporting mutl-gpu loss calculation, which makes the gpu memory usage very in-balance. We address this issue here by doing DataParallel for Model & Criterion. 
 .. note::
    This code is provided together with the paper
@@ -15,16 +15,16 @@ Data Parallel
 .. automodule:: encoding.parallel
 .. currentmodule:: encoding.parallel
-:hidden:`ModelDataParallel`
+:hidden:`DataParallelModel`
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: ModelDataParallel
+.. autoclass:: DataParallelModel
    :members:
-:hidden:`CriterionDataParallel`
+:hidden:`DataParallelCriterion`
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: CriterionDataParallel
+.. autoclass:: DataParallelCriterion
    :members:

--- a/docs/source/utils.rst
+++ b/docs/source/utils.rst
 .. role:: hidden
    :class: hidden-section
-My PyTorch Utils
+encoding.utils
-================
+==============
 Useful util functions.
 .. automodule:: encoding.utils
 .. currentmodule:: encoding.utils
-:hidden:`LR_Scheduler`
-~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: LR_Scheduler
-    :members:
 :hidden:`get_optimizer`
 ~~~~~~~~~~~~~~~~~~~~~~~
@@ -24,3 +18,13 @@ Useful util functions.
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autofunction:: save_checkpoint
+:hidden:`batch_pix_accuracy`
+~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autofunction:: batch_pix_accuracy
+:hidden:`batch_intersection_union`
+~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autofunction:: batch_intersection_union
--- a/encoding/dilated/resnet.py
+++ b/encoding/dilated/resnet.py
 """Dilated ResNet"""
 import math
 import torch.utils.model_zoo as model_zoo
-from .. import nn
+#from .. import nn
+import torch.nn as nn
 __all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
           'resnet152', 'BasicBlock', 'Bottleneck']
@@ -25,15 +26,16 @@ class BasicBlock(nn.Module):
    """ResNet BasicBlock
    """
    expansion = 1
-    def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, first_dilation=1):
+    def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, first_dilation=1,
+                 norm_layer=None):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride,
                               padding=dilation, dilation=dilation, bias=False)
-        self.bn1 = nn.BatchNorm2d(planes)
+        self.bn1 = norm_layer(planes)
-        self.relu = nn.ReLU(inplace=True)
+        self.relu = nn.ReLU(inplace=False)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1,
                               padding=first_dilation, dilation=first_dilation, bias=False)
-        self.bn2 = nn.BatchNorm2d(planes)
+        self.bn2 = norm_layer(planes)
        self.downsample = downsample
        self.stride = stride
@@ -62,18 +64,18 @@ class Bottleneck(nn.Module):
    # pylint: disable=unused-argument
    expansion = 4
    def __init__(self, inplanes, planes, stride=1, dilation=1,
-                 downsample=None, first_dilation=1):
+                 downsample=None, first_dilation=1, norm_layer=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(planes)
+        self.bn1 = norm_layer(planes)
        self.conv2 = nn.Conv2d(
            planes, planes, kernel_size=3, stride=stride,
            padding=dilation, dilation=dilation, bias=False)
-        self.bn2 = nn.BatchNorm2d(planes)
+        self.bn2 = norm_layer(planes)
        self.conv3 = nn.Conv2d(
            planes, planes * 4, kernel_size=1, bias=False)
-        self.bn3 = nn.BatchNorm2d(planes * 4)
+        self.bn3 = norm_layer(planes * 4)
-        self.relu = nn.ReLU(inplace=True)
+        self.relu = nn.ReLU(inplace=False)
        self.downsample = downsample
        self.dilation = dilation
        self.stride = stride
@@ -118,18 +120,18 @@ class ResNet(nn.Module):
        - Yu, Fisher, and Vladlen Koltun. "Multi-scale context aggregation by dilated convolutions."
    """
    # pylint: disable=unused-variable
-    def __init__(self, block, layers, num_classes=1000):
+    def __init__(self, block, layers, num_classes=1000, norm_layer=None):
        self.inplanes = 64
        super(ResNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
                               bias=False)
-        self.bn1 = nn.BatchNorm2d(64)
+        self.bn1 = norm_layer(64)
-        self.relu = nn.ReLU(inplace=True)
+        self.relu = nn.ReLU(inplace=False)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
-        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer1 = self._make_layer(block, 64, layers[0], norm_layer=norm_layer)
-        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2, norm_layer=norm_layer)
-        self.layer3 = self._make_layer(block, 256, layers[2], stride=1, dilation=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=1, dilation=2, norm_layer=norm_layer)
-        self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=4)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=4, norm_layer=norm_layer)
        self.avgpool = nn.AvgPool2d(7)
        self.fc = nn.Linear(512 * block.expansion, num_classes)
@@ -137,32 +139,33 @@ class ResNet(nn.Module):
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
-            elif isinstance(m, nn.BatchNorm2d):
+            elif isinstance(m, norm_layer):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
-    def _make_layer(self, block, planes, blocks, stride=1, dilation=1):
+    def _make_layer(self, block, planes, blocks, stride=1, dilation=1, norm_layer=None):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
-                nn.BatchNorm2d(planes * block.expansion),
+                norm_layer(planes * block.expansion),
            )
        layers = []
        if dilation == 1 or dilation == 2:
            layers.append(block(self.inplanes, planes, stride, dilation=1,
-                                downsample=downsample, first_dilation=dilation))
+                                downsample=downsample, first_dilation=dilation, norm_layer=norm_layer))
        elif dilation == 4:
            layers.append(block(self.inplanes, planes, stride, dilation=2,
-                                downsample=downsample, first_dilation=dilation))
+                                downsample=downsample, first_dilation=dilation, norm_layer=norm_layer))
        else:
            raise RuntimeError("=> unknown dilation size: {}".format(dilation))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
-            layers.append(block(self.inplanes, planes, dilation=dilation, first_dilation=dilation))
+            layers.append(block(self.inplanes, planes, dilation=dilation, first_dilation=dilation,
+                                norm_layer=norm_layer))
        return nn.Sequential(*layers)

--- a/encoding/functions/syncbn.py
+++ b/encoding/functions/syncbn.py
@@ -8,55 +8,54 @@
 ## LICENSE file in the root directory of this source tree
 ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-"""Synchronized Batch Normalization functions"""
+"""Synchronized Cross-GPU Batch Normalization functions"""
 import torch
-from torch.autograd import Function, Variable
+from torch.autograd import Variable, Function
 from .._ext import encoding_lib
 __all__ = ['sum_square', 'batchnormtrain', 'batchnormeval']
+def sum_square(input):
+    r"""Calculate sum of elements and sum of squares for Batch Normalization"""
+    return _sum_square.apply(input)
 class _sum_square(Function):
    @staticmethod
    def forward(ctx, input):
        ctx.save_for_backward(input)
-        B, C, _, _ = input.size()
+        C = input.size(1)
        with torch.cuda.device_of(input):
            xsum = input.new().resize_(C).zero_()
            xsquare = input.new().resize_(C).zero_()
        if isinstance(input, torch.cuda.FloatTensor):
            with torch.cuda.device_of(input):
                encoding_lib.Encoding_Float_sum_square_Forward(
-                    input.view(B, C, -1), xsum, xsquare)
+                    input, xsum, xsquare)
        elif isinstance(input, torch.cuda.DoubleTensor):
            with torch.cuda.device_of(input):
                encoding_lib.Encoding_Double_sum_square_Forward(
-                    input.view(B, C, -1), xsum, xsquare)
+                    input, xsum, xsquare)
        else:
-            raise RuntimeError('Unimplemented data type!')
+            raise RuntimeError('Unimplemented data type!', type(input))
        return xsum, xsquare
    @staticmethod
    def backward(ctx, gradSum, gradSquare):
        input, = ctx.saved_variables
-        B, C, H, W = input.data.size()
        with torch.cuda.device_of(input.data):
-            gradInput = Variable(input.data.new().resize_(B, C, H*W).zero_())
+            gradInput = Variable(input.data.new().resize_as_(input.data).zero_())
        if isinstance(input.data, torch.cuda.FloatTensor):
            with torch.cuda.device_of(input.data):
                encoding_lib.Encoding_Float_sum_square_Backward(
-                    gradInput, input.data.view(B, C, -1), gradSum, gradSquare)
+                    gradInput.data, input.data, gradSum.data, gradSquare.data)
        elif isinstance(input.data, torch.cuda.DoubleTensor):
            with torch.cuda.device_of(input.data):
                encoding_lib.Encoding_Double_sum_square_Backward(
-                    gradInput, input.data.view(B, C, -1), gradSum, gradSquare)
+                    gradInput.data, input.data, gradSum.data, gradSquare.data)
        else:
            raise RuntimeError('Unimplemented data type!')
-        return gradInput.view(B, C, H, W)
+        return gradInput
-def sum_square(input):
-    r"""Calculate sum of elements and sum of squares for Batch Normalization"""
-    return _sum_square.apply(input)
 class _batchnorm(Function):
@@ -134,3 +133,4 @@ def batchnormeval(input, gamma, beta, mean, std):
    Please see encoding.batchnormtrain_
    """
    return _batchnorm(False)(input, gamma, beta, mean, std)
--- a/encoding/kernel/generic/device_tensor.h
+++ b/encoding/kernel/generic/device_tensor.h
@@ -17,6 +17,8 @@ THCDeviceTensor<real, Dim> devicetensor(THCState *state, THCTensor *t) {
        return THCDeviceTensor<real, Dim>();
    }
    int inDim = THCTensor_(nDimension)(state, t);
+    return toDeviceTensor<real, Dim>(state, t);
+    /*
    if (inDim == Dim) {
        return toDeviceTensor<real, Dim>(state, t);
    }
@@ -33,6 +35,7 @@ THCDeviceTensor<real, Dim> devicetensor(THCState *state, THCTensor *t) {
        }
    }
    return THCDeviceTensor<real, Dim>(THCTensor_(data)(state, t), size);
+    */
 }
 struct Encoding_(Float2)

--- a/encoding/kernel/thc_encoding.h
+++ b/encoding/kernel/thc_encoding.h
@@ -8,7 +8,7 @@
 * LICENSE file in the root directory of this source tree 
 *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 */
-#include <THC/THC.h>
+#include <THC.h>
 #include "THCDeviceTensor.cuh"
 #include "THCDeviceTensorUtils.cuh"

--- a/encoding/make.sh
+++ b/encoding/make.sh
 #!/usr/bin/env bash
 mkdir -p encoding/lib && cd encoding/lib
 # compile and install
 cmake ..

--- a/encoding/nn/customize.py
+++ b/encoding/nn/customize.py
@@ -10,12 +10,14 @@
 """Encoding Custermized NN Module"""
 import torch
-from torch.nn import Module, Sequential, Conv2d, ReLU, AdaptiveAvgPool2d
+from torch.nn import Module, Sequential, Conv2d, ReLU, AdaptiveAvgPool2d, \
+    NLLLoss, BCELoss, CrossEntropyLoss
 from torch.nn import functional as F
 from .syncbn import BatchNorm2d
-__all__ = ['GramMatrix', 'View', 'Sum', 'Mean', 'Normalize', 'PyramidPooling']
+__all__ = ['GramMatrix', 'SegmentationLosses', 'View', 'Sum', 'Mean',
+           'Normalize', 'PyramidPooling']
 class GramMatrix(Module):
@@ -31,6 +33,46 @@ class GramMatrix(Module):
        gram = features.bmm(features_t) / (ch * h * w)
        return gram
+def softmax_crossentropy(input, target, weight, size_average, ignore_index, reduce=True):
+    return F.nll_loss(F.log_softmax(input, 1), target, weight,
+                      size_average, ignore_index, reduce)
+class SegmentationLosses(CrossEntropyLoss):
+    """2D Cross Entropy Loss with Auxilary Loss"""
+    def __init__(self, aux, aux_weight=0.2, weight=None, size_average=True, ignore_index=-1):
+        super(SegmentationLosses, self).__init__(weight, size_average, ignore_index)
+        self.aux = aux
+        self.aux_weight = aux_weight
+    def forward(self, *inputs):
+        if not self.aux:
+            return super(SegmentationLosses, self).forward(*inputs)
+        pred1, pred2, target = tuple(inputs)
+        loss1 = super(SegmentationLosses, self).forward(pred1, target)
+        loss2 = super(SegmentationLosses, self).forward(pred2, target)
+        return loss1 + self.aux_weight * loss2
+"""
+class SegmentationLosses(Module):
+    def __init__(self, aux, aux_weight=0.2, weight=None, size_average=True, ignore_index=-1):
+        super(SegmentationLosses, self).__init__()
+        self.aux = aux
+        self.aux_weight = aux_weight
+        # Somehow the size averge is not handled correctly on multi-gpu, so we average by ourself.
+        self.nll_loss = NLLLoss(weight, ignore_index=ignore_index, reduce=True)
+    def _forward_each(self, inputs, targets):
+        return self.nll_loss(F.log_softmax(inputs, dim=1), targets)
+    def forward(self, *inputs):
+        if not self.aux:
+            return self._forward_each(*inputs)
+        pred1, pred2, target = tuple(inputs)
+        loss1 = self._forward_each(pred1, target)
+        loss2 = self._forward_each(pred2, target)
+        return loss1 + self.aux_weight * loss2
+"""
 class View(Module):
    """Reshape the input into different size, an inplace operator, support

--- a/encoding/nn/syncbn.py
+++ b/encoding/nn/syncbn.py
@@ -9,48 +9,63 @@
 ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 """Synchronized Cross-GPU Batch Normalization Module"""
+import functools
+import collections
 import threading
 import torch
 from torch.nn import Module, Sequential, Conv1d, Conv2d, ConvTranspose2d, \
-    ReLU, Sigmoid, MaxPool2d, AvgPool2d, AdaptiveAvgPool2d, Dropout2d, Linear
+    ReLU, Sigmoid, MaxPool2d, AvgPool2d, AdaptiveAvgPool2d, Dropout2d, Linear, \
+    DataParallel
 from torch.nn.modules.batchnorm import _BatchNorm
+from torch.nn.functional import batch_norm
+from torch.nn.parallel._functions import ReduceAddCoalesced, Broadcast
-from ..functions import batchnormtrain, batchnormeval, sum_square
+from ..functions import *
 from ..parallel import allreduce
 __all__ = ['BatchNorm1d', 'BatchNorm2d', 'BatchNorm3d', 'Module', 'Sequential', 'Conv1d',
           'Conv2d', 'ConvTranspose2d', 'ReLU', 'Sigmoid', 'MaxPool2d', 'AvgPool2d',
           'AdaptiveAvgPool2d', 'Dropout2d', 'Linear']
 class _SyncBatchNorm(_BatchNorm):
-    # pylint: disable=access-member-before-definition
+    def __init__(self, num_features, eps=1e-5, momentum=0.001, affine=True):
-    def __init__(self, num_features, eps=1e-5, momentum=0.1, **kwargs):
+        super(_SyncBatchNorm, self).__init__(num_features, eps=eps, momentum=momentum, affine=affine)
-        super(_SyncBatchNorm, self).__init__(num_features, eps=1e-5, momentum=0.1, **kwargs)
-        # syncBN
+        self._is_parallel = False
-        self.writelock = threading.Lock()
+        self._parallel_id = None
-        nGPUs = torch.cuda.device_count()
+        self._slave_pipe = None
-        self.sharedT = SharedTensor(nGPUs)
+        self.sharedT = SharedTensor(torch.cuda.device_count())
    def forward(self, input):
-        self._check_input_dim(input)
+        # Resize the input to (B, C, -1).
        input_shape = input.size()
        input = input.view(input_shape[0], self.num_features, -1)
        if not self.training:
            std = (self.running_var.clamp(self.eps)).sqrt()
            output = batchnormeval(input, self.weight, self.bias, self.running_mean, std)
            return output.view(input_shape)
-        # get global sum(x) and sum(x^2)
-        xsum, xsquare = self.sharedT(sum_square(input.unsqueeze(3)))
+        # sum(x) and sum(x^2)
+        N = input.size(0) * input.size(2)
+        xsum, xsqsum = sum_square(input)
+        # all-reduce for global sum(x) and sum(x^2)
+        igpu = input.get_device()
+        self.sharedT.push(N, igpu, xsum, xsqsum)
+        N, xsum, xsqsum = self.sharedT.pull(igpu)
        # calculate mean, var
-        N = len(self.sharedT) * input.size(0) * input.size(2)
        mean = xsum / N
-        sumvar = xsquare - xsum * xsum / N
+        sumvar = xsqsum - xsum * xsum / N
        unbias_var = sumvar / (N - 1)
        bias_var = sumvar / N
        std = bias_var.clamp(self.eps).sqrt()
        # update running_mean and var
        self.running_mean = (1-self.momentum) * self.running_mean + self.momentum * mean.data
        self.running_var = (1-self.momentum) * self.running_var + self.momentum * unbias_var.data
        # forward
        return batchnormtrain(input, self.weight, self.bias, mean, std).view(input_shape)
@@ -61,6 +76,8 @@ class BatchNorm1d(_SyncBatchNorm):
        if input.dim() != 2 and input.dim() != 3:
            raise ValueError('expected 2D or 3D input (got {}D input)'
                             .format(input.dim()))
+        super(BatchNorm2d, self)._check_input_dim(input)
 class BatchNorm2d(_SyncBatchNorm):
    r"""Cross-GPU Synchronized Batch normalization (SyncBN)
@@ -70,6 +87,9 @@ class BatchNorm2d(_SyncBatchNorm):
    We follow the sync-onece implmentation described in the paper [2]_ .
    Please see the design idea in the `notes <./notes/syncbn.html>`_.
+    .. note::
+        Please use ``CUDA_VISIBLE_DEVICES`` to select number of GPUs.
    .. math::
        y = \frac{x - mean[x]}{ \sqrt{Var[x] + \epsilon}} * gamma + beta
@@ -106,13 +126,16 @@ class BatchNorm2d(_SyncBatchNorm):
    Examples:
        >>> # Use exactly the same as standard BatchNrom2d
-        >>> m = nn.BatchNorm2d(100)
+        >>> m = BatchNorm2d(100)
-        >>> output = m(input)
+        >>> net = torch.nn.DataParallel(m)
+        >>> output = net(input)
    """
    def _check_input_dim(self, input):
        if input.dim() != 4:
            raise ValueError('expected 4D input (got {}D input)'
                             .format(input.dim()))
+        super(BatchNorm2d, self)._check_input_dim(input)
 class BatchNorm3d(_SyncBatchNorm):
    r"""Please see the docs in :class:`encoding.nn.BatchNorm2d`"""
@@ -120,10 +143,11 @@ class BatchNorm3d(_SyncBatchNorm):
        if input.dim() != 5:
            raise ValueError('expected 5D input (got {}D input)'
                             .format(input.dim()))
+        super(BatchNorm3d, self)._check_input_dim(input)
 class SharedTensor(object):
-    """Shared Tensor for cross GPU communication
+    """Shared Tensor for cross GPU all reduce operation"""
-    """
    def __init__(self, nGPUs):
        self.mutex = threading.Lock()
        self.all_tasks_done = threading.Condition(self.mutex)
@@ -131,28 +155,37 @@ class SharedTensor(object):
        self._clear()
    def _clear(self):
-        self.list = []
+        self.N = 0
+        self.dict = {}
        self.push_tasks = self.nGPUs
        self.reduce_tasks = self.nGPUs
-    def __call__(self, *inputs):
+    def push(self, *inputs):
+        if self.nGPUs <= 1:
+            return tuple(inputs)
        # push from device
        with self.mutex:
            if self.push_tasks == 0:
                self._clear()
-            self.list.extend(list(*inputs))
+            self.N += inputs[0]
-            idx = self.nGPUs - self.push_tasks
+            igpu = inputs[1]
+            self.dict[igpu] = inputs[2:]
+            #idx = self.nGPUs - self.push_tasks
            self.push_tasks -= 1
        with self.all_tasks_done:
            if self.push_tasks == 0:
                self.all_tasks_done.notify_all()
            while self.push_tasks:
                self.all_tasks_done.wait()
+    def pull(self, igpu):
        # pull from device
        with self.mutex:
-            if self.reduce_tasks == self.nGPUs:
+            if igpu == 0:
-                assert(len(self.list) == 2 * self.nGPUs)
+                assert(len(self.dict) == self.nGPUs)
-                self.list = allreduce(2, *self.list)
+                # flatten the tensors
+                self.list = [t for i in range(len(self.dict)) for t in self.dict[i]]
+                self.outlist = allreduce(2, *self.list)
                self.reduce_tasks -= 1
            else:
                self.reduce_tasks -= 1
@@ -162,10 +195,11 @@ class SharedTensor(object):
            while self.reduce_tasks:
                self.all_tasks_done.wait()
        # all reduce done
-        return self.list[2*idx], self.list[2*idx+1]
+        return self.N, self.outlist[2*igpu], self.outlist[2*igpu+1]
    def __len__(self):
        return self.nGPUs
    def __repr__(self):
        return ('SharedTensor')
--- a/encoding/parallel.py
+++ b/encoding/parallel.py
@@ -11,31 +11,50 @@
 """Encoding Data Parallel"""
 import threading
 import torch
-from torch.autograd import Function
+from torch.autograd import Variable, Function
 import torch.cuda.comm as comm
 from torch.nn.parallel.data_parallel import DataParallel
-from torch.nn.parallel.replicate import replicate
 from torch.nn.parallel.parallel_apply import get_a_var
 from torch.nn.parallel._functions import ReduceAddCoalesced, Broadcast
 __all__ = ['allreduce', 'DataParallelModel', 'DataParallelCriterion']
+torch_ver = torch.__version__[:3]
-def allreduce(num_inputs, *inputs):
+def allreduce(*inputs):
    """Cross GPU all reduce autograd operation for calculate mean and
    variance in SyncBN.
    """
-    target_gpus = [inputs[i].get_device() for i in range(0, len(inputs), num_inputs)]
+    return AllReduce.apply(*inputs)
-    result = ReduceAddCoalesced.apply(target_gpus[0], num_inputs, *inputs)
-    outputs = Broadcast.apply(target_gpus, *result)
+class AllReduce(Function):
-    assert len(outputs) == len(inputs)
+    @staticmethod
-    return outputs
+    def forward(ctx, num_inputs, *inputs):
+        ctx.num_inputs = num_inputs
+        ctx.target_gpus = [inputs[i].get_device() for i in range(0, len(inputs), num_inputs)]
+        inputs = [inputs[i:i + num_inputs]
+                 for i in range(0, len(inputs), num_inputs)]
+        # sort before reduce sum
+        inputs = sorted(inputs, key=lambda i: i[0].get_device())
+        results = comm.reduce_add_coalesced(inputs, ctx.target_gpus[0])
+        outputs = comm.broadcast_coalesced(results, ctx.target_gpus)
+        return tuple([t for tensors in outputs for t in tensors])
+    @staticmethod
+    def backward(ctx, *inputs):
+        inputs = [i.data for i in inputs]
+        inputs = [inputs[i:i + ctx.num_inputs]
+                 for i in range(0, len(inputs), ctx.num_inputs)]
+        results = comm.reduce_add_coalesced(inputs, ctx.target_gpus[0])
+        outputs = comm.broadcast_coalesced(results, ctx.target_gpus)
+        return (None,) + tuple([Variable(t) for tensors in outputs for t in tensors])
 class Reduce(Function):
    @staticmethod
    def forward(ctx, *inputs):
        ctx.target_gpus = [inputs[i].get_device() for i in range(len(inputs))]
+        inputs = sorted(inputs, key=lambda i: i.get_device())
        return comm.reduce_add(inputs)
    @staticmethod
@@ -101,12 +120,13 @@ class DataParallelCriterion(DataParallel):
        # scattering the targets instead
        if not self.device_ids:
            return self.module(inputs, *targets, **kwargs)
-        targets, kwargs = inputs(targets, kwargs, self.device_ids)
+        targets, kwargs = self.scatter(targets, kwargs, self.device_ids)
        if len(self.device_ids) == 1:
            return self.module(inputs, *targets[0], **kwargs[0])
-        replicas = replicate(self.module, self.device_ids[:len(inputs)])
+        replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
        outputs = _criterion_parallel_apply(replicas, inputs, targets, kwargs)
        return Reduce.apply(*outputs) / len(outputs)
+        #return self.gather(outputs, self.output_device).mean()
 def _criterion_parallel_apply(modules, inputs, targets, kwargs_tup=None, devices=None):
@@ -123,14 +143,16 @@ def _criterion_parallel_apply(modules, inputs, targets, kwargs_tup=None, devices
    lock = threading.Lock()
    results = {}
-    grad_enabled = torch.is_grad_enabled()
+    if torch_ver != "0.3":
+        grad_enabled = torch.is_grad_enabled()
    def _worker(i, module, input, target, kwargs, device=None):
-        torch.set_grad_enabled(grad_enabled)
+        if torch_ver != "0.3":
+            torch.set_grad_enabled(grad_enabled)
        if device is None:
            device = get_a_var(input).get_device()
        try:
-            with torch.cuda.device_of(var_input):
+            with torch.cuda.device(device):
                output = module(*(input + target), **kwargs)
            with lock:
                results[i] = output
@@ -142,7 +164,7 @@ def _criterion_parallel_apply(modules, inputs, targets, kwargs_tup=None, devices
        threads = [threading.Thread(target=_worker,
                                    args=(i, module, input, target,
                                          kwargs, device),)
-                   for i, (module, input, target, kwargs) in
+                   for i, (module, input, target, kwargs, device) in
                   enumerate(zip(modules, inputs, targets, kwargs_tup, devices))]
        for thread in threads:

--- a/encoding/utils.py
+++ b/encoding/utils.py
@@ -9,36 +9,18 @@
 ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 """Encoding Util Tools"""
-import shutil
 import os
+import errno
+import requests
+import shutil
+import hashlib
 import math
+from tqdm import tqdm
+import numpy as np
 import torch
-__all__ = ['get_optimizer', 'LR_Scheduler', 'save_checkpoint']
+__all__ = ['LR_Scheduler', 'save_checkpoint', 'batch_pix_accuracy',
+           'batch_intersection_union', 'download', 'mkdir', 'check_sha1']
-def get_optimizer(args, model, diff_LR=True):
-    """
-    Returns an optimizer for given model,
-    Args:
-        args: :attr:`args.lr`, :attr:`args.momentum`, :attr:`args.weight_decay`
-        model: if using different lr, define `model.pretrained` and `model.head`.
-    """
-    if diff_LR and model.pretrained is not None:
-        print('Using different learning rate for pre-trained features')
-        optimizer = torch.optim.SGD([
-            {'params': model.pretrained.parameters()},
-            {'params': model.head.parameters(),
-             'lr': args.lr*10},
-            ],
-                                    lr=args.lr,
-                                    momentum=args.momentum,
-                                    weight_decay=args.weight_decay)
-    else:
-        optimizer = torch.optim.SGD(model.parameters(), lr=args.lr,
-                                    momentum=args.momentum,
-                                    weight_decay=args.weight_decay)
-    return optimizer
 class LR_Scheduler(object):
@@ -105,3 +87,245 @@ def save_checkpoint(state, args, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, directory + 'model_best.pth.tar')
+def batch_pix_accuracy(predict, target):
+    """Batch Pixel Accuracy
+    Args:
+        predict: input 4D tensor
+        target: label 3D tensor
+    """
+    _, predict = torch.max(predict, 1)
+    # pixel_labeled = (target >= 0).sum().item()
+    # TODO currently torch.eq is not working as expected, change back when it's fixed
+    # pixel_correct = torch.eq(predict, target).sum().item()
+    predict = predict.cpu().numpy()
+    target = target.cpu().numpy()
+    pixel_labeled = np.sum(target >= 0)
+    pixel_correct = np.sum((predict == target)*(target >= 0))
+    assert(pixel_correct <= pixel_labeled)
+    return pixel_correct, pixel_labeled
+def batch_intersection_union(predict, target, nclass):
+    """Batch Intersection of Union
+    Args:
+        predict: input 4D tensor
+        target: label 3D tensor
+        nclass: number of categories (int)
+    """
+    _, predict = torch.max(predict, 1)
+    mini = 0
+    maxi = nclass - 1
+    nbins = nclass
+    """
+    predict = predict.cpu().numpy()
+    target = target.cpu().numpy()
+    predict = predict * (target >= 0).astype(predict.dtype)
+    intersection = predict * (predict == target)
+    # areas of intersection and union
+    area_inter, _ = np.histogram(intersection, bins=nbins,
+                                 range=(mini, maxi))
+    area_pred, _ = np.histogram(predict, bins=nbins,
+                                range=(mini, maxi))
+    area_lab, _ = np.histogram(target, bins=nbins,
+                               range=(mini, maxi))
+    area_union = area_pred + area_lab - area_inter
+    # Somehow PyTorch update break this, will change back if fixed
+    """
+    predict = predict * (target >= 0).type_as(predict)
+    intersection = predict * (predict == target).type_as(predict)
+    area_inter = torch.histc(intersection.cpu().float(), bins=nclass, 
+                                    min=mini, max=maxi) 
+    area_pred = torch.histc(predict.cpu().float(), bins=nclass, min=mini, 
+                            max=maxi)
+    area_lab = torch.histc(target.cpu().float(), bins=nclass, min=mini, 
+                           max=maxi)
+    area_union = area_pred + area_lab - area_inter
+    return area_inter, area_union
+def get_selabel_vector(target, nclass):
+    """Get SE-Loss Label in a batch
+    Args:
+        predict: input 4D tensor
+        target: label 3D tensor (BxHxW)
+        nclass: number of categories (int)
+    Output:
+        2D tensor (BxnClass)
+    """
+    batch = target.size(0)
+    tvect = torch.zeros(batch, nclass)
+    for i in range(batch):
+        hist = torch.histc(target[i].data.float(), 
+                           bins=nclass, min=0,
+                           max=nclass-1)
+        vect = hist>0
+        tvect[i] = vect
+    return tvect
+def get_mask_pallete(npimg, dataset='detail'):
+    """Get image color pallete for visualizing masks"""
+    # recovery boundary
+    if dataset == 'pascal_voc':
+        npimg[npimg==21] = 255
+    # put colormap
+    out_img = Image.fromarray(npimg.astype('uint8'))
+    if dataset == 'ade20k':
+        out_img.putpalette(adepallete)
+    elif dataset == 'cityscapes':
+        out_img.putpalette(citypallete)
+    else:
+        out_img.putpalette(vocpallete)
+    return out_img
+def download(url, path=None, overwrite=False, sha1_hash=None):
+    """Download an given URL
+    Parameters
+    ----------
+    url : str
+        URL to download
+    path : str, optional
+        Destination path to store downloaded file. By default stores to the
+        current directory with same name as in url.
+    overwrite : bool, optional
+        Whether to overwrite destination file if already exists.
+    sha1_hash : str, optional
+        Expected sha1 hash in hexadecimal digits. Will ignore existing file when hash is specified
+        but doesn't match.
+    Returns
+    -------
+    str
+        The file path of the downloaded file.
+    """
+    if path is None:
+        fname = url.split('/')[-1]
+    else:
+        path = os.path.expanduser(path)
+        if os.path.isdir(path):
+            fname = os.path.join(path, url.split('/')[-1])
+        else:
+            fname = path
+    if overwrite or not os.path.exists(fname) or (sha1_hash and not check_sha1(fname, sha1_hash)):
+        dirname = os.path.dirname(os.path.abspath(os.path.expanduser(fname)))
+        if not os.path.exists(dirname):
+            os.makedirs(dirname)
+        print('Downloading %s from %s...'%(fname, url))
+        r = requests.get(url, stream=True)
+        if r.status_code != 200:
+            raise RuntimeError("Failed downloading url %s"%url)
+        total_length = r.headers.get('content-length')
+        with open(fname, 'wb') as f:
+            if total_length is None: # no content length header
+                for chunk in r.iter_content(chunk_size=1024):
+                    if chunk: # filter out keep-alive new chunks
+                        f.write(chunk)
+            else:
+                total_length = int(total_length)
+                for chunk in tqdm(r.iter_content(chunk_size=1024),
+                                  total=int(total_length / 1024. + 0.5),
+                                  unit='KB', unit_scale=False, dynamic_ncols=True):
+                    f.write(chunk)
+        if sha1_hash and not check_sha1(fname, sha1_hash):
+            raise UserWarning('File {} is downloaded but the content hash does not match. ' \
+                              'The repo may be outdated or download may be incomplete. ' \
+                              'If the "repo_url" is overridden, consider switching to ' \
+                              'the default repo.'.format(fname))
+    return fname
+def check_sha1(filename, sha1_hash):
+    """Check whether the sha1 hash of the file content matches the expected hash.
+    Parameters
+    ----------
+    filename : str
+        Path to the file.
+    sha1_hash : str
+        Expected sha1 hash in hexadecimal digits.
+    Returns
+    -------
+    bool
+        Whether the file content matches the expected hash.
+    """
+    sha1 = hashlib.sha1()
+    with open(filename, 'rb') as f:
+        while True:
+            data = f.read(1048576)
+            if not data:
+                break
+            sha1.update(data)
+    return sha1.hexdigest() == sha1_hash
+def mkdir(path):
+    """make dir exists okay"""
+    try:
+        os.makedirs(path)
+    except OSError as exc:  # Python >2.5
+        if exc.errno == errno.EEXIST and os.path.isdir(path):
+            pass
+        else:
+            raise
+# ref https://github.com/CSAILVision/sceneparsing/blob/master/evaluationCode/utils_eval.py
+def pixel_accuracy(im_pred, im_lab):
+    im_pred = np.asarray(im_pred)
+    im_lab = np.asarray(im_lab)
+    # Remove classes from unlabeled pixels in gt image. 
+    # We should not penalize detections in unlabeled portions of the image.
+    pixel_labeled = np.sum(im_lab > 0)
+    pixel_correct = np.sum((im_pred == im_lab) * (im_lab > 0))
+    #pixel_accuracy = 1.0 * pixel_correct / pixel_labeled
+    return pixel_correct, pixel_labeled
+def intersection_and_union(im_pred, im_lab, num_class):
+    im_pred = np.asarray(im_pred)
+    im_lab = np.asarray(im_lab)
+    # Remove classes from unlabeled pixels in gt image. 
+    im_pred = im_pred * (im_lab > 0)
+    # Compute area intersection:
+    intersection = im_pred * (im_pred == im_lab)
+    area_inter, _ = np.histogram(intersection, bins=num_class-1,
+                                        range=(1, num_class - 1))
+    # Compute area union: 
+    area_pred, _ = np.histogram(im_pred, bins=num_class-1,
+                                range=(1, num_class - 1))
+    area_lab, _ = np.histogram(im_lab, bins=num_class-1,
+                               range=(1, num_class - 1))
+    area_union = area_pred + area_lab - area_inter
+    return area_inter, area_union
+def _get_voc_pallete(num_cls):
+    n = num_cls
+    pallete = [0]*(n*3)
+    for j in range(0,n):
+            lab = j
+            pallete[j*3+0] = 0
+            pallete[j*3+1] = 0
+            pallete[j*3+2] = 0
+            i = 0
+            while (lab > 0):
+                    pallete[j*3+0] |= (((lab >> 0) & 1) << (7-i))
+                    pallete[j*3+1] |= (((lab >> 1) & 1) << (7-i))
+                    pallete[j*3+2] |= (((lab >> 2) & 1) << (7-i))
+                    i = i + 1
+                    lab >>= 3
+    return pallete
+vocpallete = _get_voc_pallete(256)
+adepallete = [0,0,0,120,120,120,180,120,120,6,230,230,80,50,50,4,200,3,120,120,80,140,140,140,204,5,255,230,230,230,4,250,7,224,5,255,235,255,7,150,5,61,120,120,70,8,255,51,255,6,82,143,255,140,204,255,4,255,51,7,204,70,3,0,102,200,61,230,250,255,6,51,11,102,255,255,7,71,255,9,224,9,7,230,220,220,220,255,9,92,112,9,255,8,255,214,7,255,224,255,184,6,10,255,71,255,41,10,7,255,255,224,255,8,102,8,255,255,61,6,255,194,7,255,122,8,0,255,20,255,8,41,255,5,153,6,51,255,235,12,255,160,150,20,0,163,255,140,140,140,250,10,15,20,255,0,31,255,0,255,31,0,255,224,0,153,255,0,0,0,255,255,71,0,0,235,255,0,173,255,31,0,255,11,200,200,255,82,0,0,255,245,0,61,255,0,255,112,0,255,133,255,0,0,255,163,0,255,102,0,194,255,0,0,143,255,51,255,0,0,82,255,0,255,41,0,255,173,10,0,255,173,255,0,0,255,153,255,92,0,255,0,255,255,0,245,255,0,102,255,173,0,255,0,20,255,184,184,0,31,255,0,255,61,0,71,255,255,0,204,0,255,194,0,255,82,0,10,255,0,112,255,51,0,255,0,194,255,0,122,255,0,255,163,255,153,0,0,255,10,255,112,0,143,255,0,82,0,255,163,255,0,255,235,0,8,184,170,133,0,255,0,255,92,184,0,255,255,0,31,0,184,255,0,214,255,255,0,112,92,255,0,0,224,255,112,224,255,70,184,160,163,0,255,153,0,255,71,255,0,255,0,163,255,204,0,255,0,143,0,255,235,133,255,0,255,0,235,245,0,255,255,0,122,255,245,0,10,190,212,214,255,0,0,204,255,20,0,255,255,255,0,0,153,255,0,41,255,0,255,204,41,0,255,41,255,0,173,0,255,0,245,255,71,0,255,122,0,255,0,255,184,0,92,255,184,255,0,0,133,255,255,214,0,25,194,194,102,255,0,92,0,255]
+citypallete = [
+128,64,128,244,35,232,70,70,70,102,102,156,190,153,153,153,153,153,250,170,30,220,220,0,107,142,35,152,251,152,70,130,180,220,20,60,255,0,0,0,0,142,0,0,70,0,60,100,0,80,100,0,0,230,119,11,32,128,192,0,0,64,128,128,64,128,0,192,128,128,192,128,64,64,0,192,64,0,64,192,0,192,192,0,64,64,128,192,64,128,64,192,128,192,192,128,0,0,64,128,0,64,0,128,64,128,128,64,0,0,192,128,0,192,0,128,192,128,128,192,64,0,64,192,0,64,64,128,64,192,128,64,64,0,192,192,0,192,64,128,192,192,128,192,0,64,64,128,64,64,0,192,64,128,192,64,0,64,192,128,64,192,0,192,192,128,192,192,64,64,64,192,64,64,64,192,64,192,192,64,64,64,192,192,64,192,64,192,192,192,192,192,32,0,0,160,0,0,32,128,0,160,128,0,32,0,128,160,0,128,32,128,128,160,128,128,96,0,0,224,0,0,96,128,0,224,128,0,96,0,128,224,0,128,96,128,128,224,128,128,32,64,0,160,64,0,32,192,0,160,192,0,32,64,128,160,64,128,32,192,128,160,192,128,96,64,0,224,64,0,96,192,0,224,192,0,96,64,128,224,64,128,96,192,128,224,192,128,32,0,64,160,0,64,32,128,64,160,128,64,32,0,192,160,0,192,32,128,192,160,128,192,96,0,64,224,0,64,96,128,64,224,128,64,96,0,192,224,0,192,96,128,192,224,128,192,32,64,64,160,64,64,32,192,64,160,192,64,32,64,192,160,64,192,32,192,192,160,192,192,96,64,64,224,64,64,96,192,64,224,192,64,96,64,192,224,64,192,96,192,192,224,192,192,0,32,0,128,32,0,0,160,0,128,160,0,0,32,128,128,32,128,0,160,128,128,160,128,64,32,0,192,32,0,64,160,0,192,160,0,64,32,128,192,32,128,64,160,128,192,160,128,0,96,0,128,96,0,0,224,0,128,224,0,0,96,128,128,96,128,0,224,128,128,224,128,64,96,0,192,96,0,64,224,0,192,224,0,64,96,128,192,96,128,64,224,128,192,224,128,0,32,64,128,32,64,0,160,64,128,160,64,0,32,192,128,32,192,0,160,192,128,160,192,64,32,64,192,32,64,64,160,64,192,160,64,64,32,192,192,32,192,64,160,192,192,160,192,0,96,64,128,96,64,0,224,64,128,224,64,0,96,192,128,96,192,0,224,192,128,224,192,64,96,64,192,96,64,64,224,64,192,224,64,64,96,192,192,96,192,64,224,192,192,224,192,32,32,0,160,32,0,32,160,0,160,160,0,32,32,128,160,32,128,32,160,128,160,160,128,96,32,0,224,32,0,96,160,0,224,160,0,96,32,128,224,32,128,96,160,128,224,160,128,32,96,0,160,96,0,32,224,0,160,224,0,32,96,128,160,96,128,32,224,128,160,224,128,96,96,0,224,96,0,96,224,0,224,224,0,96,96,128,224,96,128,96,224,128,224,224,128,32,32,64,160,32,64,32,160,64,160,160,64,32,32,192,160,32,192,32,160,192,160,160,192,96,32,64,224,32,64,96,160,64,224,160,64,96,32,192,224,32,192,96,160,192,224,160,192,32,96,64,160,96,64,32,224,64,160,224,64,32,96,192,160,96,192,32,224,192,160,224,192,96,96,64,224,96,64,96,224,64,224,224,64,96,96,192,224,96,192,96,224,192,0,0,0]
--- a/experiments/recognition/main.py
+++ b/experiments/recognition/main.py
@@ -53,7 +53,9 @@ def main():
    print(model)
    # criterion and optimizer
    criterion = nn.CrossEntropyLoss()
-    optimizer = get_optimizer(args, model, False)
+    optimizer = torch.optim.SGD(model.parameters(), lr=args.lr,
+                                momentum=args.momentum,
+                                weight_decay=args.weight_decay)
    if args.cuda:
        model.cuda()
        # Please use CUDA_VISIBLE_DEVICES to control the number of gpus

--- a/setup.py
+++ b/setup.py
@@ -33,7 +33,7 @@ class install(setuptools.command.install.install):
            f.write('"""This is encoding version file."""\n')
            f.write("__version__ = '{}'\n".format(version))
-version = '0.3.0'
+version = '0.4.0'
 try:
    sha = subprocess.check_output(['git', 'rev-parse', 'HEAD'], 
        cwd=cwd).decode('ascii').strip()
@@ -41,22 +41,36 @@ try:
 except Exception:
    pass
+try:
+    import pypandoc
+    readme = pypandoc.convert('README.md', 'rst')
+except(IOError, ImportError):
+    readme = open('README.md').read()
+requirements = [
+    'numpy',
+    'tqdm',
+    'nose',
+    'torch>=0.3.1',
+    'cffi>=1.0.0',
+]
 setup(
    name="encoding",
    version=version,
-    description="PyTorch Encoding",
-    url="https://github.com/zhanghang1989/PyTorch-Encoding",
    author="Hang Zhang",
-    author_email="zhang.hang@rutgers.edu",
+    author_email="zhanghang0704@gmail.com",
-    # Require cffi.
+    url="https://github.com/zhanghang1989/PyTorch-Encoding",
-    install_requires=["cffi>=1.0.0"],
+    description="PyTorch Encoding Package",
-    setup_requires=["cffi>=1.0.0"],
+    long_description=readme,
-    # Exclude the build files.
+    license='MIT',
-    packages=find_packages(exclude=["build"]),
+    install_requires=requirements,
-    # Package where to put the extensions. Has to be a prefix of build.py.
+    packages=find_packages(exclude=["tests", "experiments"]),
-    package_data={'encoding': [
+    package_data={ 'encoding': [
        'lib/*.so*', 'lib/*.dylib*',
+        '_ext/encoding_lib/*.so', '_ext/encoding_lib/*.dylib',
        'kernel/*.h', 'kernel/generic/*h',
+        'src/*.h',
    ]},
    ext_package="",
    # Extensions to compile.

--- a/tests/unit_test.py
+++ b/tests/unit_test.py
@@ -8,16 +8,21 @@
 ## LICENSE file in the root directory of this source tree 
 ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-import encoding
+import numpy as np
-import unittest
 import torch
-import torch.nn.functional as F
 from torch.autograd import Variable, gradcheck
-import torchvision.models as models
+import encoding
 EPS = 1e-6
+ATOL = 1e-4
+def _assert_tensor_close(a, b, atol=ATOL, rtol=EPS):
+    npa, npb = a.cpu().numpy(), b.cpu().numpy()
+    assert np.allclose(npa, npb, atol=atol), \
+        'Tensor close check failed\n{}\n{}\nadiff={}, rdiff={}'.format(
+            a, b, np.abs(npa - npb).max(), np.abs((npa - npb) / np.fmax(npa, 1e-5)).max())
 def test_aggregate():
    B,N,K,D = 2,3,4,5
@@ -28,7 +33,7 @@ def test_aggregate():
    C = Variable(torch.cuda.DoubleTensor(K,D).uniform_(-0.5,0.5), 
        requires_grad=True)
    input = (A, X, C)
-    test = gradcheck(encoding.functions.aggregate, input, eps=1e-6, atol=1e-4)
+    test = gradcheck(encoding.functions.aggregate, input, eps=EPS, atol=ATOL)
    print('Testing aggregate(): {}'.format(test))
@@ -41,7 +46,7 @@ def test_scaledL2():
    S = Variable(torch.cuda.DoubleTensor(K).uniform_(-0.5,0.5), 
        requires_grad=True)
    input = (X, C, S)
-    test = gradcheck(encoding.functions.scaledL2, input, eps=1e-6, atol=1e-4)
+    test = gradcheck(encoding.functions.scaledL2, input, eps=EPS, atol=ATOL)
    print('Testing scaledL2(): {}'.format(test))
@@ -51,16 +56,16 @@ def test_encoding():
        requires_grad=True)
    input = (X,)
    layer = encoding.nn.Encoding(C,K).double().cuda()
-    test = gradcheck(layer, input, eps=1e-6, atol=1e-4)
+    test = gradcheck(layer, input, eps=EPS, atol=ATOL)
    print('Testing encoding(): {}'.format(test))
 def test_sum_square():
-    B,C,H,W = 2,3,4,5
+    B,C,H = 2,3,4
-    X = Variable(torch.cuda.DoubleTensor(B,C,H,W).uniform_(-0.5,0.5), 
+    X = Variable(torch.cuda.DoubleTensor(B,C,H).uniform_(-0.5,0.5), 
        requires_grad=True)
    input = (X,)
-    test = gradcheck(encoding.functions.sum_square, input, eps=1e-6, atol=1e-4)
+    test = gradcheck(encoding.functions.sum_square, input, eps=EPS, atol=ATOL)
    print('Testing sum_square(): {}'.format(test))
@@ -71,6 +76,97 @@ def test_all_reduce():
        x.requires_grad = True
    Y = encoding.parallel.allreduce(1, *X)
    assert (len(X) == len(Y))
+    for i in range(1, ngpu):
+        _assert_tensor_close(Y[i].data, Y[0].data)
+    input = (1, *X)
+    #test = gradcheck(encoding.parallel.allreduce, input, eps=EPS, atol=ATOL)
+    test = gradcheck(encoding.parallel.allreduce, input, eps=EPS, atol=ATOL)
+def _test_syncbn(train_mode=True):
+    # generate input
+    B,C,H,W = 8,3,4,5
+    X = Variable(torch.cuda.DoubleTensor(B,C,H,W).uniform_(-0.5,0.5), 
+                 requires_grad=True)
+    input = (X,)
+    # SyncBN using DataParallel
+    layer = encoding.nn.SyncBatchNorm2d(C)
+    model = torch.nn.DataParallel(layer).double().cuda()
+    layer.train(train_mode)
+    # grad check
+    test = gradcheck(model, input, eps=EPS, atol=ATOL)
+    print('Testing SyncBatchNorm2d(): {}'.format(test))
+def _test_syncbn_func(train_mode=True):
+    # generate input
+    B, C, H = 2, 3, 4
+    X = Variable(torch.cuda.DoubleTensor(B,C,H).uniform_(-0.5, 0.5), 
+        requires_grad=True)
+    xsum = Variable(torch.ones(C).double().cuda(), requires_grad=True)
+    xsqsum = Variable(torch.ones(C).double().cuda(), requires_grad=True)
+    gamma = Variable(torch.ones(C).double().cuda(), requires_grad=True)
+    beta = Variable(torch.zeros(C).double().cuda(), requires_grad=True)
+    gamma.requires_grad=True
+    beta.requires_grad=True
+    runningVar = torch.ones(C).double().cuda()
+    runningMean = torch.zeros(C).double().cuda()
+    N = B * H
+    inputs = (X, xsum, xsqsum, gamma, beta, runningMean, runningVar, N, 0.1, 1e-5, train_mode)
+    # grad check
+    test = gradcheck(encoding.functions.batchnorm.apply, inputs, eps=EPS, atol=ATOL)
+    print('Testing batchnorm(): {}'.format(test))
+def _checkBatchNormResult(bn1, bn2, input, is_train, cuda=False):
+    def _find_bn(module):
+        for m in module.modules():
+            if isinstance(m, (torch.nn.BatchNorm1d, torch.nn.BatchNorm2d,
+                              encoding.nn.SyncBatchNorm1d, encoding.nn.SyncBatchNorm2d)):
+                return m
+    def _syncParameters(bn1, bn2):
+        bn1.reset_parameters()
+        bn2.reset_parameters()
+        if bn1.affine and bn2.affine:
+            bn2.weight.data.copy_(bn1.weight.data)
+            bn2.bias.data.copy_(bn1.bias.data)
+            bn2.running_mean.copy_(bn1.running_mean)
+            bn2.running_var.copy_(bn1.running_var)
+    bn1.train(mode=is_train)
+    bn2.train(mode=is_train)
+    if cuda:
+        input = input.cuda()
+    # using the same values for gamma and beta
+    _syncParameters(_find_bn(bn1), _find_bn(bn2))
+    input1 = Variable(input.clone(), requires_grad=True)
+    output1 = bn1(input1)
+    input2 = Variable(input.clone(), requires_grad=True)
+    output2 = bn2(input2)
+    _assert_tensor_close(input1.data, input2.data)
+    _assert_tensor_close(output1.data, output2.data)
+    if not is_train:
+        return
+    (output1 ** 2).sum().backward()
+    (output2 ** 2).sum().backward()
+    _assert_tensor_close(input1.grad.data, input2.grad.data)
+    _assert_tensor_close(_find_bn(bn1).running_mean, _find_bn(bn2).running_mean)
+    _assert_tensor_close(_find_bn(bn1).running_var, _find_bn(bn2).running_var)
+def testSyncBN():
+    bn = torch.nn.BatchNorm2d(10).cuda().double()
+    sync_bn = encoding.nn.SyncBatchNorm2d(10).double()
+    sync_bn = torch.nn.DataParallel(sync_bn).cuda()
+    # check with unsync version
+    for i in range(10):
+        _checkBatchNormResult(bn, sync_bn, torch.rand(16, 10, 16, 16).double(), True, cuda=True)
+        _checkBatchNormResult(bn, sync_bn, torch.rand(16, 10, 16, 16).double(), False, cuda=True)
+    # gradcheck
+    _test_syncbn_func(True)
+    _test_syncbn(True)
 if __name__ == '__main__':