V1.0.0 (#156)

* v1.0

V1.0.0 (#156)
* v1.0
ce461dae · Hang Zhang · GitHub · c2cb2aab · ce461dae · ce461dae
Unverified Commit ce461dae authored Dec 17, 2018 by Hang Zhang Committed by GitHub Dec 17, 2018
20 changed files
--- a/docs/source/experiments/cifar.rst
+++ b/docs/source/experiments/cifar.rst
@@ -57,19 +57,6 @@ Train Your Own Model
  --eval                evaluating
-Extending the Software
----------------------
-This code is well written, easy to use and extendable for your own models or datasets:
- Write your own Dataloader ``mydataset.py`` to ``dataset/`` folder
- Write your own Model ``mymodel.py`` to ``model/`` folder
- Run the program::
-    python main.py --dataset mydataset --model mymodel
 Citation
 --------

--- a/docs/source/experiments/segmentation.rst
+++ b/docs/source/experiments/segmentation.rst
@@ -38,25 +38,19 @@ Test Pre-trained Model
 .. role:: raw-html(raw)
   :format: html
-+----------------------------------+-----------+-----------+----------------------------------------------------------------------------------------------+------------+
+----------------------------------+-----------+-----------+----------------------------------------------------------------------------------------------+
-| Model                            | pixAcc    | mIoU      | Command                                                                                      | Logs       |
+| Model                            | pixAcc    | mIoU      | Command                                                                                      |
-+==================================+===========+===========+==============================================================================================+============+
+==================================+===========+===========+==============================================================================================+
-| Encnet_ResNet50_PContext         | 79.2%     | 51.0%     | :raw-html:`<a href="javascript:toggleblock('cmd_enc50_pcont')" class="toggleblock">cmd</a>`  | ENC50PC_   |
+| Encnet_ResNet50_PContext         | 79.2%     | 51.0%     | :raw-html:`<a href="javascript:toggleblock('cmd_enc50_pcont')" class="toggleblock">cmd</a>`  |
-+----------------------------------+-----------+-----------+----------------------------------------------------------------------------------------------+------------+
+----------------------------------+-----------+-----------+----------------------------------------------------------------------------------------------+
-| EncNet_ResNet101_PContext        | 80.7%     | 54.1%     | :raw-html:`<a href="javascript:toggleblock('cmd_enc101_pcont')" class="toggleblock">cmd</a>` | ENC101PC_  |
+| EncNet_ResNet101_PContext        | 80.7%     | 54.1%     | :raw-html:`<a href="javascript:toggleblock('cmd_enc101_pcont')" class="toggleblock">cmd</a>` |
-+----------------------------------+-----------+-----------+----------------------------------------------------------------------------------------------+------------+
+----------------------------------+-----------+-----------+----------------------------------------------------------------------------------------------+
-| EncNet_ResNet50_ADE              | 80.1%     | 41.5%     | :raw-html:`<a href="javascript:toggleblock('cmd_enc50_ade')" class="toggleblock">cmd</a>`    | ENC50ADE_  |
+| EncNet_ResNet50_ADE              | 80.1%     | 41.5%     | :raw-html:`<a href="javascript:toggleblock('cmd_enc50_ade')" class="toggleblock">cmd</a>`    |
-+----------------------------------+-----------+-----------+----------------------------------------------------------------------------------------------+------------+
+----------------------------------+-----------+-----------+----------------------------------------------------------------------------------------------+
-| EncNet_ResNet101_ADE             | 81.3%     | 44.4%     | :raw-html:`<a href="javascript:toggleblock('cmd_enc101_ade')" class="toggleblock">cmd</a>`   | ENC101ADE_ |
+| EncNet_ResNet101_ADE             | 81.3%     | 44.4%     | :raw-html:`<a href="javascript:toggleblock('cmd_enc101_ade')" class="toggleblock">cmd</a>`   |
-+----------------------------------+-----------+-----------+----------------------------------------------------------------------------------------------+------------+
+----------------------------------+-----------+-----------+----------------------------------------------------------------------------------------------+
-| EncNet_ResNet101_VOC             | N/A       | 85.9%     | :raw-html:`<a href="javascript:toggleblock('cmd_enc101_voc')" class="toggleblock">cmd</a>`   | ENC101VOC_ |
+| EncNet_ResNet101_VOC             | N/A       | 85.9%     | :raw-html:`<a href="javascript:toggleblock('cmd_enc101_voc')" class="toggleblock">cmd</a>`   |
-+----------------------------------+-----------+-----------+----------------------------------------------------------------------------------------------+------------+
+----------------------------------+-----------+-----------+----------------------------------------------------------------------------------------------+
-.. _ENC50PC: https://github.com/zhanghang1989/image-data/blob/master/encoding/segmentation/logs/encnet_resnet50_pcontext.log?raw=true
-.. _ENC101PC: https://github.com/zhanghang1989/image-data/blob/master/encoding/segmentation/logs/encnet_resnet101_pcontext.log?raw=true
-.. _ENC50ADE: https://github.com/zhanghang1989/image-data/blob/master/encoding/segmentation/logs/encnet_resnet50_ade.log?raw=true
-.. _ENC101ADE: https://github.com/zhanghang1989/image-data/blob/master/encoding/segmentation/logs/encnet_resnet101_ade.log?raw=true
-.. _ENC101VOC: https://github.com/zhanghang1989/image-data/blob/master/encoding/segmentation/logs/encnet_resnet101_voc.log?raw=true
 .. raw:: html

--- a/docs/source/experiments/texture.rst
+++ b/docs/source/experiments/texture.rst
@@ -22,16 +22,11 @@ Test Pre-trained Model
    cd PyTorch-Encoding/
    python scripts/prepare_minc.py
- Download pre-trained model (pre-trained on train-1 split using single training size of 224, with an error rate of :math:`18.96\%` using single crop on test-1 set)::
+- Test pre-trained model on MINC-2500. The pre-trained weight will be automatic downloaded (pre-trained on train-1 split using single training size of 224, with an error rate of :math:`18.96\%` using single crop on test-1 set)::
-    cd experiments/recognition
+    python main.py --dataset minc --model deepten_resnet50_minc --nclass 23  --pretrained --eval
-    python model/download_models.py
- Test pre-trained model on MINC-2500::
-    python main.py --dataset minc --model deepten --nclass 23 --resume deepten_minc.pth --eval
    # Teriminal Output:
-    # Loss: 1.005 | Err: 18.96% (1090/5750): 100%|████████████████████| 23/23 [00:18<00:00,  1.26it/s]
+    # Loss: 0.995 | Err: 18.957% (1090/5750): 100%|████████████████████| 23/23 [00:18<00:00,  1.26it/s]
 Train Your Own Model
@@ -39,7 +34,7 @@ Train Your Own Model
 - Example training command for training above model::
-    CUDA_VISIBLE_DEVICES=0,1,2,3 python main.py --dataset minc --model deepten --nclass 23 --model deepten --batch-size 512 --lr 0.004 --epochs 80 --lr-step 60 --lr-scheduler step
+   CUDA_VISIBLE_DEVICES=0,1,2,3 python main.py --dataset minc --model deepten_resnet50_minc --batch-size 512 --lr 0.004 --epochs 80 --lr-step 60 --lr-scheduler step --weight-decay 5e-4
 - Detail training options::
@@ -62,20 +57,6 @@ Train Your Own Model
  --eval                evaluating
-Extending the Software
----------------------
-This code is well written, easy to use and extendable for your own models or datasets:
- Write your own Dataloader ``mydataset.py`` to ``dataset/`` folder
- Write your own Model ``mymodel.py`` to ``model/`` folder
- Run the program::
-    python main.py --dataset mydataset --model mymodel
 Citation
 --------

--- a/docs/source/functions.rst
+++ b/docs/source/functions.rst
-.. role:: hidden
-    :class: hidden-section
-encoding.functions
-==================
-.. automodule:: encoding.functions
-.. currentmodule:: encoding.functions
-:hidden:`batchnormtrain`
-~~~~~~~~~~~~~~~~~~~~~~~~
-.. autofunction:: batchnormtrain
-:hidden:`aggregate`
-~~~~~~~~~~~~~~~~~~~
-.. autofunction:: aggregate
-:hidden:`scaled_l2`
-~~~~~~~~~~~~~~~~~~~
-.. autofunction:: scaled_l2
-:hidden:`sum_square`
-~~~~~~~~~~~~~~~~~~~~
-.. autofunction:: sum_square
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -30,8 +30,7 @@ An optimized PyTorch package with CUDA backend.
   nn
   parallel
-   dilated
+   models
-   functions
   utils
 Indices and tables

--- a/docs/source/dilated.rst
+++ b/docs/source/dilated.rst
 .. role:: hidden
    :class: hidden-section
-encoding.dilated
+encoding.models
 ================
+.. automodule:: encoding.models.resnet
+.. currentmodule:: encoding.models.resnet
+ResNet
+------
 We provide correct dilated pre-trained ResNet and DenseNet (stride of 8) for semantic segmentation. 
 For dilation of DenseNet, we provide :class:`encoding.nn.DilatedAvgPool2d`. 
 All provided models have been verified. 
@@ -14,12 +20,6 @@ All provided models have been verified.
    * Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang, Ambrish Tyagi, Amit Agrawal. "Context Encoding for Semantic Segmentation"  *The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2018*
-.. automodule:: encoding.dilated
-.. currentmodule:: encoding.dilated
-ResNet
------
 :hidden:`ResNet`
 ~~~~~~~~~~~~~~~~

--- a/docs/source/nn.rst
+++ b/docs/source/nn.rst
@@ -14,10 +14,10 @@ Customized NN modules in Encoding Package. For Synchronized Cross-GPU Batch Norm
 .. autoclass:: Encoding
    :members:
-:hidden:`BatchNorm2d`
+:hidden:`SyncBatchNorm`
 ~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: BatchNorm2d
+.. autoclass:: SyncBatchNorm
    :members:
 :hidden:`BatchNorm1d`
@@ -26,6 +26,12 @@ Customized NN modules in Encoding Package. For Synchronized Cross-GPU Batch Norm
 .. autoclass:: BatchNorm1d
    :members:
+:hidden:`BatchNorm2d`
+~~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: BatchNorm2d
+    :members:
 :hidden:`BatchNorm3d`
 ~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/notes/compile.rst
+++ b/docs/source/notes/compile.rst
@@ -2,13 +2,10 @@ Install and Citations
 =====================
-Install from Source
+Installation
-------------------
+------------
-    * Install PyTorch by following the `PyTorch instructions <http://pytorch.org/>`_.
+    * Install PyTorch 1.0 by following the `PyTorch instructions <http://pytorch.org/>`_.
-      This package relies on PyTorch master branch (higher than stable released v0.4.0), please follow
-      `the instruction <https://github.com/pytorch/pytorch#from-source>`_ to install
-      PyTorch from source.
    * PIP Install::

--- a/encoding/__init__.py
+++ b/encoding/__init__.py
@@ -10,4 +10,4 @@
 """An optimized PyTorch package with CUDA backend."""
 from .version import __version__
-from . import nn, functions, dilated, parallel, utils, models, datasets
+from . import nn, functions, parallel, utils, models, datasets, transforms
--- a/encoding/datasets/__init__.py
+++ b/encoding/datasets/__init__.py
+import warnings
+from torchvision.datasets import *
 from .base import *
 from .coco import COCOSegmentation
 from .ade20k import ADE20KSegmentation
@@ -5,6 +7,10 @@ from .pascal_voc import VOCSegmentation
 from .pascal_aug import VOCAugSegmentation
 from .pcontext import ContextSegmentation
 from .cityscapes import CitySegmentation
+from .imagenet import ImageNetDataset
+from .minc import MINCDataset
+from ..utils import EncodingDeprecationWarning
 datasets = {
    'coco': COCOSegmentation,
@@ -13,7 +19,40 @@ datasets = {
    'pascal_aug': VOCAugSegmentation,
    'pcontext': ContextSegmentation,
    'citys': CitySegmentation,
+    'imagenet': ImageNetDataset,
+    'minc': MINCDataset,
+    'cifar10': CIFAR10,
+}
+acronyms = {
+    'coco': 'coco',
+    'pascal_voc': 'voc',
+    'pascal_aug': 'voc',
+    'pcontext': pcontext,
+    'ade20k': 'ade',
+    'citys': 'citys',
+    'minc': 'minc',
+    'cifar10': 'cifar10',
 }
-def get_segmentation_dataset(name, **kwargs):
+def get_dataset(name, **kwargs):
    return datasets[name.lower()](**kwargs)
+def _make_deprecate(meth, old_name):
+    new_name = meth.__name__
+    def deprecated_init(*args, **kwargs):
+        warnings.warn("encoding.dataset.{} is now deprecated in favor of encoding.dataset.{}."
+                      .format(old_name, new_name), EncodingDeprecationWarning)
+        return meth(*args, **kwargs)
+    deprecated_init.__doc__ = r"""
+    {old_name}(...)
+    .. warning::
+        This method is now deprecated in favor of :func:`torch.nn.init.{new_name}`.
+    See :func:`~torch.nn.init.{new_name}` for details.""".format(
+        old_name=old_name, new_name=new_name)
+    deprecated_init.__name__ = old_name
+    return deprecated_init
+get_segmentation_dataset = _make_deprecate(get_dataset, 'get_segmentation_dataset')
--- a/encoding/datasets/ade20k.py
+++ b/encoding/datasets/ade20k.py
@@ -57,6 +57,39 @@ class ADE20KSegmentation(BaseDataset):
            mask = self.target_transform(mask)
        return img, mask
+    def _sync_transform(self, img, mask):
+        # random mirror
+        if random.random() < 0.5:
+            img = img.transpose(Image.FLIP_LEFT_RIGHT)
+            mask = mask.transpose(Image.FLIP_LEFT_RIGHT)
+        crop_size = self.crop_size
+        w, h = img.size
+        long_size = random.randint(int(self.base_size*0.5), int(self.base_size*2.5))
+        if h > w:
+            oh = long_size
+            ow = int(1.0 * w * long_size / h + 0.5)
+            short_size = ow
+        else:
+            ow = long_size
+            oh = int(1.0 * h * long_size / w + 0.5)
+            short_size = oh
+        img = img.resize((ow, oh), Image.BILINEAR)
+        mask = mask.resize((ow, oh), Image.NEAREST)
+        # pad crop
+        if short_size < crop_size:
+            padh = crop_size - oh if oh < crop_size else 0
+            padw = crop_size - ow if ow < crop_size else 0
+            img = ImageOps.expand(img, border=(0, 0, padw, padh), fill=0)
+            mask = ImageOps.expand(mask, border=(0, 0, padw, padh), fill=0)
+        # random crop crop_size
+        w, h = img.size
+        x1 = random.randint(0, w - crop_size)
+        y1 = random.randint(0, h - crop_size)
+        img = img.crop((x1, y1, x1+crop_size, y1+crop_size))
+        mask = mask.crop((x1, y1, x1+crop_size, y1+crop_size))
+        # final transform
+        return img, self._mask_transform(mask)
    def _mask_transform(self, mask):
        target = np.array(mask).astype('int64') - 1
        return torch.from_numpy(target)

--- a/encoding/datasets/base.py
+++ b/encoding/datasets/base.py
@@ -67,15 +67,16 @@ class BaseDataset(data.Dataset):
            img = img.transpose(Image.FLIP_LEFT_RIGHT)
            mask = mask.transpose(Image.FLIP_LEFT_RIGHT)
        crop_size = self.crop_size
-        # random scale (short edge from 480 to 720)
-        short_size = random.randint(int(self.base_size*0.5), int(self.base_size*2.0))
        w, h = img.size
+        long_size = random.randint(int(self.base_size*0.5), int(self.base_size*2.0))
        if h > w:
-            ow = short_size
+            oh = long_size
-            oh = int(1.0 * h * ow / w)
+            ow = int(1.0 * w * long_size / h + 0.5)
+            short_size = ow
        else:
-            oh = short_size
+            ow = long_size
-            ow = int(1.0 * w * oh / h)
+            oh = int(1.0 * h * long_size / w + 0.5)
+            short_size = oh
        img = img.resize((ow, oh), Image.BILINEAR)
        mask = mask.resize((ow, oh), Image.NEAREST)
        # pad crop
@@ -90,10 +91,6 @@ class BaseDataset(data.Dataset):
        y1 = random.randint(0, h - crop_size)
        img = img.crop((x1, y1, x1+crop_size, y1+crop_size))
        mask = mask.crop((x1, y1, x1+crop_size, y1+crop_size))
-        # gaussian blur as in PSP
-        if random.random() < 0.5:
-            img = img.filter(ImageFilter.GaussianBlur(
-                radius=random.random()))
        # final transform
        return img, self._mask_transform(mask)

--- a/encoding/datasets/cityscapes.py
+++ b/encoding/datasets/cityscapes.py
@@ -87,46 +87,6 @@ class CitySegmentation(BaseDataset):
            mask = self.target_transform(mask)
        return img, mask
-    def _sync_transform(self, img, mask):
-        # random mirror
-        if random.random() < 0.5:
-            img = img.transpose(Image.FLIP_LEFT_RIGHT)
-            mask = mask.transpose(Image.FLIP_LEFT_RIGHT)
-        crop_size = self.crop_size
-        # random scale (short edge from 480 to 720)
-        short_size = random.randint(int(self.base_size*0.5), int(self.base_size*2.0))
-        w, h = img.size
-        if h > w:
-            ow = short_size
-            oh = int(1.0 * h * ow / w)
-        else:
-            oh = short_size
-            ow = int(1.0 * w * oh / h)
-        img = img.resize((ow, oh), Image.BILINEAR)
-        mask = mask.resize((ow, oh), Image.NEAREST)
-        # random rotate -10~10, mask using NN rotate
-        deg = random.uniform(-10, 10)
-        img = img.rotate(deg, resample=Image.BILINEAR)
-        mask = mask.rotate(deg, resample=Image.NEAREST)
-        # pad crop
-        if short_size < crop_size:
-            padh = crop_size - oh if oh < crop_size else 0
-            padw = crop_size - ow if ow < crop_size else 0
-            img = ImageOps.expand(img, border=(0, 0, padw, padh), fill=0)
-            mask = ImageOps.expand(mask, border=(0, 0, padw, padh), fill=0)
-        # random crop crop_size
-        w, h = img.size
-        x1 = random.randint(0, w - crop_size)
-        y1 = random.randint(0, h - crop_size)
-        img = img.crop((x1, y1, x1+crop_size, y1+crop_size))
-        mask = mask.crop((x1, y1, x1+crop_size, y1+crop_size))
-        # gaussian blur as in PSP
-        if random.random() < 0.5:
-            img = img.filter(ImageFilter.GaussianBlur(
-                radius=random.random()))
-        # final transform
-        return img, self._mask_transform(mask)
    def _mask_transform(self, mask):
        #target = np.array(mask).astype('int32') - 1
        target = self._class_to_index(np.array(mask).astype('int32'))

--- a/encoding/datasets/coco.py
+++ b/encoding/datasets/coco.py
@@ -23,6 +23,7 @@ class COCOSegmentation(BaseDataset):
            self.root = os.path.join(root, 'train2017')
        else:
            print('val set')
+            assert split == 'val'
            ann_file = os.path.join(root, 'annotations/instances_val2017.json')
            ids_file = os.path.join(root, 'annotations/val_ids.pth')
            self.root = os.path.join(root, 'val2017')
@@ -99,6 +100,7 @@ class COCOSegmentation(BaseDataset):
        print('Found number of qualified images: ', len(new_ids))
        torch.save(new_ids, ids_file)
        return new_ids
 """
 NUM_CHANNEL = 91
 [] background
@@ -123,4 +125,3 @@ NUM_CHANNEL = 91
 [7] train
 [72] tv
 """
--- a/encoding/datasets/imagenet.py
+++ b/encoding/datasets/imagenet.py
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+## Created by: Hang Zhang
+## Email: zhanghang0704@gmail.com
+## Copyright (c) 2018
+##
+## This source code is licensed under the MIT-style license found in the
+## LICENSE file in the root directory of this source tree
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+import os
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+class ImageNetDataset(datasets.ImageFolder):
+    BASE_DIR = "ILSVRC2012"
+    def __init__(self, root=os.path.expanduser('~/.encoding/data'), transform=None,
+                 target_transform=None, train=True, **kwargs):
+        split='train' if train == True else 'val'
+        root = os.path.join(root, self.BASE_DIR, split)
+        super(ImageNetDataset, self).__init__(
+            root, transform, target_transform)
--- a/encoding/datasets/minc.py
+++ b/encoding/datasets/minc.py
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+## Created by: Hang Zhang
+## ECE Department, Rutgers University
+## Email: zhang.hang@rutgers.edu
+## Copyright (c) 2017
+##
+## This source code is licensed under the MIT-style license found in the
+## LICENSE file in the root directory of this source tree 
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+import os
+from PIL import Image
+import torch
+import torch.utils.data as data
+class MINCDataset(data.Dataset):
+    NUM_CLASS = 23
+    def __init__(self, root=os.path.expanduser('~/.encoding/data/minc-2500/'),
+                 split='train', transform=None):
+        self.transform = transform
+        classes, class_to_idx = find_classes(root + '/images')
+        if split=='train':
+            filename = os.path.join(root, 'labels/train1.txt')
+        else:
+            filename = os.path.join(root, 'labels/test1.txt')
+        self.images, self.labels = make_dataset(filename, root, 
+            class_to_idx)
+        assert (len(self.images) == len(self.labels))
+    def __getitem__(self, index):
+        _img = Image.open(self.images[index]).convert('RGB')
+        _label = self.labels[index]
+        if self.transform is not None:
+            _img = self.transform(_img)
+        return _img, _label
+    def __len__(self):
+        return len(self.images)
+def find_classes(dir):
+    classes = [d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))]
+    classes.sort()
+    class_to_idx = {classes[i]: i for i in range(len(classes))}
+    return classes, class_to_idx
+def make_dataset(filename, datadir, class_to_idx):
+    images = []
+    labels = []
+    with open(os.path.join(filename), "r") as lines:
+        for line in lines:
+            _image = os.path.join(datadir, line.rstrip('\n'))
+            _dirname = os.path.split(os.path.dirname(_image))[1]
+            assert os.path.isfile(_image)
+            label = class_to_idx[_dirname]
+            images.append(_image)
+            labels.append(label)
+    return images, labels
--- a/encoding/dilated/__init__.py
+++ b/encoding/dilated/__init__.py
-"""Dilated ResNet and DenseNet"""
-from .resnet import *
--- a/encoding/functions/syncbn.py
+++ b/encoding/functions/syncbn.py
@@ -9,71 +9,291 @@
 """Synchronized Cross-GPU Batch Normalization functions"""
 import torch
+import torch.cuda.comm as comm
 from torch.autograd import Variable, Function
+from torch.autograd.function import once_differentiable
 from .. import lib
-__all__ = ['sum_square', 'batchnormtrain']
+__all__ = ['moments', 'syncbatchnorm', 'inp_syncbatchnorm']
-def sum_square(input):
-    r"""Calculate sum of elements and sum of squares for Batch Normalization"""
-    return _sum_square.apply(input)
+class moments(Function):
+    @staticmethod
+    def forward(ctx, x):
+        if x.is_cuda:
+            ex, ex2 = lib.gpu.expectation_forward(x)
+        else:
+            raise NotImplemented
+        return ex, ex2
-class _sum_square(Function):
    @staticmethod
-    def forward(ctx, input):
+    def backward(ctx, dex, dex2):
-        ctx.save_for_backward(input)
+        if x.is_cuda:
-        if input.is_cuda:
+            dx = lib.gpu.expectation_backward(x, dex, dex2)
-            xsum, xsqusum = lib.gpu.sumsquare_forward(input)
        else:
-            xsum, xsqusum = lib.cpu.sumsquare_forward(input)
+            raise NotImplemented
-        return xsum, xsqusum
+        return dx
+class syncbatchnorm_(Function):
+    @classmethod
+    def forward(cls, ctx, x, gamma, beta, running_mean, running_var,
+                extra, sync=True, training=True, momentum=0.1, eps=1e-05,
+                activation="none", slope=0.01):
+        # save context
+        cls._parse_extra(ctx, extra)
+        ctx.sync = sync
+        ctx.training = training
+        ctx.momentum = momentum
+        ctx.eps = eps
+        ctx.activation = activation
+        ctx.slope = slope
+        assert activation == 'none'
+        # continous inputs
+        x = x.contiguous()
+        gamma = gamma.contiguous()
+        beta = beta.contiguous()
+        if ctx.training:
+            if x.is_cuda:
+                _ex, _exs = lib.gpu.expectation_forward(x)
+            else:
+                raise NotImplemented
+            if ctx.sync:
+                if ctx.is_master:
+                    _ex, _exs = [_ex.unsqueeze(0)], [_exs.unsqueeze(0)]
+                    for _ in range(ctx.master_queue.maxsize):
+                        _ex_w, _exs_w = ctx.master_queue.get()
+                        ctx.master_queue.task_done()
+                        _ex.append(_ex_w.unsqueeze(0))
+                        _exs.append(_exs_w.unsqueeze(0))
+                    _ex = comm.gather(_ex).mean(0)
+                    _exs = comm.gather(_exs).mean(0)
+                    tensors = comm.broadcast_coalesced((_ex, _exs), [_ex.get_device()] + ctx.worker_ids)
+                    for ts, queue in zip(tensors[1:], ctx.worker_queues):
+                        queue.put(ts)
+                else:
+                    ctx.master_queue.put((_ex, _exs))
+                    _ex, _exs = ctx.worker_queue.get()
+                    ctx.worker_queue.task_done()
+            # Update running stats
+            _var = _exs - _ex ** 2
+            running_mean.mul_((1 - ctx.momentum)).add_(ctx.momentum * _ex)
+            running_var.mul_((1 - ctx.momentum)).add_(ctx.momentum * _var)
+            # Mark in-place modified tensors
+            ctx.mark_dirty(running_mean, running_var)
+        else:
+            _ex, _var = running_mean.contiguous(), running_var.contiguous()
+            _exs = _var + _ex ** 2 
+        # BN forward + activation
+        if x.is_cuda:
+            y = lib.gpu.batchnorm_forward(x, _ex, _exs, gamma, beta, ctx.eps)
+        else:
+            y = lib.cpu.batchnorm_forward(x, _ex, _exs, gamma, beta, ctx.eps)
+        # Output
+        ctx.save_for_backward(x, _ex, _exs, gamma, beta)
+        return y
    @staticmethod
-    def backward(ctx, gradSum, gradSquare):
+    @once_differentiable
-        input, = ctx.saved_variables
+    def backward(ctx, dz):
-        if input.is_cuda:
+        x, _ex, _exs, gamma, beta = ctx.saved_tensors
-            gradInput = lib.gpu.sumsquare_backward(input, gradSum, gradSquare)
+        dz = dz.contiguous()
+        # BN backward
+        if dz.is_cuda:
+            dx, _dex, _dexs, dgamma, dbeta = \
+                lib.gpu.batchnorm_backward(dz, x, _ex, _exs, gamma, beta, ctx.eps)
        else:
            raise NotImplemented
-        return gradInput
+        if ctx.training:
+            if ctx.sync:
+                if ctx.is_master:
+                    _dex, _dexs = [_dex.unsqueeze(0)], [_dexs.unsqueeze(0)]
+                    for _ in range(ctx.master_queue.maxsize):
+                        _dex_w, _dexs_w = ctx.master_queue.get()
+                        ctx.master_queue.task_done()
+                        _dex.append(_dex_w.unsqueeze(0))
+                        _dexs.append(_dexs_w.unsqueeze(0))
+                    _dex = comm.gather(_dex).mean(0)
+                    _dexs = comm.gather(_dexs).mean(0)
+                    tensors = comm.broadcast_coalesced((_dex, _dexs), [_dex.get_device()] + ctx.worker_ids)
+                    for ts, queue in zip(tensors[1:], ctx.worker_queues):
+                        queue.put(ts)
+                else:
+                    ctx.master_queue.put((_dex, _dexs))
+                    _dex, _dexs = ctx.worker_queue.get()
+                    ctx.worker_queue.task_done()
+            if x.is_cuda:
+                dx_ = lib.gpu.expectation_backward(x, _dex, _dexs)
+            else:
+                raise NotImplemented
+            dx = dx + dx_
+        return dx, dgamma, dbeta, None, None, None, None, None, None, None, None, None
-class _batchnormtrain(Function):
    @staticmethod
-    def forward(ctx, input, mean, std, gamma, beta):
+    def _parse_extra(ctx, extra):
-        ctx.save_for_backward(input, mean, std, gamma, beta)
+        ctx.is_master = extra["is_master"]
-        if input.is_cuda:
+        if ctx.is_master:
-            output = lib.gpu.batchnorm_forward(input, mean, std, gamma, beta)
+            ctx.master_queue = extra["master_queue"]
+            ctx.worker_queues = extra["worker_queues"]
+            ctx.worker_ids = extra["worker_ids"]
+        else:
+            ctx.master_queue = extra["master_queue"]
+            ctx.worker_queue = extra["worker_queue"]
+def _act_forward(ctx, x):
+    if ctx.activation.lower() == "leaky_relu":
+        if x.is_cuda:
+            lib.gpu.leaky_relu_forward(x, ctx.slope)
+        else:
+            raise NotImplemented
+    else:
+        assert activation == 'none'
+def _act_backward(ctx, x, dx):
+    if ctx.activation.lower() == "leaky_relu":
+        if x.is_cuda:
+            lib.gpu.leaky_relu_backward(x, dx, ctx.slope)
        else:
-            output = lib.cpu.batchnorm_forward(input, mean, std, gamma, beta)
+            raise NotImplemented
-        return output
+    else:
+        assert activation == 'none'
+class inp_syncbatchnorm_(Function):
+    @classmethod
+    def forward(cls, ctx, x, gamma, beta, running_mean, running_var,
+                extra, sync=True, training=True, momentum=0.1, eps=1e-05,
+                activation="none", slope=0.01):
+        # save context
+        cls._parse_extra(ctx, extra)
+        ctx.sync = sync
+        ctx.training = training
+        ctx.momentum = momentum
+        ctx.eps = eps
+        ctx.activation = activation
+        ctx.slope = slope
+        # continous inputs
+        x = x.contiguous()
+        gamma = gamma.contiguous()
+        beta = beta.contiguous()
+        if ctx.training:
+            if x.is_cuda:
+                _ex, _exs = lib.gpu.expectation_forward(x)
+            else:
+                raise NotImplemented
+            if ctx.sync:
+                if ctx.is_master:
+                    _ex, _exs = [_ex.unsqueeze(0)], [_exs.unsqueeze(0)]
+                    for _ in range(ctx.master_queue.maxsize):
+                        _ex_w, _exs_w = ctx.master_queue.get()
+                        ctx.master_queue.task_done()
+                        _ex.append(_ex_w.unsqueeze(0))
+                        _exs.append(_exs_w.unsqueeze(0))
+                    _ex = comm.gather(_ex).mean(0)
+                    _exs = comm.gather(_exs).mean(0)
+                    tensors = comm.broadcast_coalesced((_ex, _exs), [_ex.get_device()] + ctx.worker_ids)
+                    for ts, queue in zip(tensors[1:], ctx.worker_queues):
+                        queue.put(ts)
+                else:
+                    ctx.master_queue.put((_ex, _exs))
+                    _ex, _exs = ctx.worker_queue.get()
+                    ctx.worker_queue.task_done()
+            # Update running stats
+            _var = _exs - _ex ** 2
+            running_mean.mul_((1 - ctx.momentum)).add_(ctx.momentum * _ex)
+            running_var.mul_((1 - ctx.momentum)).add_(ctx.momentum * _var)
+            # Mark in-place modified tensors
+            ctx.mark_dirty(x, running_mean, running_var)
+        else:
+            _ex, _var = running_mean.contiguous(), running_var.contiguous()
+            _exs = _var + _ex ** 2 
+            ctx.mark_dirty(x)
+        # BN forward + activation
+        if x.is_cuda:
+            lib.gpu.batchnorm_inp_forward(x, _ex, _exs, gamma, beta, ctx.eps)
+        else:
+            raise NotImplemented
+        _act_forward(ctx, x)
+        # Output
+        ctx.save_for_backward(x, _ex, _exs, gamma, beta)
+        return x
    @staticmethod
-    def backward(ctx, gradOutput):
+    @once_differentiable
-        input, mean, std, gamma, beta = ctx.saved_variables
+    def backward(ctx, dz):
-        if gradOutput.is_cuda:
+        z, _ex, _exs, gamma, beta = ctx.saved_tensors
-            gradInput, gradMean, gradStd, gradGamma, gradBeta = \
+        dz = dz.contiguous()
-                lib.gpu.batchnorm_backward(gradOutput, input, mean,
-                                           std, gamma, beta, True)
+        # Undo activation
+        _act_backward(ctx, z, dz)
+        # BN backward
+        if dz.is_cuda:
+            dx, _dex, _dexs, dgamma, dbeta = \
+                lib.gpu.batchnorm_inp_backward(dz, z, _ex, _exs, gamma, beta, ctx.eps)
        else:
            raise NotImplemented
-        return gradInput, gradMean, gradStd, gradGamma, gradBeta
+        if ctx.training:
+            if ctx.sync:
+                if ctx.is_master:
+                    _dex, _dexs = [_dex.unsqueeze(0)], [_dexs.unsqueeze(0)]
+                    for _ in range(ctx.master_queue.maxsize):
+                        _dex_w, _dexs_w = ctx.master_queue.get()
+                        ctx.master_queue.task_done()
+                        _dex.append(_dex_w.unsqueeze(0))
+                        _dexs.append(_dexs_w.unsqueeze(0))
-def batchnormtrain(input, mean, std, gamma, beta):
+                    _dex = comm.gather(_dex).mean(0)
-    r"""Applies Batch Normalization over a 3d input that is seen as a
+                    _dexs = comm.gather(_dexs).mean(0)
-    mini-batch.
-    .. _encoding.batchnormtrain:
+                    tensors = comm.broadcast_coalesced((_dex, _dexs), [_dex.get_device()] + ctx.worker_ids)
+                    for ts, queue in zip(tensors[1:], ctx.worker_queues):
+                        queue.put(ts)
+                else:
+                    ctx.master_queue.put((_dex, _dexs))
+                    _dex, _dexs = ctx.worker_queue.get()
+                    ctx.worker_queue.task_done()
-    .. math::
+            if z.is_cuda:
+                lib.gpu.expectation_inp_backward(dx, z, _dex, _dexs, _ex, _exs, gamma, beta, ctx.eps)
+            else:
+                raise NotImplemented
-        y = \frac{x - \mu[x]}{ \sqrt{var[x] + \epsilon}} * \gamma + \beta
+        return dx, dgamma, dbeta, None, None, None, None, None, None, None, None, None
-    Shape:
+    @staticmethod
-        - Input: :math:`(N, C)` or :math:`(N, C, L)`
+    def _parse_extra(ctx, extra):
-        - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)
+        ctx.is_master = extra["is_master"]
+        if ctx.is_master:
+            ctx.master_queue = extra["master_queue"]
+            ctx.worker_queues = extra["worker_queues"]
+            ctx.worker_ids = extra["worker_ids"]
+        else:
+            ctx.master_queue = extra["master_queue"]
+            ctx.worker_queue = extra["worker_queue"]
-    """
+syncbatchnorm = syncbatchnorm_.apply
-    return _batchnormtrain.apply(input, mean, std, gamma, beta)
+inp_syncbatchnorm = inp_syncbatchnorm_.apply
--- a/encoding/lib/__init__.py
+++ b/encoding/lib/__init__.py
@@ -17,9 +17,11 @@ cpu = load('enclib_cpu', [
 if torch.cuda.is_available():
    gpu = load('enclib_gpu', [
            os.path.join(gpu_path, 'operator.cpp'),
+            os.path.join(gpu_path, 'activation_kernel.cu'),
            os.path.join(gpu_path, 'encoding_kernel.cu'),
            os.path.join(gpu_path, 'encodingv2_kernel.cu'),
            os.path.join(gpu_path, 'syncbn_kernel.cu'),
            os.path.join(gpu_path, 'roi_align_kernel.cu'),
            os.path.join(gpu_path, 'nms_kernel.cu'),
-        ], build_directory=gpu_path, verbose=False)
+        ], extra_cuda_cflags=["--expt-extended-lambda"],
+        build_directory=gpu_path, verbose=False)
--- a/encoding/lib/cpu/__init__.py
+++ b/encoding/lib/cpu/__init__.py