v0.4.2 (#59)

This PR should fix most of the issues: fixes https://github.com/zhanghang1989/PyTorch-Encoding/issues/54 fixes https://github.com/zhanghang1989/PyTorch-Encoding/issues/53 fixes https://github.com/zhanghang1989/PyTorch-Encoding/issues/50

v0.4.2 (#59)
This PR should fix most of the issues: fixes https://github.com/zhanghang1989/PyTorch-Encoding/issues/54 fixes https://github.com/zhanghang1989/PyTorch-Encoding/issues/53 fixes https://github.com/zhanghang1989/PyTorch-Encoding/issues/50
07f25381 · Hang Zhang · GitHub · cebf1341 · 70fdeb79 · cebf1341
Unverified Commit 07f25381 authored Jun 04, 2018 by Hang Zhang Committed by GitHub Jun 04, 2018
20 changed files
--- a/encoding/kernel/include/THCDeviceTensorUtils.cu
+++ b/encoding/kernel/include/THCDeviceTensorUtils.cu
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "generic/THCDeviceTensorUtils.cu"
-#else
-
-/// Constructs a THCDeviceTensor initialized from a THCudaTensor. Will
-/// error if the dimensionality does not match exactly.
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>
-toDeviceTensor(THCState* state, THCTensor* t);
-
-template <typename T, int Dim, typename IndexT>
-THCDeviceTensor<T, Dim, IndexT, DefaultPtrTraits>
-toDeviceTensor(THCState* state, THCTensor* t) {
-  return toDeviceTensor<T, Dim, IndexT, DefaultPtrTraits>(state, t);
-}
-
-template <typename T, int Dim>
-THCDeviceTensor<T, Dim, int, DefaultPtrTraits>
-toDeviceTensor(THCState* state, THCTensor* t) {
-  return toDeviceTensor<T, Dim, int, DefaultPtrTraits>(state, t);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>
-toDeviceTensor(THCState* state, THCTensor* t) {
-  if (Dim != THCTensor_(nDimension)(state, t)) {
-    THError("THCudaTensor dimension mismatch");
-  }
-  // Determine the maximum offset into the tensor achievable; `IndexT`
-  // must be smaller than this type in order to use it.
-  ptrdiff_t maxOffset = 0;
-  IndexT sizes[Dim];
-  IndexT strides[Dim];
-
-  for (int i = 0; i < Dim; ++i) {
-    int64_t size = THCTensor_(size)(state, t, i);
-    int64_t stride = THCTensor_(stride)(state, t, i);
-
-    maxOffset += (size - 1) * stride;
-
-    sizes[i] = (IndexT) size;
-    strides[i] = (IndexT) stride;
-  }
-
-  if (maxOffset > std::numeric_limits<IndexT>::max()) {
-    THError("THCudaTensor sizes too large for THCDeviceTensor conversion");
-  }
-
-  return THCDeviceTensor<T, Dim, IndexT, PtrTraits>(
-    THCTensor_(data)(state, t), sizes, strides);
-}
-
-#endif
--- a/encoding/kernel/include/THCDeviceTensorUtils.cuh
+++ b/encoding/kernel/include/THCDeviceTensorUtils.cuh
-#ifndef THC_DEVICE_TENSOR_UTILS_INC
-#define THC_DEVICE_TENSOR_UTILS_INC
-
-#include "THCDeviceTensor.cuh"
-#include "THCTensor.h"
-#include <limits>
-
-/// Constructs a DeviceTensor initialized from a THCudaTensor by
-/// upcasting or downcasting the tensor to that of a different
-/// dimension.
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>
-toDeviceTensorCast(THCState* state, THCudaTensor* t);
-
-template <typename T, int Dim, typename IndexT>
-THCDeviceTensor<T, Dim, IndexT, DefaultPtrTraits>
-toDeviceTensorCast(THCState* state, THCudaTensor* t) {
-  return toDeviceTensorCast<T, Dim, IndexT, DefaultPtrTraits>(state, t);
-}
-
-template <typename T, int Dim>
-THCDeviceTensor<T, Dim, int, DefaultPtrTraits>
-toDeviceTensorCast(THCState* state, THCudaTensor* t) {
-  return toDeviceTensorCast<T, Dim, int, DefaultPtrTraits>(state, t);
-}
-
-#include "generic/THCDeviceTensorUtils.cu"
-#include "THCGenerateAllTypes.h"
-
-#include "THCDeviceTensorUtils-inl.cuh"
-
-#endif // THC_DEVICE_TENSOR_UTILS_INC
--- a/encoding/kernel/include/generic/THCDeviceTensorUtils.cu
+++ b/encoding/kernel/include/generic/THCDeviceTensorUtils.cu
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "generic/THCDeviceTensorUtils.cu"
-#else
-
-/// Constructs a THCDeviceTensor initialized from a THCudaTensor. Will
-/// error if the dimensionality does not match exactly.
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>
-toDeviceTensor(THCState* state, THCTensor* t);
-
-template <typename T, int Dim, typename IndexT>
-THCDeviceTensor<T, Dim, IndexT, DefaultPtrTraits>
-toDeviceTensor(THCState* state, THCTensor* t) {
-  return toDeviceTensor<T, Dim, IndexT, DefaultPtrTraits>(state, t);
-}
-
-template <typename T, int Dim>
-THCDeviceTensor<T, Dim, int, DefaultPtrTraits>
-toDeviceTensor(THCState* state, THCTensor* t) {
-  return toDeviceTensor<T, Dim, int, DefaultPtrTraits>(state, t);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>
-toDeviceTensor(THCState* state, THCTensor* t) {
-  if (Dim != THCTensor_(nDimension)(state, t)) {
-    THError("THCudaTensor dimension mismatch");
-  }
-  // Determine the maximum offset into the tensor achievable; `IndexT`
-  // must be smaller than this type in order to use it.
-  ptrdiff_t maxOffset = 0;
-  IndexT sizes[Dim];
-  IndexT strides[Dim];
-
-  for (int i = 0; i < Dim; ++i) {
-    int64_t size = THCTensor_(size)(state, t, i);
-    int64_t stride = THCTensor_(stride)(state, t, i);
-
-    maxOffset += (size - 1) * stride;
-
-    sizes[i] = (IndexT) size;
-    strides[i] = (IndexT) stride;
-  }
-
-  if (maxOffset > std::numeric_limits<IndexT>::max()) {
-    THError("THCudaTensor sizes too large for THCDeviceTensor conversion");
-  }
-
-  return THCDeviceTensor<T, Dim, IndexT, PtrTraits>(
-    THCTensor_(data)(state, t), sizes, strides);
-}
-
-#endif
--- a/encoding/kernel/thc_encoding.cu
+++ b/encoding/kernel/thc_encoding.cu
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- * Created by: Hang Zhang
- * ECE Department, Rutgers University
- * Email: zhang.hang@rutgers.edu
- * Copyright (c) 2017
- *
- * This source code is licensed under the MIT-style license found in the
- * LICENSE file in the root directory of this source tree 
- *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- */
-#include "thc_encoding.h"
-#include "common.h"
-
-#include "generic/device_tensor.h"
-#include "THC/THCGenerateFloatType.h"
-
-#include "generic/device_tensor.h"
-#include "THC/THCGenerateDoubleType.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// float
-#include "generic/encoding_utils.c"
-#include "THC/THCGenerateFloatType.h"
-
-#include "generic/encoding_kernel.c"
-#include "THC/THCGenerateFloatType.h"
-
-#include "generic/syncbn_kernel.c"
-#include "THC/THCGenerateFloatType.h"
-
-#include "generic/pooling_kernel.c"
-#include "THC/THCGenerateFloatType.h"
-
-// double
-#include "generic/encoding_utils.c"
-#include "THC/THCGenerateDoubleType.h"
-
-#include "generic/encoding_kernel.c"
-#include "THC/THCGenerateDoubleType.h"
-
-#include "generic/syncbn_kernel.c"
-#include "THC/THCGenerateDoubleType.h"
-
-#include "generic/pooling_kernel.c"
-#include "THC/THCGenerateDoubleType.h"
-
-#ifdef __cplusplus
-}
-#endif
--- a/encoding/kernel/thc_encoding.h
+++ b/encoding/kernel/thc_encoding.h
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- * Created by: Hang Zhang
- * ECE Department, Rutgers University
- * Email: zhang.hang@rutgers.edu
- * Copyright (c) 2017
- *
- * This source code is licensed under the MIT-style license found in the
- * LICENSE file in the root directory of this source tree 
- *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- */
-#include <THC.h>
-#include "THCDeviceTensor.cuh"
-#include "THCDeviceTensorUtils.cuh"
-
-// this symbol will be resolved automatically from PyTorch libs
-extern THCState *state;
-
-#define Encoding_(NAME) TH_CONCAT_4(Encoding_, Real, _, NAME)
-#define THCTensor        TH_CONCAT_3(TH,CReal,Tensor)
-#define THCTensor_(NAME) TH_CONCAT_4(TH,CReal,Tensor_,NAME)
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// float
-#include "generic/encoding_kernel.h"
-#include "THC/THCGenerateFloatType.h"
-
-#include "generic/syncbn_kernel.h"
-#include "THC/THCGenerateFloatType.h"
-
-#include "generic/pooling_kernel.h"
-#include "THC/THCGenerateFloatType.h"
-
-// double
-#include "generic/encoding_kernel.h"
-#include "THC/THCGenerateDoubleType.h"
-
-#include "generic/syncbn_kernel.h"
-#include "THC/THCGenerateDoubleType.h"
-
-#include "generic/pooling_kernel.h"
-#include "THC/THCGenerateDoubleType.h"
-
-#ifdef __cplusplus
-}
-#endif
--- a/encoding/make.sh
+++ b/encoding/make.sh
-#!/usr/bin/env bash
-mkdir -p encoding/lib && cd encoding/lib
-# compile and install
-cmake ..
-make
--- a/encoding/models/__init__.py
+++ b/encoding/models/__init__.py
+from .model_zoo import get_model
+from .base import *
+from .fcn import *
+from .encnet import *
+
+def get_segmentation_model(name, **kwargs):
+    from .fcn import get_fcn
+    models = {
+        'fcn': get_fcn,
+        'encnet': get_encnet,
+    }
+    return models[name.lower()](**kwargs)
--- a/encoding/models/base.py
+++ b/encoding/models/base.py
+###########################################################################
+# Created by: Hang Zhang 
+# Email: zhang.hang@rutgers.edu 
+# Copyright (c) 2017
+###########################################################################
+
+import math
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.functional import upsample
+from torch.nn.parallel.data_parallel import DataParallel
+from torch.nn.parallel.parallel_apply import parallel_apply
+from torch.nn.parallel.scatter_gather import scatter
+
+from .. import dilated as resnet
+from ..utils import batch_pix_accuracy, batch_intersection_union
+
+up_kwargs = {'mode': 'bilinear', 'align_corners': True}
+
+__all__ = ['BaseNet', 'EvalModule', 'MultiEvalModule']
+
+class BaseNet(nn.Module):
+    def __init__(self, nclass, backbone, aux, se_loss, dilated=True, norm_layer=None,
+                 mean=[.485, .456, .406], std=[.229, .224, .225]):
+        super(BaseNet, self).__init__()
+        self.nclass = nclass
+        self.aux = aux
+        self.se_loss = se_loss
+        self.mean = mean
+        self.std = std
+        # copying modules from pretrained models
+        if backbone == 'resnet50':
+            self.pretrained = resnet.resnet50(pretrained=True, dilated=dilated, norm_layer=norm_layer)
+        elif backbone == 'resnet101':
+            self.pretrained = resnet.resnet101(pretrained=True, dilated=dilated, norm_layer=norm_layer)
+        elif backbone == 'resnet152':
+            self.pretrained = resnet.resnet152(pretrained=True, dilated=dilated, norm_layer=norm_layer)
+        else:
+            raise RuntimeError('unknown backbone: {}'.format(backbone))
+        # bilinear upsample options
+        self._up_kwargs = up_kwargs
+
+    def base_forward(self, x):
+        x = self.pretrained.conv1(x)
+        x = self.pretrained.bn1(x)
+        x = self.pretrained.relu(x)
+        x = self.pretrained.maxpool(x)
+        c1 = self.pretrained.layer1(x)
+        c2 = self.pretrained.layer2(c1)
+        c3 = self.pretrained.layer3(c2)
+        c4 = self.pretrained.layer4(c3)
+        return c1, c2, c3, c4
+
+    def evaluate(self, x, target=None):
+        pred = self.forward(x)
+        if isinstance(pred, (tuple, list)):
+            pred = pred[0]
+        if target is None:
+            return pred
+        correct, labeled = batch_pix_accuracy(pred.data, target.data)
+        inter, union = batch_intersection_union(pred.data, target.data, self.nclass)
+        return correct, labeled, inter, union
+
+
+class EvalModule(nn.Module):
+    """Segmentation Eval Module"""
+    def __init__(self, module):
+        super(EvalModule, self).__init__()
+        self.module = module
+
+    def forward(self, *inputs, **kwargs):
+        return self.module.evaluate(*inputs, **kwargs)
+
+
+class MultiEvalModule(DataParallel):
+    """Multi-size Segmentation Eavluator"""
+    def __init__(self, module, nclass, device_ids=None,
+                 base_size=520, crop_size=480, flip=True,
+                 scales=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75]):
+        super(MultiEvalModule, self).__init__(module, device_ids)
+        self.nclass = nclass
+        self.base_size = base_size
+        self.crop_size = crop_size
+        self.scales = scales
+        self.flip = flip
+
+    def parallel_forward(self, inputs, **kwargs):
+        """Multi-GPU Mult-size Evaluation
+
+        Args:
+            inputs: list of Tensors
+        """
+        inputs = [(input.unsqueeze(0).cuda(device),) for input, device in zip(inputs, self.device_ids)]
+        replicas = self.replicate(self, self.device_ids[:len(inputs)])
+        kwargs = scatter(kwargs, target_gpus, dim) if kwargs else []
+        if len(inputs) < len(kwargs):
+            inputs.extend([() for _ in range(len(kwargs) - len(inputs))])
+        elif len(kwargs) < len(inputs):
+            kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))])
+        outputs = self.parallel_apply(replicas, inputs, kwargs)
+        return outputs
+
+    def forward(self, image):
+        """Mult-size Evaluation"""
+        # only single image is supported for evaluation
+        batch, _, h, w = image.size()
+        assert(batch == 1)
+        stride_rate = 2.0/3.0
+        crop_size = self.crop_size
+        stride = int(crop_size * stride_rate)
+        with torch.cuda.device_of(image):
+            scores = image.new().resize_(batch,self.nclass,h,w).zero_().cuda()
+
+        for scale in self.scales:
+            long_size = int(math.ceil(self.base_size * scale))
+            if h > w:
+                height = long_size
+                width = int(1.0 * w * long_size / h + 0.5)
+                short_size = width
+            else:
+                width = long_size
+                height = int(1.0 * h * long_size / w + 0.5)
+                short_size = height
+            # resize image to current size
+            cur_img = resize_image(image, height, width)
+            if scale <= 1.25 or long_size <= crop_size:# #
+                pad_img = pad_image(cur_img, self.module.mean,
+                                    self.module.std, crop_size)
+                outputs = self.module_inference(pad_img)
+                outputs = crop_image(outputs, 0, height, 0, width)
+            else:
+                if short_size < crop_size:
+                    # pad if needed
+                    pad_img = pad_image(cur_img, self.module.mean,
+                                        self.module.std, crop_size)
+                else:
+                    pad_img = cur_img
+                _,_,ph,pw = pad_img.size()
+                assert(ph >= height and pw >= width)
+                # grid forward and normalize
+                h_grids = int(math.ceil(1.0*(ph-crop_size)/stride)) + 1
+                w_grids = int(math.ceil(1.0*(pw-crop_size)/stride)) + 1
+                with torch.cuda.device_of(image):
+                    outputs = image.new().resize_(batch,self.nclass,ph,pw).zero_().cuda()
+                    count_norm = image.new().resize_(batch,1,ph,pw).zero_().cuda()
+                # grid evaluation
+                for idh in range(h_grids):
+                    for idw in range(w_grids):
+                        h0 = idh * stride
+                        w0 = idw * stride
+                        h1 = min(h0 + crop_size, ph)
+                        w1 = min(w0 + crop_size, pw)
+                        crop_img = crop_image(pad_img, h0, h1, w0, w1)
+                        # pad if needed
+                        pad_crop_img = pad_image(crop_img, self.module.mean,
+                                                 self.module.std, crop_size)
+                        output = self.module_inference(pad_crop_img)
+                        outputs[:,:,h0:h1,w0:w1] += crop_image(output,
+                            0, h1-h0, 0, w1-w0)
+                        count_norm[:,:,h0:h1,w0:w1] += 1
+                assert((count_norm==0).sum()==0)
+                outputs = outputs / count_norm
+                outputs = outputs[:,:,:height,:width]
+
+            score = resize_image(outputs, h, w)
+            scores += score
+
+        return scores
+
+    def module_inference(self, image):
+        output = self.module.evaluate(image)
+        if self.flip:
+            fimg = flip_image(image)
+            foutput = self.module.evaluate(fimg)
+            output += flip_image(foutput)
+        return output.exp()
+
+
+def resize_image(img, h, w, mode='bilinear'):
+    return F.upsample(img, (h, w), **up_kwargs)
+
+def pad_image(img, mean, std, crop_size):
+    b,c,h,w = img.size()
+    assert(c==3)
+    padh = crop_size - h if h < crop_size else 0
+    padw = crop_size - w if w < crop_size else 0
+    pad_values = -np.array(mean) / np.array(std)
+    img_pad = img.new().resize_(b,c,h+padh,w+padw)
+    #img_pad = F.pad(img, (0,padw,0,padh))
+    for i in range(c):
+        # note that pytorch pad params is in reversed orders
+        img_pad[:,i,:,:] = F.pad(img[:,i,:,:], (0, padw, 0, padh), 
+            value=pad_values[i])
+    assert(img_pad.size(2)>=crop_size and img_pad.size(3)>=crop_size)
+    return img_pad
+
+def crop_image(img, h0, h1, w0, w1):
+    return img[:,:,h0:h1,w0:w1]
+
+def flip_image(img):
+    assert(img.dim()==4)
+    with torch.cuda.device_of(img):
+        idx = torch.arange(img.size(3)-1, -1, -1).type_as(img).long()
+    return img.index_select(3, idx)
--- a/encoding/models/encnet.py
+++ b/encoding/models/encnet.py
+###########################################################################
+# Created by: Hang Zhang 
+# Email: zhang.hang@rutgers.edu 
+# Copyright (c) 2017
+###########################################################################
+
+import torch
+from torch.autograd import Variable
+import torch.nn as nn
+from torch.nn.functional import upsample
+
+import encoding
+from .base import BaseNet
+from .fcn import FCNHead
+
+__all__ = ['EncNet', 'EncModule', 'get_encnet', 'get_encnet_resnet50_pcontext']
+
+class EncNet(BaseNet):
+    def __init__(self, nclass, backbone, aux=True, se_loss=True,
+                 norm_layer=nn.BatchNorm2d, **kwargs):
+        super(EncNet, self).__init__(nclass, backbone, aux, se_loss, norm_layer=norm_layer)
+        self.head = EncHead(self.nclass, in_channels=2048, se_loss=se_loss,
+                            norm_layer=norm_layer, up_kwargs=self._up_kwargs)
+        if aux:
+            self.auxlayer = FCNHead(1024, nclass, norm_layer=norm_layer)
+
+    def forward(self, x):
+        imsize = x.size()[2:]
+        #features = self.base_forward(x)
+        _, _, c3, c4 = self.base_forward(x)
+
+        x = list(self.head(c4))
+        x[0] = upsample(x[0], imsize, **self._up_kwargs)
+        if self.aux:
+            auxout = self.auxlayer(c3)
+            auxout = upsample(auxout, imsize, **self._up_kwargs)
+            x.append(auxout)
+        return tuple(x)
+
+
+class EncModule(nn.Module):
+    def __init__(self, in_channels, nclass, ncodes=32, se_loss=True, norm_layer=None):
+        super(EncModule, self).__init__()
+        if isinstance(norm_layer, encoding.nn.BatchNorm2d):
+            norm_layer = encoding.nn.BatchNorm1d
+        else:
+            norm_layer = nn.BatchNorm1d
+        self.se_loss = se_loss
+        self.encoding = nn.Sequential(
+            encoding.nn.Encoding(D=in_channels, K=ncodes),
+            norm_layer(ncodes),
+            nn.ReLU(inplace=True),
+            encoding.nn.Sum(dim=1))
+        self.fc = nn.Sequential(
+            nn.Linear(in_channels, in_channels),
+            nn.Sigmoid())
+        if self.se_loss:
+            self.selayer = nn.Linear(in_channels, nclass)
+
+    def forward(self, x):
+        en = self.encoding(x)
+        b, c, _, _ = x.size()
+        gamma = self.fc(en)
+        y = gamma.view(b, c, 1, 1)
+        # residual ?
+        outputs = [x + x * y]
+        if self.se_loss:
+            outputs.append(self.selayer(en))
+        return tuple(outputs)
+
+
+class EncHead(nn.Module):
+    def __init__(self, out_channels, in_channels, se_loss=True,
+                 norm_layer=None, up_kwargs=None):
+        super(EncHead, self).__init__()
+        self.conv5 = nn.Sequential(
+            nn.Conv2d(in_channels, 512, 3, padding=1, bias=False),
+            norm_layer(512),
+            nn.ReLU(True))
+        self.encmodule = EncModule(512, out_channels, ncodes=32,
+            se_loss=se_loss, norm_layer=norm_layer)
+        self.dropout = nn.Dropout2d(0.1, False)
+        self.conv6 = nn.Conv2d(512, out_channels, 1)
+        self.se_loss = se_loss
+
+    def forward(self, x):
+        x = self.conv5(x)
+        outs = list(self.encmodule(x))
+        outs[0] = self.conv6(self.dropout(outs[0]))
+        return tuple(outs)
+
+
+def get_encnet(dataset='pascal_voc', backbone='resnet50', pretrained=False,
+               root='~/.encoding/models', **kwargs):
+    r"""EncNet model from the paper `"Context Encoding for Semantic Segmentation"
+    <https://arxiv.org/pdf/1803.08904.pdf>`_
+
+    Parameters
+    ----------
+    dataset : str, default pascal_voc
+        The dataset that model pretrained on. (pascal_voc, ade20k)
+    backbone : str, default resnet50
+        The backbone network. (resnet50, 101, 152)
+    pretrained : bool, default False
+        Whether to load the pretrained weights for model.
+    root : str, default '~/.encoding/models'
+        Location for keeping the model parameters.
+
+
+    Examples
+    --------
+    >>> model = get_encnet(dataset='pascal_voc', backbone='resnet50', pretrained=False)
+    >>> print(model)
+    """
+    acronyms = {
+        'pascal_voc': 'voc',
+        'ade20k': 'ade',
+        'pcontext': 'pcontext',
+    }
+    # infer number of classes
+    from ..datasets import datasets, VOCSegmentation, VOCAugSegmentation, ADE20KSegmentation
+    model = EncNet(datasets[dataset.lower()].NUM_CLASS, backbone=backbone, **kwargs)
+    if pretrained:
+        from .model_store import get_model_file
+        model.load_state_dict(torch.load(
+            get_model_file('encnet_%s_%s'%(backbone, acronyms[dataset]), root=root)))
+    return model
+
+def get_encnet_resnet50_pcontext(pretrained=False, root='~/.encoding/models', **kwargs):
+    r"""EncNet-PSP model from the paper `"Context Encoding for Semantic Segmentation"
+    <https://arxiv.org/pdf/1803.08904.pdf>`_
+
+    Parameters
+    ----------
+    pretrained : bool, default False
+        Whether to load the pretrained weights for model.
+    root : str, default '~/.encoding/models'
+        Location for keeping the model parameters.
+
+
+    Examples
+    --------
+    >>> model = get_encnet_resnet50_pcontext(pretrained=True)
+    >>> print(model)
+    """
+    return get_encnet('pcontext', 'resnet50', pretrained)
--- a/encoding/models/fcn.py
+++ b/encoding/models/fcn.py
+###########################################################################
+# Created by: Hang Zhang 
+# Email: zhang.hang@rutgers.edu 
+# Copyright (c) 2017
+###########################################################################
+from __future__ import division
+import os
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn.functional import upsample
+
+from .base import BaseNet
+
+__all__ = ['FCN', 'get_fcn', 'get_fcn_resnet50_pcontext', 'get_fcn_resnet50_ade']
+
+class FCN(BaseNet):
+    r"""Fully Convolutional Networks for Semantic Segmentation
+
+    Parameters
+    ----------
+    nclass : int
+        Number of categories for the training dataset.
+    backbone : string
+        Pre-trained dilated backbone network type (default:'resnet50'; 'resnet50',
+        'resnet101' or 'resnet152').
+    norm_layer : object
+        Normalization layer used in backbone network (default: :class:`mxnet.gluon.nn.BatchNorm`;
+
+
+    Reference:
+
+        Long, Jonathan, Evan Shelhamer, and Trevor Darrell. "Fully convolutional networks
+        for semantic segmentation." *CVPR*, 2015
+
+    Examples
+    --------
+    >>> model = FCN(nclass=21, backbone='resnet50')
+    >>> print(model)
+    """
+    def __init__(self, nclass, backbone, aux=True, se_loss=False, norm_layer=nn.BatchNorm2d, **kwargs):
+        super(FCN, self).__init__(nclass, backbone, aux, se_loss, norm_layer=norm_layer)
+        self.head = FCNHead(2048, nclass, norm_layer)
+        if aux:
+            self.auxlayer = FCNHead(1024, nclass, norm_layer)
+
+    def forward(self, x):
+        imsize = x.size()[2:]
+        _, _, c3, c4 = self.base_forward(x)
+
+        x = self.head(c4)
+        x = upsample(x, imsize, **self._up_kwargs)
+        outputs = [x]
+        if self.aux:
+            auxout = self.auxlayer(c3)
+            auxout = upsample(auxout, imsize, **self._up_kwargs)
+            outputs.append(auxout)
+        return tuple(outputs)
+
+        
+class FCNHead(nn.Module):
+    def __init__(self, in_channels, out_channels, norm_layer):
+        super(FCNHead, self).__init__()
+        inter_channels = in_channels // 4
+        self.conv5 = nn.Sequential(nn.Conv2d(in_channels, inter_channels, 3, padding=1),
+                                   norm_layer(inter_channels),
+                                   nn.ReLU(),
+                                   nn.Dropout2d(0.1, False),
+                                   nn.Conv2d(inter_channels, out_channels, 1))
+
+    def forward(self, x):
+        return self.conv5(x)
+
+
+def get_fcn(dataset='pascal_voc', backbone='resnet50', pretrained=False,
+            root='~/.encoding/models', **kwargs):
+    r"""FCN model from the paper `"Fully Convolutional Network for semantic segmentation"
+    <https://people.eecs.berkeley.edu/~jonlong/long_shelhamer_fcn.pdf>`_
+    Parameters
+    ----------
+    dataset : str, default pascal_voc
+        The dataset that model pretrained on. (pascal_voc, ade20k)
+    pretrained : bool, default False
+        Whether to load the pretrained weights for model.
+    root : str, default '~/.encoding/models'
+        Location for keeping the model parameters.
+    Examples
+    --------
+    >>> model = get_fcn(dataset='pascal_voc', backbone='resnet50', pretrained=False)
+    >>> print(model)
+    """
+    acronyms = {
+        'pascal_voc': 'voc',
+        'pascal_aug': 'voc',
+        'pcontext': 'pcontext',
+        'ade20k': 'ade',
+    }
+    # infer number of classes
+    from ..datasets import datasets, VOCSegmentation, VOCAugSegmentation, ADE20KSegmentation
+    model = FCN(datasets[dataset.lower()].NUM_CLASS, backbone=backbone, **kwargs)
+    if pretrained:
+        from .model_store import get_model_file
+        model.load_state_dict(torch.load(
+            get_model_file('fcn_%s_%s'%(backbone, acronyms[dataset]), root=root)),
+            strict= False)
+    return model
+
+def get_fcn_resnet50_pcontext(pretrained=False, root='~/.encoding/models', **kwargs):
+    r"""EncNet-PSP model from the paper `"Context Encoding for Semantic Segmentation"
+    <https://arxiv.org/pdf/1803.08904.pdf>`_
+
+    Parameters
+    ----------
+    pretrained : bool, default False
+        Whether to load the pretrained weights for model.
+    root : str, default '~/.encoding/models'
+        Location for keeping the model parameters.
+
+
+    Examples
+    --------
+    >>> model = get_fcn_resnet50_pcontext(pretrained=True)
+    >>> print(model)
+    """
+    return get_fcn('pcontext', 'resnet50', pretrained)
+
+def get_fcn_resnet50_ade(pretrained=False, root='~/.encoding/models', **kwargs):
+    r"""EncNet-PSP model from the paper `"Context Encoding for Semantic Segmentation"
+    <https://arxiv.org/pdf/1803.08904.pdf>`_
+
+    Parameters
+    ----------
+    pretrained : bool, default False
+        Whether to load the pretrained weights for model.
+    root : str, default '~/.encoding/models'
+        Location for keeping the model parameters.
+
+
+    Examples
+    --------
+    >>> model = get_fcn_resnet50_ade(pretrained=True)
+    >>> print(model)
+    """
+    return get_fcn('ade20k', 'resnet50', pretrained)
--- a/encoding/models/model_store.py
+++ b/encoding/models/model_store.py
+"""Model store which provides pretrained models."""
+from __future__ import print_function
+__all__ = ['get_model_file', 'purge']
+import os
+import zipfile
+
+from ..utils import download, check_sha1
+
+_model_sha1 = {name: checksum for checksum, name in [
+    ('eeed8e582f0fdccdba8579e7490570adc6d85c7c', 'fcn_resnet50_pcontext'),
+    ('969062a5aad2d1d983bae2f9e412578b62610114', 'encnet_resnet50_pcontext'),
+    ('fc8c0b795abf0133700c2d4265d2f9edab7eb6cc', 'fcn_resnet50_ade'),
+    ]}
+
+encoding_repo_url = 'https://hangzh.s3.amazonaws.com/'
+_url_format = '{repo_url}encoding/models/{file_name}.zip'
+
+def short_hash(name):
+    if name not in _model_sha1:
+        raise ValueError('Pretrained model for {name} is not available.'.format(name=name))
+    return _model_sha1[name][:8]
+
+def get_model_file(name, root=os.path.join('~', '.encoding', 'models')):
+    r"""Return location for the pretrained on local file system.
+
+    This function will download from online model zoo when model cannot be found or has mismatch.
+    The root directory will be created if it doesn't exist.
+
+    Parameters
+    ----------
+    name : str
+        Name of the model.
+    root : str, default '~/.encoding/models'
+        Location for keeping the model parameters.
+
+    Returns
+    -------
+    file_path
+        Path to the requested pretrained model file.
+    """
+    file_name = '{name}-{short_hash}'.format(name=name, short_hash=short_hash(name))
+    root = os.path.expanduser(root)
+    file_path = os.path.join(root, file_name+'.pth')
+    sha1_hash = _model_sha1[name]
+    if os.path.exists(file_path):
+        if check_sha1(file_path, sha1_hash):
+            return file_path
+        else:
+            print('Mismatch in the content of model file detected. Downloading again.')
+    else:
+        print('Model file is not found. Downloading.')
+
+    if not os.path.exists(root):
+        os.makedirs(root)
+
+    zip_file_path = os.path.join(root, file_name+'.zip')
+    repo_url = os.environ.get('ENCODING_REPO', encoding_repo_url)
+    if repo_url[-1] != '/':
+        repo_url = repo_url + '/'
+    download(_url_format.format(repo_url=repo_url, file_name=file_name),
+             path=zip_file_path,
+             overwrite=True)
+    with zipfile.ZipFile(zip_file_path) as zf:
+        zf.extractall(root)
+    os.remove(zip_file_path)
+
+    if check_sha1(file_path, sha1_hash):
+        return file_path
+    else:
+        raise ValueError('Downloaded file has different hash. Please try again.')
+
+def purge(root=os.path.join('~', '.encoding', 'models')):
+    r"""Purge all pretrained model files in local file store.
+
+    Parameters
+    ----------
+    root : str, default '~/.encoding/models'
+        Location for keeping the model parameters.
+    """
+    root = os.path.expanduser(root)
+    files = os.listdir(root)
+    for f in files:
+        if f.endswith(".pth"):
+            os.remove(os.path.join(root, f))
+
+def pretrained_model_list():
+    return list(_model_sha1.keys())
--- a/encoding/models/model_zoo.py
+++ b/encoding/models/model_zoo.py
+# pylint: disable=wildcard-import, unused-wildcard-import
+
+from .fcn import *
+from .encnet import *
+
+__all__ = ['get_model']
+
+
+def get_model(name, **kwargs):
+    """Returns a pre-defined model by name
+
+    Parameters
+    ----------
+    name : str
+        Name of the model.
+    pretrained : bool
+        Whether to load the pretrained weights for model.
+    root : str, default '~/.encoding/models'
+        Location for keeping the model parameters.
+
+    Returns
+    -------
+    Module:
+        The model.
+    """
+    models = {
+        'fcn_resnet50_pcontext': get_fcn_resnet50_pcontext,
+        'encnet_resnet50_pcontext': get_encnet_resnet50_pcontext,
+        'fcn_resnet50_ade': get_fcn_resnet50_ade,
+        }
+    name = name.lower()
+    if name not in models:
+        raise ValueError('%s\n\t%s' % (str(e), '\n\t'.join(sorted(models.keys()))))
+    net = models[name](**kwargs)
+    return net
--- a/encoding/nn/customize.py
+++ b/encoding/nn/customize.py
@@ -11,13 +11,15 @@
 """Encoding Custermized NN Module"""
 import torch
 from torch.nn import Module, Sequential, Conv2d, ReLU, AdaptiveAvgPool2d, \
-    NLLLoss, BCELoss, CrossEntropyLoss
+    NLLLoss, BCELoss, CrossEntropyLoss, AvgPool2d, MaxPool2d, Parameter
 from torch.nn import functional as F
-
+from torch.autograd import Variable
 from .syncbn import BatchNorm2d

+torch_ver = torch.__version__[:3]
+
 __all__ = ['GramMatrix', 'SegmentationLosses', 'View', 'Sum', 'Mean',
-           'Normalize', 'PyramidPooling']
+           'Normalize']


 class GramMatrix(Module):
@@ -39,39 +41,51 @@ def softmax_crossentropy(input, target, weight, size_average, ignore_index, redu

 class SegmentationLosses(CrossEntropyLoss):
    """2D Cross Entropy Loss with Auxilary Loss"""
-    def __init__(self, aux, aux_weight=0.2, weight=None, size_average=True, ignore_index=-1):
+    def __init__(self, se_loss=False, se_weight=0.1, nclass=-1,
+                 aux=False, aux_weight=0.2, weight=None,
+                 size_average=True, ignore_index=-1):
        super(SegmentationLosses, self).__init__(weight, size_average, ignore_index)
+        self.se_loss = se_loss
        self.aux = aux
+        self.nclass = nclass
+        self.se_weight = se_weight
        self.aux_weight = aux_weight
+        self.bceloss = BCELoss(weight, size_average) 

    def forward(self, *inputs):
-        if not self.aux:
+        if not self.se_loss and not self.aux:
            return super(SegmentationLosses, self).forward(*inputs)
-        pred1, pred2, target = tuple(inputs)
-        loss1 = super(SegmentationLosses, self).forward(pred1, target)
-        loss2 = super(SegmentationLosses, self).forward(pred2, target)
-        return loss1 + self.aux_weight * loss2
-
-"""
-class SegmentationLosses(Module):
-    def __init__(self, aux, aux_weight=0.2, weight=None, size_average=True, ignore_index=-1):
-        super(SegmentationLosses, self).__init__()
-        self.aux = aux
-        self.aux_weight = aux_weight
-        # Somehow the size averge is not handled correctly on multi-gpu, so we average by ourself.
-        self.nll_loss = NLLLoss(weight, ignore_index=ignore_index, reduce=True)
-
-    def _forward_each(self, inputs, targets):
-        return self.nll_loss(F.log_softmax(inputs, dim=1), targets)
-
-    def forward(self, *inputs):
-        if not self.aux:
-            return self._forward_each(*inputs)
-        pred1, pred2, target = tuple(inputs)
-        loss1 = self._forward_each(pred1, target)
-        loss2 = self._forward_each(pred2, target)
-        return loss1 + self.aux_weight * loss2
-"""
+        elif not self.se_loss:
+            pred1, pred2, target = tuple(inputs)
+            loss1 = super(SegmentationLosses, self).forward(pred1, target)
+            loss2 = super(SegmentationLosses, self).forward(pred2, target)
+            return loss1 + self.aux_weight * loss2
+        elif not self.aux:
+            pred, se_pred, target = tuple(inputs)
+            se_target = self._get_batch_label_vector(target, nclass=self.nclass).type_as(pred)
+            loss1 = super(SegmentationLosses, self).forward(pred, target)
+            loss2 = self.bceloss(F.sigmoid(se_pred), se_target)
+            return loss1 + self.se_weight * loss2
+        else:
+            pred1, se_pred, pred2, target = tuple(inputs)
+            se_target = self._get_batch_label_vector(target, nclass=self.nclass).type_as(pred1)
+            loss1 = super(SegmentationLosses, self).forward(pred1, target)
+            loss2 = super(SegmentationLosses, self).forward(pred2, target)
+            loss3 = self.bceloss(F.sigmoid(se_pred), se_target)
+            return loss1 + self.aux_weight * loss2 + self.se_weight * loss3
+
+    @staticmethod
+    def _get_batch_label_vector(target, nclass):
+        # target is a 3D Variable BxHxW, output is 2D BxnClass
+        batch = target.size(0)
+        tvect = Variable(torch.zeros(batch, nclass))
+        for i in range(batch):
+            hist = torch.histc(target[i].cpu().data.float(), 
+                               bins=nclass, min=0,
+                               max=nclass-1)
+            vect = hist>0
+            tvect[i] = vect
+        return tvect


 class View(Module):
@@ -135,45 +149,3 @@ class Normalize(Module):

    def forward(self, x):
        return F.normalize(x, self.p, self.dim, eps=1e-10)
-
-
-class PyramidPooling(Module):
-    """
-    Reference:
-        Zhao, Hengshuang, et al. *"Pyramid scene parsing network."*
-    """
-    def __init__(self, in_channels):
-        super(PyramidPooling, self).__init__()
-        self.pool1 = AdaptiveAvgPool2d(1)
-        self.pool2 = AdaptiveAvgPool2d(2)
-        self.pool3 = AdaptiveAvgPool2d(3)
-        self.pool4 = AdaptiveAvgPool2d(6)
-
-        out_channels = int(in_channels/4)
-        self.conv1 = Sequential(Conv2d(in_channels, out_channels, 1),
-                                BatchNorm2d(out_channels),
-                                ReLU(True))
-        self.conv2 = Sequential(Conv2d(in_channels, out_channels, 1),
-                                BatchNorm2d(out_channels),
-                                ReLU(True))
-        self.conv3 = Sequential(Conv2d(in_channels, out_channels, 1),
-                                BatchNorm2d(out_channels),
-                                ReLU(True))
-        self.conv4 = Sequential(Conv2d(in_channels, out_channels, 1),
-                                BatchNorm2d(out_channels),
-                                ReLU(True))
-
-    def _cat_each(self, x, feat1, feat2, feat3, feat4):
-        assert(len(x) == len(feat1))
-        z = []
-        for i in range(len(x)):
-            z.append(torch.cat((x[i], feat1[i], feat2[i], feat3[i], feat4[i]), 1))
-        return z
-
-    def forward(self, x):
-        _, _, h, w = x.size()
-        feat1 = F.upsample(self.conv1(self.pool1(x)), (h, w), mode='bilinear')
-        feat2 = F.upsample(self.conv2(self.pool2(x)), (h, w), mode='bilinear')
-        feat3 = F.upsample(self.conv3(self.pool3(x)), (h, w), mode='bilinear')
-        feat4 = F.upsample(self.conv4(self.pool4(x)), (h, w), mode='bilinear')
-        return torch.cat((x, feat1, feat2, feat3, feat4), 1)
--- a/encoding/nn/encoding.py
+++ b/encoding/nn/encoding.py
@@ -15,9 +15,9 @@ import torch.nn.functional as F
 from torch.autograd import Variable
 from torch.nn.modules.utils import _pair

-from ..functions import scaledL2, aggregate, dilatedavgpool2d
+from ..functions import scaledL2, aggregate

-__all__ = ['Encoding', 'EncodingDrop', 'Inspiration', 'DilatedAvgPool2d', 'UpsampleConv2d']
+__all__ = ['Encoding', 'EncodingDrop', 'Inspiration', 'UpsampleConv2d']

 class Encoding(Module):
    r"""
@@ -203,82 +203,6 @@ class Inspiration(Module):
            + 'N x ' + str(self.C) + ')'


-class DilatedAvgPool2d(Module):
-    r"""We provide Dilated Average Pooling for the dilation of Densenet as
-    in :class:`encoding.dilated.DenseNet`.
-
-    Reference:
-
-        Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang, Ambrish Tyagi,
-        Amit Agrawal. “Context Encoding for Semantic Segmentation.
-        *The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2018*
-
-    Applies a 2D average pooling over an input signal composed of several input planes.
-
-    In the simplest case, the output value of the layer with input size :math:`(N, C, H, W)`,
-    output :math:`(B, C, H_{out}, W_{out})`, :attr:`kernel_size` :math:`(k_H,k_W)`,
-    :attr:`stride` :math:`(s_H,s_W)` :attr:`dilation` :math:`(d_H,d_W)`
-    can be precisely described as:
-
-    .. math::
-
-        \begin{array}{ll}
-        out(b, c, h, w)  = 1 / (k_H \cdot k_W) \cdot
-        \sum_{{m}=0}^{k_H-1} \sum_{{n}=0}^{k_W-1}
-        input(b, c, s_H \cdot h + d_H \cdot m, s_W \cdot w + d_W \cdot n)
-        \end{array}
-
-    | If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
-      for :attr:`padding` number of points
-
-    | The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`,
-      :attr:`dilation` can either be:
-
-        - a single ``int`` -- in which case the same value is used for the height
-          and width dimension
-        - a ``tuple`` of two ints -- in which case, the first `int` is used for
-          the height dimension, and the second `int` for the width dimension
-
-    Args:
-        kernel_size: the size of the window
-        stride: the stride of the window. Default value is :attr:`kernel_size`
-        padding: implicit zero padding to be added on both sides
-        dilation: the dilation parameter similar to Conv2d
-
-    Shape:
-        - Input: :math:`(B, C, H_{in}, W_{in})`
-        - Output: :math:`(B, C, H_{out}, W_{out})` where
-          :math:`H_{out} = floor((H_{in}  + 2 * padding[0] - kernel\_size[0]) / stride[0] + 1)`
-          :math:`W_{out} = floor((W_{in}  + 2 * padding[1] - kernel\_size[1]) / stride[1] + 1)`
-          For :attr:`stride=1`, the output featuremap preserves the same size as input.
-
-    Examples::
-
-        >>> # pool of square window of size=3, stride=2, dilation=2
-        >>> m = nn.DilatedAvgPool2d(3, stride=2, dilation=2)
-        >>> input = autograd.Variable(torch.randn(20, 16, 50, 32))
-        >>> output = m(input)
-
-    """
-    def __init__(self, kernel_size, stride=None, padding=0, dilation=1):
-        super(DilatedAvgPool2d, self).__init__()
-        self.kernel_size = kernel_size
-        self.stride = stride or kernel_size
-        self.padding = padding
-        self.dilation = dilation
-
-    def forward(self, input):
-        return dilatedavgpool2d(input, self.kernel_size, self.stride,
-                                self.padding, self.dilation)
-
-    def __repr__(self):
-        return self.__class__.__name__ + ' (' \
-            + 'size=' + str(self.kernel_size) \
-            + ', stride=' + str(self.stride) \
-            + ', padding=' + str(self.padding) \
-            + ', dilation=' + str(self.dilation) + ')'
-
-
 class UpsampleConv2d(Module):
    r"""
    To avoid the checkerboard artifacts of standard Fractionally-strided Convolution,

--- a/encoding/nn/syncbn.py
+++ b/encoding/nn/syncbn.py
@@ -23,34 +23,28 @@ from ..functions import *
 from ..parallel import allreduce
 from .comm import SyncMaster

+
 __all__ = ['BatchNorm1d', 'BatchNorm2d', 'BatchNorm3d', 'Module', 'Sequential', 'Conv1d',
           'Conv2d', 'ConvTranspose2d', 'ReLU', 'Sigmoid', 'MaxPool2d', 'AvgPool2d',
           'AdaptiveAvgPool2d', 'Dropout2d', 'Linear']

-# Adapt from https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
-_ChildMessage = collections.namedtuple('Message', ['sum', 'ssum', 'sum_size'])
-_MasterMessage = collections.namedtuple('_MasterMessage', ['sum', 'inv_std'])
-
 class _SyncBatchNorm(_BatchNorm):
-    def __init__(self, num_features, eps=1e-5, momentum=0.001, affine=True):
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True):
        super(_SyncBatchNorm, self).__init__(num_features, eps=eps, momentum=momentum, affine=affine)

        self._sync_master = SyncMaster(self._data_parallel_master)
-
-        self._is_parallel = False
        self._parallel_id = None
        self._slave_pipe = None

    def forward(self, input):
-        # If it is not parallel computation or is in evaluation mode, use PyTorch's implementation.
-        if not (self._is_parallel and self.training):
+        if not self.training:
            return batch_norm(
                input, self.running_mean, self.running_var, self.weight, self.bias,
                self.training, self.momentum, self.eps)

        # Resize the input to (B, C, -1).
        input_shape = input.size()
-        input = input.view(input.size(0), self.num_features, -1)
+        input = input.view(input_shape[0], self.num_features, -1)

        # sum(x) and sum(x^2)
        N = input.size(0) * input.size(2)
@@ -62,11 +56,9 @@ class _SyncBatchNorm(_BatchNorm):
        else:
            mean, inv_std = self._slave_pipe.run_slave(_ChildMessage(xsum, xsqsum, N))
        # forward
-        return batchnormtrain(input, self.weight, self.bias, mean, 1.0/inv_std).view(input_shape)
-
+        return batchnormtrain(input, mean, 1.0/inv_std, self.weight, self.bias).view(input_shape)

    def __data_parallel_replicate__(self, ctx, copy_id):
-        self._is_parallel = True
        self._parallel_id = copy_id

        # parallel_id == 0 means master device.
@@ -110,7 +102,12 @@ class _SyncBatchNorm(_BatchNorm):
        self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * mean.data
        self.running_var = (1 - self.momentum) * self.running_var + self.momentum * unbias_var.data

-        return mean, bias_var.clamp(self.eps) ** -0.5
+        return mean, (bias_var + self.eps) ** -0.5
+
+
+# API adapted from https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
+_ChildMessage = collections.namedtuple('Message', ['sum', 'ssum', 'sum_size'])
+_MasterMessage = collections.namedtuple('_MasterMessage', ['sum', 'inv_std'])


 class BatchNorm1d(_SyncBatchNorm):
@@ -193,12 +190,11 @@ class BatchNorm3d(_SyncBatchNorm):

 class SharedTensor(object):
    """Shared Tensor for cross GPU all reduce operation"""
-    def __init__(self, nGPUs, op):
+    def __init__(self, nGPUs):
        self.mutex = threading.Lock()
        self.all_tasks_done = threading.Condition(self.mutex)
        self.nGPUs = nGPUs
        self._clear()
-        self.op = op

    def _clear(self):
        self.N = 0
@@ -206,9 +202,7 @@ class SharedTensor(object):
        self.push_tasks = self.nGPUs
        self.reduce_tasks = self.nGPUs

-    def __call__(self, *inputs):
-        if self.nGPUs <= 1:
-            return tuple(inputs)
+    def push(self, *inputs):
        # push from device
        with self.mutex:
            if self.push_tasks == 0:
@@ -223,13 +217,15 @@ class SharedTensor(object):
                self.all_tasks_done.notify_all()
            while self.push_tasks:
                self.all_tasks_done.wait()
+
+    def pull(self, igpu):
        # pull from device
        with self.mutex:
            if igpu == 0:
                assert(len(self.dict) == self.nGPUs)
                # flatten the tensors
                self.list = [t for i in range(len(self.dict)) for t in self.dict[i]]
-                self.outlist = self.op(2, *self.list)
+                self.outlist = allreduce(2, *self.list)
                self.reduce_tasks -= 1
            else:
                self.reduce_tasks -= 1

--- a/encoding/parallel.py
+++ b/encoding/parallel.py
@@ -189,10 +189,11 @@ def _criterion_parallel_apply(modules, inputs, targets, kwargs_tup=None, devices
        outputs.append(output)
    return outputs

+
 ###########################################################################
 # Adapted from Synchronized-BatchNorm-PyTorch.
 # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
-
+#
 class CallbackContext(object):
    pass


--- a/encoding/src/encoding_lib.cpp
+++ b/encoding/src/encoding_lib.cpp
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- * Created by: Hang Zhang
- * ECE Department, Rutgers University
- * Email: zhang.hang@rutgers.edu
- * Copyright (c) 2017
- *
- * This source code is licensed under the MIT-style license found in the
- * LICENSE file in the root directory of this source tree 
- *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- */
-#include <THC/THC.h>
-#include <thc_encoding.h>
-
-extern THCState *state;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// float
-#include "generic/encoding_generic.c"
-#include "THC/THCGenerateFloatType.h"
-
-#include "generic/syncbn_generic.c"
-#include "THC/THCGenerateFloatType.h"
-
-#include "generic/pooling_generic.c"
-#include "THC/THCGenerateFloatType.h"
-
-// double
-#include "generic/encoding_generic.c"
-#include "THC/THCGenerateDoubleType.h"
-
-#include "generic/syncbn_generic.c"
-#include "THC/THCGenerateDoubleType.h"
-
-#include "generic/pooling_generic.c"
-#include "THC/THCGenerateDoubleType.h"
-
-#ifdef __cplusplus
-}
-#endif
--- a/encoding/src/encoding_lib.h
+++ b/encoding/src/encoding_lib.h
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- * Created by: Hang Zhang
- * ECE Department, Rutgers University
- * Email: zhang.hang@rutgers.edu
- * Copyright (c) 2017
- *
- * This source code is licensed under the MIT-style license found in the
- * LICENSE file in the root directory of this source tree 
- *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- */
-
-/*
-#include <THC/THC.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define Encoding_(NAME) TH_CONCAT_4(Encoding_, Real, _, NAME)
-#define THCTensor        TH_CONCAT_3(TH,CReal,Tensor)
-#define THCTensor_(NAME) TH_CONCAT_4(TH,CReal,Tensor_,NAME)
-
-// float
-#include "generic/encoding_generic.h"
-#include "THC/THCGenerateFloatType.h"
-
-#include "generic/syncbn_generic.h"
-#include "THC/THCGenerateFloatType.h"
-
-#ifdef __cplusplus
-}
-#endif
-*/
-
-int Encoding_Float_scaledl2_forward(THCudaTensor *SL,  
-    THCudaTensor *X, THCudaTensor *C, THCudaTensor *S);
-
-int Encoding_Float_scaledl2_backward(
-    THCudaTensor *GSL, THCudaTensor *GX, THCudaTensor *GC,
-    THCudaTensor *X, THCudaTensor *C, THCudaTensor *S);
-
-int Encoding_Float_aggregate_forward(THCudaTensor *E, THCudaTensor *A,
-			THCudaTensor *X, THCudaTensor *C);
-
-int Encoding_Float_aggregate_backward(THCudaTensor *GA, THCudaTensor *GE, 
-		THCudaTensor *A, THCudaTensor *X, THCudaTensor *C);
-
-int Encoding_Float_batchnorm_Forward(THCudaTensor *output_, 
-    THCudaTensor *input_, THCudaTensor *mean_, 
-    THCudaTensor *invstd_, THCudaTensor *gamma_, THCudaTensor *beta_);
-
-int Encoding_Float_batchnorm_Backward(THCudaTensor *gradoutput_, 
-    THCudaTensor *input_, THCudaTensor *gradinput_, 
-    THCudaTensor *gradgamma_, THCudaTensor *gradbeta_, 
-    THCudaTensor *mean_, THCudaTensor *invstd_, 
-    THCudaTensor *gamma_,THCudaTensor *beta_, 
-    THCudaTensor *gradMean_, THCudaTensor *gradStd_, int train);
-
-int Encoding_Float_sum_square_Forward(THCudaTensor *input_, 
-    THCudaTensor *sum_, THCudaTensor *square_);
-
-int Encoding_Float_sum_square_Backward(
-    THCudaTensor *gradInput, THCudaTensor *input_, 
-    THCudaTensor *gradSum_, THCudaTensor *gradSquare_);
-
-int Encoding_Float_DilatedAvgPool2d_Forward(
-    THCudaTensor *X_, THCudaTensor *Y_, 
-    int kH, int kW, int dH, int dW,
-    int padH, int padW,
-    int dilationH, int dilationW);
-
-int Encoding_Float_DilatedAvgPool2d_Backward(
-    THCudaTensor *gradX_, THCudaTensor *gradY_, 
-    int kH, int kW, int dH, int dW,
-    int padH, int padW,
-    int dilationH, int dilationW);
-
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
-
-int Encoding_Double_scaledl2_forward(THCudaDoubleTensor *SL,  
-    THCudaDoubleTensor *X, THCudaDoubleTensor *C,  THCudaDoubleTensor *S);
-
-int Encoding_Double_scaledl2_backward(
-    THCudaDoubleTensor *GSL, THCudaDoubleTensor *GX, 
-    THCudaDoubleTensor *GC, THCudaDoubleTensor *X, 
-    THCudaDoubleTensor *C, THCudaDoubleTensor *S);
-
-int Encoding_Double_aggregate_forward(THCudaDoubleTensor *E, 
-    THCudaDoubleTensor *A, THCudaDoubleTensor *X, THCudaDoubleTensor *C);
-
-int Encoding_Double_aggregate_backward(THCudaDoubleTensor *GA, 
-    THCudaDoubleTensor *GE, THCudaDoubleTensor *A, THCudaDoubleTensor *X, 
-    THCudaDoubleTensor *C);
-
-int Encoding_Double_batchnorm_Forward(THCudaDoubleTensor *output_, 
-    THCudaDoubleTensor *input_, THCudaDoubleTensor *mean_, 
-    THCudaDoubleTensor *invstd_, THCudaDoubleTensor *gamma_, 
-    THCudaDoubleTensor *beta_);
-
-int Encoding_Double_batchnorm_Backward(THCudaDoubleTensor *gradoutput_, 
-    THCudaDoubleTensor *input_, THCudaDoubleTensor *gradinput_, 
-    THCudaDoubleTensor *gradgamma_, THCudaDoubleTensor *gradbeta_, 
-    THCudaDoubleTensor *mean_, THCudaDoubleTensor *invstd_, 
-    THCudaDoubleTensor *gamma_, THCudaDoubleTensor *beta_, 
-    THCudaDoubleTensor *gradMean_, THCudaDoubleTensor *gradStd_, 
-    int train);
-
-int Encoding_Double_sum_square_Forward(THCudaDoubleTensor *input_, 
-    THCudaDoubleTensor *sum_, THCudaDoubleTensor *square_);
-
-void Encoding_Double_sum_square_Backward(
-    THCudaDoubleTensor *gradInput, THCudaDoubleTensor *input_, 
-    THCudaDoubleTensor *gradSum_, THCudaDoubleTensor *gradSquare_);
-
-int Encoding_Double_DilatedAvgPool2d_Forward(
-    THCudaDoubleTensor *X_, THCudaDoubleTensor *Y_, 
-    int kH, int kW, int dH, int dW,
-    int padH, int padW,
-    int dilationH, int dilationW);
-
-int Encoding_Double_DilatedAvgPool2d_Backward(
-    THCudaDoubleTensor *gradX_, THCudaDoubleTensor *gradY_, 
-    int kH, int kW, int dH, int dW,
-    int padH, int padW,
-    int dilationH, int dilationW);
-
--- a/encoding/src/generic/encoding_generic.c
+++ b/encoding/src/generic/encoding_generic.c
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- * Created by: Hang Zhang
- * ECE Department, Rutgers University
- * Email: zhang.hang@rutgers.edu
- * Copyright (c) 2017
- *
- * This source code is licensed under the MIT-style license found in the
- * LICENSE file in the root directory of this source tree 
- *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- */
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "generic/encoding_generic.c"
-#else
-
-int Encoding_(scaledl2_forward)(THCTensor *SL,  
-    THCTensor *X, THCTensor *C,  THCTensor *S)
-/*
- * ScaledL2 operation
- */
-{
-		Encoding_(ScaledL2_Forward)(state, SL, X, C, S);
-		/* C function return number of the outputs */
-		return 0;
-}
-
-int Encoding_(scaledl2_backward)(
-    THCTensor *GSL, THCTensor *GX, THCTensor *GC,
-    THCTensor *X, THCTensor *C, THCTensor *S)
-/*
- * ScaledL2 operation
- */
-{
-		Encoding_(ScaledL2_Backward)(state, GSL, GX, GC, X, C, S);
-		/* C function return number of the outputs */
-		return 0;
-}
-
-int Encoding_(aggregate_forward)(THCTensor *E, THCTensor *A,
-			THCTensor *X, THCTensor *C)
-/*
- * Aggregate operation
- */
-{
-		Encoding_(Aggregate_Forward)(state, E, A, X, C);
-		/* C function return number of the outputs */
-		return 0;
-}
-
-int Encoding_(aggregate_backward)(THCTensor *GA, THCTensor *GE, 
-		THCTensor *A, THCTensor *X, THCTensor *C)
-/*
- * Aggregate backward operation to A
- * G (dl/dR), L (dl/dE), A (assignments)
- */
-{
-		Encoding_(Aggregate_Backward)(state, GA, GE, A, X, C);
-		/* C function return number of the outputs */
-		return 0;
-}
-
-#endif
--- a/encoding/src/generic/encoding_generic.h
+++ b/encoding/src/generic/encoding_generic.h
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- * Created by: Hang Zhang
- * ECE Department, Rutgers University
- * Email: zhang.hang@rutgers.edu
- * Copyright (c) 2017
- *
- * This source code is licensed under the MIT-style license found in the
- * LICENSE file in the root directory of this source tree 
- *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- */
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "generic/encoding_generic.h"
-#else
-
-int Encoding_(scaledl2_forward)(THCTensor *SL,  
-    THCTensor *X, THCTensor *C,  THCTensor *S);
-
-int Encoding_(scaledl2_backward)(
-    THCTensor *GSL, THCTensor *GX, THCTensor *GC,
-    THCTensor *X, THCTensor *C, THCTensor *S);
-
-int Encoding_(aggregate_forward)(THCTensor *E, THCTensor *A,
-			THCTensor *X, THCTensor *C);
-
-int Encoding_(aggregate_backward)(THCTensor *GA, THCTensor *GE, 
-		THCTensor *A, THCTensor *X, THCTensor *C);
-
-int Encoding_(aggregateP_forward)(THCTensor *E, THCTensor *A,
-			THCTensor *R);
-
-int Encoding_(aggregateP_backward)(THCTensor *GA, THCTensor *GR, 
-		THCTensor *L, THCTensor *A, THCTensor *R);
-
-int Encoding_(residual_forward)(THCTensor *R, THCTensor *X, THCTensor *D);
-
-int Encoding_(residual_backward)(THCTensor *GR, THCTensor *GX, 
-    THCTensor *GD);
-
-int Encoding_(squaresqueeze_forward)(THCTensor *L, THCTensor *R);
-
-int Encoding_(squaresqueeze_backward)(THCTensor *GL, THCTensor *GR, 
-    THCTensor *R);
-
-#endif