v0.4.2 (#59)

This PR should fix most of the issues: fixes https://github.com/zhanghang1989/PyTorch-Encoding/issues/54 fixes https://github.com/zhanghang1989/PyTorch-Encoding/issues/53 fixes https://github.com/zhanghang1989/PyTorch-Encoding/issues/50

v0.4.2 (#59)
This PR should fix most of the issues: fixes https://github.com/zhanghang1989/PyTorch-Encoding/issues/54 fixes https://github.com/zhanghang1989/PyTorch-Encoding/issues/53 fixes https://github.com/zhanghang1989/PyTorch-Encoding/issues/50
07f25381 · Hang Zhang · GitHub · cebf1341 · 70fdeb79 · cebf1341
Unverified Commit 07f25381 authored Jun 04, 2018 by Hang Zhang Committed by GitHub Jun 04, 2018
17 changed files
--- a/encoding/src/generic/pooling_generic.c
+++ b/encoding/src/generic/pooling_generic.c
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- * Created by: Hang Zhang
- * ECE Department, Rutgers University
- * Email: zhang.hang@rutgers.edu
- * Copyright (c) 2017
- *
- * This source code is licensed under the MIT-style license found in the
- * LICENSE file in the root directory of this source tree 
- *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- */
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "generic/pooling_generic.c"
-#else
-
-int Encoding_(DilatedAvgPool2d_Forward)(
-    THCTensor *X_, THCTensor *Y_, 
-    int kH, int kW, int dH, int dW,
-    int padH, int padW,
-    int dilationH, int dilationW)
-/*
- */
-{
-    Encoding_(DilatedAvgPool_Forward)(state, 
-    X_, Y_, kH, kW, dH, dW,
-    padH, padW, dilationH, dilationW);
-    /* C function return number of the outputs */
-    return 0;
-}
-
-int Encoding_(DilatedAvgPool2d_Backward)(
-    THCTensor *gradX_, THCTensor *gradY_, 
-    int kH, int kW, int dH, int dW,
-    int padH, int padW,
-    int dilationH, int dilationW)
-/*
- */
-{
-    Encoding_(DilatedAvgPool_Backward)(state, 
-    gradX_, gradY_, kH, kW, dH, dW,
-    padH, padW, dilationH, dilationW);
-    /* C function return number of the outputs */
-    return 0;
-}
-
-#endif
--- a/encoding/src/generic/pooling_generic.h
+++ b/encoding/src/generic/pooling_generic.h
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- * Created by: Hang Zhang
- * ECE Department, Rutgers University
- * Email: zhang.hang@rutgers.edu
- * Copyright (c) 2017
- *
- * This source code is licensed under the MIT-style license found in the
- * LICENSE file in the root directory of this source tree 
- *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- */
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "generic/pooling_generic.c"
-#else
-
-int Encoding_(DilatedAvgPool2d_Forward)(
-    THCTensor *X_, THCTensor *Y_, 
-    int kH, int kW, int dH, int dW,
-    int padH, int padW,
-    int dilationH, int dilationW);
-
-int Encoding_(DilatedAvgPool2d_Backward)(
-    THCTensor *gradX_, THCTensor *gradY_, 
-    int kH, int kW, int dH, int dW,
-    int padH, int padW,
-    int dilationH, int dilationW);
-
-#endif
--- a/encoding/src/generic/syncbn_generic.c
+++ b/encoding/src/generic/syncbn_generic.c
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- * Created by: Hang Zhang
- * ECE Department, Rutgers University
- * Email: zhang.hang@rutgers.edu
- * Copyright (c) 2017
- *
- * This source code is licensed under the MIT-style license found in the
- * LICENSE file in the root directory of this source tree 
- *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- */
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "generic/syncbn_generic.c"
-#else
-
-int Encoding_(batchnorm_Forward)(THCTensor *output_, THCTensor *input_, 
-        THCTensor *mean_, THCTensor *invstd_,
-        THCTensor *gamma_, THCTensor *beta_)
-/*
- * 
- */
-{
-    Encoding_(BatchNorm_Forward)(state, output_, input_, 
-        mean_, invstd_, gamma_, beta_);
-	/* C function return number of the outputs */
-	return 0;
-}
-
-int Encoding_(batchnorm_Backward)(THCTensor *gradoutput_, 
-        THCTensor *input_, THCTensor *gradinput_, 
-        THCTensor *gradgamma_, THCTensor *gradbeta_, THCTensor *mean_, 
-        THCTensor *invstd_, THCTensor *gamma_, THCTensor *beta_, 
-        THCTensor *gradMean_, THCTensor *gradStd_, int train)
-/*
- */
-{
-    Encoding_(BatchNorm_Backward)(state, gradoutput_, input_, gradinput_, 
-        gradgamma_, gradbeta_, mean_, invstd_, gamma_, beta_, gradMean_, gradStd_,
-        train);
-	/* C function return number of the outputs */
-	return 0;
-}
-
-
-int Encoding_(sum_square_Forward)(THCTensor *input_, 
-        THCTensor *sum_, THCTensor *square_)
-/*
- */
-{
-    Encoding_(Sum_Square_Forward)(state, input_, sum_, square_);
-	/* C function return number of the outputs */
-	return 0;
-}
-
-
-int Encoding_(sum_square_Backward)(
-        THCTensor *gradInput, THCTensor *input_, 
-        THCTensor *gradSum_, THCTensor *gradSquare_)
-/*
- */
-{
-    Encoding_(Sum_Square_Backward)(state, gradInput, input_, gradSum_, 
-                                   gradSquare_);
-	/* C function return number of the outputs */
-	return 0;
-}
-
-#endif
--- a/encoding/src/generic/syncbn_generic.h
+++ b/encoding/src/generic/syncbn_generic.h
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- * Created by: Hang Zhang
- * ECE Department, Rutgers University
- * Email: zhang.hang@rutgers.edu
- * Copyright (c) 2017
- *
- * This source code is licensed under the MIT-style license found in the
- * LICENSE file in the root directory of this source tree 
- *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- */
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "generic/syncbn_generic.h"
-#else
-
-int Encoding_(batchnorm_Forward)(THCTensor *output_, THCTensor *input_, 
-        THCTensor *mean_, THCTensor *invstd_,
-        THCTensor *gamma_, THCTensor *beta_);
-
-int Encoding_(batchnorm_Backward)(THCTensor *gradoutput_, 
-        THCTensor *input_, THCTensor *gradinput_, 
-        THCTensor *gradgamma_, THCTensor *gradbeta_, THCTensor *mean_, 
-        THCTensor *invstd_, THCTensor *gamma_, THCTensor *beta_, 
-        THCTensor *gradMean_, THCTensor *gradStd_, int train);
-
-int Encoding_(sum_square_Forward)(THCTensor *input_, 
-        THCTensor *sum_, THCTensor *square_);
-
-int Encoding_(sum_square_Backward)(
-        THCTensor *gradInput, THCTensor *input_, 
-        THCTensor *gradSum_, THCTensor *gradSquare_);
-
-#endif
--- a/encoding/utils/__init__.py
+++ b/encoding/utils/__init__.py
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+## Created by: Hang Zhang
+## ECE Department, Rutgers University
+## Email: zhang.hang@rutgers.edu
+## Copyright (c) 2017
+##
+## This source code is licensed under the MIT-style license found in the
+## LICENSE file in the root directory of this source tree
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+"""Encoding Util Tools"""
+from .lr_scheduler import LR_Scheduler
+from .metrics import batch_intersection_union, batch_pix_accuracy
+from .pallete import get_mask_pallete
+from .train_helper import get_selabel_vector
+from .presets import load_image
+from .files import *
+
+__all__ = ['LR_Scheduler', 'batch_pix_accuracy', 'batch_intersection_union',
+           'save_checkpoint', 'download', 'mkdir', 'check_sha1', 'load_image',
+           'get_mask_pallete']
--- a/encoding/utils/files.py
+++ b/encoding/utils/files.py
+import os
+import requests
+import errno
+import shutil
+import hashlib
+from tqdm import tqdm
+import torch
+
+__all__ = ['save_checkpoint', 'download', 'mkdir', 'check_sha1']
+
+def save_checkpoint(state, args, is_best, filename='checkpoint.pth.tar'):
+    """Saves checkpoint to disk"""
+    directory = "runs/%s/%s/%s/"%(args.dataset, args.model, args.checkname)
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+    filename = directory + filename
+    torch.save(state, filename)
+    if is_best:
+        shutil.copyfile(filename, directory + 'model_best.pth.tar')
+
+
+def download(url, path=None, overwrite=False, sha1_hash=None):
+    """Download an given URL
+    Parameters
+    ----------
+    url : str
+        URL to download
+    path : str, optional
+        Destination path to store downloaded file. By default stores to the
+        current directory with same name as in url.
+    overwrite : bool, optional
+        Whether to overwrite destination file if already exists.
+    sha1_hash : str, optional
+        Expected sha1 hash in hexadecimal digits. Will ignore existing file when hash is specified
+        but doesn't match.
+    Returns
+    -------
+    str
+        The file path of the downloaded file.
+    """
+    if path is None:
+        fname = url.split('/')[-1]
+    else:
+        path = os.path.expanduser(path)
+        if os.path.isdir(path):
+            fname = os.path.join(path, url.split('/')[-1])
+        else:
+            fname = path
+
+    if overwrite or not os.path.exists(fname) or (sha1_hash and not check_sha1(fname, sha1_hash)):
+        dirname = os.path.dirname(os.path.abspath(os.path.expanduser(fname)))
+        if not os.path.exists(dirname):
+            os.makedirs(dirname)
+
+        print('Downloading %s from %s...'%(fname, url))
+        r = requests.get(url, stream=True)
+        if r.status_code != 200:
+            raise RuntimeError("Failed downloading url %s"%url)
+        total_length = r.headers.get('content-length')
+        with open(fname, 'wb') as f:
+            if total_length is None: # no content length header
+                for chunk in r.iter_content(chunk_size=1024):
+                    if chunk: # filter out keep-alive new chunks
+                        f.write(chunk)
+            else:
+                total_length = int(total_length)
+                for chunk in tqdm(r.iter_content(chunk_size=1024),
+                                  total=int(total_length / 1024. + 0.5),
+                                  unit='KB', unit_scale=False, dynamic_ncols=True):
+                    f.write(chunk)
+
+        if sha1_hash and not check_sha1(fname, sha1_hash):
+            raise UserWarning('File {} is downloaded but the content hash does not match. ' \
+                              'The repo may be outdated or download may be incomplete. ' \
+                              'If the "repo_url" is overridden, consider switching to ' \
+                              'the default repo.'.format(fname))
+
+    return fname
+
+
+def check_sha1(filename, sha1_hash):
+    """Check whether the sha1 hash of the file content matches the expected hash.
+    Parameters
+    ----------
+    filename : str
+        Path to the file.
+    sha1_hash : str
+        Expected sha1 hash in hexadecimal digits.
+    Returns
+    -------
+    bool
+        Whether the file content matches the expected hash.
+    """
+    sha1 = hashlib.sha1()
+    with open(filename, 'rb') as f:
+        while True:
+            data = f.read(1048576)
+            if not data:
+                break
+            sha1.update(data)
+
+    return sha1.hexdigest() == sha1_hash
+
+
+def mkdir(path):
+    """make dir exists okay"""
+    try:
+        os.makedirs(path)
+    except OSError as exc:  # Python >2.5
+        if exc.errno == errno.EEXIST and os.path.isdir(path):
+            pass
+        else:
+            raise
--- a/encoding/utils/lr_scheduler.py
+++ b/encoding/utils/lr_scheduler.py
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+## Created by: Hang Zhang
+## ECE Department, Rutgers University
+## Email: zhang.hang@rutgers.edu
+## Copyright (c) 2017
+##
+## This source code is licensed under the MIT-style license found in the
+## LICENSE file in the root directory of this source tree
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+import math
+
+class LR_Scheduler(object):
+    """Learning Rate Scheduler
+
+    Step mode: ``lr = baselr * 0.1 ^ {floor(epoch-1 / lr_step)}``
+
+    Cosine mode: ``lr = baselr * 0.5 * (1 + cos(iter/maxiter))``
+
+    Poly mode: ``lr = baselr * (1 - iter/maxiter) ^ 0.9``
+
+    Args:
+        args:  :attr:`args.lr_scheduler` lr scheduler mode (`cos`, `poly`),
+          :attr:`args.lr` base learning rate, :attr:`args.epochs` number of epochs,
+          :attr:`args.lr_step`
+
+        niters: number of iterations per epoch
+    """
+    def __init__(self, args, niters=0):
+        self.mode = args.lr_scheduler
+        print('Using {} LR Scheduler!'.format(self.mode))
+        self.lr = args.lr
+        if self.mode == 'step':
+            self.lr_step = args.lr_step
+        else:
+            self.niters = niters
+            self.N = args.epochs * niters
+        self.epoch = -1
+
+    def __call__(self, optimizer, i, epoch, best_pred):
+        if self.mode == 'cos':
+            T = (epoch - 1) * self.niters + i
+            lr = 0.5 * self.lr * (1 + math.cos(1.0 * T / self.N * math.pi))
+        elif self.mode == 'poly':
+            T = (epoch - 1) * self.niters + i
+            lr = self.lr * pow((1 - 1.0 * T / self.N), 0.9)
+        elif self.mode == 'step':
+            lr = self.lr * (0.1 ** ((epoch - 1) // self.lr_step))
+        else:
+            raise RuntimeError('Unknown LR scheduler!')
+        if epoch > self.epoch:
+            print('\n=>Epoches %i, learning rate = %.4f, \
+                previous best = %.4f' % (epoch, lr, best_pred))
+            self.epoch = epoch
+        self._adjust_learning_rate(optimizer, lr)
+
+    def _adjust_learning_rate(self, optimizer, lr):
+        if len(optimizer.param_groups) == 1:
+            optimizer.param_groups[0]['lr'] = lr
+        else:
+            # enlarge the lr at the head
+            optimizer.param_groups[0]['lr'] = lr
+            for i in range(1, len(optimizer.param_groups)):
+                optimizer.param_groups[i]['lr'] = lr * 10
--- a/encoding/utils/metrics.py
+++ b/encoding/utils/metrics.py
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+## Created by: Hang Zhang
+## ECE Department, Rutgers University
+## Email: zhang.hang@rutgers.edu
+## Copyright (c) 2017
+##
+## This source code is licensed under the MIT-style license found in the
+## LICENSE file in the root directory of this source tree
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+import numpy as np
+import torch
+
+def batch_pix_accuracy(predict, target):
+    """Batch Pixel Accuracy
+    Args:
+        predict: input 4D tensor
+        target: label 3D tensor
+    """
+    _, predict = torch.max(predict, 1)
+    predict = predict.cpu().numpy() + 1
+    target = target.cpu().numpy() + 1
+    pixel_labeled = np.sum(target > 0)
+    pixel_correct = np.sum((predict == target)*(target > 0))
+    assert pixel_correct <= pixel_labeled, "Correct area should be smaller than Labeled"
+    return pixel_correct, pixel_labeled
+
+
+def batch_intersection_union(predict, target, nclass):
+    """Batch Intersection of Union
+    Args:
+        predict: input 4D tensor
+        target: label 3D tensor
+        nclass: number of categories (int)
+    """
+    _, predict = torch.max(predict, 1)
+    mini = 1
+    maxi = nclass
+    nbins = nclass
+    predict = predict.cpu().numpy() + 1
+    target = target.cpu().numpy() + 1
+
+    predict = predict * (target > 0).astype(predict.dtype)
+    intersection = predict * (predict == target)
+    # areas of intersection and union
+    area_inter, _ = np.histogram(intersection, bins=nbins, range=(mini, maxi))
+    area_pred, _ = np.histogram(predict, bins=nbins, range=(mini, maxi))
+    area_lab, _ = np.histogram(target, bins=nbins, range=(mini, maxi))
+    area_union = area_pred + area_lab - area_inter
+    assert (area_inter <= area_union).all(), \
+        "Intersection area should be smaller than Union area"
+    return area_inter, area_union
+
+
+# ref https://github.com/CSAILVision/sceneparsing/blob/master/evaluationCode/utils_eval.py
+def pixel_accuracy(im_pred, im_lab):
+    im_pred = np.asarray(im_pred)
+    im_lab = np.asarray(im_lab)
+
+    # Remove classes from unlabeled pixels in gt image. 
+    # We should not penalize detections in unlabeled portions of the image.
+    pixel_labeled = np.sum(im_lab > 0)
+    pixel_correct = np.sum((im_pred == im_lab) * (im_lab > 0))
+    #pixel_accuracy = 1.0 * pixel_correct / pixel_labeled
+    return pixel_correct, pixel_labeled
+
+
+def intersection_and_union(im_pred, im_lab, num_class):
+    im_pred = np.asarray(im_pred)
+    im_lab = np.asarray(im_lab)
+    # Remove classes from unlabeled pixels in gt image. 
+    im_pred = im_pred * (im_lab > 0)
+    # Compute area intersection:
+    intersection = im_pred * (im_pred == im_lab)
+    area_inter, _ = np.histogram(intersection, bins=num_class-1,
+                                        range=(1, num_class - 1))
+    # Compute area union: 
+    area_pred, _ = np.histogram(im_pred, bins=num_class-1,
+                                range=(1, num_class - 1))
+    area_lab, _ = np.histogram(im_lab, bins=num_class-1,
+                               range=(1, num_class - 1))
+    area_union = area_pred + area_lab - area_inter
+    return area_inter, area_union
--- a/encoding/utils.py
+++ b/encoding/utils.py
@@ -8,162 +8,7 @@
 ## LICENSE file in the root directory of this source tree
 ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

-"""Encoding Util Tools"""
-import os
-import errno
-import requests
-import shutil
-import hashlib
-import math
-from tqdm import tqdm
-import numpy as np
-import torch
-
-__all__ = ['LR_Scheduler', 'save_checkpoint', 'batch_pix_accuracy',
-           'batch_intersection_union', 'download', 'mkdir', 'check_sha1']
-
-
-class LR_Scheduler(object):
-    """Learning Rate Scheduler
-
-    Step mode: ``lr = baselr * 0.1 ^ {floor(epoch-1 / lr_step)}``
-
-    Cosine mode: ``lr = baselr * 0.5 * (1 + cos(iter/maxiter))``
-
-    Poly mode: ``lr = baselr * (1 - iter/maxiter) ^ 0.9``
-
-    Args:
-        args:  :attr:`args.lr_scheduler` lr scheduler mode (`cos`, `poly`),
-          :attr:`args.lr` base learning rate, :attr:`args.epochs` number of epochs,
-          :attr:`args.lr_step`
-
-        niters: number of iterations per epoch
-    """
-    def __init__(self, args, niters=0):
-        self.mode = args.lr_scheduler
-        print('Using {} LR Scheduler!'.format(self.mode))
-        self.lr = args.lr
-        if self.mode == 'step':
-            self.lr_step = args.lr_step
-        else:
-            self.niters = niters
-            self.N = args.epochs * niters
-        self.epoch = -1
-
-    def __call__(self, optimizer, i, epoch, best_pred):
-        if self.mode == 'cos':
-            T = (epoch - 1) * self.niters + i
-            lr = 0.5 * self.lr * (1 + math.cos(1.0 * T / self.N * math.pi))
-        elif self.mode == 'poly':
-            T = (epoch - 1) * self.niters + i
-            lr = self.lr * pow((1 - 1.0 * T / self.N), 0.9)
-        elif self.mode == 'step':
-            lr = self.lr * (0.1 ** ((epoch - 1) // self.lr_step))
-        else:
-            raise RuntimeError('Unknown LR scheduler!')
-        if epoch > self.epoch:
-            print('\n=>Epoches %i, learning rate = %.4f, \
-                previous best = %.4f' % (epoch, lr, best_pred))
-            self.epoch = epoch
-        self._adjust_learning_rate(optimizer, lr)
-
-    def _adjust_learning_rate(self, optimizer, lr):
-        if len(optimizer.param_groups) == 1:
-            optimizer.param_groups[0]['lr'] = lr
-        else:
-            # enlarge the lr at the head
-            optimizer.param_groups[0]['lr'] = lr
-            for i in range(1, len(optimizer.param_groups)):
-                optimizer.param_groups[i]['lr'] = lr * 10
-
-
-# refer to https://github.com/xternalz/WideResNet-pytorch
-def save_checkpoint(state, args, is_best, filename='checkpoint.pth.tar'):
-    """Saves checkpoint to disk"""
-    directory = "runs/%s/%s/%s/"%(args.dataset, args.model, args.checkname)
-    if not os.path.exists(directory):
-        os.makedirs(directory)
-    filename = directory + filename
-    torch.save(state, filename)
-    if is_best:
-        shutil.copyfile(filename, directory + 'model_best.pth.tar')
-
-
-def batch_pix_accuracy(predict, target):
-    """Batch Pixel Accuracy
-    Args:
-        predict: input 4D tensor
-        target: label 3D tensor
-    """
-    _, predict = torch.max(predict, 1)
-    # pixel_labeled = (target >= 0).sum().item()
-    # TODO currently torch.eq is not working as expected, change back when it's fixed
-    # pixel_correct = torch.eq(predict, target).sum().item()
-    predict = predict.cpu().numpy()
-    target = target.cpu().numpy()
-    pixel_labeled = np.sum(target >= 0)
-    pixel_correct = np.sum((predict == target)*(target >= 0))
-    assert(pixel_correct <= pixel_labeled)
-    return pixel_correct, pixel_labeled
-
-
-def batch_intersection_union(predict, target, nclass):
-    """Batch Intersection of Union
-    Args:
-        predict: input 4D tensor
-        target: label 3D tensor
-        nclass: number of categories (int)
-    """
-    _, predict = torch.max(predict, 1)
-    mini = 0
-    maxi = nclass - 1
-    nbins = nclass
-    """
-    predict = predict.cpu().numpy()
-    target = target.cpu().numpy()
-    predict = predict * (target >= 0).astype(predict.dtype)
-    intersection = predict * (predict == target)
-    # areas of intersection and union
-    area_inter, _ = np.histogram(intersection, bins=nbins,
-                                 range=(mini, maxi))
-    area_pred, _ = np.histogram(predict, bins=nbins,
-                                range=(mini, maxi))
-    area_lab, _ = np.histogram(target, bins=nbins,
-                               range=(mini, maxi))
-    area_union = area_pred + area_lab - area_inter
-    # Somehow PyTorch update break this, will change back if fixed
-    """
-    predict = predict * (target >= 0).type_as(predict)
-    intersection = predict * (predict == target).type_as(predict)
-    area_inter = torch.histc(intersection.cpu().float(), bins=nclass, 
-                                    min=mini, max=maxi) 
-    area_pred = torch.histc(predict.cpu().float(), bins=nclass, min=mini, 
-                            max=maxi)
-    area_lab = torch.histc(target.cpu().float(), bins=nclass, min=mini, 
-                           max=maxi)
-    area_union = area_pred + area_lab - area_inter
-    return area_inter, area_union
-
-
-def get_selabel_vector(target, nclass):
-    """Get SE-Loss Label in a batch
-    Args:
-        predict: input 4D tensor
-        target: label 3D tensor (BxHxW)
-        nclass: number of categories (int)
-    Output:
-        2D tensor (BxnClass)
-    """
-    batch = target.size(0)
-    tvect = torch.zeros(batch, nclass)
-    for i in range(batch):
-        hist = torch.histc(target[i].data.float(), 
-                           bins=nclass, min=0,
-                           max=nclass-1)
-        vect = hist>0
-        tvect[i] = vect
-    return tvect
-
+from PIL import Image

 def get_mask_pallete(npimg, dataset='detail'):
    """Get image color pallete for visualizing masks"""
@@ -171,7 +16,7 @@ def get_mask_pallete(npimg, dataset='detail'):
    if dataset == 'pascal_voc':
        npimg[npimg==21] = 255
    # put colormap
-    out_img = Image.fromarray(npimg.astype('uint8'))
+    out_img = Image.fromarray(npimg.squeeze().astype('uint8'))
    if dataset == 'ade20k':
        out_img.putpalette(adepallete)
    elif dataset == 'cityscapes':
@@ -181,131 +26,6 @@ def get_mask_pallete(npimg, dataset='detail'):
    return out_img


-def download(url, path=None, overwrite=False, sha1_hash=None):
-    """Download an given URL
-    Parameters
-    ----------
-    url : str
-        URL to download
-    path : str, optional
-        Destination path to store downloaded file. By default stores to the
-        current directory with same name as in url.
-    overwrite : bool, optional
-        Whether to overwrite destination file if already exists.
-    sha1_hash : str, optional
-        Expected sha1 hash in hexadecimal digits. Will ignore existing file when hash is specified
-        but doesn't match.
-    Returns
-    -------
-    str
-        The file path of the downloaded file.
-    """
-    if path is None:
-        fname = url.split('/')[-1]
-    else:
-        path = os.path.expanduser(path)
-        if os.path.isdir(path):
-            fname = os.path.join(path, url.split('/')[-1])
-        else:
-            fname = path
-
-    if overwrite or not os.path.exists(fname) or (sha1_hash and not check_sha1(fname, sha1_hash)):
-        dirname = os.path.dirname(os.path.abspath(os.path.expanduser(fname)))
-        if not os.path.exists(dirname):
-            os.makedirs(dirname)
-
-        print('Downloading %s from %s...'%(fname, url))
-        r = requests.get(url, stream=True)
-        if r.status_code != 200:
-            raise RuntimeError("Failed downloading url %s"%url)
-        total_length = r.headers.get('content-length')
-        with open(fname, 'wb') as f:
-            if total_length is None: # no content length header
-                for chunk in r.iter_content(chunk_size=1024):
-                    if chunk: # filter out keep-alive new chunks
-                        f.write(chunk)
-            else:
-                total_length = int(total_length)
-                for chunk in tqdm(r.iter_content(chunk_size=1024),
-                                  total=int(total_length / 1024. + 0.5),
-                                  unit='KB', unit_scale=False, dynamic_ncols=True):
-                    f.write(chunk)
-
-        if sha1_hash and not check_sha1(fname, sha1_hash):
-            raise UserWarning('File {} is downloaded but the content hash does not match. ' \
-                              'The repo may be outdated or download may be incomplete. ' \
-                              'If the "repo_url" is overridden, consider switching to ' \
-                              'the default repo.'.format(fname))
-
-    return fname
-
-
-def check_sha1(filename, sha1_hash):
-    """Check whether the sha1 hash of the file content matches the expected hash.
-    Parameters
-    ----------
-    filename : str
-        Path to the file.
-    sha1_hash : str
-        Expected sha1 hash in hexadecimal digits.
-    Returns
-    -------
-    bool
-        Whether the file content matches the expected hash.
-    """
-    sha1 = hashlib.sha1()
-    with open(filename, 'rb') as f:
-        while True:
-            data = f.read(1048576)
-            if not data:
-                break
-            sha1.update(data)
-
-    return sha1.hexdigest() == sha1_hash
-
-
-def mkdir(path):
-    """make dir exists okay"""
-    try:
-        os.makedirs(path)
-    except OSError as exc:  # Python >2.5
-        if exc.errno == errno.EEXIST and os.path.isdir(path):
-            pass
-        else:
-            raise
-
-
-# ref https://github.com/CSAILVision/sceneparsing/blob/master/evaluationCode/utils_eval.py
-def pixel_accuracy(im_pred, im_lab):
-    im_pred = np.asarray(im_pred)
-    im_lab = np.asarray(im_lab)
-
-    # Remove classes from unlabeled pixels in gt image. 
-    # We should not penalize detections in unlabeled portions of the image.
-    pixel_labeled = np.sum(im_lab > 0)
-    pixel_correct = np.sum((im_pred == im_lab) * (im_lab > 0))
-    #pixel_accuracy = 1.0 * pixel_correct / pixel_labeled
-    return pixel_correct, pixel_labeled
-
-
-def intersection_and_union(im_pred, im_lab, num_class):
-    im_pred = np.asarray(im_pred)
-    im_lab = np.asarray(im_lab)
-    # Remove classes from unlabeled pixels in gt image. 
-    im_pred = im_pred * (im_lab > 0)
-    # Compute area intersection:
-    intersection = im_pred * (im_pred == im_lab)
-    area_inter, _ = np.histogram(intersection, bins=num_class-1,
-                                        range=(1, num_class - 1))
-    # Compute area union: 
-    area_pred, _ = np.histogram(im_pred, bins=num_class-1,
-                                range=(1, num_class - 1))
-    area_lab, _ = np.histogram(im_lab, bins=num_class-1,
-                               range=(1, num_class - 1))
-    area_union = area_pred + area_lab - area_inter
-    return area_inter, area_union
-
-
 def _get_voc_pallete(num_cls):
    n = num_cls
    pallete = [0]*(n*3)
@@ -329,3 +49,4 @@ adepallete = [0,0,0,120,120,120,180,120,120,6,230,230,80,50,50,4,200,3,120,120,8

 citypallete = [
 128,64,128,244,35,232,70,70,70,102,102,156,190,153,153,153,153,153,250,170,30,220,220,0,107,142,35,152,251,152,70,130,180,220,20,60,255,0,0,0,0,142,0,0,70,0,60,100,0,80,100,0,0,230,119,11,32,128,192,0,0,64,128,128,64,128,0,192,128,128,192,128,64,64,0,192,64,0,64,192,0,192,192,0,64,64,128,192,64,128,64,192,128,192,192,128,0,0,64,128,0,64,0,128,64,128,128,64,0,0,192,128,0,192,0,128,192,128,128,192,64,0,64,192,0,64,64,128,64,192,128,64,64,0,192,192,0,192,64,128,192,192,128,192,0,64,64,128,64,64,0,192,64,128,192,64,0,64,192,128,64,192,0,192,192,128,192,192,64,64,64,192,64,64,64,192,64,192,192,64,64,64,192,192,64,192,64,192,192,192,192,192,32,0,0,160,0,0,32,128,0,160,128,0,32,0,128,160,0,128,32,128,128,160,128,128,96,0,0,224,0,0,96,128,0,224,128,0,96,0,128,224,0,128,96,128,128,224,128,128,32,64,0,160,64,0,32,192,0,160,192,0,32,64,128,160,64,128,32,192,128,160,192,128,96,64,0,224,64,0,96,192,0,224,192,0,96,64,128,224,64,128,96,192,128,224,192,128,32,0,64,160,0,64,32,128,64,160,128,64,32,0,192,160,0,192,32,128,192,160,128,192,96,0,64,224,0,64,96,128,64,224,128,64,96,0,192,224,0,192,96,128,192,224,128,192,32,64,64,160,64,64,32,192,64,160,192,64,32,64,192,160,64,192,32,192,192,160,192,192,96,64,64,224,64,64,96,192,64,224,192,64,96,64,192,224,64,192,96,192,192,224,192,192,0,32,0,128,32,0,0,160,0,128,160,0,0,32,128,128,32,128,0,160,128,128,160,128,64,32,0,192,32,0,64,160,0,192,160,0,64,32,128,192,32,128,64,160,128,192,160,128,0,96,0,128,96,0,0,224,0,128,224,0,0,96,128,128,96,128,0,224,128,128,224,128,64,96,0,192,96,0,64,224,0,192,224,0,64,96,128,192,96,128,64,224,128,192,224,128,0,32,64,128,32,64,0,160,64,128,160,64,0,32,192,128,32,192,0,160,192,128,160,192,64,32,64,192,32,64,64,160,64,192,160,64,64,32,192,192,32,192,64,160,192,192,160,192,0,96,64,128,96,64,0,224,64,128,224,64,0,96,192,128,96,192,0,224,192,128,224,192,64,96,64,192,96,64,64,224,64,192,224,64,64,96,192,192,96,192,64,224,192,192,224,192,32,32,0,160,32,0,32,160,0,160,160,0,32,32,128,160,32,128,32,160,128,160,160,128,96,32,0,224,32,0,96,160,0,224,160,0,96,32,128,224,32,128,96,160,128,224,160,128,32,96,0,160,96,0,32,224,0,160,224,0,32,96,128,160,96,128,32,224,128,160,224,128,96,96,0,224,96,0,96,224,0,224,224,0,96,96,128,224,96,128,96,224,128,224,224,128,32,32,64,160,32,64,32,160,64,160,160,64,32,32,192,160,32,192,32,160,192,160,160,192,96,32,64,224,32,64,96,160,64,224,160,64,96,32,192,224,32,192,96,160,192,224,160,192,32,96,64,160,96,64,32,224,64,160,224,64,32,96,192,160,96,192,32,224,192,160,224,192,96,96,64,224,96,64,96,224,64,224,224,64,96,96,192,224,96,192,96,224,192,0,0,0]
+
--- a/encoding/utils/presets.py
+++ b/encoding/utils/presets.py
+"""Preset Transforms for Demos"""
+from PIL import Image
+import numpy as np
+import torch
+import torchvision.transforms as transform
+
+__all__ = ['load_image', 'subtract_imagenet_mean_batch']
+
+input_transform = transform.Compose([
+    transform.ToTensor(),
+    transform.Normalize([.485, .456, .406], [.229, .224, .225])])
+
+def load_image(filename, size=None, scale=None, keep_asp=True):
+    """Load the image for demos"""
+    img = Image.open(filename).convert('RGB')
+    if size is not None:
+        if keep_asp:
+            size2 = int(size * 1.0 / img.size[0] * img.size[1])
+            img = img.resize((size, size2), Image.ANTIALIAS)
+        else:
+            img = img.resize((size, size), Image.ANTIALIAS)
+    elif scale is not None:
+        img = img.resize((int(img.size[0] / scale), int(img.size[1] / scale)), Image.ANTIALIAS)
+
+    img = input_transform(img)
+    return img
--- a/encoding/utils/train_helper.py
+++ b/encoding/utils/train_helper.py
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+## Created by: Hang Zhang
+## ECE Department, Rutgers University
+## Email: zhang.hang@rutgers.edu
+## Copyright (c) 2017
+##
+## This source code is licensed under the MIT-style license found in the
+## LICENSE file in the root directory of this source tree
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+import torch
+
+def get_selabel_vector(target, nclass):
+    """Get SE-Loss Label in a batch
+    Args:
+        predict: input 4D tensor
+        target: label 3D tensor (BxHxW)
+        nclass: number of categories (int)
+    Output:
+        2D tensor (BxnClass)
+    """
+    batch = target.size(0)
+    tvect = torch.zeros(batch, nclass)
+    for i in range(batch):
+        hist = torch.histc(target[i].data.float(), 
+                           bins=nclass, min=0,
+                           max=nclass-1)
+        vect = hist>0
+        tvect[i] = vect
+    return tvect
+
--- a/requirements.txt
+++ b/requirements.txt
-tqdm
-nose
--- a/scripts/prepare_ade20k.py
+++ b/scripts/prepare_ade20k.py
+"""Prepare ADE20K dataset"""
+import os
+import shutil
+import argparse
+import zipfile
+from encoding.utils import download, mkdir
+
+_TARGET_DIR = os.path.expanduser('~/.encoding/data')
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Initialize ADE20K dataset.',
+        epilog='Example: python prepare_ade20k.py',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--download-dir', default=None, help='dataset directory on disk')
+    args = parser.parse_args()
+    return args
+
+def download_ade(path, overwrite=False):
+    _AUG_DOWNLOAD_URLS = [
+        ('http://data.csail.mit.edu/places/ADEchallenge/ADEChallengeData2016.zip', '219e1696abb36c8ba3a3afe7fb2f4b4606a897c7'),
+        ('http://data.csail.mit.edu/places/ADEchallenge/release_test.zip', 'e05747892219d10e9243933371a497e905a4860c'),]
+    download_dir = os.path.join(path, 'downloads')
+    mkdir(download_dir)
+    for url, checksum in _AUG_DOWNLOAD_URLS:
+        filename = download(url, path=download_dir, overwrite=overwrite, sha1_hash=checksum)
+        # extract
+        with zipfile.ZipFile(filename,"r") as zip_ref:
+            zip_ref.extractall(path=path)
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    mkdir(os.path.expanduser('~/.encoding/data'))
+    if args.download_dir is not None:
+        if os.path.isdir(_TARGET_DIR):
+            os.remove(_TARGET_DIR)
+        # make symlink
+        os.symlink(args.download_dir, _TARGET_DIR)
+    else:
+        download_ade(_TARGET_DIR, overwrite=False)
--- a/scripts/prepare_pascal.py
+++ b/scripts/prepare_pascal.py
+"""Prepare PASCAL VOC datasets"""
+import os
+import shutil
+import argparse
+import tarfile
+from encoding.utils import download, mkdir
+
+_TARGET_DIR = os.path.expanduser('~/.encoding/data')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Initialize PASCAL VOC dataset.',
+        epilog='Example: python prepare_pascal.py',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--download-dir', type=str, default=None, help='dataset directory on disk')
+    parser.add_argument('--no-download', action='store_true', help='disable automatic download if set')
+    parser.add_argument('--overwrite', action='store_true', help='overwrite downloaded files if set, in case they are corrputed')
+    args = parser.parse_args()
+    return args
+
+
+def download_voc(path, overwrite=False):
+    _DOWNLOAD_URLS = [
+        ('http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar',
+         '4e443f8a2eca6b1dac8a6c57641b67dd40621a49')]
+    download_dir = os.path.join(path, 'downloads')
+    mkdir(download_dir)
+    for url, checksum in _DOWNLOAD_URLS:
+        filename = download(url, path=download_dir, overwrite=overwrite, sha1_hash=checksum)
+        # extract
+        with tarfile.open(filename) as tar:
+            tar.extractall(path=path)
+
+
+def download_aug(path, overwrite=False):
+    _AUG_DOWNLOAD_URLS = [
+        ('http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/semantic_contours/benchmark.tgz', '7129e0a480c2d6afb02b517bb18ac54283bfaa35')]
+    download_dir = os.path.join(path, 'downloads')
+    mkdir(download_dir)
+    for url, checksum in _AUG_DOWNLOAD_URLS:
+        filename = download(url, path=download_dir, overwrite=overwrite, sha1_hash=checksum)
+        # extract
+        with tarfile.open(filename) as tar:
+            tar.extractall(path=path)
+            shutil.move(os.path.join(path, 'benchmark_RELEASE'),
+                        os.path.join(path, 'VOCaug'))
+            filenames = ['VOCaug/dataset/train.txt', 'VOCaug/dataset/val.txt']
+            # generate trainval.txt
+            with open(os.path.join(path, 'VOCaug/dataset/trainval.txt'), 'w') as outfile:
+                for fname in filenames:
+                    fname = os.path.join(path, fname)
+                    with open(fname) as infile:
+                        for line in infile:
+                            outfile.write(line)
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    mkdir(os.path.expanduser('~/.encoding/datasets'))
+    if args.download_dir is not None:
+        if os.path.isdir(_TARGET_DIR):
+            os.remove(_TARGET_DIR)
+        os.symlink(args.download_dir, _TARGET_DIR)
+    else:
+        download_voc(_TARGET_DIR, overwrite=False)
+        download_aug(_TARGET_DIR, overwrite=False)
--- a/scripts/prepare_pcontext.py
+++ b/scripts/prepare_pcontext.py
+"""Prepare PASCAL Context dataset"""
+import os
+import shutil
+import argparse
+import tarfile
+from encoding.utils import download, mkdir
+
+_TARGET_DIR = os.path.expanduser('~/.encoding/data')
+PASD_URL="https://codalabuser.blob.core.windows.net/public/%s"
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Initialize PASCAL Context dataset.',
+        epilog='Example: python prepare_pcontext.py',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--download-dir', default=None, help='dataset directory on disk')
+    args = parser.parse_args()
+    return args
+
+def download_ade(path, overwrite=False):
+    _AUG_DOWNLOAD_URLS = [
+        ('http://host.robots.ox.ac.uk/pascal/VOC/voc2010/VOCtrainval_03-May-2010.tar', 'bf9985e9f2b064752bf6bd654d89f017c76c395a'),
+        ('https://codalabuser.blob.core.windows.net/public/trainval_merged.json', '169325d9f7e9047537fedca7b04de4dddf10b881')]
+    download_dir = os.path.join(path, 'downloads')
+    mkdir(download_dir)
+    for url, checksum in _AUG_DOWNLOAD_URLS:
+        filename = download(url, path=download_dir, overwrite=overwrite, sha1_hash=checksum)
+        # extract
+        if os.path.splitext(filename)[1] == '.tar':
+            with tarfile.open(filename) as tar:
+                tar.extractall(path=path)
+        else:
+            shutil.move(filename, os.path.join(path, 'VOCdevkit/VOC2010/'+os.path.basename(filename)))
+
+if __name__ == '__main__':
+    args = parse_args()
+    mkdir(os.path.expanduser('~/.encoding/data'))
+    if args.download_dir is not None:
+        if os.path.isdir(_TARGET_DIR):
+            os.remove(_TARGET_DIR)
+        # make symlink
+        os.symlink(args.download_dir, _TARGET_DIR)
+    else:
+        download_ade(_TARGET_DIR, overwrite=False)
--- a/setup.py
+++ b/setup.py
@@ -18,22 +18,7 @@ import setuptools.command.install

 cwd = os.path.dirname(os.path.abspath(__file__))

-# run test scrip after installation
-class install(setuptools.command.install.install):
-    def run(self):
-        self.create_version_file()
-        setuptools.command.install.install.run(self)
-        #subprocess.check_call("python tests/unit_test.py".split())
-    @staticmethod
-    def create_version_file():
-        global version, cwd
-        print('-- Building version ' + version)
-        version_path = os.path.join(cwd, 'encoding', 'version.py')
-        with open(version_path, 'w') as f:
-            f.write('"""This is encoding version file."""\n')
-            f.write("__version__ = '{}'\n".format(version))
-
-version = '0.4.0'
+version = '0.4.2'
 try:
    sha = subprocess.check_output(['git', 'rev-parse', 'HEAD'], 
        cwd=cwd).decode('ascii').strip()
@@ -41,6 +26,27 @@ try:
 except Exception:
    pass

+def create_version_file():
+    global version, cwd
+    print('-- Building version ' + version)
+    version_path = os.path.join(cwd, 'encoding', 'version.py')
+    with open(version_path, 'w') as f:
+        f.write('"""This is encoding version file."""\n')
+        f.write("__version__ = '{}'\n".format(version))
+
+# run test scrip after installation
+class install(setuptools.command.install.install):
+    def run(self):
+        create_version_file()
+        setuptools.command.install.install.run(self)
+        #subprocess.check_call("python tests/unit_test.py".split())
+
+class develop(setuptools.command.develop.develop):
+    def run(self):
+        create_version_file()
+        setuptools.command.develop.develop.run(self)
+        #subprocess.check_call("python tests/unit_test.py".split())
+
 try:
    import pypandoc
    readme = pypandoc.convert('README.md', 'rst')
@@ -51,10 +57,20 @@ requirements = [
    'numpy',
    'tqdm',
    'nose',
-    'torch>=0.3.1',
+    'torch>=0.5.0',
    'cffi>=1.0.0',
 ]

+requirements = [
+    'numpy',
+    'tqdm',
+    'nose',
+    'torch>=0.4.0',
+    'Pillow',
+    'scipy',
+    'requests',
+]
+
 setup(
    name="encoding",
    version=version,
@@ -67,17 +83,14 @@ setup(
    install_requires=requirements,
    packages=find_packages(exclude=["tests", "experiments"]),
    package_data={ 'encoding': [
-        'lib/*.so*', 'lib/*.dylib*',
-        '_ext/encoding_lib/*.so', '_ext/encoding_lib/*.dylib',
-        'kernel/*.h', 'kernel/generic/*h',
-        'src/*.h',
+        'lib/cpu/*.h',
+        'lib/cpu/*.cpp',
+        'lib/gpu/*.h',
+        'lib/gpu/*.cpp',
+        'lib/gpu/*.cu',
    ]},
-    ext_package="",
-    # Extensions to compile.
-    cffi_modules=[
-        os.path.join(cwd, "build.py:ffi")
-    ],
    cmdclass={
        'install': install,
+        'develop': develop,
    },
 )
--- a/tests/unit_test.py
+++ b/tests/unit_test.py
@@ -15,12 +15,12 @@ from torch.autograd import Variable, gradcheck

 import encoding

-EPS = 1e-6
-ATOL = 1e-4
+EPS = 1e-3
+ATOL = 1e-3

 def _assert_tensor_close(a, b, atol=ATOL, rtol=EPS):
    npa, npb = a.cpu().numpy(), b.cpu().numpy()
-    assert np.allclose(npa, npb, atol=atol), \
+    assert np.allclose(npa, npb, rtol=rtol, atol=atol), \
        'Tensor close check failed\n{}\n{}\nadiff={}, rdiff={}'.format(
            a, b, np.abs(npa - npb).max(), np.abs((npa - npb) / np.fmax(npa, 1e-5)).max())

@@ -79,95 +79,92 @@ def test_all_reduce():
    for i in range(1, ngpu):
        _assert_tensor_close(Y[i].data, Y[0].data)
    input = (1, *X)
-    #test = gradcheck(encoding.parallel.allreduce, input, eps=EPS, atol=ATOL)
    test = gradcheck(encoding.parallel.allreduce, input, eps=EPS, atol=ATOL)
+    print('Testing allreduce(): {}'.format(test))

-def _test_syncbn(train_mode=True):
+
+def test_syncbn():
+    train_mode=True
    # generate input
    B,C,H,W = 8,3,4,5
    X = Variable(torch.cuda.DoubleTensor(B,C,H,W).uniform_(-0.5,0.5), 
                 requires_grad=True)
    input = (X,)
    # SyncBN using DataParallel
-    layer = encoding.nn.SyncBatchNorm2d(C)
+    layer = encoding.nn.BatchNorm2d(C)
    model = torch.nn.DataParallel(layer).double().cuda()
+    encoding.parallel.patch_replication_callback(model)
    layer.train(train_mode)
    # grad check
    test = gradcheck(model, input, eps=EPS, atol=ATOL)
-    print('Testing SyncBatchNorm2d(): {}'.format(test))
+    print('Testing BatchNorm2d(): {}'.format(test))


-def _test_syncbn_func(train_mode=True):
+def test_syncbn_func():
    # generate input
    B, C, H = 2, 3, 4
    X = Variable(torch.cuda.DoubleTensor(B,C,H).uniform_(-0.5, 0.5), 
        requires_grad=True)
-    xsum = Variable(torch.ones(C).double().cuda(), requires_grad=True)
-    xsqsum = Variable(torch.ones(C).double().cuda(), requires_grad=True)
-    gamma = Variable(torch.ones(C).double().cuda(), requires_grad=True)
-    beta = Variable(torch.zeros(C).double().cuda(), requires_grad=True)
-    gamma.requires_grad=True
-    beta.requires_grad=True
-    runningVar = torch.ones(C).double().cuda()
-    runningMean = torch.zeros(C).double().cuda()
+    gamma = Variable(torch.cuda.DoubleTensor(C).uniform_(-0.5, 0.5), requires_grad=True)
+    beta = Variable(torch.cuda.DoubleTensor(C).uniform_(-0.5, 0.5), requires_grad=True)
+    mean = Variable(torch.cuda.DoubleTensor(C).uniform_(-0.5, 0.5), requires_grad=True)
+    std = Variable(torch.cuda.DoubleTensor(C).uniform_(-0.5, 0.5), requires_grad=True)
    N = B * H
-    inputs = (X, xsum, xsqsum, gamma, beta, runningMean, runningVar, N, 0.1, 1e-5, train_mode)
+    inputs = (X, mean, std, gamma, beta)
    # grad check
-    test = gradcheck(encoding.functions.batchnorm.apply, inputs, eps=EPS, atol=ATOL)
+    test = gradcheck(encoding.functions.batchnormtrain, inputs, eps=EPS, atol=ATOL)
    print('Testing batchnorm(): {}'.format(test))


-def _checkBatchNormResult(bn1, bn2, input, is_train, cuda=False):
-    def _find_bn(module):
-        for m in module.modules():
-            if isinstance(m, (torch.nn.BatchNorm1d, torch.nn.BatchNorm2d,
-                              encoding.nn.SyncBatchNorm1d, encoding.nn.SyncBatchNorm2d)):
-                return m
-    def _syncParameters(bn1, bn2):
-        bn1.reset_parameters()
-        bn2.reset_parameters()
-        if bn1.affine and bn2.affine:
-            bn2.weight.data.copy_(bn1.weight.data)
-            bn2.bias.data.copy_(bn1.bias.data)
-            bn2.running_mean.copy_(bn1.running_mean)
-            bn2.running_var.copy_(bn1.running_var)
-
-    bn1.train(mode=is_train)
-    bn2.train(mode=is_train)
-
-    if cuda:
-        input = input.cuda()
-    # using the same values for gamma and beta
-    _syncParameters(_find_bn(bn1), _find_bn(bn2))
-
-    input1 = Variable(input.clone(), requires_grad=True)
-    output1 = bn1(input1)
-    input2 = Variable(input.clone(), requires_grad=True)
-    output2 = bn2(input2)
-
-    _assert_tensor_close(input1.data, input2.data)
-    _assert_tensor_close(output1.data, output2.data)
-    if not is_train:
-        return
-    (output1 ** 2).sum().backward()
-    (output2 ** 2).sum().backward()
-    _assert_tensor_close(input1.grad.data, input2.grad.data)
-    _assert_tensor_close(_find_bn(bn1).running_mean, _find_bn(bn2).running_mean)
-    _assert_tensor_close(_find_bn(bn1).running_var, _find_bn(bn2).running_var)
+def testSyncBN():
+    def _checkBatchNormResult(bn1, bn2, input, is_train, cuda=False):
+        def _find_bn(module):
+            for m in module.modules():
+                if isinstance(m, (torch.nn.BatchNorm1d, torch.nn.BatchNorm2d,
+                                  encoding.nn.BatchNorm1d, encoding.nn.BatchNorm2d)):
+                    return m
+        def _syncParameters(bn1, bn2):
+            bn1.reset_parameters()
+            bn2.reset_parameters()
+            if bn1.affine and bn2.affine:
+                bn2.weight.data.copy_(bn1.weight.data)
+                bn2.bias.data.copy_(bn1.bias.data)
+                bn2.running_mean.copy_(bn1.running_mean)
+                bn2.running_var.copy_(bn1.running_var)
+
+        bn1.train(mode=is_train)
+        bn2.train(mode=is_train)
+
+        if cuda:
+            input = input.cuda()
+        # using the same values for gamma and beta
+        _syncParameters(_find_bn(bn1), _find_bn(bn2))
+
+        input1 = Variable(input.clone().detach(), requires_grad=True)
+        input2 = Variable(input.clone().detach(), requires_grad=True)
+        output1 = bn1(input1)
+        output2 = bn2(input2)
+        # assert forwarding
+        _assert_tensor_close(input1.data, input2.data)
+        _assert_tensor_close(output1.data, output2.data)
+        if not is_train:
+            return
+        (output1 ** 2).sum().backward()
+        (output2 ** 2).sum().backward()
+        _assert_tensor_close(input1.grad.data, input2.grad.data)
+        _assert_tensor_close(_find_bn(bn1).running_mean, _find_bn(bn2).running_mean)
+        _assert_tensor_close(_find_bn(bn1).running_var, _find_bn(bn2).running_var)


-def testSyncBN():
    bn = torch.nn.BatchNorm2d(10).cuda().double()
-    sync_bn = encoding.nn.SyncBatchNorm2d(10).double()
+    sync_bn = encoding.nn.BatchNorm2d(10).double()
    sync_bn = torch.nn.DataParallel(sync_bn).cuda()
+    encoding.parallel.patch_replication_callback(sync_bn)
    # check with unsync version
    for i in range(10):
+        print(i)
        _checkBatchNormResult(bn, sync_bn, torch.rand(16, 10, 16, 16).double(), True, cuda=True)
        _checkBatchNormResult(bn, sync_bn, torch.rand(16, 10, 16, 16).double(), False, cuda=True)
-    # gradcheck
-    _test_syncbn_func(True)
-    _test_syncbn(True)
-

 if __name__ == '__main__':
    import nose