v1.0.1

8f8fbb9f · Hang Zhang · aa9af7fd · 8f8fbb9f · 8f8fbb9f · 8f8fbb9f
Commit 8f8fbb9f authored Oct 15, 2017 by Hang Zhang
20 changed files
--- a/encoding/nn/syncbn.py
+++ b/encoding/nn/syncbn.py
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+## Created by: Hang Zhang
+## ECE Department, Rutgers University
+## Email: zhang.hang@rutgers.edu
+## Copyright (c) 2017
+##
+## This source code is licensed under the MIT-style license found in the
+## LICENSE file in the root directory of this source tree 
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+import math
+import threading
+import torch
+import torch.cuda.comm as comm
+from torch.autograd import Variable
+from torch.nn import Module, Sequential
+from torch.nn import functional as F
+from torch.nn.parameter import Parameter
+from torch.nn.modules.utils import _single, _pair, _triple
+from torch.nn.parallel.scatter_gather import scatter, scatter_kwargs, \
+    gather
+
+from ..functions import view_each, multi_each, sum_each, batchnormtrain, batchnormeval, sum_square 
+from ..parallel import my_data_parallel, Broadcast, AllReduce
+
+__all__ = ['BatchNorm1d', 'BatchNorm2d']
+
+class BatchNorm1d(Module):
+    r"""Synchronized Batch Normalization 1d
+    Please use compatible :class:`encoding.parallel.SelfDataParallel` and :class:`encoding.nn`
+
+    Applies Batch Normalization over a 2d or 3d input that is seen as a
+    mini-batch.
+
+    .. math::
+
+        y = \frac{x - \mu[x]}{ \sqrt{var[x] + \epsilon}} * \gamma + \beta
+
+    The mean and standard-deviation are calculated per-dimension over
+    the mini-batches and gamma and beta are learnable parameter vectors
+    of size C (where C is the input size).
+
+    During training, this layer keeps a running estimate of its computed mean
+    and variance. The running sum is kept with a default momentum of 0.1.
+
+    During evaluation, this running mean/variance is used for normalization.
+
+    Args:
+        num_features: num_features from an expected input of size
+            `batch_size x num_features [x width]`
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Default: 0.1
+        affine: a boolean value that when set to true, gives the layer 
+            learnable affine parameters. Default: True
+
+    Shape:
+        - Input: :math:`(N, C)` or :math:`(N, C, L)`
+        - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)
+
+    Examples:
+        >>> m = encoding.nn.BatchNorm1d(100).cuda()
+        >>> input = autograd.Variable(torch.randn(20, 100)).cuda()
+        >>> output = m(input)
+    """
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True):
+        super(BatchNorm1d, self).__init__()
+        self.num_features = num_features
+        self.affine = affine
+        self.eps = eps
+        self.momentum = momentum
+        if self.affine:
+            self.weight = Parameter(torch.Tensor(num_features))
+            self.bias = Parameter(torch.Tensor(num_features))
+        else:
+            self.register_parameter('weight', None)
+            self.register_parameter('bias', None)
+        self.register_buffer('running_mean', torch.zeros(num_features))
+        self.register_buffer('running_var', torch.ones(num_features))
+        self.reset_parameters()
+        self.writelock = threading.Lock()
+
+    def reset_parameters(self):
+        self.running_mean.zero_()
+        self.running_var.fill_(1)
+        if self.affine:
+            self.weight.data.uniform_()
+            self.bias.data.zero_()
+
+    def __repr__(self):
+        return ('{name}({num_features}, eps={eps}, momentum={momentum},'
+                ' affine={affine})'
+                .format(name=self.__class__.__name__, **self.__dict__))
+
+    def _check_input_dim(self, input):
+        if input.dim() != 3:
+            raise ValueError('expected 3D input (got {}D input)'
+                             .format(input.dim()))
+
+    def forward(self, input):
+        if isinstance(input, Variable):
+            self._check_input_dim(input)
+            if self.training:
+                xsum, xsquare = sum_square(input.unsqueeze(3))
+                N = input.size(0)*input.size(2)
+                mean = xsum / N
+                sumvar = xsquare - xsum * xsum / N
+                unbias_var = sumvar / (N - 1)
+                std = (sumvar / N + self.eps).sqrt()
+                # update running_mean and var
+                self.running_mean = (1-self.momentum) * self.running_mean \
+                    + self.momentum * mean.data
+                self.running_var = (1-self.momentum) * self.running_var + \
+                    self.momentum * unbias_var.data
+                # forward
+                output = batchnormtrain(
+                    input, self.weight, 
+                    self.bias, mean, 
+                    std)
+                return output
+            else:
+                var_mean = Variable(self.running_mean, requires_grad=False)
+                bias_var = Variable(self.running_var, requires_grad=False)
+                std = (bias_var + self.eps).sqrt()
+                return batchnormeval(
+                    input, self.weight, self.bias, var_mean, std)
+
+        elif isinstance(input, tuple) or isinstance(input, list):
+            self._check_input_dim(input[0])
+            # if evaluation, do it simple
+            if not self.training:
+                return my_data_parallel(self, input)
+            if len(input) == 1:
+                return self.forward(input[0])
+            # calculate mean and var using multithreading
+            all_sum, all_xsquare = {},{}
+            def _worker(i, x, lock):
+                try:
+                    with torch.cuda.device_of(x):
+                        xsum, xsquare = sum_square(x.unsqueeze(3))
+                    with lock:
+                        all_sum[i] = xsum 
+                        all_xsquare[i] = xsquare 
+                except Exception as e:
+                    with lock:
+                        all_sum[i] = e
+                        all_xsquare[i] = e
+            threads = [threading.Thread(target=_worker,
+                                        args=(i, x, self.writelock))
+                        for i, x in enumerate(input)]
+            for thread in threads:
+                thread.start()
+            for thread in threads:
+                thread.join()
+            # convert to list
+            def _to_list(x):
+                outputs = []
+                for i in range(len(x)):
+                    outputs.append(x[i])
+                return outputs
+            
+            all_sum = _to_list(all_sum)
+            all_xsquare = _to_list(all_xsquare)
+            xsums = AllReduce()(*all_sum)
+            xsquares = AllReduce()(*all_xsquare)
+
+            nGPUs = len(input)
+            N = nGPUs * input[0].size(0)*input[0].size(2)
+            assert(N>1)
+            xmean = xsums[0].data / N
+            unbias_var = (xsquares[0].data - N * xmean * xmean) / (N-1) 
+            # update running_mean and var
+            self.running_mean = (1-self.momentum) * self.running_mean \
+                + self.momentum * xmean
+            self.running_var = (1-self.momentum) * self.running_var + \
+                self.momentum * unbias_var
+            # Broadcast the weight, bias, mean, std
+            device_ids = list(range(torch.cuda.device_count()))
+            weights = Broadcast(device_ids[:len(input)])(self.weight) 
+            biases = Broadcast(device_ids[:len(input)])(self.bias)
+            # parallel-apply
+            results = {}
+            def _worker_bn(i, x, xsum, xsquare, weight, bias, lock):
+                var_input = _get_a_var(x)
+                mean = xsum / N
+                std  = (xsquare / N - mean * mean + self.eps).sqrt()
+                try:
+                    with torch.cuda.device_of(var_input):
+                        result = batchnormtrain(
+                            x, weight, bias, mean, std)
+                    with lock: 
+                        results[i] = result
+                except Exception as e:
+                    with lock:
+                        results[i] = e
+            threads = [threading.Thread(target=_worker_bn,
+                                        args=(i, x, xsum, xsquare, weight, 
+                                              bias, self.writelock)
+                                       )
+                        for i,( x, xsum, xsquare, weight, bias) in 
+                        enumerate(zip(input, xsums, xsquares, 
+                                      weights, biases))]
+            for thread in threads:
+                thread.start()
+            for thread in threads:
+                thread.join()
+            outputs = []
+            for i in range(len(results)):
+                output = results[i]
+                if isinstance(output, Exception):
+                    raise output
+                outputs.append(output)
+            return outputs
+        else:
+            raise RuntimeError('unknown input type')
+
+
+class BatchNorm2d(Module):
+    r"""Synchronized Batch Normalization 2d
+    Please use compatible :class:`encoding.parallel.SelfDataParallel` and :class:`encoding.nn`    
+
+    Applies Batch Normalization over a 4d input that is seen as a mini-batch
+    of 3d inputs
+
+    .. math::
+
+        y = \frac{x - \mu[x]}{ \sqrt{var[x] + \epsilon}} * \gamma + \beta
+
+    The mean and standard-deviation are calculated per-dimension over
+    the mini-batches and gamma and beta are learnable parameter vectors
+    of size C (where C is the input size).
+
+    During training, this layer keeps a running estimate of its computed mean
+    and variance. The running sum is kept with a default momentum of 0.1.
+
+    During evaluation, this running mean/variance is used for normalization.
+
+    Args:
+        num_features: num_features from an expected input of
+            size batch_size x num_features x height x width
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Default: 0.1
+        affine: a boolean value that when set to true, gives the layer learnable
+            affine parameters. Default: True
+
+    Shape:
+        - Input: :math:`(N, C, H, W)`
+        - Output: :math:`(N, C, H, W)` (same shape as input)
+
+    Examples:
+        >>> m = encoding.nn.BatchNorm2d(100).cuda()
+        >>> input = autograd.Variable(torch.randn(20, 100, 35, 45)).cuda()
+        >>> output = m(input)
+    """
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True):
+        super(BatchNorm2d, self).__init__()
+        self.num_features = num_features
+        self.affine = affine
+        self.eps = eps
+        self.momentum = momentum
+        if self.affine:
+            self.weight = Parameter(torch.Tensor(num_features))
+            self.bias = Parameter(torch.Tensor(num_features))
+        else:
+            self.register_parameter('weight', None)
+            self.register_parameter('bias', None)
+        self.register_buffer('running_mean', torch.zeros(num_features))
+        self.register_buffer('running_var', torch.ones(num_features))
+        self.reset_parameters()
+        self.writelock = threading.Lock()
+
+    def reset_parameters(self):
+        self.running_mean.zero_()
+        self.running_var.fill_(1)
+        if self.affine:
+            self.weight.data.uniform_()
+            self.bias.data.zero_()
+
+    def __repr__(self):
+        return ('{name}({num_features}, eps={eps}, momentum={momentum},'
+                ' affine={affine})'
+                .format(name=self.__class__.__name__, **self.__dict__))
+
+    def _check_input_dim(self, input):
+        if input.dim() != 4:
+            raise ValueError('expected 4D input (got {}D input)'
+                             .format(input.dim()))
+
+    def forward(self, input):
+        if isinstance(input, Variable):
+            self._check_input_dim(input)
+            if self.training:
+                xsum, xsquare = sum_square(input)
+                N = input.size(0)*input.size(2)*input.size(3)
+                mean = xsum / N
+                sumvar = xsquare - xsum * xsum / N
+                unbias_var = sumvar / (N - 1)
+                std = (sumvar / N + self.eps).sqrt()
+                # update running_mean and var
+                self.running_mean = (1-self.momentum) * self.running_mean \
+                    + self.momentum * mean.data
+                self.running_var = (1-self.momentum) * self.running_var + \
+                    self.momentum * unbias_var.data
+                # forward
+                B, C, H, W = input.size()
+                output = batchnormtrain(
+                    input.view(B,C,-1).contiguous(), self.weight, 
+                    self.bias, mean, 
+                    std)
+                return output.view(B, C, H, W)
+            else:
+                var_mean = Variable(self.running_mean, requires_grad=False)
+                bias_var = Variable(self.running_var, requires_grad=False)
+                std = (bias_var + self.eps).sqrt()
+                B, C, H, W = input.size()
+                return batchnormeval(
+                    input.view(B,C,-1).contiguous(), 
+                    self.weight, self.bias, var_mean, 
+                    std).view(B, C, H, W)
+
+        elif isinstance(input, tuple) or isinstance(input, list):
+            self._check_input_dim(input[0])
+            # if evaluation, do it simple
+            if not self.training:
+                return my_data_parallel(self, input)
+            if len(input) == 1:
+                return self.forward(input[0])
+            # calculate mean and var using multithreading
+            all_sum, all_xsquare = {},{}
+            def _worker(i, x, lock):
+                try:
+                    with torch.cuda.device_of(x):
+                        xsum, xsquare = sum_square(x)
+                    with lock:
+                        all_sum[i] = xsum 
+                        all_xsquare[i] = xsquare 
+                except Exception as e:
+                    with lock:
+                        all_sum[i] = e
+                        all_xsquare[i] = e
+            threads = [threading.Thread(target=_worker,
+                                        args=(i, x, self.writelock))
+                        for i, x in enumerate(input)]
+            for thread in threads:
+                thread.start()
+            for thread in threads:
+                thread.join()
+            # convert to list
+            def _to_list(x):
+                outputs = []
+                for i in range(len(x)):
+                    outputs.append(x[i])
+                return outputs
+            
+            all_sum = _to_list(all_sum)
+            all_xsquare = _to_list(all_xsquare)
+            xsums = AllReduce()(*all_sum)
+            xsquares = AllReduce()(*all_xsquare)
+
+            nGPUs = len(input)
+            N = nGPUs * input[0].size(0)*input[0].size(2)*input[0].size(3)
+            assert(N>1)
+            xmean = xsums[0].data / N
+            unbias_var = (xsquares[0].data - N * xmean * xmean) / (N-1) 
+            # update running_mean and var
+            self.running_mean = (1-self.momentum) * self.running_mean \
+                + self.momentum * xmean
+            self.running_var = (1-self.momentum) * self.running_var + \
+                self.momentum * unbias_var
+            # Broadcast the weight, bias, mean, std
+            device_ids = list(range(torch.cuda.device_count()))
+            weights = Broadcast(device_ids[:len(input)])(self.weight) 
+            biases = Broadcast(device_ids[:len(input)])(self.bias)
+            # parallel-apply
+            results = {}
+            def _worker_bn(i, x, xsum, xsquare, weight, bias, lock):
+                var_input = _get_a_var(x)
+                mean = xsum / N
+                std  = (xsquare / N - mean * mean + self.eps).sqrt()
+                try:
+                    with torch.cuda.device_of(var_input):
+                        B, C, H, W = x.size()
+                        result = batchnormtrain(
+                            x.view(B,C, -1), weight, bias, mean, 
+                            std).view(B, C, H, W)
+                    with lock: 
+                        results[i] = result
+                except Exception as e:
+                    with lock:
+                        results[i] = e
+            threads = [threading.Thread(target=_worker_bn,
+                                        args=(i, x, xsum, xsquare, weight, 
+                                              bias, self.writelock)
+                                       )
+                        for i,( x, xsum, xsquare, weight, bias) in 
+                        enumerate(zip(input, xsums, xsquares, 
+                                      weights, biases))]
+            for thread in threads:
+                thread.start()
+            for thread in threads:
+                thread.join()
+            outputs = []
+            for i in range(len(results)):
+                output = results[i]
+                if isinstance(output, Exception):
+                    raise output
+                outputs.append(output)
+            return outputs
+        else:
+            raise RuntimeError('unknown input type')
--- a/encoding/parallel.py
+++ b/encoding/parallel.py
@@ -19,12 +19,69 @@ from torch.nn.parallel.scatter_gather import scatter, scatter_kwargs, \
 from torch.nn.parallel.replicate import replicate
 from torch.nn.parallel.parallel_apply import parallel_apply

+def nccl_all_reduce(inputs):
+    # TODO, figure out why nccl all_reduce doesn't work for gradcheck
+    input_size = inputs[0].size()
+    #if nccl.is_available(inputs):
+    for i, inp in enumerate(inputs):
+        assert inp.is_cuda, \
+            "reduce_add expects all inputs to be on GPUs"
+        if inp.size() != input_size:
+            got = 'x'.join(str(x) for x in inp.size())
+            expected = 'x'.join(str(x) for x in input_size)
+            raise ValueError("input {} has invalid size: got {}, \
+                but expected {}".format(i, got, expected))
+    nccl.all_reduce(inputs)
+    return inputs
+
+def comm_all_reduce(inputs):
+    # comm backend
+    result = comm.reduce_add(inputs)
+    results = []
+    for i in range(len(inputs)):
+        results.append(result.clone().cuda(i))
+    return results
+
+
+class AllReduce(Function):
+    """Cross GPU all reduce autograd operation for calculate mean and
+    variance in SyncBN.
+    """
+    def forward(ctx, *inputs):
+        outputs = comm_all_reduce(list(inputs))
+        return tuple(outputs)
+
+    def backward(ctx, *gradOutputs):
+        gradInputs = comm_all_reduce(list(gradOutputs))
+        return tuple(gradInputs)
+
+
+class Broadcast(Function):
+    """Multi-GPU broadcast autograd function
+    """
+    def __init__(self, target_gpus):
+        super(Broadcast, self).__init__()
+        self.target_gpus = target_gpus
+
+    def forward(self, *inputs):
+        if not all(input.is_cuda for input in inputs):
+            raise TypeError('Broadcast function not implemented for CPU tensors')
+        if len(inputs) == 0:
+            return tuple()
+        self.num_inputs = len(inputs)
+        self.input_device = inputs[0].get_device()
+        outputs = comm.broadcast_coalesced(inputs, self.target_gpus)
+        return tuple([t for tensors in outputs for t in tensors])
+
+    def backward(self, *grad_outputs):
+        grad_outputs = [grad_outputs[i:i + self.num_inputs]
+                        for i in range(0, len(grad_outputs), self.num_inputs)]
+        return comm.reduce_add_coalesced(grad_outputs, self.input_device)
+

 class ModelDataParallel(Module):
    """Implements data parallelism at the module level.

-    .. ModelDataParallel_
-
    This container parallelizes the application of the given module by
    splitting the input across the specified devices by chunking in the 
    batch dimension. 
@@ -32,7 +89,7 @@ class ModelDataParallel(Module):
    and each replica handles a portion of the input. During the backwards
    pass, gradients from each replica are summed into the original module.
    Note that the outputs are not gathered, please use compatible 
-    CriterionDataParallel_ .
+    :class:`encoding.parallel.CriterionDataParallel`.

    The batch size should be larger than the number of GPUs used. It should
    also be an integer multiple of the number of GPUs so that each chunk is
@@ -44,19 +101,29 @@ class ModelDataParallel(Module):

    Example::

-        >>> net = torch.nn.ModelDataParallel(model, device_ids=[0, 1, 2])
+        >>> net = encoding.nn.ModelDataParallel(model, device_ids=[0, 1, 2])
        >>> output = net(input_var)
    """
-    def __init__(self, module, device_ids=None, dim=0):
+    def __init__(self, module, device_ids=None, output_device=None, dim=0):
        super(ModelDataParallel, self).__init__()
        if device_ids is None:
            device_ids = list(range(torch.cuda.device_count()))
+        if output_device is None:
+            output_device = device_ids[0]
        self.dim = dim
        self.module = module
        self.device_ids = device_ids
+        self.output_device = output_device
        self.master_mean, self.master_var = {}, {}
        if len(self.device_ids) == 1:
            self.module.cuda(device_ids[0])
+        """
+        # TODO FIXME temporal solution for BN
+        for m in self.module.modules():
+            classname = m.__class__.__name__ 
+            if classname.find('BatchNorm2d') != -1:
+                m.momentum = 0.9996
+        """

    def forward(self, *inputs, **kwargs):
        inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
@@ -79,13 +146,11 @@ class ModelDataParallel(Module):
    
 class CriterionDataParallel(Module):
    """
-    .. CriterionDataParallel_
-
    Calculate loss in multiple-GPUs, which balance the memory usage for 
    Semantic Segmentation.

    The targets are splitted across the specified devices by chunking in
-    the batch dimension. Please use together with ModelDataParallel_
+    the batch dimension. Please use together with :class:`encoding.parallel.ModelDataParallel`.
    """
    def __init__(self, module, device_ids=None, output_device=None, dim=0):
        super(CriterionDataParallel, self).__init__()
@@ -123,3 +188,158 @@ class CriterionDataParallel(Module):
        return gather(outputs, output_device, dim=self.dim).mean()
    

+class SelfDataParallel(Module):
+    """SelfDataParallel, please make sure you understand it before using.
+
+    Each module in the network should be in self-parallel mode, 
+    which allows list of inputs from multiple GPUs.
+    Please see encoding.nn for detail, use with cautious
+    """
+    def __init__(self, module, device_ids=None, output_device=None, dim=0):
+        super(SelfDataParallel, self).__init__()
+        if device_ids is None:
+            device_ids = list(range(torch.cuda.device_count()))
+        if output_device is None:
+            output_device = device_ids[0]
+        self.dim = dim
+        self.module = module
+        self.device_ids = device_ids
+        self.output_device = output_device
+        self.master_mean, self.master_var = {}, {}
+        if len(self.device_ids) == 1:
+            self.module.cuda(device_ids[0])
+
+    def forward(self, *inputs, **kwargs):
+        inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
+        outputs = self.module(inputs)
+        return outputs
+
+    def scatter(self, inputs, kwargs, device_ids):
+        #return my_scatter(inputs, target_gpus=device_ids)
+        outputs = scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
+        return outputs
+
+
+def criterion_parallel_apply(modules, inputs, targets, kwargs_tup=None):
+    assert len(modules) == len(inputs)
+    assert len(targets) == len(inputs)
+    if kwargs_tup:
+        assert len(modules) == len(kwargs_tup)
+    else:
+        kwargs_tup = ({},) * len(modules)
+    # Fast track
+    if len(modules) == 1:
+        return (modules[0](*inputs[0], *targets[0], **kwargs_tup[0]), )
+
+    lock = threading.Lock()
+    results = {}
+
+    def _worker(i, module, input, target, kwargs, results, lock):
+        var_input = input
+        while not isinstance(var_input, Variable):
+            var_input = var_input[0]
+        var_target = target
+        while not isinstance(var_target, Variable):
+            var_target = var_target[0]
+        try:
+            with torch.cuda.device_of(var_input):
+                output = module(input, *target, **kwargs)
+            with lock:
+                results[i] = output
+        except Exception as e:
+            with lock:
+                results[i] = e
+
+    threads = [threading.Thread(target=_worker,
+                                args=(i, module, input, target, 
+                                      kwargs, results, lock),
+                                )
+               for i, (module, input, target, kwargs) in
+               enumerate(zip(modules, inputs, targets, kwargs_tup))]
+
+    for thread in threads:
+        thread.start()
+    for thread in threads:
+        thread.join()
+    outputs = []
+    for i in range(len(inputs)):
+        output = results[i]
+        if isinstance(output, Exception):
+            raise output
+        outputs.append(output)
+    return outputs
+
+
+def get_a_var(obj):
+    if isinstance(obj, Variable):
+        return obj
+
+    if isinstance(obj, list) or isinstance(obj, tuple):
+        results = map(get_a_var, obj)
+        for result in results:
+            if isinstance(result, Variable):
+                return result
+    if isinstance(obj, dict):
+        results = map(get_a_var, obj.items())
+        for result in results:
+            if isinstance(result, Variable):
+                return result
+    return None
+
+
+def my_parallel_apply(modules, inputs, kwargs_tup=None):
+    assert len(modules) == len(inputs)
+    if kwargs_tup:
+        assert len(modules) == len(kwargs_tup)
+    else:
+        kwargs_tup = ({},) * len(modules)
+    # Fast track
+    if len(modules) == 1:
+        return (modules[0](*inputs[0], **kwargs_tup[0]), )
+
+    lock = threading.Lock()
+    results = {}
+
+    def _worker(i, module, input, kwargs, results, lock):
+        var_input = get_a_var(input)
+        try:
+            with torch.cuda.device_of(var_input):
+                output = module(input, **kwargs)
+            with lock:
+                results[i] = output
+        except Exception as e:
+            with lock:
+                results[i] = e
+
+    threads = [threading.Thread(target=_worker,
+                                args=(i, module, input, kwargs, results, lock),
+                                )
+               for i, (module, input, kwargs) in
+               enumerate(zip(modules, inputs, kwargs_tup))]
+
+    for thread in threads:
+        thread.start()
+    for thread in threads:
+        thread.join()
+    outputs = []
+    for i in range(len(inputs)):
+        output = results[i]
+        if isinstance(output, Exception):
+            raise output
+        outputs.append(output)
+    return outputs
+
+
+def my_data_parallel(module, inputs, device_ids=None, \
+    dim=0, module_kwargs=None):
+    if device_ids is None:
+        device_ids = list(range(torch.cuda.device_count()))
+
+    if len(inputs) == 1:
+        return module(inputs[0])
+
+    #print('my data parallel, len(inputs)', len(inputs))
+    replicas = replicate(module, device_ids[:len(inputs)])
+    outputs = my_parallel_apply(replicas, inputs, module_kwargs)
+    return outputs 
+
--- a/encoding/src/encoding_lib.cpp
+++ b/encoding/src/encoding_lib.cpp
@@ -17,12 +17,26 @@ extern THCState *state;
 extern "C" {
 #endif

+// float
 #include "generic/encoding_generic.c"
 #include "THC/THCGenerateFloatType.h"

+#include "generic/syncbn_generic.c"
+#include "THC/THCGenerateFloatType.h"
+
+#include "generic/pooling_generic.c"
+#include "THC/THCGenerateFloatType.h"
+
+// double
 #include "generic/encoding_generic.c"
 #include "THC/THCGenerateDoubleType.h"

+#include "generic/syncbn_generic.c"
+#include "THC/THCGenerateDoubleType.h"
+
+#include "generic/pooling_generic.c"
+#include "THC/THCGenerateDoubleType.h"
+
 #ifdef __cplusplus
 }
 #endif
--- a/encoding/src/encoding_lib.h
+++ b/encoding/src/encoding_lib.h
@@ -64,10 +64,22 @@ int Encoding_Float_batchnorm_Backward(THCudaTensor *gradoutput_,
 int Encoding_Float_sum_square_Forward(THCudaTensor *input_, 
    THCudaTensor *sum_, THCudaTensor *square_);

-void Encoding_Float_sum_square_Backward(
+int Encoding_Float_sum_square_Backward(
    THCudaTensor *gradInput, THCudaTensor *input_, 
    THCudaTensor *gradSum_, THCudaTensor *gradSquare_);

+int Encoding_Float_DilatedAvgPool2d_Forward(
+    THCudaTensor *X_, THCudaTensor *Y_, 
+    int kH, int kW, int dH, int dW,
+    int padH, int padW,
+    int dilationH, int dilationW);
+
+int Encoding_Float_DilatedAvgPool2d_Backward(
+    THCudaTensor *gradX_, THCudaTensor *gradY_, 
+    int kH, int kW, int dH, int dW,
+    int padH, int padW,
+    int dilationH, int dilationW);
+
 /*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/

 int Encoding_Double_scaledl2_forward(THCudaDoubleTensor *SL,  
@@ -124,3 +136,15 @@ int Encoding_Double_sum_square_Forward(THCudaDoubleTensor *input_,
 void Encoding_Double_sum_square_Backward(
    THCudaDoubleTensor *gradInput, THCudaDoubleTensor *input_, 
    THCudaDoubleTensor *gradSum_, THCudaDoubleTensor *gradSquare_);
+
+int Encoding_Double_DilatedAvgPool2d_Forward(
+    THCudaDoubleTensor *X_, THCudaDoubleTensor *Y_, 
+    int kH, int kW, int dH, int dW,
+    int padH, int padW,
+    int dilationH, int dilationW);
+
+int Encoding_Double_DilatedAvgPool2d_Backward(
+    THCudaDoubleTensor *gradX_, THCudaDoubleTensor *gradY_, 
+    int kH, int kW, int dH, int dW,
+    int padH, int padW,
+    int dilationH, int dilationW);
--- a/encoding/src/generic/encoding_generic.c
+++ b/encoding/src/generic/encoding_generic.c
@@ -127,56 +127,5 @@ int Encoding_(squaresqueeze_backward)(THCTensor *GL, THCTensor *GR,
 		/* C function return number of the outputs */
 		return 0;
 }
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
-int Encoding_(batchnorm_Forward)(THCTensor *output_, THCTensor *input_, 
-        THCTensor *mean_, THCTensor *invstd_,
-        THCTensor *gamma_, THCTensor *beta_)
-/*
- * 
- */
-{
-    Encoding_(BatchNorm_Forward)(state, output_, input_, 
-        mean_, invstd_, gamma_, beta_);
-	/* C function return number of the outputs */
-	return 0;
-}
-
-int Encoding_(batchnorm_Backward)(THCTensor *gradoutput_, 
-        THCTensor *input_, THCTensor *gradinput_, 
-        THCTensor *gradgamma_, THCTensor *gradbeta_, THCTensor *mean_, 
-        THCTensor *invstd_, THCTensor *gamma_, THCTensor *beta_, 
-        THCTensor *gradMean_, THCTensor *gradStd_, int train)
-/*
- */
-{
-    Encoding_(BatchNorm_Backward)(state, gradoutput_, input_, gradinput_, 
-        gradgamma_, gradbeta_, mean_, invstd_, gamma_, beta_, gradMean_, gradStd_,
-        train);
-	/* C function return number of the outputs */
-	return 0;
-}
-

-int Encoding_(sum_square_Forward)(THCTensor *input_, 
-        THCTensor *sum_, THCTensor *square_)
-/*
- */
-{
-    Encoding_(Sum_Square_Forward)(state, input_, sum_, square_);
-	/* C function return number of the outputs */
-	return 0;
-}
-
-
-int Encoding_(sum_square_Backward)(
-        THCTensor *gradInput, THCTensor *input_, 
-        THCTensor *gradSum_, THCTensor *gradSquare_)
-/*
- */
-{
-    Encoding_(Sum_Square_Backward)(state, gradInput, input_, gradSum_, 
-                                   gradSquare_);
-	/* C function return number of the outputs */
-	return 0;
-}
 #endif
--- a/encoding/src/generic/pooling_generic.c
+++ b/encoding/src/generic/pooling_generic.c
+/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ * Created by: Hang Zhang
+ * ECE Department, Rutgers University
+ * Email: zhang.hang@rutgers.edu
+ * Copyright (c) 2017
+ *
+ * This source code is licensed under the MIT-style license found in the
+ * LICENSE file in the root directory of this source tree 
+ *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ */
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/pooling_generic.c"
+#else
+
+int Encoding_(DilatedAvgPool2d_Forward)(
+    THCTensor *X_, THCTensor *Y_, 
+    int kH, int kW, int dH, int dW,
+    int padH, int padW,
+    int dilationH, int dilationW)
+/*
+ */
+{
+    Encoding_(DilatedAvgPool_Forward)(state, 
+    X_, Y_, kH, kW, dH, dW,
+    padH, padW, dilationH, dilationW);
+    /* C function return number of the outputs */
+    return 0;
+}
+
+int Encoding_(DilatedAvgPool2d_Backward)(
+    THCTensor *gradX_, THCTensor *gradY_, 
+    int kH, int kW, int dH, int dW,
+    int padH, int padW,
+    int dilationH, int dilationW)
+/*
+ */
+{
+    Encoding_(DilatedAvgPool_Backward)(state, 
+    gradX_, gradY_, kH, kW, dH, dW,
+    padH, padW, dilationH, dilationW);
+    /* C function return number of the outputs */
+    return 0;
+}
+
+#endif
--- a/encoding/src/generic/syncbn_generic.c
+++ b/encoding/src/generic/syncbn_generic.c
+/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ * Created by: Hang Zhang
+ * ECE Department, Rutgers University
+ * Email: zhang.hang@rutgers.edu
+ * Copyright (c) 2017
+ *
+ * This source code is licensed under the MIT-style license found in the
+ * LICENSE file in the root directory of this source tree 
+ *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ */
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/syncbn_generic.c"
+#else
+
+int Encoding_(batchnorm_Forward)(THCTensor *output_, THCTensor *input_, 
+        THCTensor *mean_, THCTensor *invstd_,
+        THCTensor *gamma_, THCTensor *beta_)
+/*
+ * 
+ */
+{
+    Encoding_(BatchNorm_Forward)(state, output_, input_, 
+        mean_, invstd_, gamma_, beta_);
+	/* C function return number of the outputs */
+	return 0;
+}
+
+int Encoding_(batchnorm_Backward)(THCTensor *gradoutput_, 
+        THCTensor *input_, THCTensor *gradinput_, 
+        THCTensor *gradgamma_, THCTensor *gradbeta_, THCTensor *mean_, 
+        THCTensor *invstd_, THCTensor *gamma_, THCTensor *beta_, 
+        THCTensor *gradMean_, THCTensor *gradStd_, int train)
+/*
+ */
+{
+    Encoding_(BatchNorm_Backward)(state, gradoutput_, input_, gradinput_, 
+        gradgamma_, gradbeta_, mean_, invstd_, gamma_, beta_, gradMean_, gradStd_,
+        train);
+	/* C function return number of the outputs */
+	return 0;
+}
+
+
+int Encoding_(sum_square_Forward)(THCTensor *input_, 
+        THCTensor *sum_, THCTensor *square_)
+/*
+ */
+{
+    Encoding_(Sum_Square_Forward)(state, input_, sum_, square_);
+	/* C function return number of the outputs */
+	return 0;
+}
+
+
+int Encoding_(sum_square_Backward)(
+        THCTensor *gradInput, THCTensor *input_, 
+        THCTensor *gradSum_, THCTensor *gradSquare_)
+/*
+ */
+{
+    Encoding_(Sum_Square_Backward)(state, gradInput, input_, gradSum_, 
+                                   gradSquare_);
+	/* C function return number of the outputs */
+	return 0;
+}
+
+#endif
--- a/encoding/utils.py
+++ b/encoding/utils.py
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+## Created by: Hang Zhang
+## ECE Department, Rutgers University
+## Email: zhang.hang@rutgers.edu
+## Copyright (c) 2017
+##
+## This source code is licensed under the MIT-style license found in the
+## LICENSE file in the root directory of this source tree 
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+import torch
+import shutil
+import os
+import sys
+import time
+import math
+
+def get_optimizer(args, model, diff_LR=True):
+    """
+    Returns an optimizer for given model, 
+
+    Args:
+        args: :attr:`args.lr`, :attr:`args.momentum`, :attr:`args.weight_decay`
+        model: if using different lr, define `model.pretrained` and `model.head`.
+    """
+    if diff_LR and model.pretrained is not None:
+        print('Using different learning rate for pre-trained features')
+        optimizer = torch.optim.SGD([
+                        {'params': model.pretrained.parameters()}, 
+                        {'params': model.head.parameters(), 
+                          'lr': args.lr*10},
+                    ], 
+                    lr=args.lr,
+                    momentum=args.momentum, 
+                    weight_decay=args.weight_decay)
+    else:
+        optimizer = torch.optim.SGD(model.parameters(), lr=args.lr,
+                                    momentum=args.momentum, 
+                                    weight_decay=args.weight_decay) 
+    return optimizer
+
+
+class CosLR_Scheduler(object):
+    """Cosine Learning Rate Scheduler
+
+    .. math::
+        lr = base_lr * 0.5 * (1 + cos(T/N))
+
+    where ``T`` is current iters and ``N`` is total iters
+
+    Args:
+        args:  base learning rate :attr:`args.lr`, number of epochs :attr:`args.epochs`
+        niters: number of iterations per epoch
+    """
+    def __init__(self, args, niters):
+        self.lr = args.lr
+        self.niters = niters
+        self.N = args.epochs * niters
+        self.epoch = -1
+
+    def __call__(self, optimizer, i, epoch, best_pred):
+        T = (epoch - 1) * self.niters + i
+        lr = 0.5 * self.lr * (1 + math.cos(1.0 * T / self.N * math.pi))
+        if epoch > self.epoch:
+            print('=>Epochs %i, learning rate = %.4f, previous best ='\
+                '%.3f%%' % (epoch, lr, best_pred))
+            self.epoch = epoch
+        self._adjust_learning_rate(optimizer, lr)
+
+    def _adjust_learning_rate(self, optimizer, lr):
+        if len(optimizer.param_groups) == 1:
+            optimizer.param_groups[0]['lr'] = lr
+        elif len(optimizer.param_groups) == 2:
+            # enlarge the lr at the head
+            optimizer.param_groups[0]['lr'] = lr
+            optimizer.param_groups[1]['lr'] = lr * 10
+        else:
+            raise RuntimeError('unsupported number of param groups: {}' \
+                .format(len(optimizer.param_groups)))
+
+
+# refer to https://github.com/xternalz/WideResNet-pytorch
+def save_checkpoint(state, args, is_best, filename='checkpoint.pth.tar'):
+    """Saves checkpoint to disk"""
+    directory = "runs/%s/%s/%s/"%(args.dataset, args.model, args.checkname)
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+    filename = directory + filename
+    torch.save(state, filename)
+    if is_best:
+        shutil.copyfile(filename, directory + 'model_best.pth.tar')
+
+# refer to https://github.com/kuangliu/pytorch-cifar/blob/master/utils.py
+_, term_width = os.popen('stty size', 'r').read().split()
+term_width = int(term_width)
+TOTAL_BAR_LENGTH = 86.
+last_time = time.time()
+begin_time = last_time
+def progress_bar(current, total, msg=None):
+    """Progress Bar for display
+    """
+    global last_time, begin_time
+    if current == 0:
+        begin_time = time.time()    # Reset for new bar.
+
+    cur_len = int(TOTAL_BAR_LENGTH*current/total)
+    rest_len = int(TOTAL_BAR_LENGTH - cur_len) - 1
+
+    sys.stdout.write(' [')
+    for i in range(cur_len):
+        sys.stdout.write('=')
+    sys.stdout.write('>')
+    for i in range(rest_len):
+        sys.stdout.write('.')
+    sys.stdout.write(']')
+
+    cur_time = time.time()
+    step_time = cur_time - last_time
+    last_time = cur_time
+    tot_time = cur_time - begin_time
+
+    L = []
+    L.append('    Step: %s' % _format_time(step_time))
+    L.append(' | Tot: %s' % _format_time(tot_time))
+    if msg:
+        L.append(' | ' + msg)
+
+    msg = ''.join(L)
+    sys.stdout.write(msg)
+    for i in range(term_width-int(TOTAL_BAR_LENGTH)-len(msg)-3):
+        sys.stdout.write(' ')
+
+    # Go back to the center of the bar.
+    for i in range(term_width-int(TOTAL_BAR_LENGTH/2)):
+        sys.stdout.write('\b')
+    sys.stdout.write(' %d/%d ' % (current+1, total))
+
+    if current < total-1:
+        sys.stdout.write('\r')
+    else:
+        sys.stdout.write('\n')
+    sys.stdout.flush()
+
+def _format_time(seconds):
+    days = int(seconds / 3600/24)
+    seconds = seconds - days*3600*24
+    hours = int(seconds / 3600)
+    seconds = seconds - hours*3600
+    minutes = int(seconds / 60)
+    seconds = seconds - minutes*60
+    secondsf = int(seconds)
+    seconds = seconds - secondsf
+    millis = int(seconds*1000)
+
+    f = ''
+    i = 1
+    if days > 0:
+        f += str(days) + 'D'
+        i += 1
+    if hours > 0 and i <= 2:
+        f += str(hours) + 'h'
+        i += 1
+    if minutes > 0 and i <= 2:
+        f += str(minutes) + 'm'
+        i += 1
+    if secondsf > 0 and i <= 2:
+        f += str(secondsf) + 's'
+        i += 1
+    if millis > 0 and i <= 2:
+        f += str(millis) + 'ms'
+        i += 1
+    if f == '':
+        f = '0ms'
+    return f
--- a/experiments/dataset/cifar.py
+++ b/experiments/dataset/cifar.py
-##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-## Created by: Hang Zhang
-## ECE Department, Rutgers University
-## Email: zhang.hang@rutgers.edu
-## Copyright (c) 2017
-##
-## This source code is licensed under the MIT-style license found in the
-## LICENSE file in the root directory of this source tree 
-##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-
-import torch
-import torchvision
-import torchvision.transforms as transforms
-
-class Dataloder():
-	def __init__(self, args):
-		transform_train = transforms.Compose([
-    	transforms.RandomCrop(32, padding=4),
-    	transforms.RandomHorizontalFlip(),
-    	transforms.ToTensor(),
-    	transforms.Normalize((0.4914, 0.4822, 0.4465), 
-				(0.2023, 0.1994, 0.2010)),
-		])
-		transform_test = transforms.Compose([
-    	transforms.ToTensor(),
-    	transforms.Normalize((0.4914, 0.4822, 0.4465), 
-				(0.2023, 0.1994, 0.2010)),
-		])
-
-		trainset = torchvision.datasets.CIFAR10(root='./data', train=True, 
-			download=True, transform=transform_train)
-		testset = torchvision.datasets.CIFAR10(root='./data', train=False, 
-			download=True, transform=transform_test)
-	
-		kwargs = {'num_workers': 2, 'pin_memory': True} if args.cuda else {}
-		trainloader = torch.utils.data.DataLoader(trainset, batch_size=
-			args.batch_size, shuffle=True, **kwargs)
-		testloader = torch.utils.data.DataLoader(testset, batch_size=
-			args.batch_size, shuffle=False, **kwargs)
-		self.trainloader = trainloader 
-		self.testloader = testloader
-	
-	def getloader(self):
-		return self.trainloader, self.testloader
--- a/experiments/dataset/dataloader.py
+++ b/experiments/dataset/dataloader.py
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+## Created by: Hang Zhang
+## ECE Department, Rutgers University
+## Email: zhang.hang@rutgers.edu
+## Copyright (c) 2017
+##
+## This source code is licensed under the MIT-style license found in the
+## LICENSE file in the root directory of this source tree 
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+# refer to https://github.com/pytorch/vision/blob/master/torchvision/
+import torch.utils.data as data
+import torchvision
+
+from PIL import Image
+import os
+import os.path
+
+IMG_EXTENSIONS = [
+    '.jpg', '.JPG', '.jpeg', '.JPEG',
+    '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP',
+]
+
+
+def is_image_file(filename):
+    return any(filename.endswith(extension) for extension in IMG_EXTENSIONS)
+
+
+def find_classes(dir):
+    classes = [d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))]
+    classes.sort()
+    class_to_idx = {classes[i]: i for i in range(len(classes))}
+    return classes, class_to_idx
+
+
+def make_dataset(dir, class_to_idx):
+    images = []
+    for target in os.listdir(dir):
+        d = os.path.join(dir, target, 'images')
+        if not os.path.isdir(d):
+            continue
+
+        for root, _, fnames in sorted(os.walk(d)):
+            for fname in fnames:
+                if is_image_file(fname):
+                    path = os.path.join(root, fname)
+                    item = (path, class_to_idx[target])
+                    images.append(item)
+
+    return images
+
+
+def default_loader(path):
+    return Image.open(path).convert('RGB')
+
+
+class DatasetLoader(data.Dataset):
+    def __init__(self, root, transform=None, target_transform=None,
+                 loader=default_loader):
+        classes, class_to_idx = find_classes(root)
+        imgs = make_dataset(root, class_to_idx)
+        if len(imgs) == 0:
+            raise(RuntimeError("Found 0 images in subfolders of: " + root \
+                + "\nSupported image extensions are: " + \
+                ",".join(IMG_EXTENSIONS)))
+
+        self.root = root
+        self.imgs = imgs
+        self.classes = classes
+        self.class_to_idx = class_to_idx
+        self.transform = transform
+        self.target_transform = target_transform
+        self.loader = loader
+
+    def __getitem__(self, index):
+        path, target = self.imgs[index]
+        img = self.loader(path)
+        if self.transform is not None:
+            img = self.transform(img)
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+
+        return img, target
+
+    def __len__(self):
+        return len(self.imgs)
+
+
+def annotation_reader(root, class_to_idx):
+    # read the tiny imagenet annotations.txt and returns the imgs and class
+    file = open(os.path.join(root,'val_annotations.txt'), 'r')
+    images = []
+    for line in file:
+        sp = line.split('\t')
+        path = os.path.join(root,'images',sp[0])
+        item = [path, class_to_idx[sp[1]]]
+        images.append(item)
+
+    return images
+
+
+class ValDatasetLoader(data.Dataset):
+    def __init__(self, root, classes, class_to_idx, 
+            transform=None, target_transform=None, loader=default_loader):
+        imgs = annotation_reader(root, class_to_idx)
+        self.root = root
+        self.imgs = imgs
+        self.classes = classes
+        self.class_to_idx = class_to_idx
+        self.transform = transform
+        self.target_transform = target_transform
+        self.loader = loader
+
+    def __getitem__(self, index):
+        path, target = self.imgs[index]
+        img = self.loader(path)
+        if self.transform is not None:
+            img = self.transform(img)
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+
+        return img, target
+
+    def __len__(self):
+        return len(self.imgs)
+
--- a/experiments/dataset/minc.py
+++ b/experiments/dataset/minc.py
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+## Created by: Hang Zhang
+## ECE Department, Rutgers University
+## Email: zhang.hang@rutgers.edu
+## Copyright (c) 2017
+##
+## This source code is licensed under the MIT-style license found in the
+## LICENSE file in the root directory of this source tree 
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+import torch
+import torch.utils.data as data
+import torchvision
+from torchvision import transforms
+
+from PIL import Image
+import os
+import os.path
+
+_imagenet_pca = {
+    'eigval': torch.Tensor([0.2175, 0.0188, 0.0045]),
+    'eigvec': torch.Tensor([
+        [-0.5675,  0.7192,  0.4009],
+        [-0.5808, -0.0045, -0.8140],
+        [-0.5836, -0.6948,  0.4203],
+    ])
+}
+
+
+def find_classes(dir):
+    classes = [d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))]
+    classes.sort()
+    class_to_idx = {classes[i]: i for i in range(len(classes))}
+    return classes, class_to_idx
+
+
+def make_dataset(filename, datadir, class_to_idx):
+    images = []
+    labels = []
+    with open(os.path.join(filename), "r") as lines:
+        for line in lines:
+            _image = os.path.join(datadir, line.rstrip('\n'))
+            _dirname = os.path.split(os.path.dirname(_image))[1]
+            assert os.path.isfile(_image)
+            label = class_to_idx[_dirname]
+            images.append(_image)
+            labels.append(label)
+
+    return images, labels
+
+
+class MINCDataloder(data.Dataset):
+    def __init__(self, root, train=True, transform=None):
+        self.transform = transform
+        classes, class_to_idx = find_classes(root + '/images')
+        if train:
+            filename = os.path.join(root, 'labels/train1.txt')
+        else:
+            filename = os.path.join(root, 'labels/test1.txt')
+
+        self.images, self.labels = make_dataset(filename, root, 
+            class_to_idx)
+        assert (len(self.images) == len(self.labels))
+
+    def __getitem__(self, index):
+        _img = Image.open(self.images[index]).convert('RGB')
+        _label = self.labels[index]
+        if self.transform is not None:
+            _img = self.transform(_img)
+
+        return _img, _label
+
+    def __len__(self):
+        return len(self.images)
+
+
+class Dataloder():
+    def __init__(self, args):
+        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                         std=[0.229, 0.224, 0.225])
+        transform_train = transforms.Compose([
+            transforms.Resize(256),
+            transforms.RandomResizedCrop(224),
+            transforms.RandomHorizontalFlip(),
+            transforms.ColorJitter(0.4,0.4,0.4),
+            transforms.ToTensor(),
+            Lighting(0.1, _imagenet_pca['eigval'], _imagenet_pca['eigvec']),
+            normalize,
+        ])
+        transform_test = transforms.Compose([
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            normalize,
+        ])
+
+        trainset = MINCDataloder(root=os.path.expanduser('~/data/minc-2500/'), 
+            train=True, transform=transform_train)
+        testset = MINCDataloder(root=os.path.expanduser('~/data/minc-2500/'), 
+            train=False, transform=transform_test)
+    
+        kwargs = {'num_workers': 8, 'pin_memory': True} if args.cuda else {}
+        trainloader = torch.utils.data.DataLoader(trainset, batch_size=
+            args.batch_size, shuffle=True, **kwargs)
+        testloader = torch.utils.data.DataLoader(testset, batch_size=
+            args.test_batch_size, shuffle=False, **kwargs)
+        self.trainloader = trainloader 
+        self.testloader = testloader
+    
+    def getloader(self):
+        return self.trainloader, self.testloader
+
+
+class Lighting(object):
+    """Lighting noise(AlexNet - style PCA - based noise)"""
+
+    def __init__(self, alphastd, eigval, eigvec):
+        self.alphastd = alphastd
+        self.eigval = eigval
+        self.eigvec = eigvec
+
+    def __call__(self, img):
+        if self.alphastd == 0:
+            return img
+
+        alpha = img.new().resize_(3).normal_(0, self.alphastd)
+        rgb = self.eigvec.type_as(img).clone()\
+            .mul(alpha.view(1, 3).expand(3, 3))\
+            .mul(self.eigval.view(1, 3).expand(3, 3))\
+            .sum(1).squeeze()
+
+        return img.add(rgb.view(3, 1, 1).expand_as(img))
+
+
+if __name__ == "__main__":
+    trainset = MINCDataloder(root=os.path.expanduser('~/data/minc-2500/'), train=True)
+    testset = MINCDataloder(root=os.path.expanduser('~/data/minc-2500/'), train=False)
+    print(len(trainset))
+    print(len(testset))
--- a/experiments/main.py
+++ b/experiments/main.py
@@ -10,6 +10,9 @@

 from __future__ import print_function

+import matplotlib.pyplot as plot
+import importlib
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -17,115 +20,172 @@ import torch.optim as optim
 from torch.autograd import Variable

 from option import Options
-from model.encodenet import Net
-from utils import *
+from encoding.utils import *

 # global variable
-best_pred = 0.0
-acclist = []
+best_pred = 100.0
+errlist_train = []
+errlist_val = []
+
+
+def adjust_learning_rate(optimizer, args, epoch, best_pred):
+    if epoch <= 60:
+        lr = args.lr * (0.1 ** ((epoch - 1) // 40))
+    else:
+        lr = 1e-4
+    print('=>Epochs %i, learning rate = %.4f, previous best = %.3f%%' % (
+		epoch, lr, best_pred))
+    if len(optimizer.param_groups) == 1:
+        optimizer.param_groups[0]['lr'] = lr
+    elif len(optimizer.param_groups) == 2:
+        # enlarge the lr at the head
+        optimizer.param_groups[0]['lr'] = lr
+        optimizer.param_groups[1]['lr'] = lr * 10
+    else:
+        raise RuntimeError('unsupported number of param groups: {}' \
+            .format(len(optimizer.param_groups)))

 def main():
-	# init the args
-	args = Options().parse()
-	args.cuda = not args.no_cuda and torch.cuda.is_available()
-	torch.manual_seed(args.seed)
-	if args.cuda:
-		torch.cuda.manual_seed(args.seed)
-	# init dataloader
-	if args.dataset == 'cifar':
-		from dataset.cifar import Dataloder
-		train_loader, test_loader = Dataloder(args).getloader()
-	else:
-		raise ValueError('Unknow dataset!')
-
-	model = Net()
-
-	if args.cuda:
-		model.cuda()
-
-	if args.resume is not None:
-		if os.path.isfile(args.resume):
-			print("=> loading checkpoint '{}'".format(args.resume))
-			checkpoint = torch.load(args.resume)
-			args.start_epoch = checkpoint['epoch']
-			best_pred = checkpoint['best_pred']
-			acclist = checkpoint['acclist']
-			model.load_state_dict(checkpoint['state_dict'])
-			print("=> loaded checkpoint '{}' (epoch {})"
-				.format(args.resume, checkpoint['epoch']))
-		else:
-			print("=> no resume checkpoint found at '{}'".format(args.resume))
-
-	criterion = nn.CrossEntropyLoss()
-	# TODO make weight_decay oen of args
-	optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=
-			args.momentum, weight_decay=1e-4)
-
-	def train(epoch):
-		model.train()
-		global best_pred
-		train_loss, correct, total = 0,0,0
-		adjust_learning_rate(optimizer, epoch, best_pred, args)
-		for batch_idx, (data, target) in enumerate(train_loader):
-			if args.cuda:
-				data, target = data.cuda(), target.cuda()
-			data, target = Variable(data), Variable(target)
-			optimizer.zero_grad()
-			output = model(data)
-			loss = criterion(output, target)
-			loss.backward()
-			optimizer.step()
-
-			train_loss += loss.data[0]
-			pred = output.data.max(1)[1] 
-			correct += pred.eq(target.data).cpu().sum()
-			total += target.size(0)
-			progress_bar(batch_idx, len(train_loader), 
-				'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (train_loss/(batch_idx+1), 
-				100.*correct/total, correct, total))
-
-	def test(epoch):
-		model.eval()
-		global best_pred
-		global acclist
-		test_loss, correct, total = 0,0,0
-		acc = 0.0
-		is_best = False
-		# for data, target in test_loader:
-		for batch_idx, (data, target) in enumerate(test_loader):
-			if args.cuda:
-				data, target = data.cuda(), target.cuda()
-			data, target = Variable(data, volatile=True), Variable(target)
-			output = model(data)
-			test_loss += criterion(output, target).data[0]
-			# get the index of the max log-probability
-			pred = output.data.max(1)[1] 
-			correct += pred.eq(target.data).cpu().sum()
-			total += target.size(0)
-
-			acc = 100.*correct/total
-			progress_bar(batch_idx, len(test_loader), 
-				'Loss: %.3f | Acc: %.3f%% (%d/%d)'% (test_loss/(batch_idx+1), 
-				acc, correct, total))
-		# save checkpoint
-		acclist += [acc]
-		if acc > best_pred:
-			best_pred = acc
-			is_best = True
-		save_checkpoint({
-			'epoch': epoch,
-			'state_dict': model.state_dict(),
-			'best_pred': best_pred,
-			'acclist':acclist,
-			}, args=args, is_best=is_best)
-
-	# TODO add plot curve
-
-	for epoch in range(args.start_epoch, args.epochs + 1):
-		train(epoch)
-		# FIXME this is a bug somewhere not in the code
-		test(epoch)
-
+    # init the args
+    global best_pred, errlist_train, errlist_val
+    args = Options().parse()
+    args.cuda = not args.no_cuda and torch.cuda.is_available()
+    torch.manual_seed(args.seed)
+    # plot 
+    if args.plot:
+        print('=>Enabling matplotlib for display:')
+        plot.ion()
+        plot.show()
+    if args.cuda:
+        torch.cuda.manual_seed(args.seed)
+    # init dataloader
+    dataset = importlib.import_module('dataset.'+args.dataset)
+    Dataloder = dataset.Dataloder
+    train_loader, test_loader = Dataloder(args).getloader()
+    # init the model
+    models = importlib.import_module('model.'+args.model)
+    model = models.Net()
+    print(model)
+    # criterion and optimizer
+    criterion = nn.CrossEntropyLoss()
+    optimizer = get_optimizer(args, model)
+    if args.cuda:
+        model.cuda()
+        # Please use CUDA_VISIBLE_DEVICES to control the number of gpus
+        model = torch.nn.DataParallel(model)
+    """
+    optim.SGD(model.parameters(), lr=args.lr, momentum=
+            args.momentum, weight_decay=args.weight_decay)
+    """
+    # check point
+    if args.resume is not None:
+        if os.path.isfile(args.resume):
+            print("=> loading checkpoint '{}'".format(args.resume))
+            checkpoint = torch.load(args.resume)
+            args.start_epoch = checkpoint['epoch'] +1
+            best_pred = checkpoint['best_pred']
+            errlist_train = checkpoint['errlist_train']
+            errlist_val = checkpoint['errlist_val']
+            model.load_state_dict(checkpoint['state_dict'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            print("=> loaded checkpoint '{}' (epoch {})"
+                .format(args.resume, checkpoint['epoch']))
+        else:
+            print("=> no resume checkpoint found at '{}'".\
+                format(args.resume))
+    #scheduler = CosLR_Scheduler(args, len(train_loader))
+    def train(epoch):
+        model.train()
+        global best_pred, errlist_train
+        train_loss, correct, total = 0,0,0
+        adjust_learning_rate(optimizer, args, epoch, best_pred)
+        for batch_idx, (data, target) in enumerate(train_loader):
+            #scheduler(optimizer, batch_idx, epoch, best_pred)
+            if args.cuda:
+                data, target = data.cuda(), target.cuda()
+            data, target = Variable(data), Variable(target)
+            optimizer.zero_grad()
+            output = model(data)
+            loss = criterion(output, target)
+            loss.backward()
+            optimizer.step()
+
+            train_loss += loss.data[0]
+            pred = output.data.max(1)[1] 
+            correct += pred.eq(target.data).cpu().sum()
+            total += target.size(0)
+            err = 100-100.*correct/total
+            progress_bar(batch_idx, len(train_loader), 
+                'Loss: %.3f | Err: %.3f%% (%d/%d)' % \
+                (train_loss/(batch_idx+1), 
+                err, total-correct, total))
+        errlist_train += [err]
+
+    def test(epoch):
+        model.eval()
+        global best_pred, errlist_train, errlist_val
+        test_loss, correct, total = 0,0,0
+        is_best = False
+        for batch_idx, (data, target) in enumerate(test_loader):
+            if args.cuda:
+                data, target = data.cuda(), target.cuda()
+            data, target = Variable(data, volatile=True), Variable(target)
+            output = model(data)
+            test_loss += criterion(output, target).data[0]
+            # get the index of the max log-probability
+            pred = output.data.max(1)[1] 
+            correct += pred.eq(target.data).cpu().sum()
+            total += target.size(0)
+
+            err = 100-100.*correct/total
+            progress_bar(batch_idx, len(test_loader), 
+                'Loss: %.3f | Err: %.3f%% (%d/%d)'% \
+                (test_loss/(batch_idx+1), 
+                err, total-correct, total))
+
+        if args.eval:
+            print('Error rate is %.3f'%err)
+            return
+        # save checkpoint
+        errlist_val += [err]
+        if err < best_pred:
+            best_pred = err 
+            is_best = True
+        save_checkpoint({
+            'epoch': epoch,
+            'state_dict': model.state_dict(),
+            'optimizer': optimizer.state_dict(),
+            'best_pred': best_pred,
+            'errlist_train':errlist_train,
+            'errlist_val':errlist_val,
+            }, args=args, is_best=is_best)
+        if args.plot:
+            plot.clf()
+            plot.xlabel('Epoches: ')
+            plot.ylabel('Error Rate: %')
+            plot.plot(errlist_train, label='train')
+            plot.plot(errlist_val, label='val')
+            plot.legend(loc='upper left')
+            plot.draw()
+            plot.pause(0.001)
+
+    if args.eval:
+        test(args.start_epoch)
+        return
+
+    for epoch in range(args.start_epoch, args.epochs + 1):
+        train(epoch)
+        test(epoch)
+
+    # save train_val curve to a file
+    if args.plot:
+        plot.clf()
+        plot.xlabel('Epoches: ')
+        plot.ylabel('Error Rate: %')
+        plot.plot(errlist_train, label='train')
+        plot.plot(errlist_val, label='val')
+        plot.savefig("runs/%s/%s/"%(args.dataset, args.checkname)
+                            +'train_val.jpg')

 if __name__ == "__main__":
-	main()
+    main()
--- a/experiments/model/download_models.sh
+++ b/experiments/model/download_models.sh
+cd models
+wget -O minc.pth.tar https://www.dropbox.com/s/0q57t0nd1tka2qx/minc.pth.tar?dl=1
+cd ..
--- a/experiments/model/encodenet.py
+++ b/experiments/model/encodenet.py
-##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-## Created by: Hang Zhang
-## ECE Department, Rutgers University
-## Email: zhang.hang@rutgers.edu
-## Copyright (c) 2017
-##
-## This source code is licensed under the MIT-style license found in the
-## LICENSE file in the root directory of this source tree 
-##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-
-import torch
-import torch.nn as nn
-import model.mynn as nn2
-from encoding import Encoding
-
-class Net(nn.Module):
-	def __init__(self, num_blocks=[2,2,2,2], num_classes=10, 
-								block=nn2.Bottleneck):
-		super(Net, self).__init__()
-		if block == nn2.Basicblock:
-			self.expansion = 1
-		else:
-			self.expansion = 4
-
-		self.inplanes = 64
-		num_planes = [64, 128, 256, 512]
-		strides = [1, 2, 2, 2]
-		model = []
-		# Conv_1
-		model += [nn.Conv2d(3, self.inplanes, kernel_size=3, padding=1),
-							nn.BatchNorm2d(self.inplanes),
-							nn.ReLU(inplace=True)]
-		# Residual units
-		for i in range(4):
-			model += [self._residual_unit(block, num_planes[i], num_blocks[i],
-								strides[i])]
-		# Last conv layer
-		# TODO norm layer, instance norm?
-		model += [nn.BatchNorm2d(self.inplanes),
-							nn.ReLU(inplace=True),
-							Encoding(D=512*self.expansion,K=16),
-							nn.BatchNorm1d(16),
-							nn.ReLU(inplace=True),
-							nn2.View(-1, 512*self.expansion*16),
-							nn.Linear(512*self.expansion*16, num_classes)]
-		self.model = nn.Sequential(*model)
-		print(model)
-
-	def _residual_unit(self, block, planes, n_blocks, stride):
-		strides = [stride] + [1]*(n_blocks-1)
-		layers = []
-		for i in range(n_blocks):
-			layers += [block(self.inplanes, planes, strides[i])]
-			self.inplanes = self.expansion*planes
-		return nn.Sequential(*layers)
-
-	def forward(self, input):
-		return self.model(input)
--- a/experiments/model/encodingnet.py
+++ b/experiments/model/encodingnet.py
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+## Created by: Hang Zhang
+## ECE Department, Rutgers University
+## Email: zhang.hang@rutgers.edu
+## Copyright (c) 2017
+##
+## This source code is licensed under the MIT-style license found in the
+## LICENSE file in the root directory of this source tree 
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+import torch
+from torch.autograd import Variable
+import torch.nn as nn
+from torch.autograd import Variable
+
+import encoding
+import torchvision.models as resnet
+
+class Net(nn.Module):
+    def __init__(self, nclass=23, aux=False, backbone='resnet50'):
+        super(Net, self).__init__()
+        self.backbone = backbone
+        # copying modules from pretrained models
+        if backbone == 'resnet50':
+            self.pretrained = resnet.resnet50(pretrained=True)
+        elif backbone == 'resnet101':
+            self.pretrained = resnet.resnet101(pretrained=True)
+        elif backbone == 'resnet152':
+            self.pretrained = resnet.resnet152(pretrained=True)
+        else:
+            raise RuntimeError('unknown backbone: {}'.format(backbone))
+        self.aux = aux
+        n_codes = 32
+        self.head = nn.Sequential(
+            nn.Conv2d(2048, 128, 1),
+            nn.BatchNorm2d(128),
+            nn.ReLU(inplace=True),
+            encoding.nn.Encoding(D=128,K=n_codes),
+            encoding.nn.View(-1, 128*n_codes),
+            encoding.nn.Normalize(),
+            nn.Linear(128*n_codes, nclass),
+        )
+
+    def forward(self, x):
+        if isinstance(x, Variable):
+            _, _, h, w = x.size()
+        elif isinstance(x, tuple) or isinstance(x, list):
+            var_input = x 
+            while not isinstance(var_input, Variable):
+                var_input = var_input[0]
+            _, _, h, w = var_input.size()
+        else:
+            raise RuntimeError('unknown input type: ', type(x))
+
+        if self.backbone == 'resnet50' or self.backbone == 'resnet101' \
+            or self.backbone == 'resnet152':
+            # pre-trained ResNet feature
+            x = self.pretrained.conv1(x)
+            x = self.pretrained.bn1(x)
+            x = self.pretrained.relu(x)
+            x = self.pretrained.maxpool(x)
+            x = self.pretrained.layer1(x)
+            x = self.pretrained.layer2(x)
+            x = self.pretrained.layer3(x)
+            x = self.pretrained.layer4(x)
+        else:
+            x = self.pretrained(x)
+        return self.head(x)
+
+
+def test():
+    net = Net(nclass=23).cuda()
+    print(net)
+    x = Variable(torch.randn(1,3,224,224)).cuda()
+    y = net(x)
+    print(y)
+    params = net.parameters()
+    sum = 0
+    for param in params:
+        sum += param.nelement()
+    print('Total params:', sum)
+
+
+if __name__ == "__main__":
+    test()
--- a/experiments/model/mynn.py
+++ b/experiments/model/mynn.py
@@ -8,114 +8,437 @@
 ## LICENSE file in the root directory of this source tree 
 ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

+import math
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from torch.autograd import Variable
+import encoding
+from encoding import Encoding

+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 class Basicblock(nn.Module):
-	def __init__(self, inplanes, planes, stride=1, 
-							norm_layer=nn.BatchNorm2d):
-		super(Basicblock, self).__init__()
-		if inplanes != planes*self.expansion or stride !=1 :
-			self.downsample = True
-			self.residual_layer = nn.Conv2d(inplanes, planes,
-														kernel_size=1, stride=stride)
-		else:
-			self.downsample = False
-		conv_block=[]
-		conv_block+=[norm_layer(inplanes),
-								nn.ReLU(inplace=True),
-								nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, 
-									padding=1),
-								norm_layer(planes),
-								nn.ReLU(inplace=True),
-								nn.Conv2d(planes, planes, kernel_size=3, stride=1,
-									padding=1),
-								norm_layer(planes)]
-		self.conv_block = nn.Sequential(*conv_block)
-	
-	def forward(self, input):
-		if self.downsample:
-			residual = self.residual_layer(input)
-		else:
-			residual = input
-		return residual + self.conv_block(input)
+    """ Pre-activation residual block
+    Identity Mapping in Deep Residual Networks
+    ref https://arxiv.org/abs/1603.05027
+    """
+    def __init__(self, inplanes, planes, stride=1, 
+                            norm_layer=nn.BatchNorm2d):
+        super(Basicblock, self).__init__()
+        if inplanes != planes or stride !=1 :
+            self.downsample = True
+            self.residual_layer = nn.Conv2d(inplanes, planes,
+                                            kernel_size=1, stride=stride)
+        else:
+            self.downsample = False
+        conv_block=[]
+        conv_block+=[norm_layer(inplanes),
+                     nn.ReLU(inplace=True),
+                     nn.Conv2d(inplanes, planes, kernel_size=3,
+                         stride=stride, padding=1),
+                     norm_layer(planes),
+                     nn.ReLU(inplace=True),
+                     nn.Conv2d(planes, planes, kernel_size=3, stride=1,
+                         padding=1)]
+        self.conv_block = nn.Sequential(*conv_block)
+    
+    def forward(self, input):
+        #print(input.size())
+        if self.downsample:
+            residual = self.residual_layer(input)
+        else:
+            residual = input
+        return residual + self.conv_block(input)


 class Bottleneck(nn.Module):
-	""" Pre-activation residual block
-	Identity Mapping in Deep Residual Networks
-	ref https://arxiv.org/abs/1603.05027
-	"""
-	def __init__(self, inplanes, planes, stride=1,norm_layer=nn.BatchNorm2d):
-		super(Bottleneck, self).__init__()
-		self.expansion = 4
-		if inplanes != planes*self.expansion or stride !=1 :
-			self.downsample = True
-			self.residual_layer = nn.Conv2d(inplanes, planes * self.expansion,
-														kernel_size=1, stride=stride)
-		else:
-			self.downsample = False
-		conv_block = []
-		conv_block += [norm_layer(inplanes),
-									nn.ReLU(inplace=True),
-									nn.Conv2d(inplanes, planes, kernel_size=1, stride=1)]
-		conv_block += [norm_layer(planes),
-									nn.ReLU(inplace=True),
-									nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
-										padding=1)]
-		conv_block += [norm_layer(planes),
-									nn.ReLU(inplace=True),
-									nn.Conv2d(planes, planes * self.expansion, kernel_size=1,
-										stride=1)]
-		self.conv_block = nn.Sequential(*conv_block)
-		
-	def forward(self, x):
-		if self.downsample:
-			residual = self.residual_layer(x)
-		else:
-			residual = x
-		return residual + self.conv_block(x)
-		
+    """ Pre-activation residual block
+    Identity Mapping in Deep Residual Networks
+    ref https://arxiv.org/abs/1603.05027
+    """
+    def __init__(self, inplanes, planes, stride=1,norm_layer=nn.BatchNorm2d):
+        super(Bottleneck, self).__init__()
+        self.expansion = 4
+        if inplanes != planes*self.expansion or stride !=1 :
+            self.downsample = True
+            self.residual_layer = nn.Conv2d(inplanes, 
+                planes * self.expansion, kernel_size=1, stride=stride)
+        else:
+            self.downsample = False
+        conv_block = []
+        conv_block += [norm_layer(inplanes),
+                       nn.ReLU(inplace=True),
+                       nn.Conv2d(inplanes, planes, kernel_size=1, 
+                           stride=1)]
+        conv_block += [norm_layer(planes),
+                       nn.ReLU(inplace=True),
+                       nn.Conv2d(planes, planes, kernel_size=3, 
+                           stride=stride, padding=1)]
+        conv_block += [norm_layer(planes),
+                       nn.ReLU(inplace=True),
+                       nn.Conv2d(planes, planes * self.expansion, 
+                           kernel_size=1, stride=1)]
+        self.conv_block = nn.Sequential(*conv_block)
+        
+    def forward(self, x):
+        if self.downsample:
+            residual = self.residual_layer(x)
+        else:
+            residual = x
+        return residual + self.conv_block(x)
+        
+
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+class ResNeXtBlock(nn.Module):
+    """
+    Aggregated Residual Transformations for Deep Neural Networks
+    ref https://arxiv.org/abs/1611.05431
+    """
+    def __init__(self, inplanes, planes, cardinality=32, base_width=4, 
+            stride=1, expansion=4):
+        super(ResNeXtBlock, self).__init__()
+        width = int(math.floor(planes * (base_width/64.0)))
+        group_width = cardinality * width
+        conv_block = []
+        conv_block += [
+            nn.Conv2d(inplanes, group_width, kernel_size=1, bias=False),
+            nn.BatchNorm2d(group_width),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(group_width, group_width, kernel_size=3, 
+                stride=stride, padding=1, groups=cardinality, bias=False),
+            nn.BatchNorm2d(group_width),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(group_width, expansion*group_width, kernel_size=1, 
+                bias=False),
+            nn.BatchNorm2d(expansion*group_width),
+            nn.ReLU(inplace=True)]
+        self.conv_block = nn.Sequential(*conv_block)
+        if stride != 1 or inplanes != expansion*group_width:
+            self.downsample = True
+            self.residual_layer = nn.Conv2d(inplanes, 
+                expansion*group_width, kernel_size=1, stride=stride, 
+                bias=False)
+        else:
+            self.downsample = False
+
+    def forward(self, x):
+        if self.downsample:
+            residual = self.residual_layer(x)
+        else:
+            residual = x
+        return residual + self.conv_block(x)
+
+
 class View(nn.Module):
-	def __init__(self, *args):
-		super(View, self).__init__()
-		if len(args) == 1 and isinstance(args[0], torch.Size):
-			self.size = args[0]
-		else:
-			self.size = torch.Size(args)
-
-	def forward(self, input):
-		return input.view(self.size)
-
-
-class InstanceNormalization(nn.Module):
-	"""InstanceNormalization
-	Improves convergence of neural-style.
-	ref: https://arxiv.org/pdf/1607.08022.pdf
-	"""
-
-	def __init__(self, dim, eps=1e-5):
-		super(InstanceNormalization, self).__init__()
-		self.weight = nn.Parameter(torch.FloatTensor(dim))
-		self.bias = nn.Parameter(torch.FloatTensor(dim))
-		self.eps = eps
-		self._reset_parameters()
-
-	def _reset_parameters(self):
-		self.weight.data.uniform_()
-		self.bias.data.zero_()
-
-	def forward(self, x):
-		n = x.size(2) * x.size(3)
-		t = x.view(x.size(0), x.size(1), n)
-		mean = torch.mean(t, 2).unsqueeze(2).expand_as(x)
-		# Calculate the biased var. torch.var returns unbiased var
-		var = torch.var(t, 2).unsqueeze(2).expand_as(x) * ((n - 1) / float(n))
-		scale_broadcast = self.weight.unsqueeze(1).unsqueeze(1).unsqueeze(0)
-		scale_broadcast = scale_broadcast.expand_as(x)
-		shift_broadcast = self.bias.unsqueeze(1).unsqueeze(1).unsqueeze(0)
-		shift_broadcast = shift_broadcast.expand_as(x)
-		out = (x - mean) / torch.sqrt(var + self.eps)
-		out = out * scale_broadcast + shift_broadcast
-		return out
+    def __init__(self, *args):
+        super(View, self).__init__()
+        if len(args) == 1 and isinstance(args[0], torch.Size):
+            self.size = args[0]
+        else:
+            self.size = torch.Size(args)
+
+    def forward(self, input):
+        return input.view(self.size)
+
+
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+class DenseBlock(nn.Module):
+    """
+    Densely Connected Convolutional Networks
+    ref https://arxiv.org/abs/1608.06993
+    """
+    def __init__(self, in_planes, growth_rate):
+        super(DenseBlock, self).__init__()
+        model = []
+        model += [nn.BatchNorm2d(in_planes),
+                  nn.ReLU(inplace=True),
+                  nn.Conv2d(in_planes, 4*growth_rate, kernel_size=1, 
+                     bias=False),
+                  nn.BatchNorm2d(4*growth_rate),
+                  nn.ReLU(inplace=True),
+                  nn.Conv2d(4*growth_rate, growth_rate, kernel_size=3, 
+                     padding=1, bias=False)]
+        self.model = nn.Sequential(*model)
+
+    def forward(self, x):
+        out = self.model(x)
+        out = torch.cat([out,x], 1)
+        return out
+
+
+class Transition(nn.Module):
+    """
+    Densely Connected Convolutional Networks
+    ref https://arxiv.org/abs/1608.06993
+    """
+    def __init__(self, in_planes, out_planes):
+        super(Transition, self).__init__()
+        model = []
+        model += [nn.BatchNorm2d(in_planes),
+                  nn.ReLU(inplace=True),
+                  nn.Conv2d(in_planes, out_planes, kernel_size=1,
+                      bias=False),
+                  nn.AvgPool2d(2)]
+        self.model = nn.Sequential(*model)
+
+    def forward(self, x):
+        return self.model(x)
+
+
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+class SELayer(nn.Module):
+    def __init__(self, channel, reduction=16):
+        super(SELayer, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        out_channel = int(channel / reduction)
+        self.fc = nn.Sequential(
+            nn.Linear(channel, out_channel),
+            nn.ReLU(inplace=True),
+            nn.Linear(out_channel, channel),
+            nn.Sigmoid()
+        )
+
+    def forward(self, x):
+        b, c, _, _ = x.size()
+        y = self.avg_pool(x).view(b, c)
+        y = self.fc(y).view(b, c, 1, 1)
+        return x * y
+
+def conv3x3(in_planes, out_planes, stride=1):
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False)
+
+
+class SEBasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, reduction=16):
+        super(SEBasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes, 1)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.se = SELayer(planes, reduction)
+        self.stride = stride
+        if inplanes != planes or stride !=1 :
+            self.downsample = True
+            self.residual_layer = nn.Conv2d(inplanes, planes,
+                                            kernel_size=1, stride=stride)
+        else:
+            self.downsample = False
+
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.se(out)
+
+        if self.downsample:
+            residual = self.residual_layer(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class SEBottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=16):
+        super(SEBottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.se = SELayer(planes * 4, reduction)
+        self.stride = stride
+        if inplanes != planes or stride !=1 :
+            self.downsample = True
+            self.residual_layer = nn.Conv2d(inplanes, planes,
+                                            kernel_size=1, stride=stride)
+        else:
+            self.downsample = False
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+        out = self.se(out)
+
+        if self.downsample:
+            residual = self.residual_layer(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+class ELayer(nn.Module):
+    def __init__(self, channel, K=16, reduction=4):
+        super(ELayer, self).__init__()
+        out_channel = int(channel / reduction)
+        self.fc = nn.Sequential(
+            nn.Conv2d(channel, out_channel, 1),
+            nn.BatchNorm2d(out_channel),
+            nn.ReLU(inplace=True),
+            Encoding(D=out_channel,K=K),
+            nn.BatchNorm1d(K),
+            nn.ReLU(inplace=True),
+            View(-1, out_channel*K),
+            nn.Linear(out_channel*K, channel),
+            nn.Sigmoid()
+        )
+        """
+        encoding.nn.View(-1, out_channel*K),
+        encoding.Normalize(),
+        """
+
+    def forward(self, x):
+        b, c, _, _ = x.size()
+        y = self.fc(x).view(b, c, 1, 1)
+        return x * y
+
+class EBasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, K=16):
+        super(EBasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes, 1)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.se = ELayer(planes, K, self.expansion*4)
+        self.stride = stride
+        if inplanes != planes or stride !=1 :
+            self.downsample = True
+            self.residual_layer = nn.Conv2d(inplanes, planes,
+                                            kernel_size=1, stride=stride)
+        else:
+            self.downsample = False
+
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.se(out)
+
+        if self.downsample:
+            residual = self.residual_layer(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class EBottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1, downsample=None, K=16):
+        super(EBottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, 
+                               stride=stride, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 
+            kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.se = ELayer(planes * self.expansion, K, self.expansion*4)
+        self.stride = stride
+        if inplanes != planes * self.expansion or stride !=1 :
+            self.downsample = True
+            self.residual_layer = nn.Conv2d(inplanes, 
+                planes* self.expansion, kernel_size=1, stride=stride)
+        else:
+            self.downsample = False
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+        out = self.se(out)
+
+        if self.downsample:
+            residual = self.residual_layer(x)
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+
+class EResNeXtBottleneck(nn.Module):
+  expansion = 4
+  """
+  RexNeXt bottleneck type C (https://github.com/facebookresearch/ResNeXt/blob/master/models/resnext.lua)
+  """
+  def __init__(self, inplanes, planes, cardinality, base_width, stride=1, downsample=None, K=32):
+    super(EResNeXtBottleneck, self).__init__()
+
+    D = int(math.floor(planes * (base_width/64.0)))
+    C = cardinality
+
+    self.conv_reduce = nn.Conv2d(inplanes, D*C, kernel_size=1, stride=1, padding=0, bias=False)
+    self.bn_reduce = nn.BatchNorm2d(D*C)
+
+    self.conv_conv = nn.Conv2d(D*C, D*C, kernel_size=3, stride=stride, padding=1, groups=cardinality, bias=False)
+    self.bn = nn.BatchNorm2d(D*C)
+
+    self.conv_expand = nn.Conv2d(D*C, planes*4, kernel_size=1, stride=1, padding=0, bias=False)
+    self.bn_expand = nn.BatchNorm2d(planes*4)
+    self.se = ELayer(planes * 4, K, self.expansion*4)
+
+    self.downsample = downsample
+
+  def forward(self, x):
+    residual = x
+
+    bottleneck = self.conv_reduce(x)
+    bottleneck = F.relu(self.bn_reduce(bottleneck), inplace=True)
+
+    bottleneck = self.conv_conv(bottleneck)
+    bottleneck = F.relu(self.bn(bottleneck), inplace=True)
+
+    bottleneck = self.conv_expand(bottleneck)
+    bottleneck = self.bn_expand(bottleneck)
+
+    bottleneck = self.se(bottleneck)
+
+    if self.downsample is not None:
+      residual = self.downsample(x)
+    
+    return F.relu(residual + bottleneck, inplace=True)
--- a/experiments/model/resnet.py
+++ b/experiments/model/resnet.py
-##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-## Created by: Hang Zhang
-## ECE Department, Rutgers University
-## Email: zhang.hang@rutgers.edu
-## Copyright (c) 2017
-##
-## This source code is licensed under the MIT-style license found in the
-## LICENSE file in the root directory of this source tree 
-##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-
-import torch
-import torch.nn as nn
-import model.mynn as nn2
-
-class Net(nn.Module):
-	def __init__(self, num_blocks=[2,2,2,2], num_classes=10, 
-								block=nn2.Bottleneck):
-		super(Net, self).__init__()
-		if block == nn2.Basicblock:
-			self.expansion = 1
-		else:
-			self.expansion = 4
-
-		self.inplanes = 64
-		num_planes = [64, 128, 256, 512]
-		strides = [1, 2, 2, 2]
-		model = []
-		# Conv_1
-		model += [nn.Conv2d(3, self.inplanes, kernel_size=3, padding=1),
-							nn.BatchNorm2d(self.inplanes),
-							nn.ReLU(inplace=True)]
-		# Residual units
-		for i in range(4):
-			model += [self._residual_unit(block, num_planes[i], num_blocks[i],
-								strides[i])]
-		# Last conv layer
-		model += [nn.BatchNorm2d(self.inplanes),
-							nn.ReLU(inplace=True),
-							nn.AvgPool2d(4),
-							nn2.View(-1, self.inplanes),
-							nn.Linear(self.inplanes, num_classes)]
-		self.model = nn.Sequential(*model)
-		print(model)
-
-	def _residual_unit(self, block, planes, n_blocks, stride):
-		strides = [stride] + [1]*(n_blocks-1)
-		layers = []
-		for i in range(n_blocks):
-			layers += [block(self.inplanes, planes, strides[i])]
-			self.inplanes = self.expansion*planes
-		return nn.Sequential(*layers)
-
-	def forward(self, input):
-		return self.model(input)
--- a/experiments/option.py
+++ b/experiments/option.py
@@ -12,33 +12,46 @@ import argparse
 import os

 class Options():
-	def __init__(self):
-		# Training settings
-		parser = argparse.ArgumentParser(description='Deep Encoding')
-		parser.add_argument('--dataset', type=str, default='cifar',
-					help='training dataset (default: cifar)')
-		parser.add_argument('--batch-size', type=int, default=128, metavar='N',
-					help='input batch size for training (default: 64)')
-		parser.add_argument('--test-batch-size', type=int, default=1000, 
-				metavar='N', help='input batch size for testing (default: 1000)')
-		parser.add_argument('--epochs', type=int, default=160, metavar='N',
-					help='number of epochs to train (default: 10)')
-		parser.add_argument('--start_epoch', type=int, default=1, metavar='N',
-					help='number of epochs to train (default: 10)')
-		parser.add_argument('--lr', type=float, default=0.1, metavar='LR',
-					help='learning rate (default: 0.01)')
-		parser.add_argument('--momentum', type=float, default=0.9, metavar='M',
-					help='SGD momentum (default: 0.5)')
-		parser.add_argument('--no-cuda', action='store_true', default=False,
-					help='disables CUDA training')
-		parser.add_argument('--seed', type=int, default=1, metavar='S',
-					help='random seed (default: 1)')
-		parser.add_argument('--log-interval', type=int, default=10, metavar=
-				'N',help='how many batches to wait before logging status')	
-		parser.add_argument('--resume', type=str, default=None,
-					help='put the path to resuming file if needed')
-		parser.add_argument('--checkname', type=str, default='default',
-					help='set the checkpoint name')
-		self.parser = parser
-	def parse(self):
-		return self.parser.parse_args()
+    def __init__(self):
+        # Training settings
+        parser = argparse.ArgumentParser(description='Deep Encoding')
+        parser.add_argument('--dataset', type=str, default='cifar10',
+            help='training dataset (default: cifar10)')
+        parser.add_argument('--model', type=str, default='densenet',
+            help='network model type (default: densenet)')
+        # scale factor for HangsNet only
+        parser.add_argument('--widen', type=int, default=4, metavar='N',
+            help='widen factor of the network (default: 4)')
+        # training hyper params
+        parser.add_argument('--batch-size', type=int, default=128,
+            metavar='N', help='batch size for training (default: 128)')
+        parser.add_argument('--test-batch-size', type=int, default=256, 
+            metavar='N', help='batch size for testing (default: 256)')
+        parser.add_argument('--epochs', type=int, default=300, metavar='N',
+            help='number of epochs to train (default: 300)')
+        parser.add_argument('--start_epoch', type=int, default=1, 
+            metavar='N', help='the epoch number to start (default: 0)')
+        parser.add_argument('--lr', type=float, default=0.1, metavar='LR',
+            help='learning rate (default: 0.1)')
+        parser.add_argument('--momentum', type=float, default=0.9, 
+            metavar='M', help='SGD momentum (default: 0.9)')
+        parser.add_argument('--weight-decay', type=float, default=1e-4, 
+            metavar ='M', help='SGD weight decay (default: 1e-4)')
+        # cuda, seed and logging
+        parser.add_argument('--no-cuda', action='store_true', 
+            default=False, help='disables CUDA training')
+        parser.add_argument('--plot', action='store_true', default=False,
+            help='matplotlib')
+        parser.add_argument('--seed', type=int, default=1, metavar='S',
+            help='random seed (default: 1)')
+        # checking point
+        parser.add_argument('--resume', type=str, default=None,
+            help='put the path to resuming file if needed')
+        parser.add_argument('--checkname', type=str, default='default',
+            help='set the checkpoint name')
+        # evaluation option
+        parser.add_argument('--eval', action='store_true', default= False,
+            help='evaluating')
+        self.parser = parser
+    def parse(self):
+        return self.parser.parse_args()
--- a/experiments/utils.py
+++ b/experiments/utils.py
-##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-## Created by: Hang Zhang
-## ECE Department, Rutgers University
-## Email: zhang.hang@rutgers.edu
-## Copyright (c) 2017
-##
-## This source code is licensed under the MIT-style license found in the
-## LICENSE file in the root directory of this source tree 
-##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-import torch
-import shutil
-import os
-import sys
-import time
-import math
-
-def adjust_learning_rate(optimizer, epoch, best_pred, args):
-	lr = args.lr * ((0.1 ** int(epoch > 80)) * (0.1 ** int(epoch > 120)))
-	print('=>Epoches %i, learning rate = %.4f, previous best = %.3f%%' % (
-		epoch, lr, best_pred))
-	for param_group in optimizer.param_groups:
-		param_group['lr'] = lr
-
-_, term_width = os.popen('stty size', 'r').read().split()
-term_width = int(term_width)
-
-def save_checkpoint(state, args, is_best, filename='checkpoint.pth.tar'):
-	"""Saves checkpoint to disk"""
-	directory = "runs/%s/%s/"%(args.dataset, args.checkname)
-	if not os.path.exists(directory):
-		os.makedirs(directory)
-	filename = directory + filename
-	torch.save(state, filename)
-	if is_best:
-		shutil.copyfile(filename, directory + 'model_best.pth.tar')
-
-# taken from https://github.com/kuangliu/pytorch-cifar/blob/master/utils.py
-TOTAL_BAR_LENGTH = 86.
-last_time = time.time()
-begin_time = last_time
-def progress_bar(current, total, msg=None):
-	global last_time, begin_time
-	if current == 0:
-		begin_time = time.time()	# Reset for new bar.
-
-	cur_len = int(TOTAL_BAR_LENGTH*current/total)
-	rest_len = int(TOTAL_BAR_LENGTH - cur_len) - 1
-
-	sys.stdout.write(' [')
-	for i in range(cur_len):
-		sys.stdout.write('=')
-	sys.stdout.write('>')
-	for i in range(rest_len):
-		sys.stdout.write('.')
-	sys.stdout.write(']')
-
-	cur_time = time.time()
-	step_time = cur_time - last_time
-	last_time = cur_time
-	tot_time = cur_time - begin_time
-
-	L = []
-	L.append('	Step: %s' % format_time(step_time))
-	L.append(' | Tot: %s' % format_time(tot_time))
-	if msg:
-		L.append(' | ' + msg)
-
-	msg = ''.join(L)
-	sys.stdout.write(msg)
-	for i in range(term_width-int(TOTAL_BAR_LENGTH)-len(msg)-3):
-		sys.stdout.write(' ')
-
-	# Go back to the center of the bar.
-	for i in range(term_width-int(TOTAL_BAR_LENGTH/2)):
-		sys.stdout.write('\b')
-	sys.stdout.write(' %d/%d ' % (current+1, total))
-
-	if current < total-1:
-		sys.stdout.write('\r')
-	else:
-		sys.stdout.write('\n')
-	sys.stdout.flush()
-
-def format_time(seconds):
-	days = int(seconds / 3600/24)
-	seconds = seconds - days*3600*24
-	hours = int(seconds / 3600)
-	seconds = seconds - hours*3600
-	minutes = int(seconds / 60)
-	seconds = seconds - minutes*60
-	secondsf = int(seconds)
-	seconds = seconds - secondsf
-	millis = int(seconds*1000)
-
-	f = ''
-	i = 1
-	if days > 0:
-		f += str(days) + 'D'
-		i += 1
-	if hours > 0 and i <= 2:
-		f += str(hours) + 'h'
-		i += 1
-	if minutes > 0 and i <= 2:
-		f += str(minutes) + 'm'
-		i += 1
-	if secondsf > 0 and i <= 2:
-		f += str(secondsf) + 's'
-		i += 1
-	if millis > 0 and i <= 2:
-		f += str(millis) + 'ms'
-		i += 1
-	if f == '':
-		f = '0ms'
-	return f
-
--- a/setup.py
+++ b/setup.py
@@ -8,7 +8,9 @@
 ## LICENSE file in the root directory of this source tree 
 ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

+import io
 import os
+import re
 import sys
 import subprocess

@@ -18,6 +20,23 @@ from setuptools.command.install import install

 this_file = os.path.dirname(__file__)

+def read(*names, **kwargs):
+    with io.open(
+        os.path.join(os.path.dirname(__file__), *names),
+        encoding=kwargs.get("encoding", "utf8")
+    ) as fp:
+        return fp.read()
+
+def find_version(*file_paths):
+    version_file = read(*file_paths)
+    version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
+                              version_file, re.M)
+    if version_match:
+        return version_match.group(1)
+    raise RuntimeError("Unable to find version string.")
+
+_version = find_version('encoding/__init__.py')
+
 #extra_compile_args = ['-std=c++11', '-Wno-write-strings']
 if os.getenv('PYTORCH_BINARY_BUILD') and platform.system() == 'Linux':
    print('PYTORCH_BINARY_BUILD found. Static linking libstdc++ on Linux')
@@ -32,7 +51,7 @@ class TestCommand(install):

 setup(
    name="encoding",
-    version="0.0.1",
+    version=_version,
    description="PyTorch Encoding Layer",
    url="https://github.com/zhanghang1989/PyTorch-Encoding-Layer",
    author="Hang Zhang",