path

0f6efd80 · Hang Zhang · c92a7c24 · 0f6efd80 · 0f6efd80 · 0f6efd80
Commit 0f6efd80 authored Nov 20, 2017 by Hang Zhang
17 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -6,5 +6,8 @@ build/
 data/
 docs/src/
 docs/html/
+encoding/lib/
 encoding/_ext/
 encoding.egg-info/
+experiments/recognition/
+experiments/segmentation/
--- a/LICENSE
+++ b/LICENSE
@@ -9,8 +9,8 @@ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:

-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software. 

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

--- a/build.py
+++ b/build.py
@@ -16,6 +16,7 @@ from torch.utils.ffi import create_extension

 lib_path = os.path.join(os.path.dirname(torch.__file__), 'lib')
 cwd = os.path.dirname(os.path.realpath(__file__))
+encoding_lib_path = os.path.join(cwd, "encoding", "lib")

 # clean the build files
 clean_cmd = ['bash', 'clean.sh']
@@ -26,11 +27,12 @@ os.environ['TORCH_BUILD_DIR'] = lib_path
 if platform.system() == 'Darwin':
    os.environ['TH_LIBRARIES'] = os.path.join(lib_path,'libTH.1.dylib')
    os.environ['THC_LIBRARIES'] = os.path.join(lib_path,'libTHC.1.dylib')
-    ENCODING_LIB = os.path.join(lib_path, 'libENCODING.dylib')
+    ENCODING_LIB = os.path.join(cwd, 'encoding/lib/libENCODING.dylib')
+
 else:
    os.environ['TH_LIBRARIES'] = os.path.join(lib_path,'libTH.so.1')
    os.environ['THC_LIBRARIES'] = os.path.join(lib_path,'libTHC.so.1')
-    ENCODING_LIB = os.path.join(lib_path, 'libENCODING.so')
+    ENCODING_LIB = os.path.join(cwd, 'encoding/lib/libENCODING.so')

 build_all_cmd = ['bash', 'encoding/make.sh']
 subprocess.check_call(build_all_cmd, env=dict(os.environ))
@@ -44,7 +46,7 @@ defines = [('WITH_CUDA', None)]
 with_cuda = True 

 include_path = [os.path.join(lib_path, 'include'),
-                os.path.join(lib_path,'include/ENCODING'), 
+                os.path.join(cwd,'encoding/kernel'), 
                os.path.join(cwd,'encoding/kernel/include'), 
                os.path.join(cwd,'encoding/src/')]

@@ -65,6 +67,7 @@ ffi = create_extension(
    include_dirs = include_path,
    extra_link_args = [
        make_relative_rpath(lib_path),
+        make_relative_rpath(encoding_lib_path),
        ENCODING_LIB,
    ],
 )

--- a/clean.sh
+++ b/clean.sh
 #!/usr/bin/env bash

-rm -rf build/ dist/ encoding.egg-info/ encoding/build/ encoding/_ext/ __pycache__ encoding/__pycache__
+rm -rf build/ dist/ encoding.egg-info/ encoding/lib/ encoding/_ext/ __pycache__ encoding/__pycache__
--- a/docs/source/dilated.rst
+++ b/docs/source/dilated.rst
@@ -4,11 +4,13 @@
 Dilated Networks
 ================

-We provide correct dilated pre-trained ResNet and DenseNet for semantic segmentation. 
-For dilation of ResNet, we replace the stride of 2 Conv3x3 at begining of certain stage and update the dilation of the conv layers afterwards. 
-For dilation of DenseNet, we provide :class:`encoding.nn.DilatedAvgPool2d` that handles the dilation of the transition layers, then update the dilation of the conv layers afterwards. 
+We provide correct dilated pre-trained ResNet and DenseNet (stride of 8) for semantic segmentation. 
+For dilation of DenseNet, we provide :class:`encoding.nn.DilatedAvgPool2d`. 
 All provided models have been verified. 

+.. note::
+
+    This code is provided together with the paper (coming soon), please cite our work.

 .. automodule:: encoding.dilated
 .. currentmodule:: encoding.dilated

--- a/docs/source/notes/compile.rst
+++ b/docs/source/notes/compile.rst
@@ -5,9 +5,7 @@ Installing PyTorch-Encoding
 Install from Source
 -------------------

-    * Please follow the `PyTorch instructions <https://github.com/pytorch/pytorch#from-source>`_ to install PyTorch from Source to the ``$HOME`` directory (recommended). Or you can simply clone a copy to ``$HOME`` directory::
-
-        git clone https://github.com/pytorch/pytorch $HOME/pytorch
+    * Install PyTorch from Source (recommended). Please follow the `PyTorch instructions <https://github.com/pytorch/pytorch#from-source>`_.

    * Install this package


--- a/docs/source/utils.rst
+++ b/docs/source/utils.rst
@@ -9,10 +9,10 @@ Useful util functions.
 .. automodule:: encoding.utils
 .. currentmodule:: encoding.utils

-:hidden:`CosLR_Scheduler`
+:hidden:`LR_Scheduler`
 ~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: CosLR_Scheduler
+.. autoclass:: LR_Scheduler
    :members:

 :hidden:`get_optimizer`

--- a/encoding/CMakeLists.txt
+++ b/encoding/CMakeLists.txt
@@ -75,9 +75,4 @@ IF(ENCODING_SO_VERSION)
    SOVERSION ${ENCODING_SO_VERSION})
 ENDIF(ENCODING_SO_VERSION)

-FILE(GLOB kernel-header kernel/generic/*.h)
-FILE(GLOB src-header src/generic/*.h)
-
-INSTALL(TARGETS ENCODING LIBRARY DESTINATION ${ENCODING_INSTALL_LIB_SUBDIR})
-INSTALL(FILES kernel/thc_encoding.h DESTINATION "${ENCODING_INSTALL_INCLUDE_SUBDIR}/ENCODING")
-INSTALL(FILES ${src-header} ${kernel-header} DESTINATION "${ENCODING_INSTALL_INCLUDE_SUBDIR}/ENCODING/generic")
+#INSTALL(TARGETS ENCODING LIBRARY DESTINATION ${ENCODING_INSTALL_LIB_SUBDIR})
--- a/encoding/functions/syncbn.py
+++ b/encoding/functions/syncbn.py
@@ -61,7 +61,11 @@ def sum_square(input):
    return _sum_square()(input)


-class _batchnormtrain(Function):
+class _batchnorm(Function):
+    def __init__(self, training=False):
+        super(_batchnorm, self).__init__()
+        self.training = training
+
    def forward(ctx, input, gamma, beta, mean, std):
        ctx.save_for_backward(input, gamma, beta, mean, std)
        assert(input.dim()==3)
@@ -95,13 +99,13 @@ class _batchnormtrain(Function):
                encoding_lib.Encoding_Float_batchnorm_Backward(
                    gradOutput, input, gradInput, gradGamma, gradBeta, 
                    mean, invstd, gamma, beta, gradMean, gradStd,
-                    True) 
+                    self.training) 
        elif isinstance(input, torch.cuda.DoubleTensor):
            with torch.cuda.device_of(input):
                encoding_lib.Encoding_Double_batchnorm_Backward(
                    gradOutput, input, gradInput, gradGamma, gradBeta, 
                    mean, invstd, gamma, beta, gradMean, gradStd,
-                    True) 
+                    self.training) 
        else:
            raise RuntimeError('Unimplemented data type!')
        return gradInput, gradGamma, gradBeta, gradMean, gradStd
@@ -122,52 +126,7 @@ def batchnormtrain(input, gamma, beta, mean, std):
        - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)

    """
-    return _batchnormtrain()(input, gamma, beta, mean, std)
-
-
-class _batchnormeval(Function):
-    def forward(ctx, input, gamma, beta, mean, std):
-        ctx.save_for_backward(input, gamma, beta, mean, std)
-        assert(input.dim()==3)
-        with torch.cuda.device_of(input):
-            invstd = 1.0 / std
-            output = input.new().resize_as_(input)
-        if isinstance(input, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Float_batchnorm_Forward(output, 
-                    input, mean, invstd, gamma, beta)
-        elif isinstance(input, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Double_batchnorm_Forward(output, 
-                    input, mean, invstd, gamma, beta)
-        else:
-            raise RuntimeError('Unimplemented data type!')
-        return output 
-
-    def backward(ctx, gradOutput):
-        input, gamma, beta, mean, std = ctx.saved_tensors
-        invstd = 1.0 / std
-        with torch.cuda.device_of(input):
-            gradInput = gradOutput.new().resize_as_(input).zero_()
-            gradGamma = gradOutput.new().resize_as_(gamma).zero_()
-            gradBeta  = gradOutput.new().resize_as_(beta).zero_()
-            gradMean  = gradOutput.new().resize_as_(mean).zero_()
-            gradStd   = gradOutput.new().resize_as_(std).zero_()
-        if isinstance(input, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Float_batchnorm_Backward(
-                    gradOutput, input, gradInput, gradGamma, gradBeta, 
-                    mean, invstd, gamma, beta, gradMean, gradStd,
-                    False) 
-        elif isinstance(input, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Double_batchnorm_Backward(
-                    gradOutput, input, gradInput, gradGamma, gradBeta, 
-                    mean, invstd, gamma, beta, gradMean, gradStd,
-                    False) 
-        else:
-            raise RuntimeError('Unimplemented data type!')
-        return gradInput, gradGamma, gradBeta, gradMean, gradStd
+    return _batchnorm(True)(input, gamma, beta, mean, std)


 def batchnormeval(input, gamma, beta, mean, std):
@@ -176,4 +135,4 @@ def batchnormeval(input, gamma, beta, mean, std):

    Please see encoding.batchnormtrain_
    """
-    return _batchnormeval()(input, gamma, beta, mean, std)
+    return _batchnorm(False)(input, gamma, beta, mean, std)
--- a/encoding/make.sh
+++ b/encoding/make.sh
 #!/usr/bin/env bash

-mkdir -p encoding/build && cd encoding/build
+mkdir -p encoding/lib && cd encoding/lib
 # compile and install
 cmake ..
-make install
-cd ..
+make
--- a/encoding/nn/__init__.py
+++ b/encoding/nn/__init__.py
@@ -8,7 +8,7 @@
 ## LICENSE file in the root directory of this source tree 
 ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

+from .encoding import *
 from .syncbn import *
 from .basic import *
-from .encoding import *
 from .customize import *
--- a/encoding/nn/customize.py
+++ b/encoding/nn/customize.py
@@ -11,193 +11,29 @@
 import math
 import torch
 from torch.autograd import Variable
-from torch.nn import Module
+from torch.nn import Module, Parameter
 from torch.nn import functional as F
-from torch.nn.parameter import Parameter

 from ..parallel import my_data_parallel
 from .syncbn import BatchNorm2d
-from ..functions import dilatedavgpool2d, view_each, upsample
+from ..functions import view_each, upsample
 from .basic import *

-__all__ = ['DilatedAvgPool2d', 'UpsampleConv2d', 'View', 'Sum', 'Mean', 
-    'Normalize', 'Bottleneck', 'PyramidPooling']
+__all__ = ['GramMatrix', 'View', 'Sum', 'Mean', 'Normalize', 'PyramidPooling']

-class DilatedAvgPool2d(Module):
-    r"""We provide Dilated Average Pooling for the dilation of Densenet as
-    in :class:`encoding.dilated.DenseNet`.

-    Reference::
-        We provide this code for a comming paper.
-
-    Applies a 2D average pooling over an input signal composed of several input planes.
-
-    In the simplest case, the output value of the layer with input size :math:`(N, C, H, W)`,
-    output :math:`(N, C, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kH, kW)`
-    can be precisely described as:
+class GramMatrix(Module):
+    r""" Gram Matrix for a 4D convolutional featuremaps as a mini-batch

    .. math::
-
-        \begin{array}{ll}
-        out(b, c, h, w)  = 1 / (kH * kW) * 
-        \sum_{{m}=0}^{kH-1} \sum_{{n}=0}^{kW-1}
-        input(b, c, dH * h + m, dW * w + n)
-        \end{array}
-
-    | If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
-      for :attr:`padding` number of points
-
-    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
-
-        - a single ``int`` -- in which case the same value is used for the height and width dimension
-        - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
-          and the second `int` for the width dimension
-
-    Args:
-        kernel_size: the size of the window
-        stride: the stride of the window. Default value is :attr:`kernel_size`
-        padding: implicit zero padding to be added on both sides
-        dilation: the dilation parameter similar to Conv2d
-
-    Shape:
-        - Input: :math:`(N, C, H_{in}, W_{in})`
-        - Output: :math:`(N, C, H_{out}, W_{out})` where
-          :math:`H_{out} = floor((H_{in}  + 2 * padding[0] - kernel\_size[0]) / stride[0] + 1)`
-          :math:`W_{out} = floor((W_{in}  + 2 * padding[1] - kernel\_size[1]) / stride[1] + 1)`
-
-    Examples::
-
-        >>> # pool of square window of size=3, stride=2, dilation=2
-        >>> m = nn.DilatedAvgPool2d(3, stride=2, dilation=2)
-        >>> input = autograd.Variable(torch.randn(20, 16, 50, 32))
-        >>> output = m(input)
-
-    """
-    def __init__(self, kernel_size, stride=None, padding=0, dilation=1):
-        super(DilatedAvgPool2d, self).__init__()
-        self.kernel_size = kernel_size
-        self.stride = stride or kernel_size
-        self.padding = padding
-        self.dilation = dilation
-
-    def forward(self, input):
-        if isinstance(input, Variable):
-            return dilatedavgpool2d(input, self.kernel_size, self.stride,
-                                self.padding, self.dilation)
-        elif isinstance(input, tuple) or isinstance(input, list):
-            return my_data_parallel(self, input)
-        else:
-            raise RuntimeError('unknown input type')
-
-    def __repr__(self):
-        return self.__class__.__name__ + ' (' \
-            + 'size=' + str(self.kernel_size) \
-            + ', stride=' + str(self.stride) \
-            + ', padding=' + str(self.padding) \
-            + ', dilation=' + str(self.dilation) + ')'
-
-
-class UpsampleConv2d(Module):
-    r"""
-    To avoid the checkerboard artifacts of standard Fractionally-strided Convolution, we adapt an integer stride convolution but producing a :math:`2\times 2` outputs for each convolutional window. 
-
-    .. image:: _static/img/upconv.png
-        :width: 50%
-        :align: center
-
-    Reference:
-        Hang Zhang and Kristin Dana. "Multi-style Generative Network for Real-time Transfer."  *arXiv preprint arXiv:1703.06953 (2017)*
-
-    Args:
-        in_channels (int): Number of channels in the input image
-        out_channels (int): Number of channels produced by the convolution
-        kernel_size (int or tuple): Size of the convolving kernel
-        stride (int or tuple, optional): Stride of the convolution. Default: 1
-        padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0
-        output_padding (int or tuple, optional): Zero-padding added to one side of the output. Default: 0
-        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
-        bias (bool, optional): If True, adds a learnable bias to the output. Default: True
-        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
-        scale_factor (int): scaling factor for upsampling convolution. Default: 1
-
-    Shape:
-        - Input: :math:`(N, C_{in}, H_{in}, W_{in})`
-        - Output: :math:`(N, C_{out}, H_{out}, W_{out})` where
-          :math:`H_{out} = scale * (H_{in} - 1) * stride[0] - 2 * padding[0] + kernel\_size[0] + output\_padding[0]`
-          :math:`W_{out} = scale * (W_{in} - 1) * stride[1] - 2 * padding[1] + kernel\_size[1] + output\_padding[1]`
-
-    Attributes:
-        weight (Tensor): the learnable weights of the module of shape
-                         (in_channels, scale * scale * out_channels, kernel_size[0], kernel_size[1])
-        bias (Tensor):   the learnable bias of the module of shape (scale * scale * out_channels)
-
-    Examples::
-        >>> # With square kernels and equal stride
-        >>> m = nn.UpsampleCov2d(16, 33, 3, stride=2)
-        >>> # non-square kernels and unequal stride and with padding
-        >>> m = nn.UpsampleCov2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
-        >>> input = autograd.Variable(torch.randn(20, 16, 50, 100))
-        >>> output = m(input)
-        >>> # exact output size can be also specified as an argument
-        >>> input = autograd.Variable(torch.randn(1, 16, 12, 12))
-        >>> downsample = nn.Conv2d(16, 16, 3, stride=2, padding=1)
-        >>> upsample = nn.UpsampleCov2d(16, 16, 3, stride=2, padding=1)
-        >>> h = downsample(input)
-        >>> h.size()
-        torch.Size([1, 16, 6, 6])
-        >>> output = upsample(h, output_size=input.size())
-        >>> output.size()
-        torch.Size([1, 16, 12, 12])
-
+        \mathcal{G} = \sum_{h=1}^{H_i}\sum_{w=1}^{W_i} \mathcal{F}_{h,w}\mathcal{F}_{h,w}^T
    """
-    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
-                 padding=0, dilation=1, groups=1, scale_factor =1, 
-                 bias=True):
-        super(UpsampleConv2d, self).__init__()
-        kernel_size = _pair(kernel_size)
-        stride = _pair(stride)
-        padding = _pair(padding)
-        dilation = _pair(dilation)
-        if in_channels % groups != 0:
-            raise ValueError('in_channels must be divisible by groups')
-        if out_channels % groups != 0:
-            raise ValueError('out_channels must be divisible by groups')
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        self.stride = stride
-        self.padding = padding
-        self.dilation = dilation
-        self.groups = groups
-        self.scale_factor = scale_factor
-        self.weight = Parameter(torch.Tensor(
-            out_channels * scale_factor * scale_factor, 
-            in_channels // groups, *kernel_size))
-        if bias:
-            self.bias = Parameter(torch.Tensor(out_channels * 
-                scale_factor * scale_factor))
-        else:
-            self.register_parameter('bias', None)
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        n = self.in_channels
-        for k in self.kernel_size:
-            n *= k
-        stdv = 1. / math.sqrt(n)
-        self.weight.data.uniform_(-stdv, stdv)
-        if self.bias is not None:
-            self.bias.data.uniform_(-stdv, stdv)
-
-    def forward(self, input):
-        if isinstance(input, Variable):
-            out = F.conv2d(input, self.weight, self.bias, self.stride,
-                            self.padding, self.dilation, self.groups)
-            return F.pixel_shuffle(out, self.scale_factor)
-        elif isinstance(input, tuple) or isinstance(input, list):
-            return my_data_parallel(self, input)
-        else:
-            raise RuntimeError('unknown input type')
+    def forward(self, y):
+        (b, ch, h, w) = y.size()
+        features = y.view(b, ch, w * h)
+        features_t = features.transpose(1, 2)
+        gram = features.bmm(features_t) / (ch * h * w)
+        return gram


 class View(Module):
@@ -283,48 +119,6 @@ class Normalize(Module):
            raise RuntimeError('unknown input type')


-class Bottleneck(Module):
-    """ Pre-activation residual block
-    Identity Mapping in Deep Residual Networks
-    ref https://arxiv.org/abs/1603.05027
-    """
-    def __init__(self, inplanes, planes, stride=1,
-            norm_layer=BatchNorm2d):
-        super(Bottleneck, self).__init__()
-        self.expansion = 4
-        if inplanes != planes*self.expansion or stride !=1 :
-            self.downsample = True
-            self.residual_layer = Conv2d(inplanes, planes * self.expansion,
-                                                        kernel_size=1, stride=stride)
-        else:
-            self.downsample = False
-        conv_block = []
-        conv_block += [norm_layer(inplanes),
-                       ReLU(inplace=True),
-                       Conv2d(inplanes, planes, kernel_size=1, stride=1, bias=False)]
-        conv_block += [norm_layer(planes),
-                       ReLU(inplace=True),
-                       Conv2d(planes, planes, kernel_size=3, stride=stride,
-                           padding=1, bias=False)]
-        conv_block += [norm_layer(planes),
-                       ReLU(inplace=True),
-                       Conv2d(planes, planes * self.expansion, kernel_size=1,
-                           stride=1, bias=False)]
-        self.conv_block = Sequential(*conv_block)
-        
-    def forward(self, x):
-        if self.downsample:
-            residual = self.residual_layer(x)
-        else:
-            residual = x
-        if isinstance(x, Variable):
-            return residual + self.conv_block(x)
-        elif isinstance(x, tuple) or isinstance(x, list):
-            return sum_each(residual, self.conv_block(x))
-        else:
-            raise RuntimeError('unknown input type')
-
-
 class PyramidPooling(Module):
    """
    Reference: 

--- a/encoding/nn/encoding.py
+++ b/encoding/nn/encoding.py
@@ -10,17 +10,18 @@

 import threading
 import torch
-import torch.nn as nn
+from torch.nn import Module, Parameter
 import torch.nn.functional as F
 from torch.autograd import Function, Variable

 from .._ext import encoding_lib
 from ..functions import scaledL2, aggregate
 from ..parallel import my_data_parallel
+from ..functions import dilatedavgpool2d

-__all__ = ['Encoding', 'EncodingShake', 'Inspiration', 'GramMatrix'] 
+__all__ = ['Encoding', 'EncodingShake', 'Inspiration', 'DilatedAvgPool2d', 'UpsampleConv2d'] 

-class Encoding(nn.Module):
+class Encoding(Module):
    r"""
    Encoding Layer: a learnable residual encoder over 3d or 4d input that 
    is seen as a mini-batch.
@@ -35,6 +36,9 @@ class Encoding(nn.Module):

    Please see the `example of training Deep TEN <./experiments/texture.html>`_.

+    Reference:
+        Hang Zhang, Jia Xue, and Kristin Dana. "Deep TEN: Texture Encoding Network." *The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2017*
+
    Args:
        D: dimention of the features or feature channels
        K: number of codeswords
@@ -51,22 +55,19 @@ class Encoding(nn.Module):
        >>> import encoding
        >>> import torch
        >>> import torch.nn.functional as F
-        >>> from torch.autograd import Variable, gradcheck
+        >>> from torch.autograd import Variable
        >>> B,C,H,W,K = 2,3,4,5,6
        >>> X = Variable(torch.cuda.DoubleTensor(B,C,H,W).uniform_(-0.5,0.5), requires_grad=True)
        >>> layer = encoding.Encoding(C,K).double().cuda()
        >>> E = layer(X)
-
-    Reference:
-        Hang Zhang, Jia Xue, and Kristin Dana. "Deep TEN: Texture Encoding Network." *The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2017*
    """
    def __init__(self, D, K):
        super(Encoding, self).__init__()
        # init codewords and smoothing factor
        self.D, self.K = D, K
-        self.codewords = nn.Parameter(torch.Tensor(K, D), 
+        self.codewords = Parameter(torch.Tensor(K, D), 
            requires_grad=True)
-        self.scale = nn.Parameter(torch.Tensor(K), requires_grad=True) 
+        self.scale = Parameter(torch.Tensor(K), requires_grad=True) 
        self.reset_params()
        
    def reset_params(self):
@@ -93,7 +94,6 @@ class Encoding(nn.Module):
        else:
            raise RuntimeError('Encoding Layer unknown input dims!')
        # assignment weights
-        #A = F.softmax(scaledL2(X, self.codewords, self.scale).view(B*N,-1), dim=1).view(B,N,K)
        A = F.softmax(scaledL2(X, self.codewords, self.scale), dim=2)
        # aggregate
        E = aggregate(A, X, self.codewords)
@@ -104,14 +104,14 @@ class Encoding(nn.Module):
            + 'N x ' + str(self.D) + '=>' + str(self.K) + 'x' \
            + str(self.D) + ')'

-class EncodingShake(nn.Module):
+class EncodingShake(Module):
    def __init__(self, D, K):
        super(EncodingShake, self).__init__()
        # init codewords and smoothing factor
        self.D, self.K = D, K
-        self.codewords = nn.Parameter(torch.Tensor(K, D), 
+        self.codewords = Parameter(torch.Tensor(K, D), 
            requires_grad=True)
-        self.scale = nn.Parameter(torch.Tensor(K), requires_grad=True) 
+        self.scale = Parameter(torch.Tensor(K), requires_grad=True) 
        self.reset_params()
        
    def reset_params(self):
@@ -146,7 +146,7 @@ class EncodingShake(nn.Module):
        # shake
        self.shake()
        # assignment weights
-        A = F.softmax(scaledL2(X, self.codewords, self.scale).view(B*N,-1), dim=1).view(B,N,K)
+        A = F.softmax(scaledL2(X, self.codewords, self.scale), dim=2)
        # aggregate
        E = aggregate(A, X, self.codewords)
        # shake
@@ -159,7 +159,7 @@ class EncodingShake(nn.Module):
            + str(self.D) + ')'


-class Inspiration(nn.Module):
+class Inspiration(Module):
    r""" 
    Inspiration Layer (CoMatch Layer) enables the multi-style transfer in feed-forward network, which learns to match the target feature statistics during the training. 
    This module is differentialble and can be inserted in standard feed-forward network to be learned directly from the loss function without additional supervision. 
@@ -176,7 +176,7 @@ class Inspiration(nn.Module):
    def __init__(self, C, B=1):
        super(Inspiration, self).__init__()
        # B is equal to 1 or input mini_batch
-        self.weight = nn.Parameter(torch.Tensor(1,C,C), requires_grad=True)
+        self.weight = Parameter(torch.Tensor(1,C,C), requires_grad=True)
        # non-parameter buffer
        self.G = Variable(torch.Tensor(B,C,C), requires_grad=True)
        self.C = C
@@ -198,16 +198,179 @@ class Inspiration(nn.Module):
            + 'N x ' + str(self.C) + ')'


-class GramMatrix(nn.Module):
-    r""" Gram Matrix for a 4D convolutional featuremaps as a mini-batch
+class DilatedAvgPool2d(Module):
+    r"""We provide Dilated Average Pooling for the dilation of Densenet as
+    in :class:`encoding.dilated.DenseNet`.
+
+    Reference::
+        We provide this code for a comming paper.
+
+    Applies a 2D average pooling over an input signal composed of several input planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C, H, W)`,
+    output :math:`(N, C, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kH, kW)`
+    can be precisely described as:

    .. math::
-        \mathcal{G} = \sum_{h=1}^{H_i}\sum_{w=1}^{W_i} \mathcal{F}_{h,w}\mathcal{F}_{h,w}^T
+
+        \begin{array}{ll}
+        out(b, c, h, w)  = 1 / (kH * kW) * 
+        \sum_{{m}=0}^{kH-1} \sum_{{n}=0}^{kW-1}
+        input(b, c, dH * h + m, dW * w + n)
+        \end{array}
+
+    | If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
+      for :attr:`padding` number of points
+
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
+
+        - a single ``int`` -- in which case the same value is used for the height and width dimension
+        - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
+          and the second `int` for the width dimension
+
+    Args:
+        kernel_size: the size of the window
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        padding: implicit zero padding to be added on both sides
+        dilation: the dilation parameter similar to Conv2d
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})`
+        - Output: :math:`(N, C, H_{out}, W_{out})` where
+          :math:`H_{out} = floor((H_{in}  + 2 * padding[0] - kernel\_size[0]) / stride[0] + 1)`
+          :math:`W_{out} = floor((W_{in}  + 2 * padding[1] - kernel\_size[1]) / stride[1] + 1)`
+
+    Examples::
+
+        >>> # pool of square window of size=3, stride=2, dilation=2
+        >>> m = nn.DilatedAvgPool2d(3, stride=2, dilation=2)
+        >>> input = autograd.Variable(torch.randn(20, 16, 50, 32))
+        >>> output = m(input)
+
    """
-    def forward(self, y):
-        (b, ch, h, w) = y.size()
-        features = y.view(b, ch, w * h)
-        features_t = features.transpose(1, 2)
-        gram = features.bmm(features_t) / (ch * h * w)
-        return gram
+    def __init__(self, kernel_size, stride=None, padding=0, dilation=1):
+        super(DilatedAvgPool2d, self).__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride or kernel_size
+        self.padding = padding
+        self.dilation = dilation
+
+    def forward(self, input):
+        if isinstance(input, Variable):
+            return dilatedavgpool2d(input, self.kernel_size, self.stride,
+                                self.padding, self.dilation)
+        elif isinstance(input, tuple) or isinstance(input, list):
+            return my_data_parallel(self, input)
+        else:
+            raise RuntimeError('unknown input type')
+
+    def __repr__(self):
+        return self.__class__.__name__ + ' (' \
+            + 'size=' + str(self.kernel_size) \
+            + ', stride=' + str(self.stride) \
+            + ', padding=' + str(self.padding) \
+            + ', dilation=' + str(self.dilation) + ')'
+
+
+class UpsampleConv2d(Module):
+    r"""
+    To avoid the checkerboard artifacts of standard Fractionally-strided Convolution, we adapt an integer stride convolution but producing a :math:`2\times 2` outputs for each convolutional window. 
+
+    .. image:: _static/img/upconv.png
+        :width: 50%
+        :align: center
+
+    Reference:
+        Hang Zhang and Kristin Dana. "Multi-style Generative Network for Real-time Transfer."  *arXiv preprint arXiv:1703.06953 (2017)*
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0
+        output_padding (int or tuple, optional): Zero-padding added to one side of the output. Default: 0
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        bias (bool, optional): If True, adds a learnable bias to the output. Default: True
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+        scale_factor (int): scaling factor for upsampling convolution. Default: 1
+
+    Shape:
+        - Input: :math:`(N, C_{in}, H_{in}, W_{in})`
+        - Output: :math:`(N, C_{out}, H_{out}, W_{out})` where
+          :math:`H_{out} = scale * (H_{in} - 1) * stride[0] - 2 * padding[0] + kernel\_size[0] + output\_padding[0]`
+          :math:`W_{out} = scale * (W_{in} - 1) * stride[1] - 2 * padding[1] + kernel\_size[1] + output\_padding[1]`
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape
+                         (in_channels, scale * scale * out_channels, kernel_size[0], kernel_size[1])
+        bias (Tensor):   the learnable bias of the module of shape (scale * scale * out_channels)
+
+    Examples::
+        >>> # With square kernels and equal stride
+        >>> m = nn.UpsampleCov2d(16, 33, 3, stride=2)
+        >>> # non-square kernels and unequal stride and with padding
+        >>> m = nn.UpsampleCov2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
+        >>> input = autograd.Variable(torch.randn(20, 16, 50, 100))
+        >>> output = m(input)
+        >>> # exact output size can be also specified as an argument
+        >>> input = autograd.Variable(torch.randn(1, 16, 12, 12))
+        >>> downsample = nn.Conv2d(16, 16, 3, stride=2, padding=1)
+        >>> upsample = nn.UpsampleCov2d(16, 16, 3, stride=2, padding=1)
+        >>> h = downsample(input)
+        >>> h.size()
+        torch.Size([1, 16, 6, 6])
+        >>> output = upsample(h, output_size=input.size())
+        >>> output.size()
+        torch.Size([1, 16, 12, 12])
+
+    """
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, scale_factor =1, 
+                 bias=True):
+        super(UpsampleConv2d, self).__init__()
+        kernel_size = _pair(kernel_size)
+        stride = _pair(stride)
+        padding = _pair(padding)
+        dilation = _pair(dilation)
+        if in_channels % groups != 0:
+            raise ValueError('in_channels must be divisible by groups')
+        if out_channels % groups != 0:
+            raise ValueError('out_channels must be divisible by groups')
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        self.scale_factor = scale_factor
+        self.weight = Parameter(torch.Tensor(
+            out_channels * scale_factor * scale_factor, 
+            in_channels // groups, *kernel_size))
+        if bias:
+            self.bias = Parameter(torch.Tensor(out_channels * 
+                scale_factor * scale_factor))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        n = self.in_channels
+        for k in self.kernel_size:
+            n *= k
+        stdv = 1. / math.sqrt(n)
+        self.weight.data.uniform_(-stdv, stdv)
+        if self.bias is not None:
+            self.bias.data.uniform_(-stdv, stdv)
+
+    def forward(self, input):
+        if isinstance(input, Variable):
+            out = F.conv2d(input, self.weight, self.bias, self.stride,
+                            self.padding, self.dilation, self.groups)
+            return F.pixel_shuffle(out, self.scale_factor)
+        elif isinstance(input, tuple) or isinstance(input, list):
+            return my_data_parallel(self, input)
+        else:
+            raise RuntimeError('unknown input type')

--- a/encoding/nn/syncbn.py
+++ b/encoding/nn/syncbn.py
@@ -27,7 +27,8 @@ __all__ = ['BatchNorm1d', 'BatchNorm2d']

 class BatchNorm1d(Module):
    r"""Synchronized Batch Normalization 1d
-    Please use compatible :class:`encoding.parallel.SelfDataParallel` and :class:`encoding.nn`
+
+    `Implementation ideas <./notes/syncbn.html>`_. Please use compatible :class:`encoding.parallel.SelfDataParallel` and :class:`encoding.nn`

    Reference::
        We provide this code for a comming paper.
@@ -221,7 +222,8 @@ class BatchNorm1d(Module):

 class BatchNorm2d(Module):
    r"""Synchronized Batch Normalization 2d
-    Please use compatible :class:`encoding.parallel.SelfDataParallel` and :class:`encoding.nn`    
+
+    `Implementation ideas <./notes/syncbn.html>`_. Please use compatible :class:`encoding.parallel.SelfDataParallel` and :class:`encoding.nn`. 

    Reference::
        We provide this code for a comming paper.

--- a/encoding/parallel.py
+++ b/encoding/parallel.py
@@ -19,6 +19,9 @@ from torch.nn.parallel.scatter_gather import scatter, scatter_kwargs, \
 from torch.nn.parallel.replicate import replicate
 from torch.nn.parallel.parallel_apply import parallel_apply

+__all__ = ['AllReduce', 'Broadcast', 'ModelDataParallel', 
+    'CriterionDataParallel', 'SelfDataParallel']
+
 def nccl_all_reduce(inputs):
    # TODO, figure out why nccl all_reduce doesn't work for gradcheck
    input_size = inputs[0].size()

--- a/encoding/utils.py
+++ b/encoding/utils.py
@@ -16,6 +16,8 @@ import time
 import math
 import tqdm

+__all__ = ['get_optimizer', 'LR_Scheduler', 'save_checkpoint', 'progress_bar']
+
 def get_optimizer(args, model, diff_LR=True):
    """
    Returns an optimizer for given model, 
@@ -41,41 +43,56 @@ def get_optimizer(args, model, diff_LR=True):
    return optimizer


-class CosLR_Scheduler(object):
-    """Cosine Learning Rate Scheduler
+class LR_Scheduler(object):
+    """Learning Rate Scheduler
+
+    Step mode: ``lr = baselr * 0.1 ^ {floor(epoch-1 / lr_step)}``

-    .. math::
-        lr = baselr * 0.5 * (1 + cos(iter/maxiter))
+    Cosine mode: ``lr = baselr * 0.5 * (1 + cos(iter/maxiter))``
+
+    Poly mode: ``lr = baselr * (1 - iter/maxiter) ^ 0.9``

    Args:
-        args:  base learning rate :attr:`args.lr`, number of epochs :attr:`args.epochs`
+        args:  :attr:`args.lr_scheduler` lr scheduler mode (`cos`, `poly`), :attr:`args.lr` base learning rate, :attr:`args.epochs` number of epochs, :attr:`args.lr_step`
+
        niters: number of iterations per epoch
    """
-    def __init__(self, args, niters):
+    def __init__(self, args, niters=0):
+        self.mode = args.lr_scheduler 
+        print('Using {} LR Scheduler!'.format(self.mode))
        self.lr = args.lr
-        self.niters = niters
-        self.N = args.epochs * niters
+        if self.mode == 'step':
+            self.lr_step = args.lr_step
+        else:
+            self.niters = niters
+            self.N = args.epochs * niters
        self.epoch = -1

-    def __call__(self, optimizer, i, epoch, best_pred):
-        T = (epoch - 1) * self.niters + i
-        lr = 0.5 * self.lr * (1 + math.cos(1.0 * T / self.N * math.pi))
+    def __call__(self, optimizer, i, epoch):
+        if self.mode == 'cos':
+            T = (epoch - 1) * self.niters + i
+            lr = 0.5 * self.lr * (1 + math.cos(1.0 * T / self.N * math.pi))
+        elif self.mode == 'poly':
+            T = (epoch - 1) * self.niters + i
+            lr = self.lr * pow((1 - 1.0 * T / self.N), 0.9)
+        elif self.mode == 'step':
+            lr = self.lr * (0.1 ** ((epoch - 1) // self.lr_step))
+        else:
+            raise RuntimeError('Unknown LR scheduler!')
        if epoch > self.epoch:
-            print('\n=>Epochs %i, learning rate = %.4f, previous best ='\
-                '%.3f%%' % (epoch, lr, best_pred))
+            print('\n=>Epoches %i, learning rate = %.4f' % (
+                epoch, lr))
            self.epoch = epoch
        self._adjust_learning_rate(optimizer, lr)

    def _adjust_learning_rate(self, optimizer, lr):
        if len(optimizer.param_groups) == 1:
            optimizer.param_groups[0]['lr'] = lr
-        elif len(optimizer.param_groups) == 2:
+        else:
            # enlarge the lr at the head
            optimizer.param_groups[0]['lr'] = lr
-            optimizer.param_groups[1]['lr'] = lr * 10
-        else:
-            raise RuntimeError('unsupported number of param groups: {}' \
-                .format(len(optimizer.param_groups)))
+            for i in range(1,len(optimizer.param_groups)):
+                optimizer.param_groups[i]['lr'] = lr * 10


 # refer to https://github.com/xternalz/WideResNet-pytorch

--- a/setup.py
+++ b/setup.py
@@ -10,8 +10,6 @@

 import io
 import os
-import re
-import sys
 import subprocess

 from setuptools import setup, find_packages
@@ -55,6 +53,10 @@ setup(
    # Exclude the build files.
    packages=find_packages(exclude=["build"]),
    # Package where to put the extensions. Has to be a prefix of build.py.
+    package_data={'encoding': [
+        'lib/*.so*', 'lib/*.dylib*',
+        'kernel/*.h', 'kernel/generic/*h',
+    ]},
    ext_package="",
    # Extensions to compile.
    cffi_modules=[