update docs

528210a3 · Hang Zhang · 30be3799 · 30be3799 · 528210a3 · 528210a3
Commit 528210a3 authored Oct 05, 2017 by Hang Zhang
12 changed files
--- a/docs/source/_static/img/Hang.png
+++ b/docs/source/_static/img/Hang.png
--- a/docs/source/_static/img/icon.png
+++ b/docs/source/_static/img/icon.png
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -72,8 +72,7 @@ author = 'Hang Zhang'
 # built documents.
 #
 # The short X.Y version.
-# TODO: change to [:2] at v1.0
+version = 'master (0.0.1)'
-version = '0.0.1'
 # The full version, including alpha/beta/rc tags.
 # TODO: verify this works as expected
 release = 'master'
@@ -115,7 +114,7 @@ html_theme_options = {
    'logo_only': True,
 }
-html_logo = '_static/img/favicon.png'
+html_logo = '_static/img/icon.png'
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,

--- a/docs/source/encoding.rst
+++ b/docs/source/encoding.rst
 .. role:: hidden
    :class: hidden-section
-encoding
+Encoding Layer
-========
+==============
 .. automodule:: encoding
@@ -33,4 +33,16 @@ Functions
 .. autoclass:: aggregate
    :members:
+:hidden:`scaledL2`
+~~~~~~~~~~~~~~~~~~~
+.. autoclass:: scaledL2
+    :members:
+:hidden:`residual`
+~~~~~~~~~~~~~~~~~~~
+.. autoclass:: residual
+    :members:
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -8,7 +8,7 @@
 Encoding documentation
 ======================
-PyTorch-Encoding is an optimized PyTorch package using GPU, including Encoding Layer, Synchronized Batch Normalization.
+PyTorch-Encoding is an optimized PyTorch package using GPU, including Encoding Layer, Multi-GPU Synchronized Batch Normalization.
 .. toctree::
   :glob:
@@ -18,10 +18,11 @@ PyTorch-Encoding is an optimized PyTorch package using GPU, including Encoding L
   notes/*
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 3
   :caption: Package Reference
   encoding
+   syncbn
 Indices and tables

--- a/docs/source/notes/extending.rst
+++ b/docs/source/notes/extending.rst
@@ -7,3 +7,104 @@ which is extending :mod:`torch.nn` and
 Torch C and CUDA Backend
 ------------------------
+Given an example of the residual operation (in a mini-batch): 
+.. math::
+    r_{ik} = x_i - c_k
+where the inputs are :math:`X=\{x_1, ...x_N\}` and :math:`C=\{c_1,...c_k\}` and the output is :math:`R=\{r_{ik}\}`. 
+- Add CUDA kernel function and expose a C API to the generic file ``encoding/kernel/generic/encoding_kernel.c`` using Torch generic files::
+    __global__ void Encoding_(Residual_Forward_kernel) (
+        THCDeviceTensor<real, 4> R,
+        THCDeviceTensor<real, 3> X,
+        THCDeviceTensor<real, 2> D)
+    /*
+     * residual forward kernel function
+     */
+    {
+        /* declarations of the variables */
+        int b, k, d, i, K;
+        /* Get the index and channels */ 
+        b = blockIdx.z;
+        d = blockIdx.x * blockDim.x + threadIdx.x;
+        i = blockIdx.y * blockDim.y + threadIdx.y;
+        K = R.getSize(2);
+        /* boundary check for output */
+        if (d >= X.getSize(2) || i >= X.getSize(1))    return;
+        /* main operation */
+        for(k=0; k<K; k++) {
+            R[b][i][k][d] = X[b][i][d].ldg() - D[k][d].ldg();
+        }
+    }
+    void Encoding_(Residual_Forward)(
+        THCState *state, THCTensor *R_, THCTensor *X_, THCTensor *D_)
+    /*
+     * residual forward 
+     */
+    {
+        /* Check the GPU index and tensor dims*/
+        THCTensor_(checkGPU)(state, 3, R_, X_, D_); 
+        if (THCTensor_(nDimension)(state, R_) != 4 ||
+            THCTensor_(nDimension)(state, X_) != 3 ||
+            THCTensor_(nDimension)(state, D_) != 2)
+        THError("Encoding: incorrect input dims. \n");
+        /* Device tensors */
+        THCDeviceTensor<real, 4> R = devicetensor<4>(state, R_);
+        THCDeviceTensor<real, 3> X = devicetensor<3>(state, X_);
+        THCDeviceTensor<real, 2> D = devicetensor<2>(state, D_);
+        /* kernel function */
+        cudaStream_t stream = THCState_getCurrentStream(state);
+        dim3 threads(16, 16);
+        dim3 blocks(X.getSize(2)/16+1, X.getSize(1)/16+1, 
+                    X.getSize(0));
+        Encoding_(Residual_Forward_kernel)<<<blocks, threads, 0, stream>>>(R, X, D);
+        THCudaCheck(cudaGetLastError());
+    }
+- Add corresponding function header to ``encoding/kernel/generic/encoding_kernel.h``::
+    void Encoding_(Residual_Forward)(
+        THCState *state, THCTensor *R_, THCTensor *X_, THCTensor *D_);
+- Add a CFFI function to ``encoding/src/generic/encoding_generic.c``, which calls the C API we just write::
+    int Encoding_(residual_forward)(THCTensor *R, THCTensor *X, THCTensor *D)
+    /*
+     * Residual operation
+     */
+    {
+        Encoding_(Residual_Forward)(state, R, X, D);
+        /* C function return number of the outputs */
+        return 0;
+    }
+- Add corresponding function header to ``encoding/src/encoding_lib.h``::
+    int Encoding_Float_residual_forward(THCudaTensor *R, THCudaTensor *X, 
+        THCudaTensor *D);
+- Finally, call this function using python::
+    class residual(Function):
+        def forward(self, X, C):
+            # X \in(BxNxD) D \in(KxD) R \in(BxNxKxD) 
+            B, N, D = X.size()
+            K = C.size(0)
+            with torch.cuda.device_of(X):
+                R = X.new(B,N,K,D)
+            if isinstance(X, torch.cuda.FloatTensor):
+                with torch.cuda.device_of(X):
+                    encoding_lib.Encoding_Float_residual_forward(R, X, C)
+            elif isinstance(X, torch.cuda.DoubleTensor):
+                with torch.cuda.device_of(X):
+                    encoding_lib.Encoding_Double_residual_forward(R, X, C)
+            else:
+                raise RuntimeError('Unimplemented data type!')
+            return R
+- Note this is just an example. You also need to implement backward function for ``residual`` operation. 
--- a/docs/source/syncbn.rst
+++ b/docs/source/syncbn.rst
+.. role:: hidden
+    :class: hidden-section
+Synchronized BatchNorm
+======================
+The current BN is implementated insynchronized accross the gpus, which is a big problem for memory consuming tasks such as Semantic Segmenation, since the mini-batch is very small. 
+To synchronize the batchnorm accross multiple gpus is not easy to implment within the current Dataparallel framework. We address this difficulty by making each layer 'self-parallel', that is accepting the inputs from multi-gpus. Therefore, we can handle different layers seperately for synchronizing it across gpus.
+.. currentmodule:: encoding
+Functions
+---------
+:hidden:`batchnormtrain`
+~~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: batchnormtrain
+    :members:
+:hidden:`batchnormeval`
+~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: batchnormeval
+    :members:
+:hidden:`sum_square`
+~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: sum_square
+    :members:
--- a/encoding/functions/__init__.py
+++ b/encoding/functions/__init__.py
-from .aggregate import *
+from .aggregate import aggregate, scaledL2, aggregateP, residual, square_squeeze, assign
--- a/encoding/functions/aggregate.py
+++ b/encoding/functions/aggregate.py
@@ -28,6 +28,14 @@ class aggregate(Function):
        - Input: :math:`A\in\mathcal{R}^{B\times N\times K}` :math:`X\in\mathcal{R}^{B\times N\times D}` :math:`C\in\mathcal{R}^{K\times D}`  (where :math:`B` is batch, :math:`N` is total number of features, :math:`K` is number is codewords, :math:`D` is feature dimensions.)
        - Output: :math:`E\in\mathcal{R}^{B\times K\times D}`
+    Examples:
+        >>> B,N,K,D = 2,3,4,5
+        >>> A = Variable(torch.cuda.DoubleTensor(B,N,K).uniform_(-0.5,0.5), requires_grad=True)
+        >>> X = Variable(torch.cuda.DoubleTensor(B,N,D).uniform_(-0.5,0.5), requires_grad=True)
+        >>> C = Variable(torch.cuda.DoubleTensor(K,D).uniform_(-0.5,0.5), requires_grad=True)
+        >>> func = encoding.aggregate()
+        >>> E = func(A, X, C)
    """
    def forward(self, A, X, C):
        # A \in(BxNxK) R \in(BxNxKxD) => E \in(BxNxD)
@@ -67,7 +75,18 @@ class aggregate(Function):
        return gradA, gradX, gradC
-class ScaledL2(Function):
+class scaledL2(Function):
+    r"""
+    scaledL2 distance
+    .. math::
+        sl_{ik} = s_k \|x_i-c_k\|^2
+    Shape:
+        - Input: :math:`X\in\mathcal{R}^{B\times N\times D}` :math:`C\in\mathcal{R}^{K\times D}` :math:`S\in \mathcal{R}^K` (where :math:`B` is batch, :math:`N` is total number of features, :math:`K` is number is codewords, :math:`D` is feature dimensions.)
+        - Output: :math:`E\in\mathcal{R}^{B\times N\times K}`
+    """
    def forward(self, X, C, S):
        B,N,D = X.size()
        K = C.size(0)
@@ -140,6 +159,17 @@ class aggregateP(Function):
 class residual(Function):
+    r"""
+    Calculate residuals over a mini-batch
+    .. math::
+        r_{ik} = x_i - c_k
+    Shape:
+        - Input: :math:`X\in\mathcal{R}^{B\times N\times D}` :math:`C\in\mathcal{R}^{K\times D}` (where :math:`B` is batch, :math:`N` is total number of features, :math:`K` is number is codewords, :math:`D` is feature dimensions.)
+        - Output: :math:`R\in\mathcal{R}^{B\times N\times K\times D}`
+    """
    def forward(self, X, C):
        # X \in(BxNxD) D \in(KxD) R \in(BxNxKxD) 
        B, N, D = X.size()

--- a/encoding/modules/encoding.py
+++ b/encoding/modules/encoding.py
@@ -48,7 +48,7 @@ class Encoding(nn.Module):
        >>> E = layer(X)
    Reference:
-        Zhang, Hang, Jia Xue, and Kristin Dana. "Deep TEN: Texture Encoding Network." *The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2017*
+        Hang Zhang, Jia Xue, and Kristin Dana. "Deep TEN: Texture Encoding Network." *The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2017*
    """
    def __init__(self, D, K):
        super(Encoding, self).__init__()
@@ -79,7 +79,7 @@ class Encoding(nn.Module):
        else:
            raise RuntimeError('Encoding Layer unknown input dims!')
        # assignment weights
-        A = F.softmax(ScaledL2()(X, self.codewords, self.scale))
+        A = F.softmax(scaledL2()(X, self.codewords, self.scale))
        # aggregate
        E = aggregate()(A, X, self.codewords)
        return E

--- a/encoding/syncbn.py
+++ b/encoding/syncbn.py
@@ -16,6 +16,9 @@ from torch.autograd import Function, Variable
 from ._ext import encoding_lib
 class sum_square(Function):
+    r"""
+    Calculate sum of elements and sum of squares for Batch Normalization.
+    """
    def forward(ctx, input):
        ctx.save_for_backward(input)
        B,C,H,W = input.size()
@@ -53,6 +56,20 @@ class sum_square(Function):
 class batchnormtrain(Function):
+    r"""Applies Batch Normalization over a 3d input that is seen as a
+    mini-batch.
+    .. _bencoding.atchnormtrain:
+    .. math::
+        y = \frac{x - \mu[x]}{ \sqrt{var[x] + \epsilon}} * \gamma + \beta
+    Shape:
+        - Input: :math:`(N, C)` or :math:`(N, C, L)`
+        - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)
+    """
    def forward(ctx, input, gamma, beta, mean, std):
        ctx.save_for_backward(input, gamma, beta, mean, std)
        assert(input.dim()==3)
@@ -99,6 +116,11 @@ class batchnormtrain(Function):
 class batchnormeval(Function):
+    r"""Applies Batch Normalization over a 3d input that is seen as a
+    mini-batch.
+    Please see encoding.batchnormtrain_
+    """
    def forward(ctx, input, gamma, beta, mean, std):
        ctx.save_for_backward(input, gamma, beta, mean, std)
        assert(input.dim()==3)

--- a/test/test.py
+++ b/test/test.py
@@ -37,7 +37,7 @@ def test_aggregate():
    print('Testing aggregate(): {}'.format(test))
-def test_ScaledL2():
+def test_scaledL2():
    B,N,K,D = 2,3,4,5
    X = Variable(torch.cuda.DoubleTensor(B,N,D).uniform_(-0.5,0.5), 
        requires_grad=True)
@@ -46,8 +46,8 @@ def test_ScaledL2():
    S = Variable(torch.cuda.DoubleTensor(K).uniform_(-0.5,0.5), 
        requires_grad=True)
    input = (X, C, S)
-    test = gradcheck(encoding.ScaledL2(), input, eps=1e-6, atol=1e-4)
+    test = gradcheck(encoding.scaledL2(), input, eps=1e-6, atol=1e-4)
-    print('Testing ScaledL2(): {}'.format(test))
+    print('Testing scaledL2(): {}'.format(test))
 def test_assign():
@@ -63,7 +63,7 @@ def test_assign():
    A1 = encoding.assign(R, S)
    E1 = encoding.aggregateP()(A1, R)
-    A2 = F.softmax(encoding.ScaledL2()(X,C,S))
+    A2 = F.softmax(encoding.scaledL2()(X,C,S))
    E2 = encoding.aggregate()(A2, X, C)
    print('E1', E1)
@@ -121,7 +121,7 @@ def test_sum_square():
 if __name__ == '__main__':
    test_aggregateP()
-    test_ScaledL2()
+    test_scaledL2()
    test_encoding() 
    test_aggregate()
    test_residual()