##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Created by: Hang Zhang
## ECE Department, Rutgers University
## Email: zhang.hang@rutgers.edu
## Copyright (c) 2017
##
## This source code is licensed under the MIT-style license found in the
## LICENSE file in the root directory of this source tree 
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

import threading
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Function, Variable
from .._ext import encoding_lib


class aggregate(Function):
    r"""
    Aggregate operation, aggregate the residuals of inputs (:math:`X`) with repect to the codewords (:math:`C`) with assignment weights (:math:`A`).
    

    .. math::
        e_{k} = \sum_{i=1}^{N} a_{ik} (x_i - d_k)

    Shape:
        - Input: :math:`A\in\mathcal{R}^{B\times N\times K}` :math:`X\in\mathcal{R}^{B\times N\times D}` :math:`C\in\mathcal{R}^{K\times D}`  (where :math:`B` is batch, :math:`N` is total number of features, :math:`K` is number is codewords, :math:`D` is feature dimensions.)
        - Output: :math:`E\in\mathcal{R}^{B\times K\times D}`

    Examples:
        >>> B,N,K,D = 2,3,4,5
        >>> A = Variable(torch.cuda.DoubleTensor(B,N,K).uniform_(-0.5,0.5), requires_grad=True)
        >>> X = Variable(torch.cuda.DoubleTensor(B,N,D).uniform_(-0.5,0.5), requires_grad=True)
        >>> C = Variable(torch.cuda.DoubleTensor(K,D).uniform_(-0.5,0.5), requires_grad=True)
        >>> func = encoding.aggregate()
        >>> E = func(A, X, C)

    """
    def forward(self, A, X, C):
        # A \in(BxNxK) R \in(BxNxKxD) => E \in(BxNxD)
        self.save_for_backward(A, X, C)
        B, N, K = A.size()
        D = X.size(2)
        with torch.cuda.device_of(A):
            E = A.new(B,K,D)
        if isinstance(A, torch.cuda.FloatTensor):
            with torch.cuda.device_of(A):
                encoding_lib.Encoding_Float_aggregateE_forward(E, A, X, C)
        elif isinstance(A, torch.cuda.DoubleTensor):
            with torch.cuda.device_of(A):
                encoding_lib.Encoding_Double_aggregateE_forward(E, A, X, C)
        else:
            raise RuntimeError('Unimplemented data type!')
        return E

    def backward(self, gradE):
        A, X, C = self.saved_tensors
        with torch.cuda.device_of(A):
            gradA = A.new().resize_as_(A)
            gradX = A.new().resize_as_(X)
            gradC = A.new().resize_as_(C)
        if isinstance(A, torch.cuda.FloatTensor):
            with torch.cuda.device_of(A):
                encoding_lib.Encoding_Float_aggregateE_backward(gradA, 
                    gradE, A, X, C)
        elif isinstance(A, torch.cuda.DoubleTensor):
            with torch.cuda.device_of(A):
                encoding_lib.Encoding_Double_aggregateE_backward(gradA, 
                    gradE, A, X, C)
        else:
            raise RuntimeError('Unimplemented data type!')
        gradX.copy_(torch.bmm(A, gradE))
        gradC.copy_((-gradE*A.sum(1).unsqueeze(2)).sum(0))
        return gradA, gradX, gradC


class scaledL2(Function):
    r"""
    scaledL2 distance

    .. math::
        sl_{ik} = s_k \|x_i-c_k\|^2

    Shape:
        - Input: :math:`X\in\mathcal{R}^{B\times N\times D}` :math:`C\in\mathcal{R}^{K\times D}` :math:`S\in \mathcal{R}^K` (where :math:`B` is batch, :math:`N` is total number of features, :math:`K` is number is codewords, :math:`D` is feature dimensions.)
        - Output: :math:`E\in\mathcal{R}^{B\times N\times K}`

    """
    def forward(self, X, C, S):
        B,N,D = X.size()
        K = C.size(0)
        with torch.cuda.device_of(X):
            SL = X.new(B,N,K)
        if isinstance(X, torch.cuda.FloatTensor):
            with torch.cuda.device_of(X):
                encoding_lib.Encoding_Float_scaledl2_forward(SL, X, C, S)
        elif isinstance(X, torch.cuda.DoubleTensor):
            with torch.cuda.device_of(X):
                encoding_lib.Encoding_Double_scaledl2_forward(SL, X, C, S)
        else:
            raise RuntimeError('Unimplemented data type!')
        self.save_for_backward(X, C, S, SL)
        return SL
    def backward(self, gradSL):
        X, C, S, SL = self.saved_tensors
        K = C.size(0)
        with torch.cuda.device_of(X):
            gradX = X.new().resize_as_(X)
            gradC = X.new().resize_as_(C)
            gradS = X.new().resize_as_(S)
        if isinstance(X, torch.cuda.FloatTensor):
            with torch.cuda.device_of(X):
                encoding_lib.Encoding_Float_scaledl2_backward(gradSL, 
                    gradX, gradC, X, C, S)
        elif isinstance(X, torch.cuda.DoubleTensor):
            with torch.cuda.device_of(X):
                encoding_lib.Encoding_Double_scaledl2_backward(gradSL, 
                    gradX, gradC, X, C, S)
        else:
            raise RuntimeError('Unimplemented data type!')
        gradS.copy_((gradSL*(SL/S.view(1,1,K))).sum(0).sum(0))
        return gradX, gradC, gradS


class aggregateP(Function):
    def forward(self, A, R):
        # A \in(BxNxK) R \in(BxNxKxD) => E \in(BxNxD)
        self.save_for_backward(A, R)
        B, N, K, D = R.size()
        with torch.cuda.device_of(A):
            E = A.new(B,K,D)
        if isinstance(A, torch.cuda.FloatTensor):
            with torch.cuda.device_of(A):
                encoding_lib.Encoding_Float_aggregate_forward(E, A, R)
        elif isinstance(A, torch.cuda.DoubleTensor):
            with torch.cuda.device_of(A):
                encoding_lib.Encoding_Double_aggregate_forward(E, A, R)
        else:
            raise RuntimeError('Unimplemented data type!')
        return E

    def backward(self, gradE):
        A, R = self.saved_tensors
        with torch.cuda.device_of(A):
            gradA = A.new().resize_as_(A)
            gradR = R.new().resize_as_(R)
        if isinstance(A, torch.cuda.FloatTensor):
            with torch.cuda.device_of(A):
                encoding_lib.Encoding_Float_aggregate_backward(gradA, 
                    gradR, gradE, A, R)
        elif isinstance(A, torch.cuda.DoubleTensor):
            with torch.cuda.device_of(A):
                encoding_lib.Encoding_Double_aggregate_backward(gradA, 
                    gradR, gradE, A, R)
        else:
            raise RuntimeError('Unimplemented data type!')
        return gradA, gradR


class residual(Function):
    r"""
    Calculate residuals over a mini-batch
    
    .. math::
        r_{ik} = x_i - c_k

    Shape:
        - Input: :math:`X\in\mathcal{R}^{B\times N\times D}` :math:`C\in\mathcal{R}^{K\times D}` (where :math:`B` is batch, :math:`N` is total number of features, :math:`K` is number is codewords, :math:`D` is feature dimensions.)
        - Output: :math:`R\in\mathcal{R}^{B\times N\times K\times D}`

    """
    def forward(self, X, C):
        # X \in(BxNxD) D \in(KxD) R \in(BxNxKxD) 
        B, N, D = X.size()
        K = C.size(0)
        with torch.cuda.device_of(X):
            R = X.new(B,N,K,D)
        if isinstance(X, torch.cuda.FloatTensor):
            with torch.cuda.device_of(X):
                encoding_lib.Encoding_Float_residual_forward(R, X, C)
        elif isinstance(X, torch.cuda.DoubleTensor):
            with torch.cuda.device_of(X):
                encoding_lib.Encoding_Double_residual_forward(R, X, C)
        else:
            raise RuntimeError('Unimplemented data type!')
        return R

    def backward(self, gradR):
        B, N, K, D = gradR.size()
        with torch.cuda.device_of(gradR):
            gradX = gradR.new(B,N,D)
            gradD = gradR.new(K,D)
        if isinstance(gradR, torch.cuda.FloatTensor):
            with torch.cuda.device_of(gradR):
                encoding_lib.Encoding_Float_residual_backward(gradR, 
                    gradX, gradD)
        elif isinstance(gradR, torch.cuda.DoubleTensor):
            with torch.cuda.device_of(gradR):
                encoding_lib.Encoding_Double_residual_backward(gradR, 
                    gradX, gradD)
        else:
            raise RuntimeError('Unimplemented data type!')
        return gradX, gradD


class square_squeeze(Function):
    def forward(self, R):
        B, N, K, D = R.size()
        with torch.cuda.device_of(R):
            L = R.new(B,N,K)
        if isinstance(R, torch.cuda.FloatTensor):
            with torch.cuda.device_of(R):
                encoding_lib.Encoding_Float_squaresqueeze_forward(L, R)
        elif isinstance(R, torch.cuda.DoubleTensor):
            with torch.cuda.device_of(R):
                encoding_lib.Encoding_Double_squaresqueeze_forward(L, R)
        else:
            raise RuntimeError('Unimplemented data type!')
        self.save_for_backward(L, R)
        return L

    def backward(self, gradL):
        L, R = self.saved_tensors
        B, N, K, D = R.size()
        with torch.cuda.device_of(R):
            gradR = R.new(B,N,K,D)
        if isinstance(R, torch.cuda.FloatTensor):
            with torch.cuda.device_of(gradL):
                encoding_lib.Encoding_Float_squaresqueeze_backward(gradL, 
                    gradR, R)
        elif isinstance(R, torch.cuda.DoubleTensor):
            with torch.cuda.device_of(gradL):
                encoding_lib.Encoding_Double_squaresqueeze_backward(gradL, 
                    gradR, R)
        else:
            raise RuntimeError('Unimplemented data type!')
        return gradR
    

def assign(R, S):
    r"""
    Calculate assignment weights for given residuals (:math:`R`) and scale (:math:`S`)

    .. math::
        a_{ik} = \frac{exp(-s_k\|r_{ik}\|^2)}{\sum_{j=1}^K exp(-s_j\|r_{ik}\|^2)}

    Shape:
        - Input: :math:`R\in\mathcal{R}^{B\times N\times K\times D}` :math:`S\in \mathcal{R}^K` (where :math:`B` is batch, :math:`N` is total number of features, :math:`K` is number is codewords, :math:`D` is feature dimensions.)
        - Output :math:`A\in\mathcal{R}^{B\times N\times K}`

    """
    L = square_squeeze()(R)
    K = S.size(0)
    SL = L * S.view(1,1,K)
    return F.softmax(SL)