Commit 1177a80b authored by Hang Zhang's avatar Hang Zhang
Browse files

memory efficient implementation and test script

parent 8dd870b1
......@@ -28,9 +28,11 @@ else:
os.environ['THC_LIBRARIES'] = os.path.join(lib_path,'libTHC.so.1')
ENCODING_LIB = os.path.join(lib_path, 'libENCODING.so')
clean_cmd = ['bash', 'clean.sh']
subprocess.check_call(clean_cmd)
build_all_cmd = ['bash', 'encoding/make.sh']
if subprocess.call(build_all_cmd, env=dict(os.environ)) != 0:
sys.exit(1)
subprocess.check_call(build_all_cmd, env=dict(os.environ))
sources = ['encoding/src/encoding_lib.cpp']
headers = ['encoding/src/encoding_lib.h']
......
......@@ -10,55 +10,253 @@
import threading
import torch
import torch.cuda.nccl as nccl
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Function, Variable
from torch.nn.parameter import Parameter
from ._ext import encoding_lib
class aggregateE(Function):
def forward(self, A, X, C):
# A \in(BxNxK) R \in(BxNxKxD) => E \in(BxNxD)
self.save_for_backward(A, X, C)
B, N, K = A.size()
D = X.size(2)
with torch.cuda.device_of(A):
E = A.new(B,K,D)
if isinstance(A, torch.cuda.FloatTensor):
with torch.cuda.device_of(A):
encoding_lib.Encoding_Float_aggregateE_forward(E, A, X, C)
elif isinstance(A, torch.cuda.DoubleTensor):
with torch.cuda.device_of(A):
encoding_lib.Encoding_Double_aggregateE_forward(E, A, X, C)
else:
raise RuntimeError('Unimplemented data type!')
return E
def backward(self, gradE):
A, X, C = self.saved_tensors
with torch.cuda.device_of(A):
gradA = A.new().resize_as_(A)
gradX = A.new().resize_as_(X)
gradC = A.new().resize_as_(C)
if isinstance(A, torch.cuda.FloatTensor):
with torch.cuda.device_of(A):
encoding_lib.Encoding_Float_aggregateE_backward(gradA,
gradE, A, X, C)
elif isinstance(A, torch.cuda.DoubleTensor):
with torch.cuda.device_of(A):
encoding_lib.Encoding_Double_aggregateE_backward(gradA,
gradE, A, X, C)
else:
raise RuntimeError('Unimplemented data type!')
gradX.copy_(torch.bmm(A, gradE))
gradC.copy_((-gradE*A.sum(1).unsqueeze(2)).sum(0))
return gradA, gradX, gradC
class ScaledL2(Function):
def forward(self, X, C, S):
B,N,D = X.size()
K = C.size(0)
with torch.cuda.device_of(X):
SL = X.new(B,N,K)
if isinstance(X, torch.cuda.FloatTensor):
with torch.cuda.device_of(X):
encoding_lib.Encoding_Float_scaledl2_forward(SL, X, C, S)
elif isinstance(X, torch.cuda.DoubleTensor):
with torch.cuda.device_of(X):
encoding_lib.Encoding_Double_scaledl2_forward(SL, X, C, S)
else:
raise RuntimeError('Unimplemented data type!')
self.save_for_backward(X, C, S, SL)
return SL
def backward(self, gradSL):
X, C, S, SL = self.saved_tensors
K = C.size(0)
with torch.cuda.device_of(X):
gradX = X.new().resize_as_(X)
gradC = X.new().resize_as_(C)
gradS = X.new().resize_as_(S)
if isinstance(X, torch.cuda.FloatTensor):
with torch.cuda.device_of(X):
encoding_lib.Encoding_Float_scaledl2_backward(gradSL,
gradX, gradC, X, C, S)
elif isinstance(X, torch.cuda.DoubleTensor):
with torch.cuda.device_of(X):
encoding_lib.Encoding_Double_scaledl2_backward(gradSL,
gradX, gradC, X, C, S)
else:
raise RuntimeError('Unimplemented data type!')
gradS.copy_((gradSL*(SL/S.view(1,1,K))).sum(0).sum(0))
return gradX, gradC, gradS
class Encoding(nn.Module):
def __init__(self, D, K):
super(Encoding, self).__init__()
# init codewords and smoothing factor
self.D, self.K = D, K
self.codewords = nn.Parameter(torch.Tensor(K, D),
requires_grad=True)
self.scale = nn.Parameter(torch.Tensor(K), requires_grad=True)
self.reset_params()
def reset_params(self):
std1 = 1./((self.K*self.D)**(1/2))
std2 = 1./((self.K)**(1/2))
self.codewords.data.uniform_(-std1, std1)
self.scale.data.uniform_(-std2, std2)
def forward(self, X):
# input X is a 4D tensor
assert(X.size(1)==self.D,"Encoding Layer wrong channels!")
if X.dim() == 3:
# BxDxN
B, N, K, D = X.size(0), X.size(2), self.K, self.D
X = X.transpose(1,2).contiguous()
elif X.dim() == 4:
# BxDxHxW
B, N, K, D = X.size(0), X.size(2)*X.size(3), self.K, self.D
X = X.view(B,D,-1).transpose(1,2).contiguous()
else:
raise RuntimeError('Encoding Layer unknown input dims!')
# assignment weights
A = F.softmax(ScaledL2()(X, self.codewords, self.scale))
# aggregate
E = aggregateE()(A, X, self.codewords)
return E
def __repr__(self):
return self.__class__.__name__ + '(' \
+ 'N x ' + str(self.D) + '=>' + str(self.K) + 'x' \
+ str(self.D) + ')'
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
class aggregate(Function):
def forward(self, A, R):
# A \in(BxNxK) R \in(BxNxKxD) => E \in(BxNxD)
self.save_for_backward(A, R)
B, N, K, D = R.size()
with torch.cuda.device_of(A):
E = A.new(B,K,D)
# TODO support cpu backend
if isinstance(A, torch.cuda.FloatTensor):
with torch.cuda.device_of(A):
encoding_lib.Encoding_Float_aggregate_forward(E, A, R)
elif isinstance(A, torch.cuda.DoubleTensor):
with torch.cuda.device_of(A):
encoding_lib.Encoding_Double_aggregate_forward(E, A, R)
else:
raise RuntimeError('unimplemented')
raise RuntimeError('Unimplemented data type!')
return E
def backward(self, gradE):
A, R = self.saved_tensors
with torch.cuda.device_of(A):
gradA = A.new().resize_as_(A)
gradR = R.new().resize_as_(R)
if isinstance(A, torch.cuda.FloatTensor):
encoding_lib.Encoding_Float_aggregate_backward(gradA, gradR, gradE,
A, R)
with torch.cuda.device_of(A):
encoding_lib.Encoding_Float_aggregate_backward(gradA,
gradR, gradE, A, R)
elif isinstance(A, torch.cuda.DoubleTensor):
encoding_lib.Encoding_Double_aggregate_backward(gradA, gradR, gradE,
A, R)
with torch.cuda.device_of(A):
encoding_lib.Encoding_Double_aggregate_backward(gradA,
gradR, gradE, A, R)
else:
raise RuntimeError('unimplemented')
raise RuntimeError('Unimplemented data type!')
return gradA, gradR
class residual(Function):
def forward(self, X, C):
# X \in(BxNxD) D \in(KxD) R \in(BxNxKxD)
B, N, D = X.size()
K = C.size(0)
with torch.cuda.device_of(X):
R = X.new(B,N,K,D)
if isinstance(X, torch.cuda.FloatTensor):
with torch.cuda.device_of(X):
encoding_lib.Encoding_Float_residual_forward(R, X, C)
elif isinstance(X, torch.cuda.DoubleTensor):
with torch.cuda.device_of(X):
encoding_lib.Encoding_Double_residual_forward(R, X, C)
else:
raise RuntimeError('Unimplemented data type!')
return R
def backward(self, gradR):
B, N, K, D = gradR.size()
with torch.cuda.device_of(gradR):
gradX = gradR.new(B,N,D)
gradD = gradR.new(K,D)
if isinstance(gradR, torch.cuda.FloatTensor):
with torch.cuda.device_of(gradR):
encoding_lib.Encoding_Float_residual_backward(gradR,
gradX, gradD)
elif isinstance(gradR, torch.cuda.DoubleTensor):
with torch.cuda.device_of(gradR):
encoding_lib.Encoding_Double_residual_backward(gradR,
gradX, gradD)
else:
raise RuntimeError('Unimplemented data type!')
return gradX, gradD
class square_squeeze(Function):
def forward(self, R):
B, N, K, D = R.size()
with torch.cuda.device_of(R):
L = R.new(B,N,K)
if isinstance(R, torch.cuda.FloatTensor):
with torch.cuda.device_of(R):
encoding_lib.Encoding_Float_squaresqueeze_forward(L, R)
elif isinstance(R, torch.cuda.DoubleTensor):
with torch.cuda.device_of(R):
encoding_lib.Encoding_Double_squaresqueeze_forward(L, R)
else:
raise RuntimeError('Unimplemented data type!')
self.save_for_backward(L, R)
return L
def backward(self, gradL):
L, R = self.saved_tensors
B, N, K, D = R.size()
with torch.cuda.device_of(R):
gradR = R.new(B,N,K,D)
if isinstance(R, torch.cuda.FloatTensor):
with torch.cuda.device_of(gradL):
encoding_lib.Encoding_Float_squaresqueeze_backward(gradL,
gradR, R)
elif isinstance(R, torch.cuda.DoubleTensor):
with torch.cuda.device_of(gradL):
encoding_lib.Encoding_Double_squaresqueeze_backward(gradL,
gradR, R)
else:
raise RuntimeError('Unimplemented data type!')
return gradR
def assign(R, S):
L = square_squeeze()(R)
K = S.size(0)
SL = L * S.view(1,1,K)
return F.softmax(SL)
class Aggregate(nn.Module):
def forward(self, A, R):
return aggregate()(A, R)
class Encoding(nn.Module):
class EncodingP(nn.Module):
def __init__(self, D, K):
super(Encoding, self).__init__()
super(EncodingP, self).__init__()
# init codewords and smoothing factor
self.D, self.K = D, K
self.codewords = nn.Parameter(torch.Tensor(K, D), requires_grad=True)
self.codewords = nn.Parameter(torch.Tensor(K, D),
requires_grad=True)
self.scale = nn.Parameter(torch.Tensor(K), requires_grad=True)
self.softmax = nn.Softmax()
self.reset_params()
def reset_params(self):
......@@ -69,34 +267,33 @@ class Encoding(nn.Module):
def forward(self, X):
# input X is a 4D tensor
assert(X.size(1)==self.D,"Encoding Layer incompatible input channels!")
unpacked = False
assert(X.size(1)==self.D,"Encoding Layer wrong channels!")
if X.dim() == 3:
unpacked = True
X = X.unsqueeze(0)
# BxDxN
B, N, K, D = X.size(0), X.size(2), self.K, self.D
X = X.transpose(1,2)
elif X.dim() == 4:
# BxDxHxW
B, N, K, D = X.size(0), X.size(2)*X.size(3), self.K, self.D
# reshape input
X = X.view(B,D,-1).transpose(1,2)
else:
raise RuntimeError('Encoding Layer unknown input dims!')
# calculate residuals
R = X.contiguous().view(B,N,1,D).expand(B,N,K,D) - self.codewords.view(
1,1,K,D).expand(B,N,K,D)
R = residual()(X.contiguous(), self.codewords)
# assignment weights
A = R
A = A.pow(2).sum(3).view(B,N,K)
A = A*self.scale.view(1,1,K).expand_as(A)
A = self.softmax(A.view(B*N,K)).view(B,N,K)
A = assign(R, self.scale)
# aggregate
E = aggregate()(A, R)
if unpacked:
E = E.squeeze(0)
return E
def __repr__(self):
return self.__class__.__name__ + '(' \
+ 'N x ' + str(self.D) + '=>' + str(self.K) + 'x' + str(self.D) + ')'
+ 'N x ' + str(self.D) + '=>' + str(self.K) + 'x' \
+ str(self.D) + ')'
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
class sum_square(Function):
def forward(ctx, input):
ctx.save_for_backward(input)
......@@ -113,7 +310,7 @@ class sum_square(Function):
encoding_lib.Encoding_Double_sum_square_Forward(
input.view(B,C,-1), xsum, xsquare)
else:
raise RuntimeError('unimplemented')
raise RuntimeError('Unimplemented data type!')
return xsum, xsquare
def backward(ctx, gradSum, gradSquare):
......@@ -121,8 +318,6 @@ class sum_square(Function):
B,C,H,W = input.size()
with torch.cuda.device_of(input):
gradInput = input.new().resize_(B,C,H*W).zero_()
# gradSum.view(1,C,1,1).expand_as(input) + \
# 2*gradSquare.view(1,C,1,1).expand_as(input)*input
if isinstance(input, torch.cuda.FloatTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Float_sum_square_Backward(
......@@ -132,9 +327,10 @@ class sum_square(Function):
encoding_lib.Encoding_Double_sum_square_Backward(
gradInput, input.view(B,C,-1), gradSum, gradSquare)
else:
raise RuntimeError('unimplemented')
raise RuntimeError('Unimplemented data type!')
return gradInput.view(B,C,H,W)
class batchnormtrain(Function):
def forward(ctx, input, gamma, beta, mean, std):
ctx.save_for_backward(input, gamma, beta, mean, std)
......@@ -151,7 +347,7 @@ class batchnormtrain(Function):
encoding_lib.Encoding_Double_batchnorm_Forward(output,
input, mean, invstd, gamma, beta)
else:
raise RuntimeError('unimplemented')
raise RuntimeError('Unimplemented data type!')
return output
def backward(ctx, gradOutput):
......@@ -177,9 +373,10 @@ class batchnormtrain(Function):
mean, invstd, gamma, beta, gradMean, gradStd,
True)
else:
raise RuntimeError('unimplemented')
raise RuntimeError('Unimplemented data type!')
return gradInput, gradGamma, gradBeta, gradMean, gradStd
class batchnormeval(Function):
def forward(ctx, input, gamma, beta, mean, std):
ctx.save_for_backward(input, gamma, beta, mean, std)
......@@ -196,7 +393,7 @@ class batchnormeval(Function):
encoding_lib.Encoding_Double_batchnorm_Forward(output,
input, mean, invstd, gamma, beta)
else:
raise RuntimeError('unimplemented')
raise RuntimeError('Unimplemented data type!')
return output
def backward(ctx, gradOutput):
......@@ -221,6 +418,6 @@ class batchnormeval(Function):
mean, invstd, gamma, beta, gradMean, gradStd,
False)
else:
raise RuntimeError('unimplemented')
raise RuntimeError('Unimplemented data type!')
return gradInput, gradGamma, gradBeta, gradMean, gradStd
......@@ -12,6 +12,62 @@
#define THC_GENERIC_FILE "generic/encoding_kernel.c"
#else
__global__ void Encoding_(AggregateE_Forward_kernel) (
THCDeviceTensor<real, 3> E,
THCDeviceTensor<real, 3> A,
THCDeviceTensor<real, 3> X,
THCDeviceTensor<real, 2> C)
/*
* aggregating forward kernel function
*/
{
/* declarations of the variables */
int b, k, d, i, N;
real sum;
/* Get the index and channels */
b = blockIdx.z;
d = blockIdx.x * blockDim.x + threadIdx.x;
k = blockIdx.y * blockDim.y + threadIdx.y;
N = A.getSize(1);
/* boundary check for output */
sum = 0;
if (d >= E.getSize(2) || k >= E.getSize(1)) return;
/* main operation */
for(i=0; i<N; i++) {
sum += A[b][i][k].ldg() * (X[b][i][d].ldg()-C[k][d].ldg());
}
E[b][k][d] = sum;
}
void Encoding_(AggregateE_Forward)(THCState *state, THCTensor *E_,
THCTensor *A_, THCTensor *X_, THCTensor *C_)
/*
* aggregating forward the residuals with assignment weights
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 4, E_, A_, X_, C_);
if (THCTensor_(nDimension)(state, E_) != 3 ||
THCTensor_(nDimension)(state, A_) != 3 ||
THCTensor_(nDimension)(state, X_) != 3 ||
THCTensor_(nDimension)(state, C_) != 2)
THError("Encoding: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> E = devicetensor<3>(state, E_);
THCDeviceTensor<real, 3> A = devicetensor<3>(state, A_);
THCDeviceTensor<real, 3> X = devicetensor<3>(state, X_);
THCDeviceTensor<real, 2> C = devicetensor<2>(state, C_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 threads(16, 16);
dim3 blocks(E.getSize(2)/16+1, E.getSize(1)/16+1,
E.getSize(0));
Encoding_(AggregateE_Forward_kernel)<<<blocks, threads, 0, stream>>>
(E, A, X, C);
THCudaCheck(cudaGetLastError());
}
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
__global__ void Encoding_(Aggregate_Forward_kernel) (
THCDeviceTensor<real, 3> E,
THCDeviceTensor<real, 3> A,
......@@ -63,11 +119,72 @@ void Encoding_(Aggregate_Forward)(THCState *state, THCTensor *E_,
THCudaCheck(cudaGetLastError());
}
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
__global__ void Encoding_(AggregateE_Backward_kernel) (
THCDeviceTensor<real, 3> GA,
THCDeviceTensor<real, 3> GE,
THCDeviceTensor<real, 3> A,
THCDeviceTensor<real, 3> X,
THCDeviceTensor<real, 2> C)
/*
* aggregating backward kernel function
* G (dl/dR), L (dl/dE), A
*/
{
/* declarations of the variables */
int b, k, d, i, D;
real sum;
/* Get the index and channels */
b = blockIdx.z;
i = blockIdx.y * blockDim.y + threadIdx.y;
k = blockIdx.x * blockDim.x + threadIdx.x;
D = GE.getSize(2);
/* boundary check for output G \in R^{BxNxKxD} */
if (k >= GA.getSize(2) || i >= GA.getSize(1)) return;
/* main operation */
sum = 0;
for(d=0; d<D; d++) {
sum += GE[b][k][d].ldg() * (X[b][i][d].ldg()-C[k][d].ldg());
}
GA[b][i][k] = sum;
}
void Encoding_(AggregateE_Backward)(THCState *state, THCTensor *GA_,
THCTensor *GE_, THCTensor *A_, THCTensor *X_, THCTensor *C_)
/*
* aggregate backward to assignment weights
* G (dl/dR), L (dl/dE), A
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 5, GA_, GE_, A_, X_, C_);
if (THCTensor_(nDimension)(state, GA_) != 3 ||
THCTensor_(nDimension)(state, GE_) != 3 ||
THCTensor_(nDimension)(state, A_) != 3 ||
THCTensor_(nDimension)(state, X_) != 3 ||
THCTensor_(nDimension)(state, C_) != 2)
THError("Encoding: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> GA = devicetensor<3>(state, GA_);
THCDeviceTensor<real, 3> GE = devicetensor<3>(state, GE_);
THCDeviceTensor<real, 3> A = devicetensor<3>(state, A_);
THCDeviceTensor<real, 3> X = devicetensor<3>(state, X_);
THCDeviceTensor<real, 2> C = devicetensor<2>(state, C_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 threads(16, 16);
dim3 blocks(GA.getSize(2)/16+1, GA.getSize(1)/16+1,
GA.getSize(0));
Encoding_(AggregateE_Backward_kernel)<<<blocks, threads, 0, stream>>>
(GA, GE, A, X, C);
THCudaCheck(cudaGetLastError());
}
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
__global__ void Encoding_(Aggregate_Backward_kernel) (
THCDeviceTensor<real, 3> GA,
THCDeviceTensor<real, 4> GR,
THCDeviceTensor<real, 3> L,
THCDeviceTensor<real, 3> GE,
THCDeviceTensor<real, 3> A,
THCDeviceTensor<real, 4> R)
/*
......@@ -82,38 +199,37 @@ __global__ void Encoding_(Aggregate_Backward_kernel) (
b = blockIdx.z;
i = blockIdx.y * blockDim.y + threadIdx.y;
k = blockIdx.x * blockDim.x + threadIdx.x;
D = L.getSize(2);
D = GE.getSize(2);
/* boundary check for output G \in R^{BxNxKxD} */
if (k >= GR.getSize(2) || i >= GR.getSize(1)) return;
/* main operation */
sum = 0;
for(d=0; d<D; d++) {
//sum += L[b][k][d].ldg() * R[b][i][k][d].ldg();
GR[b][i][k][d] = L[b][k][d].ldg() * A[b][i][k].ldg();
sum += L[b][k][d].ldg() * R[b][i][k][d].ldg();
GR[b][i][k][d] = GE[b][k][d].ldg() * A[b][i][k].ldg();
sum += GE[b][k][d].ldg() * R[b][i][k][d].ldg();
}
GA[b][i][k] = sum;
}
void Encoding_(Aggregate_Backward)(THCState *state, THCTensor *GA_,
THCTensor *GR_, THCTensor *L_, THCTensor *A_, THCTensor *R_)
THCTensor *GR_, THCTensor *GE_, THCTensor *A_, THCTensor *R_)
/*
* aggregate backward to assignment weights
* G (dl/dR), L (dl/dE), A
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 5, GA_, GR_, L_, A_, R_);
THCTensor_(checkGPU)(state, 5, GA_, GR_, GE_, A_, R_);
if (THCTensor_(nDimension)(state, GA_) != 3 ||
THCTensor_(nDimension)(state, GR_) != 4 ||
THCTensor_(nDimension)(state, L_) != 3 ||
THCTensor_(nDimension)(state, GE_) != 3 ||
THCTensor_(nDimension)(state, A_) != 3 ||
THCTensor_(nDimension)(state, R_) != 4)
THError("Encoding: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> GA = devicetensor<3>(state, GA_);
THCDeviceTensor<real, 4> GR = devicetensor<4>(state, GR_);
THCDeviceTensor<real, 3> L = devicetensor<3>(state, L_);
THCDeviceTensor<real, 3> GE = devicetensor<3>(state, GE_);
THCDeviceTensor<real, 3> A = devicetensor<3>(state, A_);
THCDeviceTensor<real, 4> R = devicetensor<4>(state, R_);
/* kernel function */
......@@ -122,14 +238,397 @@ void Encoding_(Aggregate_Backward)(THCState *state, THCTensor *GA_,
dim3 blocks(GA.getSize(2)/16+1, GA.getSize(1)/16+1,
GA.getSize(0));
Encoding_(Aggregate_Backward_kernel)<<<blocks, threads, 0, stream>>>(GA,
GR, L, A, R);
GR, GE, A, R);
THCudaCheck(cudaGetLastError());
}
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
__global__ void Encoding_(Residual_Forward_kernel) (
THCDeviceTensor<real, 4> R,
THCDeviceTensor<real, 3> X,
THCDeviceTensor<real, 2> D)
/*
* aggregating forward kernel function
*/
{
/* declarations of the variables */
int b, k, d, i, K;
/* Get the index and channels */
b = blockIdx.z;
d = blockIdx.x * blockDim.x + threadIdx.x;
i = blockIdx.y * blockDim.y + threadIdx.y;
K = R.getSize(2);
/* boundary check for output */
if (d >= X.getSize(2) || i >= X.getSize(1)) return;
/* main operation */
for(k=0; k<K; k++) {
R[b][i][k][d] = X[b][i][d].ldg() - D[k][d].ldg();
}
}
void Encoding_(Residual_Forward)(
THCState *state, THCTensor *R_, THCTensor *X_, THCTensor *D_)
/*
* aggregating forward the residuals with assignment weights
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 3, R_, X_, D_);
if (THCTensor_(nDimension)(state, R_) != 4 ||
THCTensor_(nDimension)(state, X_) != 3 ||
THCTensor_(nDimension)(state, D_) != 2)
THError("Encoding: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 4> R = devicetensor<4>(state, R_);
THCDeviceTensor<real, 3> X = devicetensor<3>(state, X_);
THCDeviceTensor<real, 2> D = devicetensor<2>(state, D_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 threads(16, 16);
dim3 blocks(X.getSize(2)/16+1, X.getSize(1)/16+1,
X.getSize(0));
Encoding_(Residual_Forward_kernel)<<<blocks, threads, 0, stream>>>(R, X, D);
THCudaCheck(cudaGetLastError());
}
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
// Returns the index of the most significant 1 bit in `val`.
__global__ void Encoding_(ResidualX_Backward_kernel) (
THCDeviceTensor<real, 4> GR,
THCDeviceTensor<real, 3> GX)
/*
* aggregating forward kernel function
*/
{
/* declarations of the variables */
int b, k, d, i, K;
real sum;
/* Get the index and channels */
b = blockIdx.z;
d = blockIdx.x * blockDim.x + threadIdx.x;
i = blockIdx.y * blockDim.y + threadIdx.y;
K = GR.getSize(2);
/* boundary check for output */
if (d >= GX.getSize(2) || i >= GX.getSize(1)) return;
/* main operation */
sum = 0;
for(k=0; k<K; k++) {
sum += GR[b][i][k][d].ldg();
}
GX[b][i][d] = sum;
}
__global__ void Encoding_(ResidualD_Backward_kernel) (
THCDeviceTensor<real, 4> GR,
THCDeviceTensor<real, 2> GD)
/*
* aggregating forward kernel function
*/
{
/* declarations of the variables */
int b, k, d, i, B, N;
real sum;
/* Get the index and channels */
d = blockIdx.x * blockDim.x + threadIdx.x;
k = blockIdx.y * blockDim.y + threadIdx.y;
B = GR.getSize(0);
N = GR.getSize(1);
/* boundary check for output */
if (d >= GD.getSize(1) || k >= GD.getSize(0)) return;
/* main operation */
sum = 0;
for(b=0; b<B; b++) {
for(i=0; i<N; i++) {
sum -= GR[b][i][k][d].ldg();
}
}
GD[k][d] = sum;
}
void Encoding_(Residual_Backward)(
THCState *state, THCTensor *GR_, THCTensor *GX_, THCTensor *GD_)
/*
* aggregating forward the residuals with assignment weights
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 3, GR_, GX_, GD_);
if (THCTensor_(nDimension)(state, GR_) != 4 ||
THCTensor_(nDimension)(state, GX_) != 3 ||
THCTensor_(nDimension)(state, GD_) != 2)
THError("Encoding: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 4> GR = devicetensor<4>(state, GR_);
THCDeviceTensor<real, 3> GX = devicetensor<3>(state, GX_);
THCDeviceTensor<real, 2> GD = devicetensor<2>(state, GD_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 threads(16, 16);
dim3 blocks(GX.getSize(2)/16+1, GX.getSize(1)/16+1,
GX.getSize(0));
Encoding_(ResidualX_Backward_kernel)<<<blocks, threads, 0, stream>>>
(GR, GX);
THCudaCheck(cudaGetLastError());
dim3 blocks2(GD.getSize(1)/16+1, GD.getSize(0)/16+1);
Encoding_(ResidualD_Backward_kernel)<<<blocks2, threads, 0, stream>>>
(GR, GD);
THCudaCheck(cudaGetLastError());
}
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
__global__ void Encoding_(ScaledL2_Forward_kernel) (
THCDeviceTensor<real, 3> SL,
THCDeviceTensor<real, 3> X,
THCDeviceTensor<real, 2> C,
THCDeviceTensor<real, 1> S)
/*
* aggregating forward kernel function
*/
{
/* declarations of the variables */
int b, k, d, i, D;
real r, sum;
/* Get the index and channels */
b = blockIdx.z;
k = blockIdx.x * blockDim.x + threadIdx.x;
i = blockIdx.y * blockDim.y + threadIdx.y;
D = X.getSize(2);
/* boundary check for output */
if (k >= SL.getSize(2) || i >= SL.getSize(1)) return;
/* main operation */
sum = 0;
for(d=0; d<D; d++) {
r = X[b][i][d].ldg() - C[k][d].ldg();
sum += r * r;
}
SL[b][i][k] = S[k] * sum;
}
void Encoding_(ScaledL2_Forward)(
THCState *state, THCTensor *SL_, THCTensor *X_,
THCTensor *C_, THCTensor *S_)
/*
* aggregating forward the residuals with assignment weights
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 4, SL_, X_, C_, S_);
if (THCTensor_(nDimension)(state, SL_) != 3 ||
THCTensor_(nDimension)(state, X_) != 3 ||
THCTensor_(nDimension)(state, C_) != 2 ||
THCTensor_(nDimension)(state, S_) != 1)
THError("Encoding: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> SL = devicetensor<3>(state, SL_);
THCDeviceTensor<real, 3> X = devicetensor<3>(state, X_);
THCDeviceTensor<real, 2> C = devicetensor<2>(state, C_);
THCDeviceTensor<real, 1> S = devicetensor<1>(state, S_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 threads(16, 16);
dim3 blocks(SL.getSize(2)/16+1, SL.getSize(1)/16+1,
SL.getSize(0));
Encoding_(ScaledL2_Forward_kernel)<<<blocks, threads, 0, stream>>>
(SL, X, C, S);
THCudaCheck(cudaGetLastError());
}
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
__global__ void Encoding_(ScaledL2X_Backward_kernel) (
THCDeviceTensor<real, 3> GSL,
THCDeviceTensor<real, 3> GX,
THCDeviceTensor<real, 3> X,
THCDeviceTensor<real, 2> C,
THCDeviceTensor<real, 1> S)
/*
*/
{
/* declarations of the variables */
int b, k, d, i, K;
real sum;
/* Get the index and channels */
b = blockIdx.z;
d = blockIdx.x * blockDim.x + threadIdx.x;
i = blockIdx.y * blockDim.y + threadIdx.y;
K = C.getSize(0);
/* boundary check for output */
if (d >= GX.getSize(2) || i >= GX.getSize(1)) return;
/* main operation */
sum = 0;
for(k=0; k<K; k++) {
sum += 2*S[k].ldg() * GSL[b][i][k].ldg() *
(X[b][i][d].ldg()-C[k][d].ldg());
}
GX[b][i][d] = sum;
}
__global__ void Encoding_(ScaledL2C_Backward_kernel) (
THCDeviceTensor<real, 3> GSL,
THCDeviceTensor<real, 2> GC,
THCDeviceTensor<real, 3> X,
THCDeviceTensor<real, 2> C,
THCDeviceTensor<real, 1> S)
/*
*/
{
/* declarations of the variables */
int b, k, d, i, B, N;
real sum;
/* Get the index and channels */
d = blockIdx.x * blockDim.x + threadIdx.x;
k = blockIdx.y * blockDim.y + threadIdx.y;
B = X.getSize(0);
N = X.getSize(1);
/* boundary check for output */
if (d >= GC.getSize(1) || k >= GC.getSize(0)) return;
/* main operation */
sum = 0;
for(b=0; b<B; b++) {
for(i=0; i<N; i++) {
sum += -2*S[k].ldg() * GSL[b][i][k].ldg() *
(X[b][i][d].ldg()-C[k][d].ldg());
}
}
GC[k][d] = sum;
}
void Encoding_(ScaledL2_Backward)(
THCState *state, THCTensor *GSL_, THCTensor *GX_, THCTensor *GC_,
THCTensor *X_, THCTensor *C_, THCTensor *S_)
/*
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 6, GSL_, GX_, GC_, X_, C_, S_);
if (THCTensor_(nDimension)(state, GSL_) != 3 ||
THCTensor_(nDimension)(state, GX_) != 3 ||
THCTensor_(nDimension)(state, GC_) != 2 ||
THCTensor_(nDimension)(state, X_) != 3 ||
THCTensor_(nDimension)(state, C_) != 2 ||
THCTensor_(nDimension)(state, S_) != 1)
THError("Encoding: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> GSL = devicetensor<3>(state, GSL_);
THCDeviceTensor<real, 3> GX = devicetensor<3>(state, GX_);
THCDeviceTensor<real, 2> GC = devicetensor<2>(state, GC_);
THCDeviceTensor<real, 3> X = devicetensor<3>(state, X_);
THCDeviceTensor<real, 2> C = devicetensor<2>(state, C_);
THCDeviceTensor<real, 1> S = devicetensor<1>(state, S_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 threads(16, 16);
dim3 blocks(GX.getSize(2)/16+1, GX.getSize(1)/16+1,
GX.getSize(0));
Encoding_(ScaledL2X_Backward_kernel)<<<blocks, threads, 0, stream>>>
(GSL, GX, X, C, S);
THCudaCheck(cudaGetLastError());
dim3 blocks2(GC.getSize(1)/16+1, GX.getSize(0)/16+1);
Encoding_(ScaledL2C_Backward_kernel)<<<blocks2, threads, 0, stream>>>
(GSL, GC, X, C, S);
THCudaCheck(cudaGetLastError());
}
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
__global__ void Encoding_(SquareSqueeze_Forward_kernel) (
THCDeviceTensor<real, 3> L,
THCDeviceTensor<real, 4> R)
/*
* aggregating forward kernel function
*/
{
/* declarations of the variables */
int b, k, d, i, D;
real sum;
/* Get the index and channels */
b = blockIdx.z;
k = blockIdx.x * blockDim.x + threadIdx.x;
i = blockIdx.y * blockDim.y + threadIdx.y;
D = R.getSize(3);
/* boundary check for output */
if (k >= L.getSize(2) || i >= L.getSize(1)) return;
/* main operation */
sum = 0;
for(d=0; d<D; d++) {
sum += R[b][i][k][d].ldg()*R[b][i][k][d].ldg();
}
L[b][i][k] = sum;
}
void Encoding_(SquareSqueeze_Forward)(
THCState *state, THCTensor *L_, THCTensor *R_)
/*
* aggregating forward the residuals with assignment weights
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 2, L_, R_);
if (THCTensor_(nDimension)(state, L_) != 3 ||
THCTensor_(nDimension)(state, R_) != 4)
THError("Encoding: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> L = devicetensor<3>(state, L_);
THCDeviceTensor<real, 4> R = devicetensor<4>(state, R_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 threads(16, 16);
dim3 blocks(L.getSize(2)/16+1, L.getSize(1)/16+1,
L.getSize(0));
Encoding_(SquareSqueeze_Forward_kernel)<<<blocks, threads, 0, stream>>>
(L, R);
THCudaCheck(cudaGetLastError());
}
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
__global__ void Encoding_(SquareSqueeze_Backward_kernel) (
THCDeviceTensor<real, 3> GL,
THCDeviceTensor<real, 4> GR,
THCDeviceTensor<real, 4> R)
/*
*/
{
/* declarations of the variables */
int b, k, d, i, D;
real scale;
/* Get the index and channels */
b = blockIdx.z;
k = blockIdx.x * blockDim.x + threadIdx.x;
i = blockIdx.y * blockDim.y + threadIdx.y;
D = R.getSize(3);
/* boundary check for output */
if (k >= R.getSize(2) || i >= R.getSize(1)) return;
/* main operation */
scale = GL[b][i][k] * 2;
for(d=0; d<D; d++) {
GR[b][i][k][d] = scale * R[b][i][k][d];
}
}
void Encoding_(SquareSqueeze_Backward)(
THCState *state, THCTensor *GL_, THCTensor *GR_, THCTensor *R_)
/*
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 3, GL_, GR_, R_);
if (THCTensor_(nDimension)(state, GL_) != 3 ||
THCTensor_(nDimension)(state, GR_) != 4 ||
THCTensor_(nDimension)(state, R_) != 4)
THError("Encoding: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> GL = devicetensor<3>(state, GL_);
THCDeviceTensor<real, 4> GR = devicetensor<4>(state, GR_);
THCDeviceTensor<real, 4> R = devicetensor<4>(state, R_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 threads(16, 16);
dim3 blocks(R.getSize(2)/16+1, R.getSize(1)/16+1,
R.getSize(0));
Encoding_(SquareSqueeze_Backward_kernel)<<<blocks, threads, 0, stream>>>
(GL, GR, R);
THCudaCheck(cudaGetLastError());
}
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
__global__ void Encoding_(BatchNorm_Forward_kernel) (
THCDeviceTensor<real, 3> output,
......@@ -140,7 +639,6 @@ __global__ void Encoding_(BatchNorm_Forward_kernel) (
THCDeviceTensor<real, 1> beta)
{
int c = blockIdx.x;
//int N = input.getSize(0) * input.getSize(2);
/* main operation */
for (int b = 0; b < input.getSize(0); ++b) {
for (int x = threadIdx.x; x < input.getSize(2); x += blockDim.x) {
......@@ -291,10 +789,6 @@ __global__ void Encoding_(BatchNorm_Backward_kernel) (
/* Get the index and channels */
int c = blockIdx.x;
/* main operation */
//int N = input.getSize(0) * input.getSize(2);
//real norm;
//norm = 1.0 / N;
Encoding_(GradOp) g(mean[c], input, gradoutput);
Encoding_(Float2) res = Encoding_(reduce)(g, gradoutput, c);
real gradOutputSum = res.v1;
......
......@@ -12,11 +12,37 @@
#define THC_GENERIC_FILE "generic/encoding_kernel.h"
#else
void Encoding_(Aggregate_Forward)(THCState *state, THCTensor *E_,
void Encoding_(AggregateE_Forward)(THCState *state, THCTensor *E_,
THCTensor *A_, THCTensor *X_, THCTensor *C_);
void Encoding_(AggregateE_Backward)(THCState *state, THCTensor *GA_,
THCTensor *GE_, THCTensor *A_, THCTensor *X_, THCTensor *C_);
void Encoding_(ScaledL2_Forward)( THCState *state, THCTensor *SL_,
THCTensor *X_, THCTensor *C_, THCTensor *S_);
void Encoding_(ScaledL2_Backward)(
THCState *state, THCTensor *GSL_, THCTensor *GX_, THCTensor *GC_,
THCTensor *X_, THCTensor *C_, THCTensor *S_);
void Encoding_(Aggregate_Forward)(
THCState *state, THCTensor *E_, THCTensor *A_, THCTensor *R_);
void Encoding_(Aggregate_Backward)(
THCState *state, THCTensor *GA_, THCTensor *GR_, THCTensor *L_,
THCTensor *A_, THCTensor *R_);
void Encoding_(Aggregate_Backward)(THCState *state, THCTensor *GA_,
THCTensor *GR_, THCTensor *L_, THCTensor *A_, THCTensor *R_);
void Encoding_(Residual_Forward)(
THCState *state, THCTensor *R_, THCTensor *X_, THCTensor *D_);
void Encoding_(Residual_Backward)(
THCState *state, THCTensor *GR_, THCTensor *GX_, THCTensor *GD_);
void Encoding_(SquareSqueeze_Forward)(
THCState *state, THCTensor *L_, THCTensor *R_);
void Encoding_(SquareSqueeze_Backward)(
THCState *state, THCTensor *GL_, THCTensor *GR_, THCTensor *R_);
void Encoding_(BatchNorm_Forward)(THCState *state,
THCTensor *output_, THCTensor *input_,
......
......@@ -20,11 +20,36 @@
#include "THC/THCGenerateFloatType.h"
*/
int Encoding_Float_scaledl2_forward(THCudaTensor *SL,
THCudaTensor *X, THCudaTensor *C, THCudaTensor *S);
int Encoding_Float_scaledl2_backward(
THCudaTensor *GSL, THCudaTensor *GX, THCudaTensor *GC,
THCudaTensor *X, THCudaTensor *C, THCudaTensor *S);
int Encoding_Float_aggregateE_forward(THCudaTensor *E, THCudaTensor *A,
THCudaTensor *X, THCudaTensor *C);
int Encoding_Float_aggregateE_backward(THCudaTensor *GA, THCudaTensor *GE,
THCudaTensor *A, THCudaTensor *X, THCudaTensor *C);
int Encoding_Float_aggregate_forward(THCudaTensor *E, THCudaTensor *A,
THCudaTensor *R);
int Encoding_Float_aggregate_backward(THCudaTensor *GA, THCudaTensor *GR,
THCudaTensor *L, THCudaTensor *A, THCudaTensor *R);
int Encoding_Float_residual_forward(THCudaTensor *R, THCudaTensor *X,
THCudaTensor *D);
int Encoding_Float_residual_backward(THCudaTensor *GR, THCudaTensor *GX,
THCudaTensor *GD);
int Encoding_Float_squaresqueeze_forward(THCudaTensor *L, THCudaTensor *R);
int Encoding_Float_squaresqueeze_backward(THCudaTensor *GL,
THCudaTensor *GR, THCudaTensor *R);
int Encoding_Float_batchnorm_Forward(THCudaTensor *output_,
THCudaTensor *input_, THCudaTensor *mean_,
THCudaTensor *invstd_, THCudaTensor *gamma_, THCudaTensor *beta_);
......@@ -43,25 +68,55 @@ void Encoding_Float_sum_square_Backward(
THCudaTensor *gradInput, THCudaTensor *input_,
THCudaTensor *gradSum_, THCudaTensor *gradSquare_);
/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
int Encoding_Double_aggregate_forward(THCudaDoubleTensor *E,
THCudaDoubleTensor *A, THCudaDoubleTensor *R);
int Encoding_Double_scaledl2_forward(THCudaDoubleTensor *SL,
THCudaDoubleTensor *X, THCudaDoubleTensor *C, THCudaDoubleTensor *S);
int Encoding_Double_scaledl2_backward(
THCudaDoubleTensor *GSL, THCudaDoubleTensor *GX,
THCudaDoubleTensor *GC, THCudaDoubleTensor *X,
THCudaDoubleTensor *C, THCudaDoubleTensor *S);
int Encoding_Double_aggregateE_forward(THCudaDoubleTensor *E,
THCudaDoubleTensor *A, THCudaDoubleTensor *X, THCudaDoubleTensor *C);
int Encoding_Double_aggregateE_backward(THCudaDoubleTensor *GA,
THCudaDoubleTensor *GE, THCudaDoubleTensor *A, THCudaDoubleTensor *X,
THCudaDoubleTensor *C);
int Encoding_Double_aggregate_forward(
THCudaDoubleTensor *E, THCudaDoubleTensor *A, THCudaDoubleTensor *R);
int Encoding_Double_aggregate_backward(THCudaDoubleTensor *GA,
THCudaDoubleTensor *GR, THCudaDoubleTensor *L,
int Encoding_Double_aggregate_backward(
THCudaDoubleTensor *GA, THCudaDoubleTensor *GR, THCudaDoubleTensor *L,
THCudaDoubleTensor *A, THCudaDoubleTensor *R);
int Encoding_Double_residual_forward(
THCudaDoubleTensor *R, THCudaDoubleTensor *X, THCudaDoubleTensor *D);
int Encoding_Double_residual_backward(
THCudaDoubleTensor *GR, THCudaDoubleTensor *GX,
THCudaDoubleTensor *GD);
int Encoding_Double_squaresqueeze_forward(THCudaDoubleTensor *L,
THCudaDoubleTensor *R);
int Encoding_Double_squaresqueeze_backward(THCudaDoubleTensor *GL,
THCudaDoubleTensor *GR, THCudaDoubleTensor *R);
int Encoding_Double_batchnorm_Forward(THCudaDoubleTensor *output_,
THCudaDoubleTensor *input_, THCudaDoubleTensor *mean_,
THCudaDoubleTensor *invstd_, THCudaDoubleTensor *gamma_, THCudaDoubleTensor *beta_);
THCudaDoubleTensor *invstd_, THCudaDoubleTensor *gamma_,
THCudaDoubleTensor *beta_);
int Encoding_Double_batchnorm_Backward(THCudaDoubleTensor *gradoutput_,
THCudaDoubleTensor *input_, THCudaDoubleTensor *gradinput_,
THCudaDoubleTensor *gradgamma_, THCudaDoubleTensor *gradbeta_,
THCudaDoubleTensor *mean_, THCudaDoubleTensor *invstd_,
THCudaDoubleTensor *gamma_, THCudaDoubleTensor *beta_,
THCudaDoubleTensor *gradMean_, THCudaDoubleTensor *gradStd_, int train);
THCudaDoubleTensor *gradMean_, THCudaDoubleTensor *gradStd_,
int train);
int Encoding_Double_sum_square_Forward(THCudaDoubleTensor *input_,
THCudaDoubleTensor *sum_, THCudaDoubleTensor *square_);
......
......@@ -12,6 +12,56 @@
#define THC_GENERIC_FILE "generic/encoding_generic.c"
#else
int Encoding_(scaledl2_forward)(THCTensor *SL,
THCTensor *X, THCTensor *C, THCTensor *S)
/*
* ScaledL2 operation
*/
{
Encoding_(ScaledL2_Forward)(state, SL, X, C, S);
/* C function return number of the outputs */
return 0;
}
int Encoding_(scaledl2_backward)(
THCTensor *GSL, THCTensor *GX, THCTensor *GC,
THCTensor *X, THCTensor *C, THCTensor *S)
/*
* ScaledL2 operation
*/
{
Encoding_(ScaledL2_Backward)(state, GSL, GX, GC, X, C, S);
/* C function return number of the outputs */
return 0;
}
int Encoding_(aggregateE_forward)(THCTensor *E, THCTensor *A,
THCTensor *X, THCTensor *C)
/*
* Aggregate operation
*/
{
Encoding_(AggregateE_Forward)(state, E, A, X, C);
/* C function return number of the outputs */
return 0;
}
int Encoding_(aggregateE_backward)(THCTensor *GA, THCTensor *GE,
THCTensor *A, THCTensor *X, THCTensor *C)
/*
* Aggregate backward operation to A
* G (dl/dR), L (dl/dE), A (assignments)
*/
{
Encoding_(AggregateE_Backward)(state, GA, GE, A, X, C);
/* C function return number of the outputs */
return 0;
}
int Encoding_(aggregate_forward)(THCTensor *E, THCTensor *A,
THCTensor *R)
/*
......@@ -35,6 +85,49 @@ int Encoding_(aggregate_backward)(THCTensor *GA, THCTensor *GR,
return 0;
}
int Encoding_(residual_forward)(THCTensor *R, THCTensor *X, THCTensor *D)
/*
* Residual operation
*/
{
Encoding_(Residual_Forward)(state, R, X, D);
/* C function return number of the outputs */
return 0;
}
int Encoding_(residual_backward)(THCTensor *GR, THCTensor *GX,
THCTensor *GD)
/*
* Residual operation
*/
{
Encoding_(Residual_Backward)(state, GR, GX, GD);
/* C function return number of the outputs */
return 0;
}
int Encoding_(squaresqueeze_forward)(THCTensor *L, THCTensor *R)
/*
* Residual operation
*/
{
Encoding_(SquareSqueeze_Forward)(state, L, R);
/* C function return number of the outputs */
return 0;
}
int Encoding_(squaresqueeze_backward)(THCTensor *GL, THCTensor *GR,
THCTensor *R)
/*
* Residual operation
*/
{
Encoding_(SquareSqueeze_Backward)(state, GL, GR, R);
/* C function return number of the outputs */
return 0;
}
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
int Encoding_(batchnorm_Forward)(THCTensor *output_, THCTensor *input_,
THCTensor *mean_, THCTensor *invstd_,
THCTensor *gamma_, THCTensor *beta_)
......
......@@ -10,19 +10,26 @@
import os
import sys
import subprocess
from setuptools import setup, find_packages
import build
from setuptools.command.develop import develop
from setuptools.command.install import install
this_file = os.path.dirname(__file__)
extra_compile_args = ['-std=c++11', '-Wno-write-strings']
#extra_compile_args = ['-std=c++11', '-Wno-write-strings']
if os.getenv('PYTORCH_BINARY_BUILD') and platform.system() == 'Linux':
print('PYTORCH_BINARY_BUILD found. Static linking libstdc++ on Linux')
extra_compile_args += ['-static-libstdc++']
extra_link_args += ['-static-libstdc++']
class TestCommand(install):
"""Post-installation mode."""
def run(self):
install.run(self)
subprocess.check_call("python test/test.py".split())
setup(
name="encoding",
version="0.0.1",
......@@ -35,11 +42,14 @@ setup(
setup_requires=["cffi>=1.0.0"],
# Exclude the build files.
packages=find_packages(exclude=["build"]),
extra_compile_args=extra_compile_args,
#extra_compile_args=extra_compile_args,
# Package where to put the extensions. Has to be a prefix of build.py.
ext_package="",
# Extensions to compile.
cffi_modules=[
os.path.join(this_file, "build.py:ffi")
],
cmdclass={
'install': TestCommand,
},
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment