/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ * Created by: Hang Zhang * ECE Department, Rutgers University * Email: zhang.hang@rutgers.edu * Copyright (c) 2017 * * This source code is licensed under the MIT-style license found in the * LICENSE file in the root directory of this source tree *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */ #ifndef THC_GENERIC_FILE #define THC_GENERIC_FILE "generic/encoding_kernel.c" #else __global__ void Encoding_(AggregateE_Forward_kernel) ( THCDeviceTensor E, THCDeviceTensor A, THCDeviceTensor X, THCDeviceTensor C) /* * aggregating forward kernel function */ { /* declarations of the variables */ int b, k, d, i, N; real sum; /* Get the index and channels */ b = blockIdx.z; d = blockIdx.x * blockDim.x + threadIdx.x; k = blockIdx.y * blockDim.y + threadIdx.y; N = A.getSize(1); /* boundary check for output */ if (d >= E.getSize(2) || k >= E.getSize(1)) return; sum = 0; /* main operation */ for(i=0; i E = devicetensor<3>(state, E_); THCDeviceTensor A = devicetensor<3>(state, A_); THCDeviceTensor X = devicetensor<3>(state, X_); THCDeviceTensor C = devicetensor<2>(state, C_); /* kernel function */ cudaStream_t stream = THCState_getCurrentStream(state); dim3 threads(16, 16); dim3 blocks(E.getSize(2)/16+1, E.getSize(1)/16+1, E.getSize(0)); Encoding_(AggregateE_Forward_kernel)<<>> (E, A, X, C); THCudaCheck(cudaGetLastError()); } /*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ __global__ void Encoding_(Aggregate_Forward_kernel) ( THCDeviceTensor E, THCDeviceTensor A, THCDeviceTensor R) /* * aggregating forward kernel function */ { /* declarations of the variables */ int b, k, d, i, N; real sum; /* Get the index and channels */ b = blockIdx.z; d = blockIdx.x * blockDim.x + threadIdx.x; k = blockIdx.y * blockDim.y + threadIdx.y; N = A.getSize(1); /* boundary check for output */ sum = 0; if (d >= E.getSize(2) || k >= E.getSize(1)) return; /* main operation */ for(i=0; i E = devicetensor<3>(state, E_); THCDeviceTensor A = devicetensor<3>(state, A_); THCDeviceTensor R = devicetensor<4>(state, R_); /* kernel function */ cudaStream_t stream = THCState_getCurrentStream(state); dim3 threads(16, 16); dim3 blocks(E.getSize(2)/16+1, E.getSize(1)/16+1, E.getSize(0)); Encoding_(Aggregate_Forward_kernel)<<>>(E, A, R); THCudaCheck(cudaGetLastError()); } /*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ __global__ void Encoding_(AggregateE_Backward_kernel) ( THCDeviceTensor GA, THCDeviceTensor GE, THCDeviceTensor A, THCDeviceTensor X, THCDeviceTensor C) /* * aggregating backward kernel function * G (dl/dR), L (dl/dE), A */ { /* declarations of the variables */ int b, k, d, i, D; real sum; /* Get the index and channels */ b = blockIdx.z; i = blockIdx.y * blockDim.y + threadIdx.y; k = blockIdx.x * blockDim.x + threadIdx.x; D = GE.getSize(2); /* boundary check for output G \in R^{BxNxKxD} */ if (k >= GA.getSize(2) || i >= GA.getSize(1)) return; /* main operation */ sum = 0; for(d=0; d GA = devicetensor<3>(state, GA_); THCDeviceTensor GE = devicetensor<3>(state, GE_); THCDeviceTensor A = devicetensor<3>(state, A_); THCDeviceTensor X = devicetensor<3>(state, X_); THCDeviceTensor C = devicetensor<2>(state, C_); /* kernel function */ cudaStream_t stream = THCState_getCurrentStream(state); dim3 threads(16, 16); dim3 blocks(GA.getSize(2)/16+1, GA.getSize(1)/16+1, GA.getSize(0)); Encoding_(AggregateE_Backward_kernel)<<>> (GA, GE, A, X, C); THCudaCheck(cudaGetLastError()); } /*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ __global__ void Encoding_(Aggregate_Backward_kernel) ( THCDeviceTensor GA, THCDeviceTensor GR, THCDeviceTensor GE, THCDeviceTensor A, THCDeviceTensor R) /* * aggregating backward kernel function * G (dl/dR), L (dl/dE), A */ { /* declarations of the variables */ int b, k, d, i, D; real sum; /* Get the index and channels */ b = blockIdx.z; i = blockIdx.y * blockDim.y + threadIdx.y; k = blockIdx.x * blockDim.x + threadIdx.x; D = GE.getSize(2); /* boundary check for output G \in R^{BxNxKxD} */ if (k >= GR.getSize(2) || i >= GR.getSize(1)) return; /* main operation */ sum = 0; for(d=0; d GA = devicetensor<3>(state, GA_); THCDeviceTensor GR = devicetensor<4>(state, GR_); THCDeviceTensor GE = devicetensor<3>(state, GE_); THCDeviceTensor A = devicetensor<3>(state, A_); THCDeviceTensor R = devicetensor<4>(state, R_); /* kernel function */ cudaStream_t stream = THCState_getCurrentStream(state); dim3 threads(16, 16); dim3 blocks(GA.getSize(2)/16+1, GA.getSize(1)/16+1, GA.getSize(0)); Encoding_(Aggregate_Backward_kernel)<<>>(GA, GR, GE, A, R); THCudaCheck(cudaGetLastError()); } /*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ __global__ void Encoding_(Residual_Forward_kernel) ( THCDeviceTensor R, THCDeviceTensor X, THCDeviceTensor D) /* * aggregating forward kernel function */ { /* declarations of the variables */ int b, k, d, i, K; /* Get the index and channels */ b = blockIdx.z; d = blockIdx.x * blockDim.x + threadIdx.x; i = blockIdx.y * blockDim.y + threadIdx.y; K = R.getSize(2); /* boundary check for output */ if (d >= X.getSize(2) || i >= X.getSize(1)) return; /* main operation */ for(k=0; k R = devicetensor<4>(state, R_); THCDeviceTensor X = devicetensor<3>(state, X_); THCDeviceTensor D = devicetensor<2>(state, D_); /* kernel function */ cudaStream_t stream = THCState_getCurrentStream(state); dim3 threads(16, 16); dim3 blocks(X.getSize(2)/16+1, X.getSize(1)/16+1, X.getSize(0)); Encoding_(Residual_Forward_kernel)<<>>(R, X, D); THCudaCheck(cudaGetLastError()); } /*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ __global__ void Encoding_(ResidualX_Backward_kernel) ( THCDeviceTensor GR, THCDeviceTensor GX) /* * aggregating forward kernel function */ { /* declarations of the variables */ int b, k, d, i, K; real sum; /* Get the index and channels */ b = blockIdx.z; d = blockIdx.x * blockDim.x + threadIdx.x; i = blockIdx.y * blockDim.y + threadIdx.y; K = GR.getSize(2); /* boundary check for output */ if (d >= GX.getSize(2) || i >= GX.getSize(1)) return; /* main operation */ sum = 0; for(k=0; k GR, THCDeviceTensor GD) /* * aggregating forward kernel function */ { /* declarations of the variables */ int b, k, d, i, B, N; real sum; /* Get the index and channels */ d = blockIdx.x * blockDim.x + threadIdx.x; k = blockIdx.y * blockDim.y + threadIdx.y; B = GR.getSize(0); N = GR.getSize(1); /* boundary check for output */ if (d >= GD.getSize(1) || k >= GD.getSize(0)) return; /* main operation */ sum = 0; for(b=0; b GR = devicetensor<4>(state, GR_); THCDeviceTensor GX = devicetensor<3>(state, GX_); THCDeviceTensor GD = devicetensor<2>(state, GD_); /* kernel function */ cudaStream_t stream = THCState_getCurrentStream(state); dim3 threads(16, 16); dim3 blocks(GX.getSize(2)/16+1, GX.getSize(1)/16+1, GX.getSize(0)); Encoding_(ResidualX_Backward_kernel)<<>> (GR, GX); THCudaCheck(cudaGetLastError()); dim3 blocks2(GD.getSize(1)/16+1, GD.getSize(0)/16+1); Encoding_(ResidualD_Backward_kernel)<<>> (GR, GD); THCudaCheck(cudaGetLastError()); } /*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ __global__ void Encoding_(ScaledL2_Forward_kernel) ( THCDeviceTensor SL, THCDeviceTensor X, THCDeviceTensor C, THCDeviceTensor S) /* * aggregating forward kernel function */ { /* declarations of the variables */ int b, k, d, i, D; real r, sum; /* Get the index and channels */ b = blockIdx.z; k = blockIdx.x * blockDim.x + threadIdx.x; i = blockIdx.y * blockDim.y + threadIdx.y; D = X.getSize(2); /* boundary check for output */ if (k >= SL.getSize(2) || i >= SL.getSize(1)) return; /* main operation */ sum = 0; for(d=0; d SL = devicetensor<3>(state, SL_); THCDeviceTensor X = devicetensor<3>(state, X_); THCDeviceTensor C = devicetensor<2>(state, C_); THCDeviceTensor S = devicetensor<1>(state, S_); /* kernel function */ cudaStream_t stream = THCState_getCurrentStream(state); dim3 threads(16, 16); dim3 blocks(SL.getSize(2)/16+1, SL.getSize(1)/16+1, SL.getSize(0)); Encoding_(ScaledL2_Forward_kernel)<<>> (SL, X, C, S); THCudaCheck(cudaGetLastError()); } /*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ __global__ void Encoding_(ScaledL2X_Backward_kernel) ( THCDeviceTensor GSL, THCDeviceTensor GX, THCDeviceTensor X, THCDeviceTensor C, THCDeviceTensor S) /* */ { /* declarations of the variables */ int b, k, d, i, K; real sum; /* Get the index and channels */ b = blockIdx.z; d = blockIdx.x * blockDim.x + threadIdx.x; i = blockIdx.y * blockDim.y + threadIdx.y; K = C.getSize(0); /* boundary check for output */ if (d >= GX.getSize(2) || i >= GX.getSize(1)) return; /* main operation */ sum = 0; for(k=0; k GSL, THCDeviceTensor GC, THCDeviceTensor X, THCDeviceTensor C, THCDeviceTensor S) /* */ { /* declarations of the variables */ int b, k, d, i, B, N; real sum; /* Get the index and channels */ d = blockIdx.x * blockDim.x + threadIdx.x; k = blockIdx.y * blockDim.y + threadIdx.y; B = X.getSize(0); N = X.getSize(1); /* boundary check for output */ if (d >= GC.getSize(1) || k >= GC.getSize(0)) return; /* main operation */ sum = 0; for(b=0; b GSL = devicetensor<3>(state, GSL_); THCDeviceTensor GX = devicetensor<3>(state, GX_); THCDeviceTensor GC = devicetensor<2>(state, GC_); THCDeviceTensor X = devicetensor<3>(state, X_); THCDeviceTensor C = devicetensor<2>(state, C_); THCDeviceTensor S = devicetensor<1>(state, S_); /* kernel function */ cudaStream_t stream = THCState_getCurrentStream(state); dim3 threads(16, 16); dim3 blocks(GX.getSize(2)/16+1, GX.getSize(1)/16+1, GX.getSize(0)); Encoding_(ScaledL2X_Backward_kernel)<<>> (GSL, GX, X, C, S); THCudaCheck(cudaGetLastError()); dim3 blocks2(GC.getSize(1)/16+1, GX.getSize(0)/16+1); Encoding_(ScaledL2C_Backward_kernel)<<>> (GSL, GC, X, C, S); THCudaCheck(cudaGetLastError()); } #endif