/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ * Created by: Hang Zhang * ECE Department, Rutgers University * Email: zhang.hang@rutgers.edu * Copyright (c) 2017 * * This source code is licensed under the MIT-style license found in the * LICENSE file in the root directory of this source tree *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */ #ifndef THC_GENERIC_FILE #define THC_GENERIC_FILE "generic/encoding_utils.c" #else struct Encoding_(AggOp) { __device__ Encoding_(AggOp)(THCDeviceTensor a, THCDeviceTensor x, THCDeviceTensor c) : A(a), X(x), C(c) {} __device__ __forceinline__ real operator()(int b, int i, int k, int d) { return A[b][i][k].ldg() * (X[b][i][d].ldg()-C[k][d].ldg()); } THCDeviceTensor A; THCDeviceTensor X; THCDeviceTensor C; }; __device__ real Encoding_(reduce_agg)( Encoding_(AggOp) op, int b, int k, int d, int N) { real sum = 0; for (int x = threadIdx.x; x < N; x += blockDim.x) { sum += op(b,x,k,d); } // sum over NumThreads within a warp sum = Encoding_(rwarpSum)(sum); // 'transpose', and reduce within warp again __shared__ real shared[32]; __syncthreads(); if (threadIdx.x % WARP_SIZE == 0) { if (threadIdx.x / WARP_SIZE < 32) { shared[threadIdx.x / WARP_SIZE] = sum; } } if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) { // zero out the other entries in shared shared[threadIdx.x] = (real) 0; } __syncthreads(); if (threadIdx.x / WARP_SIZE == 0) { sum = Encoding_(rwarpSum)(shared[threadIdx.x]); if (threadIdx.x == 0) { shared[0] = sum; } } __syncthreads(); // Everyone picks it up, should be broadcast into the whole gradInput return shared[0]; } struct Encoding_(AggBackOp) { __device__ Encoding_(AggBackOp)(THCDeviceTensor ge, THCDeviceTensor x, THCDeviceTensor c) : GE(ge), X(x), C(c) {} __device__ __forceinline__ real operator()(int b, int i, int k, int d) { return GE[b][k][d].ldg() * (X[b][i][d].ldg()-C[k][d].ldg()); } THCDeviceTensor GE; THCDeviceTensor X; THCDeviceTensor C; }; __device__ real Encoding_(reduce_aggback)( Encoding_(AggBackOp) op, int b, int i, int k, int D) { real sum = 0; for (int x = threadIdx.x; x < D; x += blockDim.x) { sum += op(b,i,k,x); } // sum over NumThreads within a warp sum = Encoding_(rwarpSum)(sum); // 'transpose', and reduce within warp again __shared__ real shared[32]; __syncthreads(); if (threadIdx.x % WARP_SIZE == 0) { if (threadIdx.x / WARP_SIZE < 32) { shared[threadIdx.x / WARP_SIZE] = sum; } } if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) { // zero out the other entries in shared shared[threadIdx.x] = (real) 0; } __syncthreads(); if (threadIdx.x / WARP_SIZE == 0) { sum = Encoding_(rwarpSum)(shared[threadIdx.x]); if (threadIdx.x == 0) { shared[0] = sum; } } __syncthreads(); // Everyone picks it up, should be broadcast into the whole gradInput return shared[0]; } struct Encoding_(L2Op) { __device__ Encoding_(L2Op)(THCDeviceTensor x, THCDeviceTensor c) : X(x), C(c) {} __device__ __forceinline__ real operator()(int b, int i, int k, int d) { real r = X[b][i][d].ldg() - C[k][d].ldg(); return r * r; } THCDeviceTensor X; THCDeviceTensor C; }; __device__ real Encoding_(reduce_sl2)( Encoding_(L2Op) op, int b, int i, int k, int D) { real sum = 0; for (int x = threadIdx.x; x < D; x += blockDim.x) { sum += op(b,i,k,x); } // sum over NumThreads within a warp sum = Encoding_(rwarpSum)(sum); // 'transpose', and reduce within warp again __shared__ real shared[32]; __syncthreads(); if (threadIdx.x % WARP_SIZE == 0) { if (threadIdx.x / WARP_SIZE < 32) { shared[threadIdx.x / WARP_SIZE] = sum; } } if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) { // zero out the other entries in shared shared[threadIdx.x] = (real) 0; } __syncthreads(); if (threadIdx.x / WARP_SIZE == 0) { sum = Encoding_(rwarpSum)(shared[threadIdx.x]); if (threadIdx.x == 0) { shared[0] = sum; } } __syncthreads(); // Everyone picks it up, should be broadcast into the whole gradInput return shared[0]; } struct Encoding_(L2XBackOp) { __device__ Encoding_(L2XBackOp)( THCDeviceTensor gsl, THCDeviceTensor x, THCDeviceTensor c, THCDeviceTensor s ) : GSL(gsl), X(x), C(c), S(s) {} __device__ __forceinline__ real operator()(int b, int i, int k, int d) { return 2*S[k].ldg() * GSL[b][i][k].ldg() * (X[b][i][d].ldg()-C[k][d].ldg()); } THCDeviceTensor GSL; THCDeviceTensor X; THCDeviceTensor C; THCDeviceTensor S; }; __device__ real Encoding_(reduce_sl2xback)( Encoding_(L2XBackOp) op, int b, int i, int d, int K) { real sum = 0; for (int x = threadIdx.x; x < K; x += blockDim.x) { sum += op(b,i,x,d); } // sum over NumThreads within a warp sum = Encoding_(rwarpSum)(sum); // 'transpose', and reduce within warp again __shared__ real shared[32]; __syncthreads(); if (threadIdx.x % WARP_SIZE == 0) { if (threadIdx.x / WARP_SIZE < 32) { shared[threadIdx.x / WARP_SIZE] = sum; } } if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) { // zero out the other entries in shared shared[threadIdx.x] = (real) 0; } __syncthreads(); if (threadIdx.x / WARP_SIZE == 0) { sum = Encoding_(rwarpSum)(shared[threadIdx.x]); if (threadIdx.x == 0) { shared[0] = sum; } } __syncthreads(); // Everyone picks it up, should be broadcast into the whole gradInput return shared[0]; } struct Encoding_(L2CBackOp) { __device__ Encoding_(L2CBackOp)( THCDeviceTensor gsl, THCDeviceTensor x, THCDeviceTensor c, THCDeviceTensor s ) : GSL(gsl), X(x), C(c), S(s) {} __device__ __forceinline__ real operator()(int b, int i, int k, int d) { return -2*S[k].ldg() * GSL[b][i][k].ldg() * (X[b][i][d].ldg()-C[k][d].ldg()); } THCDeviceTensor GSL; THCDeviceTensor X; THCDeviceTensor C; THCDeviceTensor S; }; __device__ real Encoding_(reduce_sl2cback)( Encoding_(L2CBackOp) op, int k, int d, int B, int N) { real sum = 0; for (int batch = 0; batch < B; ++batch) { for (int x = threadIdx.x; x < N; x += blockDim.x) { sum += op(batch,x,k,d); } } // sum over NumThreads within a warp sum = Encoding_(rwarpSum)(sum); // 'transpose', and reduce within warp again __shared__ real shared[32]; __syncthreads(); if (threadIdx.x % WARP_SIZE == 0) { if (threadIdx.x / WARP_SIZE < 32) { shared[threadIdx.x / WARP_SIZE] = sum; } } if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) { // zero out the other entries in shared shared[threadIdx.x] = (real) 0; } __syncthreads(); if (threadIdx.x / WARP_SIZE == 0) { sum = Encoding_(rwarpSum)(shared[threadIdx.x]); if (threadIdx.x == 0) { shared[0] = sum; } } __syncthreads(); // Everyone picks it up, should be broadcast into the whole gradInput return shared[0]; } #endif