/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ * Created by: Hang Zhang * ECE Department, Rutgers University * Email: zhang.hang@rutgers.edu * Copyright (c) 2017 * * This source code is licensed under the MIT-style license found in the * LICENSE file in the root directory of this source tree *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */ #ifndef THC_GENERIC_FILE #define THC_GENERIC_FILE "generic/device_tensor.h" #else template THCDeviceTensor devicetensor(THCState *state, THCTensor *t) { if (!t) { return THCDeviceTensor(); } int inDim = THCTensor_(nDimension)(state, t); return toDeviceTensor(state, t); /* if (inDim == Dim) { return toDeviceTensor(state, t); } // View in which the last dimensions are collapsed or expanded as needed THAssert(THCTensor_(isContiguous)(state, t)); int size[Dim]; for (int i = 0; i < Dim || i < inDim; ++i) { if (i < Dim && i < inDim) { size[i] = t->size[i]; } else if (i < Dim) { size[i] = 1; } else { size[Dim - 1] *= t->size[i]; } } return THCDeviceTensor(THCTensor_(data)(state, t), size); */ } struct Encoding_(Float2) /* * For reduce sum calcualtion of two elements */ { real v1, v2; __device__ Encoding_(Float2)() {} __device__ Encoding_(Float2)(real x1, real x2) : v1(x1), v2(x2) {} __device__ Encoding_(Float2)(real v) : v1(v), v2(v) {} __device__ Encoding_(Float2)(int v) : v1(v), v2(v) {} __device__ Encoding_(Float2)& operator+=(const Encoding_(Float2)& a) { v1 += a.v1; v2 += a.v2; return *this; } }; static __device__ __forceinline__ real Encoding_(rwarpSum)(real val) { #if CUDA_VERSION >= 9000 unsigned int mask = 0xffffffff; for (int i = 0; i < getMSB(WARP_SIZE); ++i) { val += __shfl_xor_sync(mask, val, 1 << i, WARP_SIZE); } #else #if __CUDA_ARCH__ >= 300 for (int i = 0; i < getMSB(WARP_SIZE); ++i) { val += __shfl_xor(val, 1 << i, WARP_SIZE); } #else __shared__ real values[MAX_BLOCK_SIZE]; values[threadIdx.x] = val; __threadfence_block(); const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE; for (int i = 1; i < WARP_SIZE; i++) { val += values[base + ((i + threadIdx.x) % WARP_SIZE)]; } #endif #endif return val; } static __device__ __forceinline__ Encoding_(Float2) Encoding_(warpSum)( Encoding_(Float2) value) { value.v1 = Encoding_(rwarpSum)(value.v1); value.v2 = Encoding_(rwarpSum)(value.v2); return value; } #endif