//-----------------------------------------------------------------------------------------

//-----------------------------------------------------------------------------------------

typedef unsigned int uint;

__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateAmoebaReduce, N2ToNBlockLevel)( float *N2Array, float *partialSum, int num,int numberOfBlocksPerAtom )
{

   uint tid = threadIdx.x;
  
   __shared__ float asx[THREADS_PER_BLOCK];
   asx[tid] = 0.0f;

#ifdef AMOEBA_OFFSET_3
    __shared__ float asy[THREADS_PER_BLOCK];
    __shared__ float asz[THREADS_PER_BLOCK];
    asx[tid] = 0.0f;
    asy[tid] = asz[tid] = 0.0f;
    int offset = 3;
#else
    int offset = 1;
#endif

    int atomI =  blockIdx.x / numberOfBlocksPerAtom;
    int atomJ = (blockIdx.x % numberOfBlocksPerAtom)*blockDim.x+tid;

#ifdef AMOEBA_INCLUDE_DIAGONAL
    if( atomJ < num && atomI < num ){
#else
    if( atomJ < num && atomJ != atomI ){
#endif

      int index = offset*(atomI*num + atomJ);
      asx[tid] = N2Array[index];
#ifdef AMOEBA_OFFSET_3
      asy[tid] = N2Array[index+1];
      asz[tid] = N2Array[index+2];
#endif
    }
    __syncthreads(); //to make sure all the elements are loaded

    for( uint s = (blockDim.x)/2; s != 0; s >>= 1 ){
      if( tid < s ){
        asx[tid] += asx[tid+s];
#ifdef AMOEBA_OFFSET_3
        asy[tid] += asy[tid+s];
        asz[tid] += asz[tid+s];
#endif
      }
      __syncthreads();
    }
  
    if( tid == 0 ){
      partialSum[blockIdx.x*offset] = asx[0];
#ifdef AMOEBA_OFFSET_3
      partialSum[blockIdx.x*3+1]    = asy[0];
      partialSum[blockIdx.x*3+2]    = asz[0];
#endif
    }  

}

__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateAmoebaReduce, N2ToNFinal)( float *partialSum, float *final,int num,int numberOfBlocksPerAtom )
{

    uint thread_id = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
    if( thread_id > num )return;
  
    float3 sum;
  
#ifdef AMOEBA_OFFSET_3
    int offset = 3;
    sum.x = sum.y = sum.z = 0.0f;
#else
    int offset = 1;
    sum.x      = 0.0f;
#endif

    int index = thread_id*offset*numberOfBlocksPerAtom;
    for( int i=0; i < numberOfBlocksPerAtom; i++ ){
      sum.x += partialSum[index + i*offset];
#ifdef AMOEBA_OFFSET_3
      sum.y += partialSum[index + i*offset+1];
      sum.z += partialSum[index + i*offset+2];
#endif
      
    }
    final[thread_id*offset   ] = sum.x;
#ifdef AMOEBA_OFFSET_3
    final[thread_id*3+1 ]      = sum.y;
    final[thread_id*3+2 ]      = sum.z;
#endif
}