/****************************************************************
 //Linear index of i particle, divided by 2 because we unroll i by 2
* This file is part of the gpu acceleration library for gromacs.
* Author: Mark Friedrichs
* 
* This kernel was developed in collaboration with
* 
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/

/* After forces above, we have the forces for even numbered particles
 * in one stream, odd numbered particles in another.
 * In each stream, the forces are in several parts depending on how
 * many times we replicated the input stream.
 *
 * To avoid an extra kernel to zero forces, this sets the forces
 * rather than adding to it.
 * */

kernel void kMergeFloat( 
      float repfac, 
      float atomStrWidth, 
      float pstreamStrWidth,
      float natoms,
      float iUnroll,
      iter float2 count<>, 
      float pstream1[][], 
      float pstream2[][], 
      out float outstream<> )
{
   float linind;
   float2 pindex;
   float odd;
   float i;
   
   //convert to linear atom index
   linind = count.x + count.y * atomStrWidth;
   
   //If odd or even, we pick from diferent streams.
   odd = linind - floor( linind / iUnroll ) * iUnroll;

   //Now linear index is the index into partial_streams
   linind = floor( linind / iUnroll );

   outstream = 0.0f;
   
   //If we have predicated conditionals, we should 
   //keep the conditional inside the loop

   for ( i = 0; i < repfac; i+=1.0f ) {

      pindex.y = floor( linind / pstreamStrWidth );
      pindex.x = linind - pindex.y * pstreamStrWidth;
      
      if ( odd > 0.5f ) { //is odd
         outstream += pstream2[ pindex ];
      } else {
         outstream += pstream1[ pindex ];
      }
      linind += natoms/iUnroll; 
   }
}

kernel void kMergeFloat4( 
      float repfac, 
      float atomStrWidth, 
      float pstreamStrWidth,
      float natoms,
      float iUnroll,
      float4 pstream1[][], 
      float4 pstream2[][], 
      out float4 outstream<> )
{
   float linind;
   float2 pindex;
   float odd;
   float i;
   
   //convert to linear atom index
   linind = (indexof outstream).x + ( (indexof outstream).y * atomStrWidth );
   
   //If odd or even, we pick from diferent streams.
   odd = linind - floor( linind / iUnroll ) * iUnroll;

   //Now linear index is the index into partial_streams
   linind = floor( linind / iUnroll );

   outstream = float4( 0.0f, 0.0f, 0.0f, 0.0f );
   
   //If we have predicated conditionals, we should 
   //keep the conditional inside the loop

   for ( i = 0.0f; i < repfac; i+= 1.0f ) {

      pindex.y = floor( linind / pstreamStrWidth );
      pindex.x = linind - pindex.y * pstreamStrWidth;
      
      if ( odd > 0.5f ) { //is odd
         outstream += pstream2[ pindex ];
      } else {
         outstream += pstream1[ pindex ];
      }
      linind += natoms/iUnroll; 
   }
}


/* After forces above, we have the forces for even numbered particles
 * in one stream, odd numbered particles in another.
 * In each stream, the forces are in several parts depending on how
 * many times we replicated the input stream.
 *
 * To avoid an extra kernel to zero forces, this sets the forces
 * rather than adding to it.
 * */

kernel void kMergeFloat4_4X( 
      float repfac, 
      float atomStrWidth, 
      float pstreamStrWidth,
      float natoms,
      float iUnroll,
      float4 pstream1[][], 
      float4 pstream2[][], 
      float4 pstream3[][], 
      float4 pstream4[][], 
      out float4 outstream<> )
{
   float linind;
   float2 pindex;
   float odd;
   float i;
   
   //convert to linear atom index
   linind = (indexof outstream).x + ( (indexof outstream).y * atomStrWidth );
   
   //If odd or even, we pick from diferent streams.
   odd = linind - floor( linind / iUnroll ) * iUnroll;

   //Now linear index is the index into partial_streams
   linind = floor( linind / iUnroll );

   outstream = float4( 0.0f, 0.0f, 0.0f, 0.0f );
   
   //If we have predicated conditionals, we should 
   //keep the conditional inside the loop

   for ( i = 0.0f; i < repfac; i+= 1.0f ) {

      //pindex.y = floor( linind / pstreamStrWidth );
      //pindex.x = linind - pindex.y * pstreamStrWidth;
      
      pindex.y = round( (linind - fmod( linind, pstreamStrWidth ))/pstreamStrWidth ); //bixia modify
      pindex.x = linind - pindex.y * pstreamStrWidth;

outstream += float4( linind, odd, pindex.x,  pindex.y );
/*
      if ( odd < 0.5f ) { //is odd
         outstream += pstream1[ pindex ];
      } else if( odd < 1.5f ){
         outstream += pstream2[ pindex ];
      } else if( odd < 2.5f ){
         outstream += pstream3[ pindex ];
      } else {
         outstream += pstream4[ pindex ];
      }
*/

      linind += natoms/iUnroll; 
   }
}

kernel void kMergeFloat4_4( 
      float repfac, 
      float atomStreamWidth, 
      float pStreamWidth,
      float natoms,
      float roundNatoms,
      float iUnroll,
      float4 pstream1[][], 
      float4 pstream2[][], 
      float4 pstream3[][], 
      float4 pstream4[][], 
      out float4 outstream<> )
{
   float atomIndex, forceIndex, qIndex, qOff;
   float2 pindex;
   float i;
   
   // given atom index find force indices and streams

   pindex      = indexof( outstream );
   atomIndex   = pindex.x + pindex.y*atomStreamWidth;
   forceIndex  = atomIndex;

   outstream   = float4( 0.0f, 0.0f, 0.0f, 0.0f );
   
   for( i = 0.0f; i < repfac; i += 1.0f ){

      // qIndex            = floor( forceIndex/iUnroll );
      qIndex            = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );

      qOff              = forceIndex - iUnroll*qIndex;

      // pindex.y          = floor( qIndex/ pStreamWidth );
      pindex.y          = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );

      // pindex.x          = qIndex - pindex.y*pStreamWidth + qOff;
      pindex.x          = qIndex - pindex.y*pStreamWidth;

// outstream += float4( forceIndex, qIndex, pindex.x,  pindex.y );

      if ( qOff < 0.5f ){ 
         outstream += pstream1[ pindex ];
      } else if( qOff < 1.5f ){
         outstream += pstream2[ pindex ];
      } else if( qOff < 2.5f ){
         outstream += pstream3[ pindex ];
      } else {
         outstream += pstream4[ pindex ];
      }

      forceIndex += roundNatoms;
   }
}

kernel void kSetValue4( float value, out float4 outstream<> ){
 outstream = float4( value, value, value, value );
}

kernel void kSetValue3( float value, out float3 outstream<> ){
 outstream = float3( value, value, value );
}

kernel void kSetValue2( float value, out float2 outstream<> ){
 outstream = float2( value, value );
}

kernel void kSetValue1( float value, out float outstream<> ){
 outstream = value;
}

kernel void kCheck( float natoms, float atomStrWidth, float pstreamStrWidth, float unroll, out float4 outstream<> )
{

 float linind, forceIndex, atomIndex;
 float2 pindex;

 pindex     = indexof( outstream );
 forceIndex = unroll*(pindex.x + pindex.y*pstreamStrWidth);
 atomIndex  = fmod( forceIndex, natoms );
 outstream  = float4( pindex.x, pindex.y, forceIndex, atomIndex );
   
}

/* After forces above, we have the forces for even numbered particles
 * in one stream, odd numbered particles in another.
 * In each stream, the forces are in several parts depending on how
 * many times we replicated the input stream.
 *
 * To avoid an extra kernel to zero forces, this sets the forces
 * rather than adding to it.
 * */

kernel void kAddAndMergeFloat4( 
      float repfac, 
      float atomStrWidth, 
      float pstreamStrWidth,
      float natoms,
      float iUnroll,
      float4 inStream<>, 
      float4 pstream1[][], 
      float4 pstream2[][], 
      out float4 outstream<> )
{
   float linind;
   float2 pindex;
   float odd;
   float i;
   float floor_linind_iUnroll;
   
   linind = (indexof outstream).x + (indexof outstream).y * atomStrWidth;
   
   //If odd or even, we pick from diferent streams.
   //odd = linind - floor( linind / iUnroll ) * iUnroll;

   //Now linear index is the index into partial_streams
   //linind = floor( linind / iUnroll );
   
   floor_linind_iUnroll = round( (linind - fmod(linind, iUnroll))/iUnroll );
   odd = linind - floor_linind_iUnroll * iUnroll;//bixia modify
   linind = floor_linind_iUnroll; //bixia modify

   outstream   = inStream;
   outstream.w = 0.0f;
   
   //If we have predicated conditionals, we should 
   //keep the conditional inside the loop

   for ( i = 0.0f; i < repfac; i+= 1.0f ) {

      //pindex.y = floor( linind / pstreamStrWidth );
      pindex.y = round( (linind - fmod( linind, pstreamStrWidth ))/pstreamStrWidth ); //bixia modify
      
      pindex.x = linind - pindex.y * pstreamStrWidth;
      
      if ( odd > 0.5f ) { //is odd
         outstream += pstream2[ pindex ];
      } else {
         outstream += pstream1[ pindex ];
      }
      linind += natoms/iUnroll; 
   }
}

kernel void kAddAndMergeFloat4_4( 
      float repfac, 
      float atomStreamWidth, 
      float pStreamWidth,
      float natoms,
      float roundNatoms,
      float iUnroll,
      float4 inStream<>, 
      float4 pstream1[][], 
      float4 pstream2[][], 
      float4 pstream3[][], 
      float4 pstream4[][], 
      out float4 outstream<> ){

   float atomIndex, forceIndex, qIndex, qOff;
   float2 pindex;
   float i;
   
   // given atom index find force indices and streams

   pindex      = indexof( outstream );
   atomIndex   = pindex.x + pindex.y*atomStreamWidth;
   forceIndex  = atomIndex;

   // add current forces in inStream to forces stored in pstreams
   // the .w entry is Born sum values; it will be used to calculate the
   // Born radii and obcChain term

   outstream   = inStream;
   outstream.w = 0.0f;
//outstream   = float4( 0.0f, 0.0f, 0.0f, 0.0f );
   
   // sum over j-loop 'duplications' by gathering from pstreams

   for( i = 0.0f; i < repfac; i += 1.0f ){

      // qIndex            = floor( forceIndex/iUnroll );
      qIndex            = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );

      qOff              = forceIndex - iUnroll*qIndex;

      // pindex.y          = floor( qIndex/ pStreamWidth );
      pindex.y          = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );

      // pindex.x          = qIndex - pindex.y*pStreamWidth + qOff;
      pindex.x          = qIndex - pindex.y*pStreamWidth;

      if( qOff < 0.5f ){ 
         outstream += pstream1[ pindex ];
      } else if( qOff < 1.5f ){
         outstream += pstream2[ pindex ];
      } else if( qOff < 2.5f ){
         outstream += pstream3[ pindex ];
      } else {
         outstream += pstream4[ pindex ];
      }
      forceIndex += roundNatoms;
   }
}

/* Add forces from two streams */ 

kernel void kAddForces3_4( float conversion, float3 force1<>, float4 force2<>, out float3 outForce<> ){
   outForce.xyz  = force1 + conversion*force2.xyz;
}

/* Copy one stream to another */

kernel void kCopyFloat4( float4 inForce<>, out float4 outForce<> ){
   outForce = inForce;
}

/* Copy one stream to another
 * */

kernel void kCopyFloat3To4( 
      float3 inForce<>, 
      out float4 outForce<> ){
   // ---------------------------------------------------------------------------------------

   outForce.xyz = inForce;
   outForce.w   = 0.0f;
}