/****************************************************************
 //Linear index of i particle, divided by 2 because we unroll i by 2
* This file is part of the gpu acceleration library for gromacs.
* Author: Mark Friedrichs
* 
* This kernel was developed in collaboration with
* 
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/

/* After forces above, we have the forces for even numbered particles
 * in one stream, odd numbered particles in another.
 * In each stream, the forces are in several parts depending on how
 * many times we replicated the input stream.
 *
 * To avoid an extra kernel to zero forces, this sets the forces
 * rather than adding to it.
 * */

kernel void kMergeFloat( 
      float repfac, 
      float atomStrWidth, 
      float pstreamStrWidth,
      float natoms,
      float iUnroll,
      iter float2 count<>, 
      float pstream1[][], 
      float pstream2[][], 
      out float outstream<> )
{
   float linind;
   float2 pindex;
   float odd;
   float i;
   
   //convert to linear atom index
   linind = count.x + count.y * atomStrWidth;
   
   //If odd or even, we pick from diferent streams.
   odd = linind - floor( linind / iUnroll ) * iUnroll;

   //Now linear index is the index into partial_streams
   linind = floor( linind / iUnroll );

   outstream = 0.0f;
   
   //If we have predicated conditionals, we should 
   //keep the conditional inside the loop

   for ( i = 0; i < repfac; i+=1.0f ) {

      pindex.y = floor( linind / pstreamStrWidth );
      pindex.x = linind - pindex.y * pstreamStrWidth;
      
      if ( odd > 0.5f ) { //is odd
         outstream += pstream2[ pindex ];
      } else {
         outstream += pstream1[ pindex ];
      }
      linind += natoms/iUnroll; 
   }
}

kernel void kMergeFloat4( 
      float repfac, 
      float atomStrWidth, 
      float pstreamStrWidth,
      float natoms,
      float iUnroll,
      float4 pstream1[][], 
      float4 pstream2[][], 
      out float4 outstream<> )
{
   float linind;
   float2 pindex;
   float odd;
   float i;
   
   //convert to linear atom index
   linind = (indexof outstream).x + ( (indexof outstream).y * atomStrWidth );
   
   //If odd or even, we pick from diferent streams.
   odd = linind - floor( linind / iUnroll ) * iUnroll;

   //Now linear index is the index into partial_streams
   linind = floor( linind / iUnroll );

   outstream = float4( 0.0f, 0.0f, 0.0f, 0.0f );
   
   //If we have predicated conditionals, we should 
   //keep the conditional inside the loop

   for ( i = 0.0f; i < repfac; i+= 1.0f ) {

      pindex.y = floor( linind / pstreamStrWidth );
      pindex.x = linind - pindex.y * pstreamStrWidth;
      
      if ( odd > 0.5f ) { //is odd
         outstream += pstream2[ pindex ];
      } else {
         outstream += pstream1[ pindex ];
      }
      linind += natoms/iUnroll; 
   }
}


/* After forces above, we have the forces for even numbered particles
 * in one stream, odd numbered particles in another.
 * In each stream, the forces are in several parts depending on how
 * many times we replicated the input stream.
 *
 * To avoid an extra kernel to zero forces, this sets the forces
 * rather than adding to it.
 * */

kernel void kMergeFloat4_4X( 
      float repfac, 
      float atomStrWidth, 
      float pstreamStrWidth,
      float natoms,
      float iUnroll,
      float4 pstream1[][], 
      float4 pstream2[][], 
      float4 pstream3[][], 
      float4 pstream4[][], 
      out float4 outstream<> )
{
   float linind;
   float2 pindex;
   float odd;
   float i;
   
   //convert to linear atom index
   linind = (indexof outstream).x + ( (indexof outstream).y * atomStrWidth );
   
   //If odd or even, we pick from diferent streams.
   odd = linind - floor( linind / iUnroll ) * iUnroll;

   //Now linear index is the index into partial_streams
   linind = floor( linind / iUnroll );

   outstream = float4( 0.0f, 0.0f, 0.0f, 0.0f );
   
   //If we have predicated conditionals, we should 
   //keep the conditional inside the loop

   for ( i = 0.0f; i < repfac; i+= 1.0f ) {

      //pindex.y = floor( linind / pstreamStrWidth );
      //pindex.x = linind - pindex.y * pstreamStrWidth;
      
      pindex.y = round( (linind - fmod( linind, pstreamStrWidth ))/pstreamStrWidth ); //bixia modify
      pindex.x = linind - pindex.y * pstreamStrWidth;

outstream += float4( linind, odd, pindex.x,  pindex.y );
/*
      if ( odd < 0.5f ) { //is odd
         outstream += pstream1[ pindex ];
      } else if( odd < 1.5f ){
         outstream += pstream2[ pindex ];
      } else if( odd < 2.5f ){
         outstream += pstream3[ pindex ];
      } else {
         outstream += pstream4[ pindex ];
      }
*/

      linind += natoms/iUnroll; 
   }
}

kernel void kMergeFloat4_4( 
      float repfac, 
      float atomStreamWidth, 
      float pStreamWidth,
      float natoms,
      float roundNatoms,
      float iUnroll,
      float4 pstream1[][], 
      float4 pstream2[][], 
      float4 pstream3[][], 
      float4 pstream4[][], 
      out float4 outstream<> )
{
   float atomIndex, forceIndex, qIndex, qOff;
   float2 pindex;
   float i;
   
   // given atom index find force indices and streams

   pindex      = indexof( outstream );
   atomIndex   = pindex.x + pindex.y*atomStreamWidth;
   forceIndex  = atomIndex;

   outstream   = float4( 0.0f, 0.0f, 0.0f, 0.0f );
   
   for( i = 0.0f; i < repfac; i += 1.0f ){

      // qIndex            = floor( forceIndex/iUnroll );
      qIndex            = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );

      qOff              = forceIndex - iUnroll*qIndex;

      // pindex.y          = floor( qIndex/ pStreamWidth );
      pindex.y          = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );

      // pindex.x          = qIndex - pindex.y*pStreamWidth + qOff;
      pindex.x          = qIndex - pindex.y*pStreamWidth;

// outstream += float4( forceIndex, qIndex, pindex.x,  pindex.y );

      if ( qOff < 0.5f ){ 
         outstream += pstream1[ pindex ];
      } else if( qOff < 1.5f ){
         outstream += pstream2[ pindex ];
      } else if( qOff < 2.5f ){
         outstream += pstream3[ pindex ];
      } else {
         outstream += pstream4[ pindex ];
      }

      forceIndex += roundNatoms;
   }
}


kernel void kPostObcLoop1( 
    float repfac, 
    float atomStreamWidth, 
    float pStreamWidth,
    float natoms,
    float roundNatoms,
    float iUnroll,
    float4 pstream1[][], 
    float4 pstream2[][], 
    float4 pstream3[][], 
    float4 pstream4[][], 
    float obcChain<>, 
    float bornRadii<>, 
    out float4 outstream<>,
    out float bornRadii2Force<> )
{
    
    // ---------------------------------------------------------------------------------------
    
    float atomIndex, forceIndex, qIndex, qOff;
    float2 pindex;
    float i;
    float4 o1,o2,o3,o4;
    float4 tmp;
    float4 zero4;
    
    // given atom index find force indices and streams
    
    pindex      = indexof( outstream );
    atomIndex   = pindex.x + pindex.y*atomStreamWidth;
    forceIndex  = atomIndex;
    
    zero4   = float4( 0.0f, 0.0f, 0.0f, 0.0f );
    outstream = zero4;
    
    for( i = 0.0f; i < repfac; i += 1.0f ){
        
        // qIndex            = floor( forceIndex/iUnroll );
        qIndex            = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
        
        qOff              = forceIndex - iUnroll*qIndex;
        
        // pindex.y          = floor( qIndex/ pStreamWidth );
        pindex.y          = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
        
        // pindex.x          = qIndex - pindex.y*pStreamWidth + qOff;
        pindex.x          = qIndex - pindex.y*pStreamWidth;
        
// outstream += float4( forceIndex, qIndex, pindex.x,  pindex.y );

        // This is going to cause really divergent code and we are
        // going to end up doing all the fetches anyway...
        
        if ( qOff < 0.5f ){ 
            outstream += pstream1[ pindex ];
        } else if( qOff < 1.5f ){
            outstream += pstream2[ pindex ];
        } else if( qOff < 2.5f ){
            outstream += pstream3[ pindex ];
        } else {
            outstream += pstream4[ pindex ];
        }

//         o1 = pstream1[ pindex ];
//         o2 = pstream2[ pindex ];
//         o3 = pstream3[ pindex ];
//         o4 = pstream4[ pindex ];

//         tmp = qOff < 0.5f ? o1 : o2;
//         tmp = qOff < 1.5f ? tmp : o3;
//         tmp = qOff < 2.5f ? tmp : o4;
        
//         outstream += tmp;
        
        forceIndex += roundNatoms;
    }
    
    bornRadii2Force  = obcChain*bornRadii*bornRadii*outstream.w;
}

// The inner loop by definition creates divergent paths.  Chances are
// fair that we will take all sides of the branch anyway, so this
// verion uses manual predication
kernel void kPostObcLoop1_nobranch( 
    float repfac, 
    float atomStreamWidth, 
    float pStreamWidth,
    float natoms,
    float roundNatoms,
    float iUnroll,
    float4 pstream1[][], 
    float4 pstream2[][], 
    float4 pstream3[][], 
    float4 pstream4[][], 
    float obcChain<>, 
    float bornRadii<>, 
    out float4 outstream<>,
    out float bornRadii2Force<> )
{
    
    // ---------------------------------------------------------------------------------------
    
    float atomIndex, forceIndex, qIndex, qOff;
    float2 pindex;
    float i;
    float4 o1,o2,o3,o4;
    float4 tmp;
    
    // given atom index find force indices and streams
    
    pindex      = indexof( outstream );
    atomIndex   = pindex.x + pindex.y*atomStreamWidth;
    forceIndex  = atomIndex;
    
    outstream  = float4( 0.0f, 0.0f, 0.0f, 0.0f );
    
    for( i = 0.0f; i < repfac; i += 1.0f ){
        
        // qIndex            = floor( forceIndex/iUnroll );
        qIndex            = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
        
        qOff              = forceIndex - iUnroll*qIndex;
        
        // pindex.y          = floor( qIndex/ pStreamWidth );
        pindex.y          = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
        
        // pindex.x          = qIndex - pindex.y*pStreamWidth + qOff;
        pindex.x          = qIndex - pindex.y*pStreamWidth;

        o1 = pstream1[ pindex ];
        o2 = pstream2[ pindex ];
        o3 = pstream3[ pindex ];
        o4 = pstream4[ pindex ];

        tmp = qOff < 0.5f ? o1 : o2;
        tmp = qOff < 1.5f ? tmp : o3;
        tmp = qOff < 2.5f ? tmp : o4;
        
        outstream += tmp;
        
        forceIndex += roundNatoms;
    }
    
    bornRadii2Force  = obcChain*bornRadii*bornRadii*outstream.w;
}


kernel void kSetValue4( float value, out float4 outstream<> ){
 outstream = float4( value, value, value, value );
}

kernel void kSetValue3( float value, out float3 outstream<> ){
 outstream = float3( value, value, value );
}

kernel void kSetValue2( float value, out float2 outstream<> ){
 outstream = float2( value, value );
}

kernel void kSetValue1( float value, out float outstream<> ){
 outstream = value;
}

kernel void kCheck( float natoms, float atomStrWidth, float pstreamStrWidth, float unroll, out float4 outstream<> )
{

 float linind, forceIndex, atomIndex;
 float2 pindex;

 pindex     = indexof( outstream );
 forceIndex = unroll*(pindex.x + pindex.y*pstreamStrWidth);
 atomIndex  = fmod( forceIndex, natoms );
 outstream  = float4( pindex.x, pindex.y, forceIndex, atomIndex );
   
}

/* After forces above, we have the forces for even numbered particles
 * in one stream, odd numbered particles in another.
 * In each stream, the forces are in several parts depending on how
 * many times we replicated the input stream.
 *
 * To avoid an extra kernel to zero forces, this sets the forces
 * rather than adding to it.
 * */

kernel void kAddAndMergeFloat4( 
      float repfac, 
      float atomStrWidth, 
      float pstreamStrWidth,
      float natoms,
      float iUnroll,
      float4 inStream<>, 
      float4 pstream1[][], 
      float4 pstream2[][], 
      out float4 outstream<> )
{
   float linind;
   float2 pindex;
   float odd;
   float i;
   float floor_linind_iUnroll;
   
   linind = (indexof outstream).x + (indexof outstream).y * atomStrWidth;
   
   //If odd or even, we pick from diferent streams.
   //odd = linind - floor( linind / iUnroll ) * iUnroll;

   //Now linear index is the index into partial_streams
   //linind = floor( linind / iUnroll );
   
   floor_linind_iUnroll = round( (linind - fmod(linind, iUnroll))/iUnroll );
   odd = linind - floor_linind_iUnroll * iUnroll;//bixia modify
   linind = floor_linind_iUnroll; //bixia modify

   outstream   = inStream;
   outstream.w = 0.0f;
   
   //If we have predicated conditionals, we should 
   //keep the conditional inside the loop

   for ( i = 0.0f; i < repfac; i+= 1.0f ) {

      //pindex.y = floor( linind / pstreamStrWidth );
      pindex.y = round( (linind - fmod( linind, pstreamStrWidth ))/pstreamStrWidth ); //bixia modify
      
      pindex.x = linind - pindex.y * pstreamStrWidth;
      
      if ( odd > 0.5f ) { //is odd
         outstream += pstream2[ pindex ];
      } else {
         outstream += pstream1[ pindex ];
      }
      linind += natoms/iUnroll; 
   }
}

kernel void kAddAndMergeFloat4_4( 
      float repfac, 
      float atomStreamWidth, 
      float pStreamWidth,
      float natoms,
      float roundNatoms,
      float iUnroll,
      float4 inStream<>, 
      float4 pstream1[][], 
      float4 pstream2[][], 
      float4 pstream3[][], 
      float4 pstream4[][], 
      out float4 outstream<> ){

   float atomIndex, forceIndex, qIndex, qOff;
   float2 pindex;
   float i;
   
   // given atom index find force indices and streams

   pindex      = indexof( outstream );
   atomIndex   = pindex.x + pindex.y*atomStreamWidth;
   forceIndex  = atomIndex;

   // add current forces in inStream to forces stored in pstreams
   // the .w entry is Born sum values; it will be used to calculate the
   // Born radii and obcChain term

   outstream   = inStream;
   outstream.w = 0.0f;
//outstream   = float4( 0.0f, 0.0f, 0.0f, 0.0f );
   
   // sum over j-loop 'duplications' by gathering from pstreams

   for( i = 0.0f; i < repfac; i += 1.0f ){

      // qIndex            = floor( forceIndex/iUnroll );
      qIndex            = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );

      qOff              = forceIndex - iUnroll*qIndex;

      // pindex.y          = floor( qIndex/ pStreamWidth );
      pindex.y          = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );

      // pindex.x          = qIndex - pindex.y*pStreamWidth + qOff;
      pindex.x          = qIndex - pindex.y*pStreamWidth;

      if( qOff < 0.5f ){ 
         outstream += pstream1[ pindex ];
      } else if( qOff < 1.5f ){
         outstream += pstream2[ pindex ];
      } else if( qOff < 2.5f ){
         outstream += pstream3[ pindex ];
      } else {
         outstream += pstream4[ pindex ];
      }
      forceIndex += roundNatoms;
   }
}

kernel void kPostObcLoop2( 
      float repfac, 
      float atomStreamWidth, 
      float pStreamWidth,
      float natoms,
      float roundNatoms,
      float iUnroll,
      float conversion,
      float mergeNonObcForces,
      float4 inObcForces<>, 
      float3 nonObcForces<>, 
      float4 pstream1[][], 
      float4 pstream2[][], 
      float4 pstream3[][], 
      float4 pstream4[][], 
      float  atomicRadii<>,
      out float bornRadii<>,
      out float obcChain<>,
      out float3 outForces<> ){

   // ---------------------------------------------------------------------------------------

   float atomIndex, forceIndex, qIndex, qOff;
   float2 pindex;
   float i;
   float sum2, sum3, bornSum, tanhSum, atomicRadiiOffset, obcIntermediate;

   float2 iAtom; 
   float4 forces;
   float expPlus, expMinus;

   // ---------------------------------------------------------------------------------------

   // constants -- OBC Type II

   const float alphaObc          = 1.0f;
   const float betaObc           = 0.8f;
   const float gammaObc          = 4.85f;
   const float dielectricOffset  = 0.009f;

   // ---------------------------------------------------------------------------------------
   
   // given atom index find force indices and streams

   pindex      = indexof( outForces );
   atomIndex   = pindex.x + pindex.y*atomStreamWidth;
   forceIndex  = atomIndex;

   // add current forces in inStream to forces stored in pstreams
   // the .w entry is Born sum values; it will be used to calculate the
   // Born radii and obcChain term

   forces   = inObcForces;
   forces.w = 0.0f;
//forces   = float4( 0.0f, 0.0f, 0.0f, 0.0f );
   
   // sum over j-loop 'duplications' by gathering from pstreams

   for( i = 0.0f; i < repfac; i += 1.0f ){

      // qIndex            = floor( forceIndex/iUnroll );
      qIndex            = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );

      qOff              = forceIndex - iUnroll*qIndex;

      // pindex.y          = floor( qIndex/ pStreamWidth );
      pindex.y          = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );

      // pindex.x          = qIndex - pindex.y*pStreamWidth + qOff;
      pindex.x          = qIndex - pindex.y*pStreamWidth;

      if( qOff < 0.5f ){ 
         forces += pstream1[ pindex ];
      } else if( qOff < 1.5f ){
         forces += pstream2[ pindex ];
      } else if( qOff < 2.5f ){
         forces += pstream3[ pindex ];
      } else {
         forces += pstream4[ pindex ];
      }
      forceIndex += roundNatoms;
   }

   // compute Born radii and ObcChain

   atomicRadiiOffset            = atomicRadii - dielectricOffset;

   bornSum                      = forces.w;
   bornSum                     *= 0.5f*atomicRadiiOffset;

   sum2                         = bornSum*bornSum;
   sum3                         = bornSum*sum2;

   // Tanh does not exist? 
   // calculate [ exp(x) - exp(-x) ]/[ exp(x) + exp(-x) ]

   // tanhSum                      = tanh( bornSum - betaObc*sum2 + gammaObc*sum3 );

   tanhSum                      = bornSum - betaObc*sum2 + gammaObc*sum3;
   expPlus                      = exp( tanhSum );
   expMinus                     = 1.0f/expPlus;
   tanhSum                      = ( expPlus - expMinus )/( expPlus + expMinus );
    
   bornRadii                    = 1.0f/( (1.0f/(atomicRadiiOffset)) - tanhSum/atomicRadii );  
 
   obcIntermediate              = atomicRadiiOffset*( alphaObc - 2.0f*betaObc*bornSum + 3.0f*gammaObc*sum2 );
   obcChain                     = (1.0f - tanhSum*tanhSum)*obcIntermediate/atomicRadii;
   if( atomIndex >= natoms ){
      bornRadii = 0.0f;
      obcChain  = 0.0f;
   }

   // add converted new forces to non-Obc forces

   outForces = conversion*forces.xyz;
   if( mergeNonObcForces > 0.1f ){
      outForces += nonObcForces; 
   }

}

kernel void kPostObcLoop2_nobranch( 
      float repfac, 
      float atomStreamWidth, 
      float pStreamWidth,
      float natoms,
      float roundNatoms,
      float iUnroll,
      float conversion,
      float mergeNonObcForces,
      float4 inObcForces<>, 
      float3 nonObcForces<>, 
      float4 pstream1[][], 
      float4 pstream2[][], 
      float4 pstream3[][], 
      float4 pstream4[][], 
      float  atomicRadii<>,
      out float bornRadii<>,
      out float obcChain<>,
      out float3 outForces<> ){

   // ---------------------------------------------------------------------------------------

   float atomIndex, forceIndex, qIndex, qOff;
   float2 pindex;
   float i;
   float sum2, sum3, bornSum, tanhSum, atomicRadiiOffset, obcIntermediate;
   float4 o1,o2,o3,o4;
   float4 tmp;

   float2 iAtom; 
   float4 forces;
   float expPlus, expMinus;

   // ---------------------------------------------------------------------------------------

   // constants -- OBC Type II

   const float alphaObc          = 1.0f;
   const float betaObc           = 0.8f;
   const float gammaObc          = 4.85f;
   const float dielectricOffset  = 0.009f;

   // ---------------------------------------------------------------------------------------
   
   // given atom index find force indices and streams

   pindex      = indexof( outForces );
   atomIndex   = pindex.x + pindex.y*atomStreamWidth;
   forceIndex  = atomIndex;

   // add current forces in inStream to forces stored in pstreams
   // the .w entry is Born sum values; it will be used to calculate the
   // Born radii and obcChain term

   forces   = inObcForces;
   forces.w = 0.0f;
//forces   = float4( 0.0f, 0.0f, 0.0f, 0.0f );
   
   // sum over j-loop 'duplications' by gathering from pstreams

   for( i = 0.0f; i < repfac; i += 1.0f ){

      // qIndex            = floor( forceIndex/iUnroll );
      qIndex            = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );

      qOff              = forceIndex - iUnroll*qIndex;

      // pindex.y          = floor( qIndex/ pStreamWidth );
      pindex.y          = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );

      // pindex.x          = qIndex - pindex.y*pStreamWidth + qOff;
      pindex.x          = qIndex - pindex.y*pStreamWidth;

      o1 = pstream1[ pindex ];
      o2 = pstream2[ pindex ];
      o3 = pstream3[ pindex ];
      o4 = pstream4[ pindex ];
      
      tmp = qOff < 0.5f ? o1 : o2;
      tmp = qOff < 1.5f ? tmp : o3;
      tmp = qOff < 2.5f ? tmp : o4;
      
      forces += tmp;
        
      forceIndex += roundNatoms;
      
   }

   // compute Born radii and ObcChain

   atomicRadiiOffset            = atomicRadii - dielectricOffset;

   bornSum                      = forces.w;
   bornSum                     *= 0.5f*atomicRadiiOffset;

   sum2                         = bornSum*bornSum;
   sum3                         = bornSum*sum2;

   // Tanh does not exist? 
   // calculate [ exp(x) - exp(-x) ]/[ exp(x) + exp(-x) ]

   // tanhSum                      = tanh( bornSum - betaObc*sum2 + gammaObc*sum3 );

   tanhSum                      = bornSum - betaObc*sum2 + gammaObc*sum3;
   expPlus                      = exp( tanhSum );
   expMinus                     = 1.0f/expPlus;
   tanhSum                      = ( expPlus - expMinus )/( expPlus + expMinus );
    
   bornRadii                    = 1.0f/( (1.0f/(atomicRadiiOffset)) - tanhSum/atomicRadii );  
 
   obcIntermediate              = atomicRadiiOffset*( alphaObc - 2.0f*betaObc*bornSum + 3.0f*gammaObc*sum2 );
   obcChain                     = (1.0f - tanhSum*tanhSum)*obcIntermediate/atomicRadii;
   if( atomIndex >= natoms ){
      bornRadii = 0.0f;
      obcChain  = 0.0f;
   }

   // add converted new forces to non-Obc forces

   outForces = conversion*forces.xyz;
   if( mergeNonObcForces > 0.1f ){
      outForces += nonObcForces; 
   }

}

kernel void kPostCalculateBornRadii_nobranch( 
      float repfac, 
      float atomStreamWidth, 
      float pStreamWidth,
      float natoms,
      float roundNatoms,
      float iUnroll,
      float conversion,
      float mergeNonObcForces,
      float4 pstream1[][], 
      float  atomicRadii<>,
      out float bornRadii<>,
      out float obcChain<> ){

   // ---------------------------------------------------------------------------------------

   float atomIndex, forceIndex, qIndex, qOff;
   float2 pindex;
   float i;
   float sum2, sum3, bornSum, tanhSum, atomicRadiiOffset, obcIntermediate;
   float4 o1;
   float tmp;

   float2 iAtom; 
   float4 forces;
   float expPlus, expMinus;

   // ---------------------------------------------------------------------------------------

   // constants -- OBC Type II

   const float alphaObc          = 1.0f;
   const float betaObc           = 0.8f;
   const float gammaObc          = 4.85f;
   const float dielectricOffset  = 0.009f;

   // ---------------------------------------------------------------------------------------
   
   // given atom index find force indices and streams

   pindex      = indexof( obcChain );
   atomIndex   = pindex.x + pindex.y*atomStreamWidth;
   forceIndex  = atomIndex;

   // add current forces in inStream to forces stored in pstreams
   // the .w entry is Born sum values; it will be used to calculate the
   // Born radii and obcChain term

   bornSum  = 0.0f;
   
   // sum over j-loop 'duplications' by gathering from pstreams

   for( i = 0.0f; i < repfac; i += 1.0f ){

      qIndex            = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
      qOff              = forceIndex - iUnroll*qIndex;
      pindex.y          = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
      pindex.x          = qIndex - pindex.y*pStreamWidth;

      o1                = pstream1[ pindex ];
      
      tmp               = qOff < 0.5f ? o1.x : o1.y;
      tmp               = qOff < 1.5f ? tmp : o1.z;
      tmp               = qOff < 2.5f ? tmp : o1.w;
      
      bornSum          += tmp;
        
      forceIndex       += roundNatoms;
      
   }

   // compute Born radii and ObcChain

   atomicRadiiOffset            = atomicRadii - dielectricOffset;

   bornSum                     *= 0.5f*atomicRadiiOffset;

   sum2                         = bornSum*bornSum;
   sum3                         = bornSum*sum2;

   // Tanh does not exist? 
   // calculate [ exp(x) - exp(-x) ]/[ exp(x) + exp(-x) ]

   // tanhSum                      = tanh( bornSum - betaObc*sum2 + gammaObc*sum3 );

   tanhSum                      = bornSum - betaObc*sum2 + gammaObc*sum3;
   expPlus                      = exp( tanhSum );
   expMinus                     = 1.0f/expPlus;
   tanhSum                      = ( expPlus - expMinus )/( expPlus + expMinus );
    
   bornRadii                    = 1.0f/( (1.0f/(atomicRadiiOffset)) - tanhSum/atomicRadii );  
 
   obcIntermediate              = atomicRadiiOffset*( alphaObc - 2.0f*betaObc*bornSum + 3.0f*gammaObc*sum2 );
   obcChain                     = (1.0f - tanhSum*tanhSum)*obcIntermediate/atomicRadii;
   if( atomIndex >= natoms ){
      bornRadii = 0.0f;
      obcChain  = 0.0f;
   }

}

/* Add forces from two streams */ 

kernel void kAddForces3_4( float conversion, float3 force1<>, float4 force2<>, out float3 outForce<> ){
   outForce.xyz  = force1 + conversion*force2.xyz;
}

/* Copy one stream to another */

kernel void kCopyFloat4( float4 inForce<>, out float4 outForce<> ){
   outForce = inForce;
}

/* Copy one stream to another
 * */

kernel void kCopyFloat3To4( 
      float3 inForce<>, 
      out float4 outForce<> ){
   // ---------------------------------------------------------------------------------------

   outForce.xyz = inForce;
   outForce.w   = 0.0f;
}