Mods

8c3878b2 · Mark Friedrichs · 116a99ba · 8c3878b2 · 8c3878b2 · 116a99ba
Commit 8c3878b2 authored Jan 02, 2009 by Mark Friedrichs
4 changed files
--- a/platforms/brook/src/gpu/kObcBaseD2.br
+++ b/platforms/brook/src/gpu/kObcBaseD2.br
--- a/platforms/brook/src/gpu/kgbsa.h
+++ b/platforms/brook/src/gpu/kgbsa.h
@@ -486,7 +486,6 @@ void kObcLoop2(
      float streamWidth, 
      float fstreamWidth,
      ::brook::stream posq, 
-      ::brook::stream atomicRadii, 
      ::brook::stream scaledAtomicRadii, 
      ::brook::stream bornForceFactor, 
      ::brook::stream bornForce1,

--- a/platforms/brook/src/gpu/kgbsa1.br
+++ b/platforms/brook/src/gpu/kgbsa1.br
-/****************************************************************
-* This file is part of the gpu acceleration library for gromacs.
-* Author: Mark Friedrichs
-* 
-* This kernel was developed in collaboration with
-* 
-* Copyright (C) Pande Group, Stanford, 2006
-*****************************************************************/
-//96 : 228 Flops
-kernel void loop1Internal( float3 d1, float3 d2, float3 d3, float3 d4, float4 jBornR,
-                           float4 jQ, float iBornR, float iQ, out float4 dGpol_dr<>,
-                           out float4 dGpol_dalpha2_ij<> ){
-   // ---------------------------------------------------------------------------------------
-   float4 r2, alpha2_ij, D_ij, expTerm, denominator2, denominator, Gpol;
-   // ---------------------------------------------------------------------------------------
-   r2                 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) ); //20
-   alpha2_ij          = jBornR*iBornR; //4
-   D_ij               = r2/(4.0f*alpha2_ij); //8
-   expTerm            = exp( -D_ij ); //4 : 80
-   denominator2       = r2 + alpha2_ij*expTerm; //8 
-   denominator        = sqrt( denominator2 ); //4 : 60
-   Gpol               = jQ/denominator; //4
-   Gpol              *= iQ;//4
-   dGpol_dr           = -Gpol*( 1.0f - 0.25f*expTerm )/denominator2;  //16
-   dGpol_dalpha2_ij   = -0.5f*Gpol*expTerm*( 1.0f + D_ij )*jBornR/denominator2; //24
-}
-/* ---------------------------------------------------------------------------------------
-   Calculate nonpolar ACE term (Simbios) 
-   bornRadius:          Born radius
-   vdwRadius:           Vdw radius
-   duplicationFactor:   duplication factor
-   aceForce:            ACE term
-   --------------------------------------------------------------------------------------- */
-kernel void kAceNonPolarLoop1( float iBornRadius, float iVdwRadius, float duplicationFactor,
-                                out float aceForce<> ){
-   // ---------------------------------------------------------------------------------------
-   // nonpolar term
-   float iSurface;
-   float iAceTerm;
-   // ---------------------------------------------------------------------------------------
-   // constants
-   // solvent radius
-   const float probeRadius       = 0.14f;
-   // PI*24*0.0f049 (0.0f054=asolv from Tinker)
-   //const float PI_24_aI          = -0.3694512961;
-   const float PI_24_aI          = -400.71504079f;
-   // ---------------------------------------------------------------------------------------
-   // etch i position and partial charge
-   // e = ai * term * (ri+probe)**2 * (ri/rb)**6
-   // (drbi) = drb(i) - 6.0fd0*e/rb
-   // (rI+probe)**2
-   iSurface                     = (iVdwRadius+probeRadius);
-   iSurface                     = iSurface*iSurface;
-   // (rI/rB)**6
-   iAceTerm                     = iVdwRadius/iBornRadius;
-   iAceTerm                     = iAceTerm*iAceTerm*iAceTerm;
-   iAceTerm                     = iAceTerm*iAceTerm;
-   aceForce                     = iSurface*iAceTerm*PI_24_aI/(duplicationFactor*iBornRadius);
-}
-/* ---------------------------------------------------------------------------------------
-   Calculate first loop force terms  (Simbios) 
-   numberOfAtoms:       no. of atoms
-   roundedUpAtoms:      rounded up number of atoms -- accounts for unrolling
-   duplicationFactor:   number of threads for inner loop
-   streamWidth:         atom stream width
-   fstreamWidth:        force stream width (output -- i-unroll)
-   soluteDielectric:    solute dielectric
-   solventDielectric:   solvent dielectric
-   includeAce:          include ACE term 
-   posq:                atom positions and charge
-   bornRadii:           Born radii
-   nonpolarForce:       nonpolar force (0 if nonpolar not included, else
-                        ACE value)
-   bornForce1:          i-unroll first force component, including dBornR/dr in .w
-   bornForce2:          i-unroll second force component, including dBornR/dr in .w 
-   bornForce3:          i-unroll first force component, including dBornR/dr in .w
-   bornForce4:          i-unroll second force component, including dBornR/dr in .w 
-   --------------------------------------------------------------------------------------- */
-//FLOPS = 544
-kernel void kObcLoop1( float numberOfAtoms, float roundedUpAtoms, float duplicationFactor, 
-                       float streamWidth, float fstreamWidth, float soluteDielectric,
-                       float solventDielectric, float includeAce,
-                       float4 posq[][], float  bornRadii[][], float  atomicRadii[][], 
-                       out float4 bornForce1<>, out float4 bornForce2<>,
-                       out float4 bornForce3<>, out float4 bornForce4<> ){
-   // ---------------------------------------------------------------------------------------
-   // Born radii
-   float i1BornR, i2BornR, i3BornR, i4BornR;
-   float j1BornR, j2BornR, j3BornR, j4BornR;
-   float4 jBornR;
-   // atomic radii
-   float i1AtomicR, i2AtomicR, i3AtomicR, i4AtomicR;
-   // i,j coordinates
-   float3 i1Pos, i2Pos, i3Pos, i4Pos; 
-   float3 j1Pos, j2Pos, j3Pos, j4Pos;
-   float4 j1PosQ, j2PosQ, j3PosQ, j4PosQ;
-   // i, j partial charges
-   float i1Q, i2Q, i3Q, i4Q;
-   float j1Q, j2Q, j3Q, j4Q;
-   float4 jQ;
-   float aceForce;
-   // delta coordinates
-   float3 d1, d2, d3, d4;
-   // intermediate terms
-   float4 dGpol_dr, dGpol_dalpha2_ij;
-   // indices
-   float2 iAtom; 
-   float forceIndex;
-   // This is forceIndex mod numberOfAtoms, the true i index
-   float iAtomLinearIndex, jLinind; 
-   float2 jAtom;
-   float jEnd, jStart, jBlock;
-   float whichRep;
-   float tmp; 
-   // ---------------------------------------------------------------------------------------
-   // electricConstant           = -166.0f2691;
-   // preFactor                  = 2.0f*electricConstant*(1.0f - (1.0f/waterDielectric))
-   float preFactor               = -332.05382f;
-   const float I_Unroll          = 4.0f;
-   const float3 zero3            = float3( 0.0f, 0.0f, 0.0f );
-   // ---------------------------------------------------------------------------------------
-   preFactor                    *= ( (1.0f/soluteDielectric) - (1.0f/solventDielectric) );
-   iAtom                         = indexof( bornForce1 );
-   forceIndex                    = I_Unroll*( iAtom.x + iAtom.y*fstreamWidth );
-   iAtomLinearIndex              = fmod( forceIndex, roundedUpAtoms );
-   // ---------------------------------------------------------------------------------------
-   // set gather index
-   iAtom.x                       = fmod(  iAtomLinearIndex, streamWidth );
-   iAtom.y                       = round( (iAtomLinearIndex - fmod(iAtomLinearIndex, streamWidth ))/streamWidth );
-   // ---------------------------------------------------------------------------------------
-   // etch i1 position and partial charge
-   jQ                            = posq[          iAtom ];
-   i1BornR                       = bornRadii[     iAtom ];
-   i1AtomicR                     = atomicRadii[   iAtom ];
-   i1Pos                         = jQ.xyz;
-   i1Q                           = jQ.w;
-   i1Q                          *= preFactor;
-   kAceNonPolarLoop1( i1BornR, i1AtomicR, duplicationFactor, aceForce );
-   bornForce1.xyz                = zero3;
-   bornForce1.w                  = includeAce > 0.5f ? aceForce : 0.0f;
-   // ---------------------------------------------------------------------------------------
-   // etch i2 position and partial charge
-   iAtom.x                      += 1;
-   jQ                            = posq[          iAtom ];
-   i2BornR                       = bornRadii[     iAtom ];
-   i2AtomicR                     = atomicRadii[   iAtom ];
-   i2Pos                         = jQ.xyz;
-   i2Q                           = jQ.w;
-   i2Q                          *= preFactor;
-   kAceNonPolarLoop1( i2BornR, i2AtomicR, duplicationFactor, aceForce );
-   bornForce2.xyz                = zero3;
-   bornForce2.w                  = includeAce > 0.5f ? aceForce : 0.0f;
-   // ---------------------------------------------------------------------------------------
-   // etch i3 position and partial charge
-   iAtom.x                      += 1;
-   jQ                            = posq[          iAtom ];
-   i3BornR                       = bornRadii[     iAtom ];
-   i3AtomicR                     = atomicRadii[   iAtom ];
-   i3Pos                         = jQ.xyz;
-   i3Q                           = jQ.w;
-   i3Q                          *= preFactor;
-   kAceNonPolarLoop1( i3BornR, i3AtomicR, duplicationFactor, aceForce );
-   bornForce3.xyz                = zero3;
-   bornForce3.w                  = includeAce > 0.5f ? aceForce : 0.0f;
-   // ---------------------------------------------------------------------------------------
-   // etch i4 position and partial charge
-   iAtom.x                      += 1;
-   jQ                            = posq[          iAtom ];
-   i4BornR                       = bornRadii[     iAtom ];
-   i4AtomicR                     = atomicRadii[   iAtom ];
-   i4Pos                         = jQ.xyz;
-   i4Q                           = jQ.w;
-   i4Q                          *= preFactor;
-   kAceNonPolarLoop1( i4BornR, i4AtomicR, duplicationFactor, aceForce );
-   bornForce4.xyz                = zero3;
-   bornForce4.w                  = includeAce > 0.5f ? aceForce : 0.0f;
-   // ---------------------------------------------------------------------------------------
-   // inner loop setup
-   // if dupFac == 4, I_UnRoll =2, then breaking inner loop into two segments
-   // to increase number of threads in flight
-   // forceStreamSz = N*RepFac/I_UnRoll
-   // forceIndex       = I_UnRoll*( a.x + a.y*forceStreamSz )
-   // whichRep      = 0 or 1
-   // jBlock        = 1 + floor[ N/(duplicationFactor*streamWidth) ]
-   //changed the following instruction for rounding issues on some ASICs
-   //whichRep                     = floor( forceIndex / roundedUpAtoms );
-   tmp = fmod(forceIndex, roundedUpAtoms);
-   whichRep = round((forceIndex - tmp)/roundedUpAtoms);
-   //jBlock                        = 1 + floor( numberOfAtoms/(duplicationFactor*streamWidth ) );
-   tmp = fmod( numberOfAtoms, (duplicationFactor*streamWidth ) );   
-   jBlock                       = 1 + round( (numberOfAtoms-tmp)/(duplicationFactor*streamWidth ));
-   jStart                       = whichRep*jBlock;
-   jEnd                         = ( whichRep > duplicationFactor - 1.5f ) ? 999999.0f : (jStart + jBlock);
-   jAtom.y                      = jStart;
-   jLinind                      = jAtom.y*streamWidth;
-   // ---------------------------------------------------------------------------------------
-   while ( jAtom.y < jEnd && ( numberOfAtoms - jLinind )  > 0.9f ){
-      jAtom.x = 0.0f;
-      while ( jAtom.x < streamWidth && ( numberOfAtoms - jLinind ) > 0.9f ) {
-         // ---------------------------------------------------------------------------------------
-         // gather required values
-         j1PosQ             = posq[      jAtom ];
-         j1Pos              = j1PosQ.xyz;
-         j1Q                = j1PosQ.w;
-         j1BornR            = bornRadii[ jAtom ];
-         jAtom.x           += 1.0f;
-         j2PosQ             = posq[      jAtom ];
-         j2Pos              = j2PosQ.xyz;
-         j2Q                = j2PosQ.w;
-         j2BornR            = bornRadii[ jAtom ];
-         jAtom.x           += 1.0f;
-         j3PosQ             = posq[      jAtom ];
-         j3Pos              = j3PosQ.xyz;
-         j3Q                = j3PosQ.w;
-         j3BornR            = bornRadii[ jAtom ];
-         jAtom.x           += 1.0f;
-         j4PosQ             = posq[      jAtom ];
-         j4Pos              = j4PosQ.xyz;
-         j4Q                = j4PosQ.w;
-         j4BornR            = bornRadii[ jAtom ];
-         jAtom.x           += 1.0f;
-         jBornR             = float4( j1BornR, j2BornR, j3BornR, j4BornR );
-         jQ                 = float4( j1Q, j2Q, j3Q, j4Q );
-         // ---------------------------------------------------------------------------------------
-         // i == 1
-         d1                 = i1Pos - j1Pos; //3
-         d2                 = i1Pos - j2Pos; //3
-         d3                 = i1Pos - j3Pos; //3
-         d4                 = i1Pos - j4Pos; //3
-         loop1Internal( d1, d2, d3, d4, jBornR, jQ, i1BornR, i1Q, dGpol_dr, dGpol_dalpha2_ij ); //96 : 228
-         bornForce1.xyz    += dGpol_dr.x*d1; //6
-         bornForce1.xyz    += dGpol_dr.y*d2; //6
-         bornForce1.xyz    += dGpol_dr.z*d3; //6
-         bornForce1.xyz    += dGpol_dr.w*d4; //6
-         bornForce1.w      += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w; //4
-         // ---------------------------------------------------------------------------------------
-         // i == 2
-         d1                 = i2Pos - j1Pos; //3
-         d2                 = i2Pos - j2Pos; //3
-         d3                 = i2Pos - j3Pos; //3
-         d4                 = i2Pos - j4Pos; //3
-         loop1Internal( d1, d2, d3, d4, jBornR, jQ, i2BornR, i2Q, dGpol_dr, dGpol_dalpha2_ij ); //96 : 228
-         bornForce2.xyz    += dGpol_dr.x*d1; //6
-         bornForce2.xyz    += dGpol_dr.y*d2; //6
-         bornForce2.xyz    += dGpol_dr.z*d3; //6
-         bornForce2.xyz    += dGpol_dr.w*d4; //6
-         bornForce2.w      += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w; //4
-         // ---------------------------------------------------------------------------------------
-         // i == 3
-         d1                 = i3Pos - j1Pos; //3
-         d2                 = i3Pos - j2Pos; //3
-         d3                 = i3Pos - j3Pos; //3
-         d4                 = i3Pos - j4Pos; //3
-         loop1Internal( d1, d2, d3, d4, jBornR, jQ, i3BornR, i3Q, dGpol_dr, dGpol_dalpha2_ij ); //96 : 228
-         bornForce3.xyz    += dGpol_dr.x*d1; //6
-         bornForce3.xyz    += dGpol_dr.y*d2; //6
-         bornForce3.xyz    += dGpol_dr.z*d3; //6
-         bornForce3.xyz    += dGpol_dr.w*d4; //6 
-         bornForce3.w      += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w; //4
-         // ---------------------------------------------------------------------------------------
-         // i == 4
-         d1                 = i4Pos - j1Pos; //3
-         d2                 = i4Pos - j2Pos; //3
-         d3                 = i4Pos - j3Pos; //3
-         d4                 = i4Pos - j4Pos; //3
-         loop1Internal( d1, d2, d3, d4, jBornR, jQ, i4BornR, i4Q, dGpol_dr, dGpol_dalpha2_ij ); //96 : 228
-         bornForce4.xyz    += dGpol_dr.x*d1; //6 
-         bornForce4.xyz    += dGpol_dr.y*d2; //6 
-         bornForce4.xyz    += dGpol_dr.z*d3; //6 
-         bornForce4.xyz    += dGpol_dr.w*d4; //6 
-         bornForce4.w      += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w; //4
-         // ---------------------------------------------------------------------------------------
-         jLinind    += 4.0f;
-      }
-      jAtom.y       += 1.0f;      
-   }
-}
--- a/platforms/brook/src/gpu/kmerge.br
+++ b/platforms/brook/src/gpu/kmerge.br
@@ -225,154 +225,6 @@ kernel void kMergeFloat4_4(
   }
 }
-kernel void kPostObcLoop1( 
-    float repfac, 
-    float atomStreamWidth, 
-    float pStreamWidth,
-    float natoms,
-    float roundNatoms,
-    float iUnroll,
-    float4 pstream1[][], 
-    float4 pstream2[][], 
-    float4 pstream3[][], 
-    float4 pstream4[][], 
-    float obcChain<>, 
-    float bornRadii<>, 
-    out float4 outstream<>,
-    out float bornRadii2Force<> )
-{
-    // ---------------------------------------------------------------------------------------
-    float atomIndex, forceIndex, qIndex, qOff;
-    float2 pindex;
-    float i;
-    float4 o1,o2,o3,o4;
-    float4 tmp;
-    float4 zero4;
-    // given atom index find force indices and streams
-    pindex      = indexof( outstream );
-    atomIndex   = pindex.x + pindex.y*atomStreamWidth;
-    forceIndex  = atomIndex;
-    zero4   = float4( 0.0f, 0.0f, 0.0f, 0.0f );
-    outstream = zero4;
-    for( i = 0.0f; i < repfac; i += 1.0f ){
-        // qIndex            = floor( forceIndex/iUnroll );
-        qIndex            = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
-        qOff              = forceIndex - iUnroll*qIndex;
-        // pindex.y          = floor( qIndex/ pStreamWidth );
-        pindex.y          = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
-        // pindex.x          = qIndex - pindex.y*pStreamWidth + qOff;
-        pindex.x          = qIndex - pindex.y*pStreamWidth;
-// outstream += float4( forceIndex, qIndex, pindex.x,  pindex.y );
-        // This is going to cause really divergent code and we are
-        // going to end up doing all the fetches anyway...
-        if ( qOff < 0.5f ){ 
-            outstream += pstream1[ pindex ];
-        } else if( qOff < 1.5f ){
-            outstream += pstream2[ pindex ];
-        } else if( qOff < 2.5f ){
-            outstream += pstream3[ pindex ];
-        } else {
-            outstream += pstream4[ pindex ];
-        }
-//         o1 = pstream1[ pindex ];
-//         o2 = pstream2[ pindex ];
-//         o3 = pstream3[ pindex ];
-//         o4 = pstream4[ pindex ];
-//         tmp = qOff < 0.5f ? o1 : o2;
-//         tmp = qOff < 1.5f ? tmp : o3;
-//         tmp = qOff < 2.5f ? tmp : o4;
-//         outstream += tmp;
-        forceIndex += roundNatoms;
-    }
-    bornRadii2Force  = obcChain*bornRadii*bornRadii*outstream.w;
-}
-// The inner loop by definition creates divergent paths.  Chances are
-// fair that we will take all sides of the branch anyway, so this
-// verion uses manual predication
-kernel void kPostObcLoop1_nobranch( 
-    float repfac, 
-    float atomStreamWidth, 
-    float pStreamWidth,
-    float natoms,
-    float roundNatoms,
-    float iUnroll,
-    float4 pstream1[][], 
-    float4 pstream2[][], 
-    float4 pstream3[][], 
-    float4 pstream4[][], 
-    float obcChain<>, 
-    float bornRadii<>, 
-    out float4 outstream<>,
-    out float bornRadii2Force<> )
-{
-    // ---------------------------------------------------------------------------------------
-    float atomIndex, forceIndex, qIndex, qOff;
-    float2 pindex;
-    float i;
-    float4 o1,o2,o3,o4;
-    float4 tmp;
-    // given atom index find force indices and streams
-    pindex      = indexof( outstream );
-    atomIndex   = pindex.x + pindex.y*atomStreamWidth;
-    forceIndex  = atomIndex;
-    outstream  = float4( 0.0f, 0.0f, 0.0f, 0.0f );
-    for( i = 0.0f; i < repfac; i += 1.0f ){
-        // qIndex            = floor( forceIndex/iUnroll );
-        qIndex            = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
-        qOff              = forceIndex - iUnroll*qIndex;
-        // pindex.y          = floor( qIndex/ pStreamWidth );
-        pindex.y          = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
-        // pindex.x          = qIndex - pindex.y*pStreamWidth + qOff;
-        pindex.x          = qIndex - pindex.y*pStreamWidth;
-        o1 = pstream1[ pindex ];
-        o2 = pstream2[ pindex ];
-        o3 = pstream3[ pindex ];
-        o4 = pstream4[ pindex ];
-        tmp = qOff < 0.5f ? o1 : o2;
-        tmp = qOff < 1.5f ? tmp : o3;
-        tmp = qOff < 2.5f ? tmp : o4;
-        outstream += tmp;
-        forceIndex += roundNatoms;
-    }
-    bornRadii2Force  = obcChain*bornRadii*bornRadii*outstream.w;
-}
 kernel void kSetValue4( float value, out float4 outstream<> ){
 outstream = float4( value, value, value, value );
 }
@@ -522,354 +374,6 @@ kernel void kAddAndMergeFloat4_4(
   }
 }
-kernel void kPostObcLoop2( 
-      float repfac, 
-      float atomStreamWidth, 
-      float pStreamWidth,
-      float natoms,
-      float roundNatoms,
-      float iUnroll,
-      float conversion,
-      float mergeNonObcForces,
-      float4 inObcForces<>, 
-      float3 nonObcForces<>, 
-      float4 pstream1[][], 
-      float4 pstream2[][], 
-      float4 pstream3[][], 
-      float4 pstream4[][], 
-      float  atomicRadii<>,
-      out float bornRadii<>,
-      out float obcChain<>,
-      out float3 outForces<> ){
-   // ---------------------------------------------------------------------------------------
-   float atomIndex, forceIndex, qIndex, qOff;
-   float2 pindex;
-   float i;
-   float sum2, sum3, bornSum, tanhSum, atomicRadiiOffset, obcIntermediate;
-   float2 iAtom; 
-   float4 forces;
-   float expPlus, expMinus;
-   // ---------------------------------------------------------------------------------------
-   // constants -- OBC Type II
-   const float alphaObc          = 1.0f;
-   const float betaObc           = 0.8f;
-   const float gammaObc          = 4.85f;
-   const float dielectricOffset  = 0.009f;
-   // ---------------------------------------------------------------------------------------
-   // given atom index find force indices and streams
-   pindex      = indexof( outForces );
-   atomIndex   = pindex.x + pindex.y*atomStreamWidth;
-   forceIndex  = atomIndex;
-   // add current forces in inStream to forces stored in pstreams
-   // the .w entry is Born sum values; it will be used to calculate the
-   // Born radii and obcChain term
-   forces   = inObcForces;
-   forces.w = 0.0f;
-//forces   = float4( 0.0f, 0.0f, 0.0f, 0.0f );
-   // sum over j-loop 'duplications' by gathering from pstreams
-   for( i = 0.0f; i < repfac; i += 1.0f ){
-      // qIndex            = floor( forceIndex/iUnroll );
-      qIndex            = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
-      qOff              = forceIndex - iUnroll*qIndex;
-      // pindex.y          = floor( qIndex/ pStreamWidth );
-      pindex.y          = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
-      // pindex.x          = qIndex - pindex.y*pStreamWidth + qOff;
-      pindex.x          = qIndex - pindex.y*pStreamWidth;
-      if( qOff < 0.5f ){ 
-         forces += pstream1[ pindex ];
-      } else if( qOff < 1.5f ){
-         forces += pstream2[ pindex ];
-      } else if( qOff < 2.5f ){
-         forces += pstream3[ pindex ];
-      } else {
-         forces += pstream4[ pindex ];
-      }
-      forceIndex += roundNatoms;
-   }
-   // compute Born radii and ObcChain
-   atomicRadiiOffset            = atomicRadii - dielectricOffset;
-   bornSum                      = forces.w;
-   bornSum                     *= 0.5f*atomicRadiiOffset;
-   sum2                         = bornSum*bornSum;
-   sum3                         = bornSum*sum2;
-   // Tanh does not exist? 
-   // calculate [ exp(x) - exp(-x) ]/[ exp(x) + exp(-x) ]
-   // tanhSum                      = tanh( bornSum - betaObc*sum2 + gammaObc*sum3 );
-   tanhSum                      = bornSum - betaObc*sum2 + gammaObc*sum3;
-   expPlus                      = exp( tanhSum );
-   expMinus                     = 1.0f/expPlus;
-   tanhSum                      = ( expPlus - expMinus )/( expPlus + expMinus );
-   bornRadii                    = 1.0f/( (1.0f/(atomicRadiiOffset)) - tanhSum/atomicRadii );  
-   obcIntermediate              = atomicRadiiOffset*( alphaObc - 2.0f*betaObc*bornSum + 3.0f*gammaObc*sum2 );
-   obcChain                     = (1.0f - tanhSum*tanhSum)*obcIntermediate/atomicRadii;
-   if( atomIndex >= natoms ){
-      bornRadii = 0.0f;
-      obcChain  = 0.0f;
-   }
-   // add converted new forces to non-Obc forces
-   outForces = conversion*forces.xyz;
-   if( mergeNonObcForces > 0.1f ){
-      outForces += nonObcForces; 
-   }
-}
-kernel void kPostObcLoop2_nobranch( 
-      float repfac, 
-      float atomStreamWidth, 
-      float pStreamWidth,
-      float natoms,
-      float roundNatoms,
-      float iUnroll,
-      float conversion,
-      float mergeNonObcForces,
-      float4 inObcForces<>, 
-      float3 nonObcForces<>, 
-      float4 pstream1[][], 
-      float4 pstream2[][], 
-      float4 pstream3[][], 
-      float4 pstream4[][], 
-      float  atomicRadii<>,
-      out float bornRadii<>,
-      out float obcChain<>,
-      out float3 outForces<> ){
-   // ---------------------------------------------------------------------------------------
-   float atomIndex, forceIndex, qIndex, qOff;
-   float2 pindex;
-   float i;
-   float sum2, sum3, bornSum, tanhSum, atomicRadiiOffset, obcIntermediate;
-   float4 o1,o2,o3,o4;
-   float4 tmp;
-   float2 iAtom; 
-   float4 forces;
-   float expPlus, expMinus;
-   // ---------------------------------------------------------------------------------------
-   // constants -- OBC Type II
-   const float alphaObc          = 1.0f;
-   const float betaObc           = 0.8f;
-   const float gammaObc          = 4.85f;
-   const float dielectricOffset  = 0.009f;
-   // ---------------------------------------------------------------------------------------
-   // given atom index find force indices and streams
-   pindex      = indexof( outForces );
-   atomIndex   = pindex.x + pindex.y*atomStreamWidth;
-   forceIndex  = atomIndex;
-   // add current forces in inStream to forces stored in pstreams
-   // the .w entry is Born sum values; it will be used to calculate the
-   // Born radii and obcChain term
-   forces   = inObcForces;
-   forces.w = 0.0f;
-//forces   = float4( 0.0f, 0.0f, 0.0f, 0.0f );
-   // sum over j-loop 'duplications' by gathering from pstreams
-   for( i = 0.0f; i < repfac; i += 1.0f ){
-      // qIndex            = floor( forceIndex/iUnroll );
-      qIndex            = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
-      qOff              = forceIndex - iUnroll*qIndex;
-      // pindex.y          = floor( qIndex/ pStreamWidth );
-      pindex.y          = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
-      // pindex.x          = qIndex - pindex.y*pStreamWidth + qOff;
-      pindex.x          = qIndex - pindex.y*pStreamWidth;
-      o1 = pstream1[ pindex ];
-      o2 = pstream2[ pindex ];
-      o3 = pstream3[ pindex ];
-      o4 = pstream4[ pindex ];
-      tmp = qOff < 0.5f ? o1 : o2;
-      tmp = qOff < 1.5f ? tmp : o3;
-      tmp = qOff < 2.5f ? tmp : o4;
-      forces += tmp;
-      forceIndex += roundNatoms;
-   }
-   // compute Born radii and ObcChain
-   atomicRadiiOffset            = atomicRadii - dielectricOffset;
-   bornSum                      = forces.w;
-   bornSum                     *= 0.5f*atomicRadiiOffset;
-   sum2                         = bornSum*bornSum;
-   sum3                         = bornSum*sum2;
-   // Tanh does not exist? 
-   // calculate [ exp(x) - exp(-x) ]/[ exp(x) + exp(-x) ]
-   // tanhSum                      = tanh( bornSum - betaObc*sum2 + gammaObc*sum3 );
-   tanhSum                      = bornSum - betaObc*sum2 + gammaObc*sum3;
-   expPlus                      = exp( tanhSum );
-   expMinus                     = 1.0f/expPlus;
-   tanhSum                      = ( expPlus - expMinus )/( expPlus + expMinus );
-   bornRadii                    = 1.0f/( (1.0f/(atomicRadiiOffset)) - tanhSum/atomicRadii );  
-   obcIntermediate              = atomicRadiiOffset*( alphaObc - 2.0f*betaObc*bornSum + 3.0f*gammaObc*sum2 );
-   obcChain                     = (1.0f - tanhSum*tanhSum)*obcIntermediate/atomicRadii;
-   if( atomIndex >= natoms ){
-      bornRadii = 0.0f;
-      obcChain  = 0.0f;
-   }
-   // add converted new forces to non-Obc forces
-   outForces = conversion*forces.xyz;
-   if( mergeNonObcForces > 0.1f ){
-      outForces += nonObcForces; 
-   }
-}
-kernel void kPostCalculateBornRadii_nobranch( 
-      float repfac, 
-      float atomStreamWidth, 
-      float pStreamWidth,
-      float natoms,
-      float roundNatoms,
-      float iUnroll,
-      float conversion,
-      float mergeNonObcForces,
-      float4 pstream1[][], 
-      float  atomicRadii<>,
-      out float bornRadii<>,
-      out float obcChain<> ){
-   // ---------------------------------------------------------------------------------------
-   float atomIndex, forceIndex, qIndex, qOff;
-   float2 pindex;
-   float i;
-   float sum2, sum3, bornSum, tanhSum, atomicRadiiOffset, obcIntermediate;
-   float4 o1;
-   float tmp;
-   float2 iAtom; 
-   float4 forces;
-   float expPlus, expMinus;
-   // ---------------------------------------------------------------------------------------
-   // constants -- OBC Type II
-   const float alphaObc          = 1.0f;
-   const float betaObc           = 0.8f;
-   const float gammaObc          = 4.85f;
-   const float dielectricOffset  = 0.009f;
-   // ---------------------------------------------------------------------------------------
-   // given atom index find force indices and streams
-   pindex      = indexof( obcChain );
-   atomIndex   = pindex.x + pindex.y*atomStreamWidth;
-   forceIndex  = atomIndex;
-   // add current forces in inStream to forces stored in pstreams
-   // the .w entry is Born sum values; it will be used to calculate the
-   // Born radii and obcChain term
-   bornSum  = 0.0f;
-   // sum over j-loop 'duplications' by gathering from pstreams
-   for( i = 0.0f; i < repfac; i += 1.0f ){
-      qIndex            = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
-      qOff              = forceIndex - iUnroll*qIndex;
-      pindex.y          = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
-      pindex.x          = qIndex - pindex.y*pStreamWidth;
-      o1                = pstream1[ pindex ];
-      tmp               = qOff < 0.5f ? o1.x : o1.y;
-      tmp               = qOff < 1.5f ? tmp : o1.z;
-      tmp               = qOff < 2.5f ? tmp : o1.w;
-      bornSum          += tmp;
-      forceIndex       += roundNatoms;
-   }
-   // compute Born radii and ObcChain
-   atomicRadiiOffset            = atomicRadii - dielectricOffset;
-   bornSum                     *= 0.5f*atomicRadiiOffset;
-   sum2                         = bornSum*bornSum;
-   sum3                         = bornSum*sum2;
-   // Tanh does not exist? 
-   // calculate [ exp(x) - exp(-x) ]/[ exp(x) + exp(-x) ]
-   // tanhSum                      = tanh( bornSum - betaObc*sum2 + gammaObc*sum3 );
-   tanhSum                      = bornSum - betaObc*sum2 + gammaObc*sum3;
-   expPlus                      = exp( tanhSum );
-   expMinus                     = 1.0f/expPlus;
-   tanhSum                      = ( expPlus - expMinus )/( expPlus + expMinus );
-   bornRadii                    = 1.0f/( (1.0f/(atomicRadiiOffset)) - tanhSum/atomicRadii );  
-   obcIntermediate              = atomicRadiiOffset*( alphaObc - 2.0f*betaObc*bornSum + 3.0f*gammaObc*sum2 );
-   obcChain                     = (1.0f - tanhSum*tanhSum)*obcIntermediate/atomicRadii;
-   if( atomIndex >= natoms ){
-      bornRadii = 0.0f;
-      obcChain  = 0.0f;
-   }
-}
 /* Add forces from two streams */ 
 kernel void kAddForces3_4( float conversion, float3 force1<>, float4 force2<>, out float3 outForce<> ){