Mods

cb130f92 · Mark Friedrichs · cc8b4de0 · cb130f92 · cb130f92 · cb130f92
Commit cb130f92 authored Sep 25, 2008 by Mark Friedrichs
8 changed files
--- a/platforms/brook/src/gpu/kgbsa1.br
+++ b/platforms/brook/src/gpu/kgbsa1.br
+/****************************************************************
+* This file is part of the gpu acceleration library for gromacs.
+* Author: Mark Friedrichs
+* 
+* This kernel was developed in collaboration with
+* 
+* Copyright (C) Pande Group, Stanford, 2006
+*****************************************************************/
+kernel void loop1Internal( float3 d1, float3 d2, float3 d3, float3 d4, float4 jBornR,
+                           float4 jQ, float iBornR, float iQ, out float4 dGpol_dr<>,
+                           out float4 dGpol_dalpha2_ij<> ){
+   // ---------------------------------------------------------------------------------------
+   float4 r2, alpha2_ij, D_ij, expTerm, denominator2, denominator, Gpol;
+   // ---------------------------------------------------------------------------------------
+   r2                 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) );
+   alpha2_ij          = jBornR*iBornR;
+   D_ij               = r2/(4.0f*alpha2_ij);
+   expTerm            = exp( -D_ij );
+   denominator2       = r2 + alpha2_ij*expTerm; 
+   denominator        = sqrt( denominator2 ); 
+   Gpol               = jQ/denominator; 
+   Gpol              *= iQ;
+   dGpol_dr           = -Gpol*( 1.0f - 0.25f*expTerm )/denominator2;  
+   dGpol_dalpha2_ij   = -0.5f*Gpol*expTerm*( 1.0f + D_ij )*jBornR/denominator2;
+}
+/* ---------------------------------------------------------------------------------------
+   Calculate nonpolar ACE term (Simbios) 
+   bornRadius:          Born radius
+   vdwRadius:           Vdw radius
+   duplicationFactor:   duplication factor
+   aceForce:            ACE term
+   --------------------------------------------------------------------------------------- */
+kernel void kAceNonPolarLoop1( float iBornRadius, float iVdwRadius, float duplicationFactor,
+                                out float aceForce<> ){
+   // ---------------------------------------------------------------------------------------
+   // nonpolar term
+   float iSurface;
+   float iAceTerm;
+   // ---------------------------------------------------------------------------------------
+   // constants
+   // solvent radius
+   const float probeRadius       = 0.14f;
+   // PI*4*6*0.0054*1000 (0.0054=asolv from Tinker)
+   //const float PI_24_aI          = -0.3694512961;
+   const float PI_24_aI          = -407.1504079f;
+   // ---------------------------------------------------------------------------------------
+   // etch i position and partial charge
+   // e = ai * term * (ri+probe)**2 * (ri/rb)**6
+   // (drbi) = drb(i) - 6.0fd0*e/rb
+   // (rI+probe)**2
+   iSurface                     = (iVdwRadius+probeRadius);
+   iSurface                     = iSurface*iSurface;
+   // (rI/rB)**6
+   iAceTerm                     = iVdwRadius/iBornRadius;
+   iAceTerm                     = iAceTerm*iAceTerm*iAceTerm;
+   iAceTerm                     = iAceTerm*iAceTerm;
+   aceForce                     = iSurface*iAceTerm*PI_24_aI/(duplicationFactor*iBornRadius);
+}
+/* ---------------------------------------------------------------------------------------
+   Calculate first loop force terms  (Simbios) 
+   numberOfAtoms:       no. of atoms
+   roundedUpAtoms:      rounded up number of atoms -- accounts for unrolling
+   duplicationFactor:   number of threads for inner loop
+   streamWidth:         atom stream width
+   fstreamWidth:        force stream width (output -- i-unroll)
+   soluteDielectric:    solute dielectric
+   solventDielectric:   solvent dielectric
+   includeAce:          include ACE term 
+   posq:                atom positions and charge
+   bornRadii:           Born radii
+   nonpolarForce:       nonpolar force (0 if nonpolar not included, else
+                        ACE value)
+   bornForce1:          i-unroll first force component, including dBornR/dr in .w
+   bornForce2:          i-unroll second force component, including dBornR/dr in .w 
+   bornForce3:          i-unroll first force component, including dBornR/dr in .w
+   bornForce4:          i-unroll second force component, including dBornR/dr in .w 
+   --------------------------------------------------------------------------------------- */
+kernel void kObcLoop1( float numberOfAtoms, float roundedUpAtoms, float duplicationFactor, 
+                       float streamWidth, float fstreamWidth, float soluteDielectric,
+                       float solventDielectric, float includeAce,
+                       float3 posq[][], float  bornRadii[][], float2  atomicRadii[][], 
+                       out float4 bornForce1<>, out float4 bornForce2<>,
+                       out float4 bornForce3<>, out float4 bornForce4<> ){
+   // ---------------------------------------------------------------------------------------
+   // Born radii
+   float i1BornR, i2BornR, i3BornR, i4BornR;
+   float j1BornR, j2BornR, j3BornR, j4BornR;
+   float4 jBornR;
+   // atomic radii
+   float i1AtomicR, i2AtomicR, i3AtomicR, i4AtomicR;
+   // i,j coordinates
+   float3 i1Pos, i2Pos, i3Pos, i4Pos; 
+   float3 j1Pos, j2Pos, j3Pos, j4Pos;
+   float4 j1PosQ, j2PosQ, j3PosQ, j4PosQ;
+   // i, j partial charges
+   float i1Q, i2Q, i3Q, i4Q;
+   float j1Q, j2Q, j3Q, j4Q;
+   float4 jQ;
+   float aceForce;
+   // delta coordinates
+   float3 d1, d2, d3, d4;
+   // intermediate terms
+   float4 dGpol_dr, dGpol_dalpha2_ij;
+   // indices
+   float2 iAtom; 
+   float forceIndex;
+   // This is forceIndex mod numberOfAtoms, the true i index
+   float iAtomLinearIndex, jLinind; 
+   float2 jAtom;
+   float jEnd, jStart, jBlock;
+   float whichRep;
+   float tmp; 
+   // ---------------------------------------------------------------------------------------
+   // electricConstant           = -166.0f2691;
+   // preFactor                  = 2.0f*electricConstant*(1.0f - (1.0f/waterDielectric))
+   float preFactor               = -332.05382f;
+   const float I_Unroll          = 4.0f;
+   const float3 zero3            = float3( 0.0f, 0.0f, 0.0f );
+   // ---------------------------------------------------------------------------------------
+   preFactor                    *= ( (1.0f/soluteDielectric) - (1.0f/solventDielectric) );
+   iAtom                         = indexof( bornForce1 );
+   forceIndex                    = I_Unroll*( iAtom.x + iAtom.y*fstreamWidth );
+   iAtomLinearIndex              = fmod( forceIndex, roundedUpAtoms );
+   // ---------------------------------------------------------------------------------------
+   // set gather index
+   iAtom.x                       = fmod(  iAtomLinearIndex, streamWidth );
+   iAtom.y                       = round( (iAtomLinearIndex - fmod(iAtomLinearIndex, streamWidth ))/streamWidth );
+   // ---------------------------------------------------------------------------------------
+   // etch i1 position and partial charge
+   jQ                            = posq[          iAtom ];
+   i1Pos                         = jQ.xyz;
+   i1Q                           = atomicRadii[   iAtom ].y;
+   i1Q                          *= preFactor;
+   i1BornR                       = bornRadii[     iAtom ];
+   i1AtomicR                     = atomicRadii[   iAtom ].x;
+   kAceNonPolarLoop1( i1BornR, i1AtomicR, duplicationFactor, aceForce );
+   bornForce1.xyz                = zero3;
+   bornForce1.w                  = includeAce > 0.5f ? aceForce : 0.0f;
+   // ---------------------------------------------------------------------------------------
+   // etch i2 position and partial charge
+   iAtom.x                      += 1;
+   jQ                            = posq[          iAtom ];
+   i2Pos                         = jQ.xyz;
+   i2Q                           = atomicRadii[   iAtom ].y;
+   i2Q                          *= preFactor;
+   i2BornR                       = bornRadii[     iAtom ];
+   i2AtomicR                     = atomicRadii[   iAtom ].x;
+   kAceNonPolarLoop1( i2BornR, i2AtomicR, duplicationFactor, aceForce );
+   bornForce2.xyz                = zero3;
+   bornForce2.w                  = includeAce > 0.5f ? aceForce : 0.0f;
+   // ---------------------------------------------------------------------------------------
+   // etch i3 position and partial charge
+   iAtom.x                      += 1;
+   jQ                            = posq[          iAtom ];
+   i3Pos                         = jQ.xyz;
+   i3Q                           = atomicRadii[   iAtom ].y;
+   i3Q                          *= preFactor;
+   i3BornR                       = bornRadii[     iAtom ];
+   i3AtomicR                     = atomicRadii[   iAtom ].x;
+   kAceNonPolarLoop1( i3BornR, i3AtomicR, duplicationFactor, aceForce );
+   bornForce3.xyz                = zero3;
+   bornForce3.w                  = includeAce > 0.5f ? aceForce : 0.0f;
+   // ---------------------------------------------------------------------------------------
+   // etch i4 position and partial charge
+   iAtom.x                      += 1;
+   jQ                            = posq[          iAtom ];
+   i4Pos                         = jQ.xyz;
+   i4Q                           = atomicRadii[   iAtom ].y;
+   i4Q                          *= preFactor;
+   i4BornR                       = bornRadii[     iAtom ];
+   i4AtomicR                     = atomicRadii[   iAtom ].x;
+   kAceNonPolarLoop1( i4BornR, i4AtomicR, duplicationFactor, aceForce );
+   bornForce4.xyz                = zero3;
+   bornForce4.w                  = includeAce > 0.5f ? aceForce : 0.0f;
+   // ---------------------------------------------------------------------------------------
+   // inner loop setup
+   // if dupFac == 4, I_UnRoll =2, then breaking inner loop into two segments
+   // to increase number of threads in flight
+   // forceStreamSz = N*RepFac/I_UnRoll
+   // forceIndex       = I_UnRoll*( a.x + a.y*forceStreamSz )
+   // whichRep      = 0 or 1
+   // jBlock        = 1 + floor[ N/(duplicationFactor*streamWidth) ]
+   //changed the following instruction for rounding issues on some ASICs
+   //whichRep                     = floor( forceIndex / roundedUpAtoms );
+   tmp = fmod(forceIndex, roundedUpAtoms);
+   whichRep = round((forceIndex - tmp)/roundedUpAtoms);
+   jBlock                       = 1 + floor( numberOfAtoms/(duplicationFactor*streamWidth ) );
+   jStart                       = whichRep*jBlock;
+   jEnd                         = ( whichRep > duplicationFactor - 1.5f ) ? 999999.0f : (jStart + jBlock);
+   jAtom.y                      = jStart;
+   jLinind                      = jAtom.y*streamWidth;
+   // ---------------------------------------------------------------------------------------
+   while ( jAtom.y < jEnd && ( numberOfAtoms - jLinind )  > 0.9f ){
+      jAtom.x = 0.0f;
+      while ( jAtom.x < streamWidth && ( numberOfAtoms - jLinind ) > 0.9f ) {
+         // ---------------------------------------------------------------------------------------
+         // gather required values
+         j1Pos              = posq[      jAtom ];
+         j1Q                = atomicRadii[   jAtom ].y;
+         j1BornR            = bornRadii[ jAtom ];
+         jAtom.x           += 1.0f;
+         j2Pos              = posq[      jAtom ];
+         j2Q                = atomicRadii[   jAtom ].y;
+         j2BornR            = bornRadii[ jAtom ];
+         jAtom.x           += 1.0f;
+         j3Pos              = posq[      jAtom ];
+         j3Q                = atomicRadii[   jAtom ].y;
+         j3BornR            = bornRadii[ jAtom ];
+         jAtom.x           += 1.0f;
+         j4Pos              = posq[      jAtom ];
+         j4Q                = atomicRadii[   jAtom ].y;
+         j4BornR            = bornRadii[ jAtom ];
+         jAtom.x           += 1.0f;
+         jBornR             = float4( j1BornR, j2BornR, j3BornR, j4BornR );
+         jQ                 = float4( j1Q, j2Q, j3Q, j4Q );
+         // ---------------------------------------------------------------------------------------
+         // i == 1
+         d1                 = i1Pos - j1Pos;
+         d2                 = i1Pos - j2Pos;
+         d3                 = i1Pos - j3Pos;
+         d4                 = i1Pos - j4Pos;
+         loop1Internal( d1, d2, d3, d4, jBornR, jQ, i1BornR, i1Q, dGpol_dr, dGpol_dalpha2_ij );
+         bornForce1.xyz    += dGpol_dr.x*d1; 
+         bornForce1.xyz    += dGpol_dr.y*d2; 
+         bornForce1.xyz    += dGpol_dr.z*d3; 
+         bornForce1.xyz    += dGpol_dr.w*d4; 
+         bornForce1.w      += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w;
+         // ---------------------------------------------------------------------------------------
+         // i == 2
+         d1                 = i2Pos - j1Pos;
+         d2                 = i2Pos - j2Pos;
+         d3                 = i2Pos - j3Pos;
+         d4                 = i2Pos - j4Pos;
+         loop1Internal( d1, d2, d3, d4, jBornR, jQ, i2BornR, i2Q, dGpol_dr, dGpol_dalpha2_ij );
+         bornForce2.xyz    += dGpol_dr.x*d1; 
+         bornForce2.xyz    += dGpol_dr.y*d2; 
+         bornForce2.xyz    += dGpol_dr.z*d3; 
+         bornForce2.xyz    += dGpol_dr.w*d4; 
+         bornForce2.w      += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w;
+         // ---------------------------------------------------------------------------------------
+         // i == 3
+         d1                 = i3Pos - j1Pos;
+         d2                 = i3Pos - j2Pos;
+         d3                 = i3Pos - j3Pos;
+         d4                 = i3Pos - j4Pos;
+         loop1Internal( d1, d2, d3, d4, jBornR, jQ, i3BornR, i3Q, dGpol_dr, dGpol_dalpha2_ij );
+         bornForce3.xyz    += dGpol_dr.x*d1; 
+         bornForce3.xyz    += dGpol_dr.y*d2; 
+         bornForce3.xyz    += dGpol_dr.z*d3; 
+         bornForce3.xyz    += dGpol_dr.w*d4; 
+         bornForce3.w      += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w;
+         // ---------------------------------------------------------------------------------------
+         // i == 4
+         d1                 = i4Pos - j1Pos;
+         d2                 = i4Pos - j2Pos;
+         d3                 = i4Pos - j3Pos;
+         d4                 = i4Pos - j4Pos;
+         loop1Internal( d1, d2, d3, d4, jBornR, jQ, i4BornR, i4Q, dGpol_dr, dGpol_dalpha2_ij );
+         bornForce4.xyz    += dGpol_dr.x*d1; 
+         bornForce4.xyz    += dGpol_dr.y*d2; 
+         bornForce4.xyz    += dGpol_dr.z*d3; 
+         bornForce4.xyz    += dGpol_dr.w*d4; 
+         bornForce4.w      += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w;
+         // ---------------------------------------------------------------------------------------
+         jLinind    += 4.0f;
+      }
+      jAtom.y       += 1.0f;      
+   }
+}
--- a/platforms/brook/src/gpu/kmerge.br
+++ b/platforms/brook/src/gpu/kmerge.br
+/****************************************************************
+ //Linear index of i particle, divided by 2 because we unroll i by 2
+* This file is part of the gpu acceleration library for gromacs.
+* Author: Mark Friedrichs
+* 
+* This kernel was developed in collaboration with
+* 
+* Copyright (C) Pande Group, Stanford, 2006
+*****************************************************************/
+/* After forces above, we have the forces for even numbered particles
+ * in one stream, odd numbered particles in another.
+ * In each stream, the forces are in several parts depending on how
+ * many times we replicated the input stream.
+ *
+ * To avoid an extra kernel to zero forces, this sets the forces
+ * rather than adding to it.
+ * */
+kernel void kMergeFloat( 
+      float repfac, 
+      float atomStrWidth, 
+      float pstreamStrWidth,
+      float natoms,
+      float iUnroll,
+      iter float2 count<>, 
+      float pstream1[][], 
+      float pstream2[][], 
+      out float outstream<> )
+{
+   float linind;
+   float2 pindex;
+   float odd;
+   float i;
+   //convert to linear atom index
+   linind = count.x + count.y * atomStrWidth;
+   //If odd or even, we pick from diferent streams.
+   odd = linind - floor( linind / iUnroll ) * iUnroll;
+   //Now linear index is the index into partial_streams
+   linind = floor( linind / iUnroll );
+   outstream = 0.0f;
+   //If we have predicated conditionals, we should 
+   //keep the conditional inside the loop
+   for ( i = 0; i < repfac; i+=1.0f ) {
+      pindex.y = floor( linind / pstreamStrWidth );
+      pindex.x = linind - pindex.y * pstreamStrWidth;
+      if ( odd > 0.5f ) { //is odd
+         outstream += pstream2[ pindex ];
+      } else {
+         outstream += pstream1[ pindex ];
+      }
+      linind += natoms/iUnroll; 
+   }
+}
+kernel void kMergeFloat4( 
+      float repfac, 
+      float atomStrWidth, 
+      float pstreamStrWidth,
+      float natoms,
+      float iUnroll,
+      float4 pstream1[][], 
+      float4 pstream2[][], 
+      out float4 outstream<> )
+{
+   float linind;
+   float2 pindex;
+   float odd;
+   float i;
+   //convert to linear atom index
+   linind = (indexof outstream).x + ( (indexof outstream).y * atomStrWidth );
+   //If odd or even, we pick from diferent streams.
+   odd = linind - floor( linind / iUnroll ) * iUnroll;
+   //Now linear index is the index into partial_streams
+   linind = floor( linind / iUnroll );
+   outstream = float4( 0.0f, 0.0f, 0.0f, 0.0f );
+   //If we have predicated conditionals, we should 
+   //keep the conditional inside the loop
+   for ( i = 0.0f; i < repfac; i+= 1.0f ) {
+      pindex.y = floor( linind / pstreamStrWidth );
+      pindex.x = linind - pindex.y * pstreamStrWidth;
+      if ( odd > 0.5f ) { //is odd
+         outstream += pstream2[ pindex ];
+      } else {
+         outstream += pstream1[ pindex ];
+      }
+      linind += natoms/iUnroll; 
+   }
+}
+/* After forces above, we have the forces for even numbered particles
+ * in one stream, odd numbered particles in another.
+ * In each stream, the forces are in several parts depending on how
+ * many times we replicated the input stream.
+ *
+ * To avoid an extra kernel to zero forces, this sets the forces
+ * rather than adding to it.
+ * */
+kernel void kMergeFloat4_4X( 
+      float repfac, 
+      float atomStrWidth, 
+      float pstreamStrWidth,
+      float natoms,
+      float iUnroll,
+      float4 pstream1[][], 
+      float4 pstream2[][], 
+      float4 pstream3[][], 
+      float4 pstream4[][], 
+      out float4 outstream<> )
+{
+   float linind;
+   float2 pindex;
+   float odd;
+   float i;
+   //convert to linear atom index
+   linind = (indexof outstream).x + ( (indexof outstream).y * atomStrWidth );
+   //If odd or even, we pick from diferent streams.
+   odd = linind - floor( linind / iUnroll ) * iUnroll;
+   //Now linear index is the index into partial_streams
+   linind = floor( linind / iUnroll );
+   outstream = float4( 0.0f, 0.0f, 0.0f, 0.0f );
+   //If we have predicated conditionals, we should 
+   //keep the conditional inside the loop
+   for ( i = 0.0f; i < repfac; i+= 1.0f ) {
+      //pindex.y = floor( linind / pstreamStrWidth );
+      //pindex.x = linind - pindex.y * pstreamStrWidth;
+      pindex.y = round( (linind - fmod( linind, pstreamStrWidth ))/pstreamStrWidth ); //bixia modify
+      pindex.x = linind - pindex.y * pstreamStrWidth;
+outstream += float4( linind, odd, pindex.x,  pindex.y );
+/*
+      if ( odd < 0.5f ) { //is odd
+         outstream += pstream1[ pindex ];
+      } else if( odd < 1.5f ){
+         outstream += pstream2[ pindex ];
+      } else if( odd < 2.5f ){
+         outstream += pstream3[ pindex ];
+      } else {
+         outstream += pstream4[ pindex ];
+      }
+*/
+      linind += natoms/iUnroll; 
+   }
+}
+kernel void kMergeFloat4_4( 
+      float repfac, 
+      float atomStreamWidth, 
+      float pStreamWidth,
+      float natoms,
+      float roundNatoms,
+      float iUnroll,
+      float4 pstream1[][], 
+      float4 pstream2[][], 
+      float4 pstream3[][], 
+      float4 pstream4[][], 
+      out float4 outstream<> )
+{
+   float atomIndex, forceIndex, qIndex, qOff;
+   float2 pindex;
+   float i;
+   // given atom index find force indices and streams
+   pindex      = indexof( outstream );
+   atomIndex   = pindex.x + pindex.y*atomStreamWidth;
+   forceIndex  = atomIndex;
+   outstream   = float4( 0.0f, 0.0f, 0.0f, 0.0f );
+   for( i = 0.0f; i < repfac; i += 1.0f ){
+      // qIndex            = floor( forceIndex/iUnroll );
+      qIndex            = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
+      qOff              = forceIndex - iUnroll*qIndex;
+      // pindex.y          = floor( qIndex/ pStreamWidth );
+      pindex.y          = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
+      // pindex.x          = qIndex - pindex.y*pStreamWidth + qOff;
+      pindex.x          = qIndex - pindex.y*pStreamWidth;
+// outstream += float4( forceIndex, qIndex, pindex.x,  pindex.y );
+      if ( qOff < 0.5f ){ 
+         outstream += pstream1[ pindex ];
+      } else if( qOff < 1.5f ){
+         outstream += pstream2[ pindex ];
+      } else if( qOff < 2.5f ){
+         outstream += pstream3[ pindex ];
+      } else {
+         outstream += pstream4[ pindex ];
+      }
+      forceIndex += roundNatoms;
+   }
+}
+kernel void kPostObcLoop1( 
+    float repfac, 
+    float atomStreamWidth, 
+    float pStreamWidth,
+    float natoms,
+    float roundNatoms,
+    float iUnroll,
+    float4 pstream1[][], 
+    float4 pstream2[][], 
+    float4 pstream3[][], 
+    float4 pstream4[][], 
+    float obcChain<>, 
+    float bornRadii<>, 
+    out float4 outstream<>,
+    out float bornRadii2Force<> )
+{
+    // ---------------------------------------------------------------------------------------
+    float atomIndex, forceIndex, qIndex, qOff;
+    float2 pindex;
+    float i;
+    float4 o1,o2,o3,o4;
+    float4 tmp;
+    float4 zero4;
+    // given atom index find force indices and streams
+    pindex      = indexof( outstream );
+    atomIndex   = pindex.x + pindex.y*atomStreamWidth;
+    forceIndex  = atomIndex;
+    zero4   = float4( 0.0f, 0.0f, 0.0f, 0.0f );
+    outstream = zero4;
+    for( i = 0.0f; i < repfac; i += 1.0f ){
+        // qIndex            = floor( forceIndex/iUnroll );
+        qIndex            = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
+        qOff              = forceIndex - iUnroll*qIndex;
+        // pindex.y          = floor( qIndex/ pStreamWidth );
+        pindex.y          = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
+        // pindex.x          = qIndex - pindex.y*pStreamWidth + qOff;
+        pindex.x          = qIndex - pindex.y*pStreamWidth;
+// outstream += float4( forceIndex, qIndex, pindex.x,  pindex.y );
+        // This is going to cause really divergent code and we are
+        // going to end up doing all the fetches anyway...
+        if ( qOff < 0.5f ){ 
+            outstream += pstream1[ pindex ];
+        } else if( qOff < 1.5f ){
+            outstream += pstream2[ pindex ];
+        } else if( qOff < 2.5f ){
+            outstream += pstream3[ pindex ];
+        } else {
+            outstream += pstream4[ pindex ];
+        }
+//         o1 = pstream1[ pindex ];
+//         o2 = pstream2[ pindex ];
+//         o3 = pstream3[ pindex ];
+//         o4 = pstream4[ pindex ];
+//         tmp = qOff < 0.5f ? o1 : o2;
+//         tmp = qOff < 1.5f ? tmp : o3;
+//         tmp = qOff < 2.5f ? tmp : o4;
+//         outstream += tmp;
+        forceIndex += roundNatoms;
+    }
+    bornRadii2Force  = obcChain*bornRadii*bornRadii*outstream.w;
+}
+// The inner loop by definition creates divergent paths.  Chances are
+// fair that we will take all sides of the branch anyway, so this
+// verion uses manual predication
+kernel void kPostObcLoop1_nobranch( 
+    float repfac, 
+    float atomStreamWidth, 
+    float pStreamWidth,
+    float natoms,
+    float roundNatoms,
+    float iUnroll,
+    float4 pstream1[][], 
+    float4 pstream2[][], 
+    float4 pstream3[][], 
+    float4 pstream4[][], 
+    float obcChain<>, 
+    float bornRadii<>, 
+    out float4 outstream<>,
+    out float bornRadii2Force<> )
+{
+    // ---------------------------------------------------------------------------------------
+    float atomIndex, forceIndex, qIndex, qOff;
+    float2 pindex;
+    float i;
+    float4 o1,o2,o3,o4;
+    float4 tmp;
+    // given atom index find force indices and streams
+    pindex      = indexof( outstream );
+    atomIndex   = pindex.x + pindex.y*atomStreamWidth;
+    forceIndex  = atomIndex;
+    outstream  = float4( 0.0f, 0.0f, 0.0f, 0.0f );
+    for( i = 0.0f; i < repfac; i += 1.0f ){
+        // qIndex            = floor( forceIndex/iUnroll );
+        qIndex            = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
+        qOff              = forceIndex - iUnroll*qIndex;
+        // pindex.y          = floor( qIndex/ pStreamWidth );
+        pindex.y          = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
+        // pindex.x          = qIndex - pindex.y*pStreamWidth + qOff;
+        pindex.x          = qIndex - pindex.y*pStreamWidth;
+        o1 = pstream1[ pindex ];
+        o2 = pstream2[ pindex ];
+        o3 = pstream3[ pindex ];
+        o4 = pstream4[ pindex ];
+        tmp = qOff < 0.5f ? o1 : o2;
+        tmp = qOff < 1.5f ? tmp : o3;
+        tmp = qOff < 2.5f ? tmp : o4;
+        outstream += tmp;
+        forceIndex += roundNatoms;
+    }
+    bornRadii2Force  = obcChain*bornRadii*bornRadii*outstream.w;
+}
+kernel void kSetValue4( float value, out float4 outstream<> ){
+ outstream = float4( value, value, value, value );
+}
+kernel void kSetValue3( float value, out float3 outstream<> ){
+ outstream = float3( value, value, value );
+}
+kernel void kSetValue2( float value, out float2 outstream<> ){
+ outstream = float2( value, value );
+}
+kernel void kSetValue1( float value, out float outstream<> ){
+ outstream = value;
+}
+kernel void kCheck( float natoms, float atomStrWidth, float pstreamStrWidth, float unroll, out float4 outstream<> )
+{
+ float linind, forceIndex, atomIndex;
+ float2 pindex;
+ pindex     = indexof( outstream );
+ forceIndex = unroll*(pindex.x + pindex.y*pstreamStrWidth);
+ atomIndex  = fmod( forceIndex, natoms );
+ outstream  = float4( pindex.x, pindex.y, forceIndex, atomIndex );
+}
+/* After forces above, we have the forces for even numbered particles
+ * in one stream, odd numbered particles in another.
+ * In each stream, the forces are in several parts depending on how
+ * many times we replicated the input stream.
+ *
+ * To avoid an extra kernel to zero forces, this sets the forces
+ * rather than adding to it.
+ * */
+kernel void kAddAndMergeFloat4( 
+      float repfac, 
+      float atomStrWidth, 
+      float pstreamStrWidth,
+      float natoms,
+      float iUnroll,
+      float4 inStream<>, 
+      float4 pstream1[][], 
+      float4 pstream2[][], 
+      out float4 outstream<> )
+{
+   float linind;
+   float2 pindex;
+   float odd;
+   float i;
+   float floor_linind_iUnroll;
+   linind = (indexof outstream).x + (indexof outstream).y * atomStrWidth;
+   //If odd or even, we pick from diferent streams.
+   //odd = linind - floor( linind / iUnroll ) * iUnroll;
+   //Now linear index is the index into partial_streams
+   //linind = floor( linind / iUnroll );
+   floor_linind_iUnroll = round( (linind - fmod(linind, iUnroll))/iUnroll );
+   odd = linind - floor_linind_iUnroll * iUnroll;//bixia modify
+   linind = floor_linind_iUnroll; //bixia modify
+   outstream   = inStream;
+   outstream.w = 0.0f;
+   //If we have predicated conditionals, we should 
+   //keep the conditional inside the loop
+   for ( i = 0.0f; i < repfac; i+= 1.0f ) {
+      //pindex.y = floor( linind / pstreamStrWidth );
+      pindex.y = round( (linind - fmod( linind, pstreamStrWidth ))/pstreamStrWidth ); //bixia modify
+      pindex.x = linind - pindex.y * pstreamStrWidth;
+      if ( odd > 0.5f ) { //is odd
+         outstream += pstream2[ pindex ];
+      } else {
+         outstream += pstream1[ pindex ];
+      }
+      linind += natoms/iUnroll; 
+   }
+}
+/* After forces above, we have the forces for even numbered particles
+ * in one stream, odd numbered particles in another.
+ * In each stream, the forces are in several parts depending on how
+ * many times we replicated the input stream.
+ *
+ * To avoid an extra kernel to zero forces, this sets the forces
+ * rather than adding to it.
+ * */
+/*
+kernel void kAddAndMergeFloat4_4( 
+      float repfac, 
+      float atomStrWidth, 
+      float pstreamStrWidth,
+      float natoms,
+      float iUnroll,
+      float4 inStream<>, 
+      float4 pstream1[][], 
+      float4 pstream2[][], 
+      float4 pstream3[][], 
+      float4 pstream4[][], 
+      out float4 outstream<> )
+{
+   float linind;
+   float2 pindex;
+   float odd;
+   float i;
+   float floor_linind_iUnroll;
+   linind = (indexof outstream).x + (indexof outstream).y * atomStrWidth;
+   //If odd or even, we pick from diferent streams.
+   //odd = linind - floor( linind / iUnroll ) * iUnroll;
+   //Now linear index is the index into partial_streams
+   //linind = floor( linind / iUnroll );
+   floor_linind_iUnroll = round( (linind - fmod(linind, iUnroll))/iUnroll );
+   odd                  = linind - floor_linind_iUnroll * iUnroll;//bixia modify
+   linind               = floor_linind_iUnroll; //bixia modify
+   outstream   = inStream;
+   outstream.w = 0.0f;
+//   outstream     = float4( 0.0f, 0.0f, 0.0f, 0.0f );
+   //If we have predicated conditionals, we should 
+   //keep the conditional inside the loop
+   for ( i = 0.0f; i < repfac; i+= 1.0f ) {
+      //pindex.y = floor( linind / pstreamStrWidth );
+      pindex.y = round( (linind - fmod( linind, pstreamStrWidth ))/pstreamStrWidth ); //bixia modify
+      pindex.x = linind - pindex.y * pstreamStrWidth;
+      if ( odd < 0.5f ) { //is odd
+         outstream += pstream1[ pindex ];
+      } else if( odd < 1.5f ){
+         outstream += pstream2[ pindex ];
+      } else if( odd < 2.5f ){
+         outstream += pstream3[ pindex ];
+      } else {
+         outstream += pstream4[ pindex ];
+      }
+      linind += natoms/iUnroll; 
+   }
+} */
+kernel void kAddAndMergeFloat4_4( 
+      float repfac, 
+      float atomStreamWidth, 
+      float pStreamWidth,
+      float natoms,
+      float roundNatoms,
+      float iUnroll,
+      float4 inStream<>, 
+      float4 pstream1[][], 
+      float4 pstream2[][], 
+      float4 pstream3[][], 
+      float4 pstream4[][], 
+      out float4 outstream<> ){
+   float atomIndex, forceIndex, qIndex, qOff;
+   float2 pindex;
+   float i;
+   // given atom index find force indices and streams
+   pindex      = indexof( outstream );
+   atomIndex   = pindex.x + pindex.y*atomStreamWidth;
+   forceIndex  = atomIndex;
+   // add current forces in inStream to forces stored in pstreams
+   // the .w entry is Born sum values; it will be used to calculate the
+   // Born radii and obcChain term
+   outstream   = inStream;
+   outstream.w = 0.0f;
+//outstream   = float4( 0.0f, 0.0f, 0.0f, 0.0f );
+   // sum over j-loop 'duplications' by gathering from pstreams
+   for( i = 0.0f; i < repfac; i += 1.0f ){
+      // qIndex            = floor( forceIndex/iUnroll );
+      qIndex            = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
+      qOff              = forceIndex - iUnroll*qIndex;
+      // pindex.y          = floor( qIndex/ pStreamWidth );
+      pindex.y          = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
+      // pindex.x          = qIndex - pindex.y*pStreamWidth + qOff;
+      pindex.x          = qIndex - pindex.y*pStreamWidth;
+      if( qOff < 0.5f ){ 
+         outstream += pstream1[ pindex ];
+      } else if( qOff < 1.5f ){
+         outstream += pstream2[ pindex ];
+      } else if( qOff < 2.5f ){
+         outstream += pstream3[ pindex ];
+      } else {
+         outstream += pstream4[ pindex ];
+      }
+      forceIndex += roundNatoms;
+   }
+}
+kernel void kPostObcLoop2( 
+      float repfac, 
+      float atomStreamWidth, 
+      float pStreamWidth,
+      float natoms,
+      float roundNatoms,
+      float iUnroll,
+      float conversion,
+      float mergeNonObcForces,
+      float4 inObcForces<>, 
+      float3 nonObcForces<>, 
+      float4 pstream1[][], 
+      float4 pstream2[][], 
+      float4 pstream3[][], 
+      float4 pstream4[][], 
+      float  atomicRadii<>,
+      out float bornRadii<>,
+      out float obcChain<>,
+      out float3 outForces<> ){
+   // ---------------------------------------------------------------------------------------
+   float atomIndex, forceIndex, qIndex, qOff;
+   float2 pindex;
+   float i;
+   float sum2, sum3, bornSum, tanhSum, atomicRadiiOffset, obcIntermediate;
+   float2 iAtom; 
+   float4 forces;
+   float expPlus, expMinus;
+   // ---------------------------------------------------------------------------------------
+   // constants -- OBC Type II
+   const float alphaObc          = 1.0f;
+   const float betaObc           = 0.8f;
+   const float gammaObc          = 4.85f;
+   const float dielectricOffset  = 0.009f;
+   // ---------------------------------------------------------------------------------------
+   // given atom index find force indices and streams
+   pindex      = indexof( outForces );
+   atomIndex   = pindex.x + pindex.y*atomStreamWidth;
+   forceIndex  = atomIndex;
+   // add current forces in inStream to forces stored in pstreams
+   // the .w entry is Born sum values; it will be used to calculate the
+   // Born radii and obcChain term
+   forces   = inObcForces;
+   forces.w = 0.0f;
+//forces   = float4( 0.0f, 0.0f, 0.0f, 0.0f );
+   // sum over j-loop 'duplications' by gathering from pstreams
+   for( i = 0.0f; i < repfac; i += 1.0f ){
+      // qIndex            = floor( forceIndex/iUnroll );
+      qIndex            = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
+      qOff              = forceIndex - iUnroll*qIndex;
+      // pindex.y          = floor( qIndex/ pStreamWidth );
+      pindex.y          = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
+      // pindex.x          = qIndex - pindex.y*pStreamWidth + qOff;
+      pindex.x          = qIndex - pindex.y*pStreamWidth;
+      if( qOff < 0.5f ){ 
+         forces += pstream1[ pindex ];
+      } else if( qOff < 1.5f ){
+         forces += pstream2[ pindex ];
+      } else if( qOff < 2.5f ){
+         forces += pstream3[ pindex ];
+      } else {
+         forces += pstream4[ pindex ];
+      }
+      forceIndex += roundNatoms;
+   }
+   // compute Born radii and ObcChain
+   atomicRadiiOffset            = atomicRadii - dielectricOffset;
+   bornSum                      = forces.w;
+   bornSum                     *= 0.5f*atomicRadiiOffset;
+   sum2                         = bornSum*bornSum;
+   sum3                         = bornSum*sum2;
+   // Tanh does not exist? 
+   // calculate [ exp(x) - exp(-x) ]/[ exp(x) + exp(-x) ]
+   // tanhSum                      = tanh( bornSum - betaObc*sum2 + gammaObc*sum3 );
+   tanhSum                      = bornSum - betaObc*sum2 + gammaObc*sum3;
+   expPlus                      = exp( tanhSum );
+   expMinus                     = 1.0f/expPlus;
+   tanhSum                      = ( expPlus - expMinus )/( expPlus + expMinus );
+   bornRadii                    = 1.0f/( (1.0f/(atomicRadiiOffset)) - tanhSum/atomicRadii );  
+   obcIntermediate              = atomicRadiiOffset*( alphaObc - 2.0f*betaObc*bornSum + 3.0f*gammaObc*sum2 );
+   obcChain                     = (1.0f - tanhSum*tanhSum)*obcIntermediate/atomicRadii;
+   if( atomIndex >= natoms ){
+      bornRadii = 0.0f;
+      obcChain  = 0.0f;
+   }
+   // add converted new forces to non-Obc forces
+   outForces = conversion*forces.xyz;
+   if( mergeNonObcForces > 0.1f ){
+      outForces += nonObcForces; 
+   }
+}
+kernel void kPostObcLoop2_nobranch( 
+      float repfac, 
+      float atomStreamWidth, 
+      float pStreamWidth,
+      float natoms,
+      float roundNatoms,
+      float iUnroll,
+      float conversion,
+      float mergeNonObcForces,
+      float4 inObcForces<>, 
+      float3 nonObcForces<>, 
+      float4 pstream1[][], 
+      float4 pstream2[][], 
+      float4 pstream3[][], 
+      float4 pstream4[][], 
+      float  atomicRadii<>,
+      out float bornRadii<>,
+      out float obcChain<>,
+      out float3 outForces<> ){
+   // ---------------------------------------------------------------------------------------
+   float atomIndex, forceIndex, qIndex, qOff;
+   float2 pindex;
+   float i;
+   float sum2, sum3, bornSum, tanhSum, atomicRadiiOffset, obcIntermediate;
+   float4 o1,o2,o3,o4;
+   float4 tmp;
+   float2 iAtom; 
+   float4 forces;
+   float expPlus, expMinus;
+   // ---------------------------------------------------------------------------------------
+   // constants -- OBC Type II
+   const float alphaObc          = 1.0f;
+   const float betaObc           = 0.8f;
+   const float gammaObc          = 4.85f;
+   const float dielectricOffset  = 0.009f;
+   // ---------------------------------------------------------------------------------------
+   // given atom index find force indices and streams
+   pindex      = indexof( outForces );
+   atomIndex   = pindex.x + pindex.y*atomStreamWidth;
+   forceIndex  = atomIndex;
+   // add current forces in inStream to forces stored in pstreams
+   // the .w entry is Born sum values; it will be used to calculate the
+   // Born radii and obcChain term
+   forces   = inObcForces;
+   forces.w = 0.0f;
+//forces   = float4( 0.0f, 0.0f, 0.0f, 0.0f );
+   // sum over j-loop 'duplications' by gathering from pstreams
+   for( i = 0.0f; i < repfac; i += 1.0f ){
+      // qIndex            = floor( forceIndex/iUnroll );
+      qIndex            = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
+      qOff              = forceIndex - iUnroll*qIndex;
+      // pindex.y          = floor( qIndex/ pStreamWidth );
+      pindex.y          = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
+      // pindex.x          = qIndex - pindex.y*pStreamWidth + qOff;
+      pindex.x          = qIndex - pindex.y*pStreamWidth;
+      o1 = pstream1[ pindex ];
+      o2 = pstream2[ pindex ];
+      o3 = pstream3[ pindex ];
+      o4 = pstream4[ pindex ];
+      tmp = qOff < 0.5f ? o1 : o2;
+      tmp = qOff < 1.5f ? tmp : o3;
+      tmp = qOff < 2.5f ? tmp : o4;
+      forces += tmp;
+      forceIndex += roundNatoms;
+   }
+   // compute Born radii and ObcChain
+   atomicRadiiOffset            = atomicRadii - dielectricOffset;
+   bornSum                      = forces.w;
+   bornSum                     *= 0.5f*atomicRadiiOffset;
+   sum2                         = bornSum*bornSum;
+   sum3                         = bornSum*sum2;
+   // Tanh does not exist? 
+   // calculate [ exp(x) - exp(-x) ]/[ exp(x) + exp(-x) ]
+   // tanhSum                      = tanh( bornSum - betaObc*sum2 + gammaObc*sum3 );
+   tanhSum                      = bornSum - betaObc*sum2 + gammaObc*sum3;
+   expPlus                      = exp( tanhSum );
+   expMinus                     = 1.0f/expPlus;
+   tanhSum                      = ( expPlus - expMinus )/( expPlus + expMinus );
+   bornRadii                    = 1.0f/( (1.0f/(atomicRadiiOffset)) - tanhSum/atomicRadii );  
+   obcIntermediate              = atomicRadiiOffset*( alphaObc - 2.0f*betaObc*bornSum + 3.0f*gammaObc*sum2 );
+   obcChain                     = (1.0f - tanhSum*tanhSum)*obcIntermediate/atomicRadii;
+   if( atomIndex >= natoms ){
+      bornRadii = 0.0f;
+      obcChain  = 0.0f;
+   }
+   // add converted new forces to non-Obc forces
+   outForces = conversion*forces.xyz;
+   if( mergeNonObcForces > 0.1f ){
+      outForces += nonObcForces; 
+   }
+}
+/* After forces above, we have the forces for even numbered particles
+ * in one stream, odd numbered particles in another.
+ * In each stream, the forces are in several parts depending on how
+ * many times we replicated the input stream.
+ *
+ * To avoid an extra kernel to zero forces, this sets the forces
+ * rather than adding to it.
+ * */
+kernel void kPreGbsaForce2( 
+      float4 intermediateForceIn<>, 
+      float bornRadii<>, 
+      out float bornRadii2Force<> ){
+   // ---------------------------------------------------------------------------------------
+   // float P4                   = 15.236f;
+   // P4_ec                      = P4/electricConstant
+   const float P4_ec             = -0.09176f;
+   bornRadii2Force = P4_ec*bornRadii*bornRadii*intermediateForceIn.w;
+}
+/* After forces above, we have the forces for even numbered particles
+ * in one stream, odd numbered particles in another.
+ * In each stream, the forces are in several parts depending on how
+ * many times we replicated the input stream.
+ *
+ * To avoid an extra kernel to zero forces, this sets the forces
+ * rather than adding to it.
+ * */
+kernel void kPreObcForce2( 
+      float4 intermediateForceIn<>, 
+      float4 obcChain<>, 
+      float bornRadii<>, 
+      out float bornRadii2Force<> ){
+   // ---------------------------------------------------------------------------------------
+   bornRadii2Force = obcChain*bornRadii*bornRadii*intermediateForceIn.w;
+}
+/* Add forces from two streams */ 
+kernel void kAddForces3_4( float conversion, float3 force1<>, float4 force2<>, out float3 outForce<> ){
+   outForce.xyz  = force1 + conversion*force2.xyz;
+}
+/* Copy one stream to another */
+kernel void kCopyFloat4( float4 inForce<>, out float4 outForce<> ){
+   outForce = inForce;
+}
+/* Copy one stream to another
+ * */
+kernel void kCopyFloat3To4( 
+      float3 inForce<>, 
+      out float4 outForce<> ){
+   // ---------------------------------------------------------------------------------------
+   outForce.xyz = inForce;
+   outForce.w   = 0.0f;
+}
+/* ---------------------------------------------------------------------------------------
+   Calculate Born radius from bonded and nonbonded Gpol 
+   gpolNonBonded value is in gpolNonBonded.w 
+   --------------------------------------------------------------------------------------- */
+kernel void kBornRadii( float4 gpolNonBonded<>, float gpolFixed<>, out float bornRadius<> ){
+   // ---------------------------------------------------------------------------------------
+   // constants
+   const float electricConstant  = -166.02691f;
+   // 0.25*P4
+   const float P4_25             = 3.81575f;
+   // ---------------------------------------------------------------------------------------
+   bornRadius = gpolFixed + P4_25*gpolNonBonded.w;
+   bornRadius = electricConstant/bornRadius;
+   // ---------------------------------------------------------------------------------------
+}
--- a/platforms/brook/src/gpu/kmerge_partial_forces.br
+++ b/platforms/brook/src/gpu/kmerge_partial_forces.br
@@ -99,3 +99,53 @@ kernel void kMergeFloat3_4(
   }
 }
+kernel void kMergeFloat3_4_nobranch( 
+      float repfac, 
+      float atomStreamWidth, 
+      float pStreamWidth,
+      float natoms,
+      float roundNatoms,
+      float iUnroll,
+      float3 pstream1[][], 
+      float3 pstream2[][], 
+      float3 pstream3[][], 
+      float3 pstream4[][], 
+      out float3 outstream<> )
+{
+   float atomIndex, forceIndex, qIndex, qOff;
+   float2 pindex;
+   float i;
+   float3 o1,o2,o3,o4;
+   float3 tmp;
+   // given atom index find force indices and streams
+   pindex      = indexof( outstream );
+   atomIndex   = pindex.x + pindex.y*atomStreamWidth;
+   forceIndex  = atomIndex;
+   outstream   = float3( 0.0f, 0.0f, 0.0f );
+   for( i = 0.0f; i < repfac; i += 1.0f ){
+      qIndex            = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
+      qOff              = forceIndex - iUnroll*qIndex;
+      pindex.y          = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
+      pindex.x          = qIndex - pindex.y*pStreamWidth;
+      o1 = pstream1[ pindex ];
+      o2 = pstream2[ pindex ];
+      o3 = pstream3[ pindex ];
+      o4 = pstream4[ pindex ];
+      tmp = qOff < 0.5f ? o1 : o2;
+      tmp = qOff < 1.5f ? tmp : o3;
+      tmp = qOff < 2.5f ? tmp : o4;
+      outstream += tmp;
+      forceIndex += roundNatoms;
+   }
+}
--- a/platforms/brook/src/gpu/knlist.br
+++ b/platforms/brook/src/gpu/knlist.br
-/****************************************************************
-* This file is part of the gpu acceleration library for gromacs.
-* Author: V. Vishal
-* Copyright (C) Pande Group, Stanford, 2006
-*****************************************************************/
-/* Order N^2 neighbor searching.
- *
- * This only works for force fields that don't have charge groups.
- * If you insist on charge groups, you'll have to pass in appropriate masks here.
- *
- * This is a simplified kernel, for testing the O(N) speeds.
- *
- * This does a complete N^2 search without considering groups of
- * atoms. Most likely this will prove to be inefficient for 
- * the O(N) kernel. Lets find out.
- * 
- *
- * Each component of the curpass textures is an atom index. The w component
- * of curpass3 is a count indicating how many j particles we have
- * scanned for this particular i atom.
- * 
- * */
-kernel void knborsearch(
-		float first,          //Positive means constructing the first 16.
-		iter float2 wpos<>,   //pixel position of output
-		float AtomStrHeight,
-		float AtomStrWidth,
-		float cutoff2,      //square of the cutoff
-		float natoms,       //number of atoms
-		float excl[][],    //exclusions in 1x1 format, 0 means not excluded, 1 means excluded.
-		float4 posq[][],    //atom positions/charges
-		float4 prevpass3<>, //Last output texture of previous pass
-		out float4 curpass0<>,  //First output of current pass
-		out float4 curpass1<>,  
-		out float4 curpass2<>,  
-		out float4 curpass3<>  //Last output of current pass, used in next pass
-		){
-	/*For this kernel, wpos == iatom*/
-	float2 iind;
-	float2 jind;
-	float3 ipos, jpos, dr;
-	float r2;
-	float listptr; //Where in the 16-chunk are we now.
-	float jlinind;
-	float breakflag; //positive means keep looping, negative means stop
-	float4 exclconst;
-	float2 exclind;
-	float exclusions;
-	exclconst = float4( 2.0f, 3.0f, 5.0f, 7.0f );
-	iind = wpos;
-	exclind.x = iind.x + iind.y * AtomStrWidth;
-	//etch i atom
-	ipos = posq[ iind ].xyz;
-	//Loop over j depending on prevpass
-	jlinind = prevpass3.w + 1;
-	jind.y = floor( jlinind / AtomStrWidth );
-	jind.x = fmod( jlinind, AtomStrWidth );
-	exclind.y = jlinind;
-	//All outputs should be initialized to 
-	listptr = 0.0f;
-	breakflag = 1.0f;
-	//if we already finished, do nothing
-	if ( first < 0.0f && prevpass3.w < 0.0f )
-		breakflag = -1.0f;
-	//set to -1 to indicate no neighbor
-	//just to save a separate set of init calls
-	curpass0 = float4( -1.0f, -1.0f, -1.0f, -1.0f );
-	curpass1 = curpass0;
-	curpass2 = curpass0;
-	curpass3 = curpass0;
-	while ( jind.y < AtomStrHeight && breakflag > 0.0f ) {
-		while ( jind.x < AtomStrWidth && breakflag > 0.0f ) {
-			//First see if this pair is excluded
-			exclusions = excl[ exclind ];
-			if ( exclusions < 0.5f ) {
-				jpos = posq[ jind ].xyz;
-				dr = jpos - ipos;
-				r2 = dot( dr, dr );
-				//If it is inside the cutoff
-				if ( r2 < cutoff2 ) {
-					//Figure out where to put it
-					//We are allowed 4 nested conditionals
-					//We can play with the structuring of these
-					if ( listptr < 0.5f )
-						curpass0.x = jlinind;
-					else if ( listptr < 1.5f )
-						curpass0.y = jlinind;
-					else if ( listptr < 2.5f )
-						curpass0.z = jlinind;
-					else if ( listptr < 3.5f )
-						curpass0.w = jlinind;
-					else if ( listptr < 4.5f )
-						curpass1.x = jlinind;
-					else if ( listptr < 5.5f )
-						curpass1.y = jlinind;
-					else if ( listptr < 6.5f )
-						curpass1.z = jlinind;
-					else if ( listptr < 7.5f )
-						curpass1.w = jlinind;
-					else if ( listptr < 8.5f )
-						curpass2.x = jlinind;
-					else if ( listptr < 9.5f )
-						curpass2.y = jlinind;
-					else if ( listptr < 10.5f )
-						curpass2.z = jlinind;
-					else if ( listptr < 11.5f )
-						curpass2.w = jlinind;
-					else if ( listptr < 12.5f )
-						curpass3.x = jlinind;
-					else if ( listptr < 13.5f )
-						curpass3.y = jlinind;
-					else if ( listptr < 14.5f ) {
-						curpass3.z = jlinind;
-					}
-					else if ( listptr < 15.5f ) {
-						//We're done for this pass
-						curpass3.w = jlinind;
-						breakflag = -1.0f;
-					}
-					listptr += 1.0f;
-				}
-			}
-			jlinind += 1.0f;
-			exclind.y += 1.0f;
-			jind.x  += 1.0f;
-		}
-		jind.x = 0.0f;
-		jind.y += 1.0f;
-	}
-}
-//Precomputes lennard jones sig and eps 
-//to save an indirect etch (and a ew flops) in the 
-//force kernel. The charge product is not done this way
-//because charges have to be etched anyway with the 
-//positions
-kernel void knl_precompute_sigeps(
-		float AtomStrWidth,
-		iter float2 wpos<>,
-		float2 sigeps[][], //x=sigma, y=epsilon
-		float4 nlist0<>,
-		float4 nlist1<>,
-		out float4 sig0<>,
-		out float4 eps0<>,
-		out float4 sig1<>,
-		out float4 eps1<>
-	   	)
-{
-	float2 jind;
-	float4 ind_tmp1, ind_tmp2;
-	float2 isigeps, jsigeps1, jsigeps2, jsigeps3, jsigeps4;
-	isigeps = sigeps[ wpos ];
-	ind_tmp1 = floor( nlist0 / AtomStrWidth );
-	ind_tmp2 = nlist0 - ind_tmp1 * AtomStrWidth;
-	jind.y = ind_tmp1.x;
-	jind.x = ind_tmp2.x;
-	jsigeps1 = sigeps[ jind ];
-	jind.y = ind_tmp1.y;
-	jind.x = ind_tmp2.y;
-	jsigeps2 = sigeps[ jind ];
-	jind.y = ind_tmp1.z;
-	jind.x = ind_tmp2.z;
-	jsigeps3 = sigeps[ jind ];
-	jind.y = ind_tmp1.w;
-	jind.x = ind_tmp2.w;
-	jsigeps4 = sigeps[ jind ];
-	sig0.x = isigeps.x + jsigeps1.x;
-	sig0.y = isigeps.x + jsigeps2.x;
-	sig0.z = isigeps.x + jsigeps3.x;
-	sig0.w = isigeps.x + jsigeps4.x;
-	eps0.x = isigeps.y * jsigeps1.y;
-	eps0.y = isigeps.y * jsigeps2.y;
-	eps0.z = isigeps.y * jsigeps3.y;
-	eps0.w = isigeps.y * jsigeps4.y;
-	//2nd nlist set
-	ind_tmp1 = floor( nlist1 / AtomStrWidth );
-	ind_tmp2 = nlist1 - ind_tmp1 * AtomStrWidth;
-	jind.y = ind_tmp1.x;
-	jind.x = ind_tmp2.x;
-	jsigeps1 = sigeps[ jind ];
-	jind.y = ind_tmp1.y;
-	jind.x = ind_tmp2.y;
-	jsigeps2 = sigeps[ jind ];
-	jind.y = ind_tmp1.z;
-	jind.x = ind_tmp2.z;
-	jsigeps3 = sigeps[ jind ];
-	jind.y = ind_tmp1.w;
-	jind.x = ind_tmp2.w;
-	jsigeps4 = sigeps[ jind ];
-	sig1.x = isigeps.x + jsigeps1.x;
-	sig1.y = isigeps.x + jsigeps2.x;
-	sig1.z = isigeps.x + jsigeps3.x;
-	sig1.w = isigeps.x + jsigeps4.x;
-	eps1.x = isigeps.y * jsigeps1.y;
-	eps1.y = isigeps.y * jsigeps2.y;
-	eps1.z = isigeps.y * jsigeps3.y;
-	eps1.w = isigeps.y * jsigeps4.y;
-}
--- a/platforms/brook/src/gpu/kpdihs.br
+++ b/platforms/brook/src/gpu/kpdihs.br
-/****************************************************************
-* This file is part of the gpu acceleration library for gromacs.
-* Author: V. Vishal
-* Copyright (C) Pande Group, Stanford, 2006
-*****************************************************************/
-//Ryckaert Bellman dihedrals, needed for Amber/OPLS ff's
-//
-//Input is a stream of quartets i, j, k, l and the output is
-//four float3 streams fi, fj, fk, fl.
-//If by any chance this kernel becomes the bottleneck, we will
-//optimize, but for now, this is kept pretty simple.
-//To keep things streaming, we have a stream of 6 parameters(a float4 and float2)
-//for each dihedral.
-kernel void kpdih( 
-        float xstrwidth, //stream width for x
-        float4 xq[][], //particle coordinates and charges
-        float4 atoms<>, //ijkl quartets
-        float4 parms<>, //parms = ( cp, phi0, mult, 0.0 )
-        out float3 fi<>, //output forces for i, j, k, l
-        out float3 fj<>,
-        out float3 fk<>,
-        out float3 fl<>
-	  	) {
-	float3 r_ij, r_kj, r_kl;
-	float2 ai, aj, ak, al;
-	float3 m, n;
-	float sgnphi;
-	float cosfac;
-	float phi, ddphi, mdphi;
-	float3 u, v, s;
-	float nrkj, nrkj2, msq, nsq, cos_phi, sin_phi;
-	//Convert from linear indices to 2D indices into x
-	//If this kernel is compute bound, we can do this 
-	//conversion before-hand and feed in the 2D coordinates
-	ai.y = floor( atoms.x / xstrwidth );
-	ai.x = atoms.x - ai.y * xstrwidth;
-	aj.y = floor( atoms.y / xstrwidth );
-	aj.x = atoms.y - aj.y * xstrwidth;
-	ak.y = floor( atoms.z / xstrwidth );
-	ak.x = atoms.z - ak.y * xstrwidth;
-	al.y = floor( atoms.w / xstrwidth );
-	al.x = atoms.w - al.y * xstrwidth;
-	r_ij = xq[ai].xyz - xq[aj].xyz; //3
-	r_kj = xq[ak].xyz - xq[aj].xyz; //3
-	r_kl = xq[ak].xyz - xq[al].xyz; //3
-	m = cross( r_ij, r_kj ); //9
-	n = cross( r_kj, r_kl ); //9
-	msq = dot(m, m); //5
-	nsq = dot(n, n); //5
-	cos_phi = clamp( dot(m, n)/sqrt(msq*nsq), -1.0, 1.0 ); //8
-	sgnphi = sign( dot( r_ij, n ) ); //5
-	phi = sgnphi * acos( cos_phi ); //2
-	mdphi = parms.z * phi - parms.y; //2
-	ddphi = - parms.x * parms.z * sin( mdphi ); //3
-	nrkj2 = dot( r_kj, r_kj ); //5
-	nrkj  = sqrt( nrkj2 );     //1
-	fi = -ddphi * nrkj / msq * m; //5
-	fl =  ddphi * nrkj / nsq * n; //5
-	u = dot( r_ij, r_kj ) / nrkj2 * fi; //9
-	v = dot( r_kl, r_kj ) / nrkj2 * fl; //9
-	s = u - v; //3
-	fj = s - fi; //3
-	fk = -(s + fl); //3
-	//Total : 100 flops
-}
--- a/platforms/brook/src/gpu/krbdihs.br
+++ b/platforms/brook/src/gpu/krbdihs.br
-/****************************************************************
-* This file is part of the gpu acceleration library for gromacs.
-* Author: V. Vishal
-* Copyright (C) Pande Group, Stanford, 2006
-*****************************************************************/
-//Ryckaert Bellman dihedrals, needed for Amber/OPLS ff's
-//
-//Input is a stream of quartets i, j, k, l and the output is
-//four float3 streams fi, fj, fk, fl.
-//If by any chance this kernel becomes the bottleneck, we will
-//optimize, but for now, this is kept pretty simple.
-//To keep things streaming, we have a stream of 6 parameters(a float4 and float2)
-//for each dihedral.
-kernel void krbdih( 
-        float xstrwidth, //stream width for x
-        float4 xq[][], //particle coordinates and charges
-        float4 atoms<>, //ijkl quartets
-        float4 parm03<>, //params 0-3
-        float2 parm45<>, //params 4 and 5
-        out float3 fi<>, //output forces for i, j, k, l
-        out float3 fj<>,
-        out float3 fk<>,
-        out float3 fl<>
-	  	) {
-	float3 r_ij, r_kj, r_kl;
-	float2 ai, aj, ak, al;
-	float3 m, n;
-	float sgnphi;
-	float cosfac;
-	float ddphi;
-	float3 u, v, s;
-	float nrkj, nrkj2, msq, nsq, cos_phi, sin_phi;
-	//Convert from linear indices to 2D indices into x
-	//If this kernel is compute bound, we can do this 
-	//conversion before-hand and feed in the 2D coordinates
-	ai.y = floor( atoms.x / xstrwidth );
-	ai.x = atoms.x - ai.y * xstrwidth;
-	aj.y = floor( atoms.y / xstrwidth );
-	aj.x = atoms.y - aj.y * xstrwidth;
-	ak.y = floor( atoms.z / xstrwidth );
-	ak.x = atoms.z - ak.y * xstrwidth;
-	al.y = floor( atoms.w / xstrwidth );
-	al.x = atoms.w - al.y * xstrwidth;
-	r_ij = xq[ai].xyz - xq[aj].xyz; //3
-	r_kj = xq[ak].xyz - xq[aj].xyz; //3
-	r_kl = xq[ak].xyz - xq[al].xyz; //3
-	m = cross( r_ij, r_kj ); //9
-	n = cross( r_kj, r_kl ); //9
-	msq = dot(m, m); //5
-	nsq = dot(n, n); //5
-	cos_phi = dot(m, n)/sqrt(msq*nsq); //8 (sqrt=1)
-	//Switching to "polymer convention"
-	//See gromacs code
-	cos_phi = -cos_phi;
-	sgnphi  = sign( dot(r_ij, n) ); //5
-	sin_phi = -sgnphi*sqrt( clamp( 1.0 - cos_phi * cos_phi, 0.0, 1.0) ); //3
-	//ddphi is basically sum_{i=1}^5 i parm_i cosphi^{i-1}
-	//This might not be the best way to use the 
-	//4-way mads, but for now we'll let fxc figure it
-	//out.
-	//If we precompute some ratios of the parameters
-	//we can use the 4-way mads better
-	ddphi = 5.0 * parm45.y;
-	ddphi = 4.0 * parm45.x + ddphi * cos_phi;
-	ddphi = 3.0 * parm03.w + ddphi * cos_phi;
-	ddphi = 2.0 * parm03.z + ddphi * cos_phi;
-	ddphi = parm03.y + ddphi * cos_phi;
-	ddphi = -ddphi * sin_phi; //13 flops total for ddphi
-	nrkj2 = dot( r_kj, r_kj ); //5
-	nrkj  = sqrt( nrkj2 );     //1
-	fi = -ddphi * nrkj / msq * m; //5
-	fl =  ddphi * nrkj / nsq * n; //5
-	u = dot( r_ij, r_kj ) / nrkj2 * fi; //9
-	v = dot( r_kl, r_kj ) / nrkj2 * fl; //9
-	s = u - v; //3
-	fj = s - fi; //3
-	fk = -(s + fl); //3
-	//Total flops: 109 per rb torsion.
-}
--- a/platforms/brook/src/gpu/kshakeh.h
+++ b/platforms/brook/src/gpu/kshakeh.h
+void  kshakeh_fix1 (const float  nit,
+		const float  strwidth,
+		const float  invmH,
+		const float  omega,
+		::brook::stream atoms,
+		::brook::stream posq,
+		::brook::stream posqp,
+		::brook::stream params,
+		::brook::stream cposq0,
+		::brook::stream cposq1,
+		::brook::stream cposq2,
+		::brook::stream cposq3);
+void  kshakeh_fix2 (const float  nit,
+		const float  strwidth,
+		const float  invmH,
+		const float  omega,
+		::brook::stream atoms,
+		::brook::stream posq,
+		::brook::stream posqp,
+		::brook::stream params,
+		::brook::stream cposq0,
+		::brook::stream cposq1,
+		::brook::stream cposq2,
+		::brook::stream cposq3);
+void  kshakeh_update (const float  strwidth,
+		::brook::stream invmap,
+		::brook::stream posq,
+		::brook::stream cposq0,
+		::brook::stream cposq1,
+		::brook::stream cposq2,
+		::brook::stream cposq3,
+		::brook::stream oposq) ;
+void  kshakeh (const float  nit,
+		const float  strwidth,
+		const float  invmH,
+		const float  omega,
+		::brook::stream atoms,
+		::brook::stream posq,
+		::brook::stream posqp,
+		::brook::stream params,
+		::brook::stream cposq0,
+		::brook::stream cposq1,
+		::brook::stream cposq2,
+		::brook::stream cposq3); 
+void  kshakeh_update1_fix1 (
+      const float  strwidth,
+      const float  sdpc1,
+		::brook::stream invmap,
+		::brook::stream posq,
+		::brook::stream posqp,
+		::brook::stream vPrime,
+		::brook::stream cposq0,
+		::brook::stream cposq1,
+		::brook::stream cposq2,
+		::brook::stream cposq3,
+		::brook::stream oposq); 
+void  kshakeh_update1_fix1Old (const float  strwidth,
+		::brook::stream invmap,
+		::brook::stream posq,
+		::brook::stream cposq0,
+		::brook::stream cposq1,
+		::brook::stream cposq2,
+		::brook::stream cposq3,
+		::brook::stream oposq); 
+void  kshakeh_update2_fix1 (const float  strwidth,
+		::brook::stream invmap,
+		::brook::stream posq,
+		::brook::stream posqp,
+		::brook::stream cposq0,
+		::brook::stream cposq1,
+		::brook::stream cposq2,
+		::brook::stream cposq3,
+		::brook::stream oposq); 
--- a/platforms/brook/src/gpu/kupdatesd.h
+++ b/platforms/brook/src/gpu/kupdatesd.h
+void  kupdate_sd1 (
+		const float  xstrwidth,
+		const float  gstrwidth,
+		const float  goffset,
+		const float  cem,
+		const float  pc1,
+		const float  pc2,
+		const float  pc3,
+		::brook::stream sdpc,
+		::brook::stream fgauss,
+		::brook::stream sd2X,
+		::brook::stream posq,
+		::brook::stream f,
+		::brook::stream v,
+		::brook::stream invmass,
+		::brook::stream sd1V,
+		::brook::stream vnew,
+		::brook::stream posqp); 
+void  kupdate_sd2 (
+		const float  xstrwidth,
+		const float  gstrwidth,
+		const float  goffset,
+		const float  pc1,
+		const float  pc2,
+		::brook::stream sdpc,
+		::brook::stream fgauss,
+		::brook::stream sd1V,
+		::brook::stream posq,
+		::brook::stream posqp,
+		::brook::stream vnew,
+		::brook::stream sd2X,
+		::brook::stream v,
+		::brook::stream posqp2); 
+void  kpermute_vectors (const float  gstrwidth,
+		::brook::stream perm,
+		::brook::stream gvin,
+		::brook::stream gvout); 
+void  kupdate_sd2_fix1 (const float  xstrwidth,
+		const float  gstrwidth,
+		const float  goffset,
+		const float  pc1,
+		const float  pc2,
+		::brook::stream sdpc,
+		::brook::stream fgauss,
+		::brook::stream sd1V,
+		::brook::stream posq,
+		::brook::stream posqp,
+		::brook::stream vnew,
+		::brook::stream sd2X,
+		::brook::stream v,
+		::brook::stream posqp2); 
+void  kupdate_sd1_fix1 (const float  xstrwidth,
+		const float  gstrwidth,
+		const float  goffset,
+		const float  cem,
+		const float  pc1,
+		const float  pc2,
+		const float  pc3,
+		::brook::stream sdpc,
+		::brook::stream fgauss,
+		::brook::stream sd2X,
+		::brook::stream posq,
+		::brook::stream f,
+		::brook::stream v,
+		::brook::stream invmass,
+		::brook::stream sd1V,
+		::brook::stream vnew,
+		::brook::stream posqp);
+void  kupdate_sd2_fix1_FixedRV(const float  xstrwidth,
+		const float  gstrwidth,
+		const float  goffset,
+		const float  pc1,
+		const float  pc2,
+		::brook::stream sdpc,
+		::brook::stream fgauss,
+		::brook::stream sd1V,
+		::brook::stream posq,
+		::brook::stream posqp,
+		::brook::stream vnew,
+		::brook::stream sd2X,
+		::brook::stream v,
+		::brook::stream posqp2); 
+void  kupdate_sd1_fix1_FixedRV(const float  xstrwidth,
+		const float  gstrwidth,
+		const float  goffset,
+		const float  cem,
+		const float  pc1,
+		const float  pc2,
+		const float  pc3,
+		::brook::stream sdpc,
+		::brook::stream fgauss,
+		::brook::stream sd2X,
+		::brook::stream posq,
+		::brook::stream f,
+		::brook::stream v,
+		::brook::stream invmass,
+		::brook::stream sd1V,
+		::brook::stream vnew,
+		::brook::stream posqp);