Latest mods

b53cd947 · Mark Friedrichs · ff88ddad · b53cd947 · b53cd947 · b53cd947
Commit b53cd947 authored Aug 15, 2008 by Mark Friedrichs
14 changed files
--- a/platforms/brook/src/gpu/kcommonOrig.br
+++ b/platforms/brook/src/gpu/kcommonOrig.br
+
+/****************************************************************
+* This file is part of the gpu acceleration library for gromacs.
+* Author: V. Vishal
+* Copyright (C) Pande Group, Stanford, 2006
+*****************************************************************/
+
+//Kernel to set the xyz components of a float4 stream
+//Used for changing the coordinates without changing
+//the charges in strPosQ
+kernel void ksetxyz( float3 instr<>, float4 before<>, out float4 after<> ) {
+	after.xyz = instr;
+	after.w   = before.w;
+}
+
+//Inverse of above
+kernel void kgetxyz( float4 instr<>, out float3 outstr<> ) {
+	outstr = instr.xyz;
+}
+
+
+//Zeroes out a stream
+kernel void kzerof3( out float3 outstr<> ) {
+	outstr = float3( 0.0, 0.0, 0.0 );
+}
+
+//Zeros out a stream
+kernel void kzerof4( out float4 outstr<> ) {
+	outstr = float4( 0.0, 0.0, 0.0, 0.0 );
+}
+
+kernel void ksetf4( float4 val, out float4 outstr<> ) {
+	outstr = val;
+}
--- a/platforms/brook/src/gpu/kforce.h
+++ b/platforms/brook/src/gpu/kforce.h
+#ifndef __KFORCE_H__
+#define __KFORCE_H__
+
+//Define a generic force kernel prototype
+//To make switching easier to look at
+//This should match the kernels from kforce.br
+typedef void (*gpuNBForceFunction)(
+		const float natoms,
+		const float  dupfac,
+		const float  strheight,
+		const float  strwidth,
+		const float  jstrwidth,
+		const float  fstrwidth,
+		const float  epsfac,
+		const float4 params,
+		::brook::stream posq,
+		::brook::stream isigeps,
+		::brook::stream sigma,
+		::brook::stream epsilon,
+		::brook::stream excl,
+		::brook::stream outforce1,
+		::brook::stream outforce2 
+		);
+
+void  knbforce_CDLJ (const float  natoms,
+		const float  dupfac,
+		const float  strheight,
+		const float  strwidth,
+		const float  jstrwidth,
+		const float  fstrwidth,
+		const float  epsfac,
+		const float4 params,
+		::brook::stream posq,
+		::brook::stream isigeps,
+		::brook::stream sigma,
+		::brook::stream epsilon,
+		::brook::stream excl,
+		::brook::stream outforce1,
+		::brook::stream outforce2); 
+
+void  knbforce_CDLJ2(const float  natoms,
+		const float  dupfac,
+		const float  strheight,
+		const float  strwidth,
+		const float  jstrwidth,
+		const float  fstrwidth,
+		const float  epsfac,
+		const float4 params,
+		::brook::stream posq,
+		::brook::stream isigeps,
+		::brook::stream sigma,
+		::brook::stream epsilon,
+		::brook::stream excl,
+		::brook::stream outforce1,
+		::brook::stream outforce2); 
+
+void  knbforce_CDLJ4(
+      const float  natoms,
+      const float  nAtomsCeiling,
+		const float  dupfac,
+		const float  strheight,
+		const float  strwidth,
+		const float  jstrwidth,
+		const float  fstrwidth,
+		const float  epsfac,
+		const float4 params,
+		::brook::stream posq,
+		::brook::stream isigeps,
+		::brook::stream sigma,
+		::brook::stream epsilon,
+		::brook::stream excl,
+		::brook::stream outforce1,
+		::brook::stream outforce2, 
+		::brook::stream outforce3, 
+		::brook::stream outforce4); 
+
+void  knbforce_CDLJ4Debug(
+      const float  natoms,
+      const float  nAtomsCeiling,
+		const float  dupfac,
+		const float  strheight,
+		const float  strwidth,
+		const float  jstrwidth,
+		const float  fstrwidth,
+		const float  epsfac,
+		const float4 params,
+		::brook::stream posq,
+		::brook::stream isigeps,
+		::brook::stream sigma,
+		::brook::stream epsilon,
+		::brook::stream excl,
+		::brook::stream outforce1,
+		::brook::stream outforce2, 
+		::brook::stream outforce3, 
+		::brook::stream outforce4); 
+
+void  knbforce_CDLJ_1(
+      const float  natoms,
+      const float  nAtomsCeiling,
+		const float  strwidth,
+		const float  epsfac,
+		::brook::stream posq,
+		::brook::stream isigeps,
+		::brook::stream excl,
+		::brook::stream outforce ); 
+
+void  knbforce_CDLJ4NoEx(
+      const float  natoms,
+      const float  nAtomsCeiling,
+		const float  dupfac,
+		const float  strheight,
+		const float  strwidth,
+		const float  jstrwidth,
+		const float  fstrwidth,
+		const float  epsfac,
+		const float4 params,
+		::brook::stream posq,
+		::brook::stream isigeps,
+		::brook::stream sigma,
+		::brook::stream epsilon,
+		::brook::stream excl,
+		::brook::stream outforce1,
+		::brook::stream outforce2, 
+		::brook::stream outforce3, 
+		::brook::stream outforce4); 
+
+void  knbforce_LDLJ (const float  natoms,
+		const float  dupfac,
+		const float  strheight,
+		const float  strwidth,
+		const float  jstrwidth,
+		const float  fstrwidth,
+		const float  epsfac,
+		const float4 params,
+		const __BRTIter& a_iatom2,
+		::brook::stream posq,
+		::brook::stream isigeps,
+		::brook::stream sigma,
+		::brook::stream epsilon,
+		::brook::stream excl,
+		::brook::stream outforce1,
+		::brook::stream outforce2); 
+
+void  knbforce_SFDLJ (const float  natoms,
+		const float  dupfac,
+		const float  strheight,
+		const float  strwidth,
+		const float  jstrwidth,
+		const float  fstrwidth,
+		const float  epsfac,
+		const float4 params,
+		const __BRTIter& a_iatom2,
+		::brook::stream posq,
+		::brook::stream isigeps,
+		::brook::stream sigma,
+		::brook::stream epsilon,
+		::brook::stream excl,
+		::brook::stream outforce1,
+		::brook::stream outforce2); 
+
+void  knbforce_SHEFFIELD (const float  natoms,
+		const float  dupfac,
+		const float  strheight,
+		const float  strwidth,
+		const float  jstrwidth,
+		const float  fstrwidth,
+		const float  epsfac,
+		const float4 params,
+		const __BRTIter& a_iatom2,
+		::brook::stream posq,
+		::brook::stream isigeps,
+		::brook::stream sigma,
+		::brook::stream epsilon,
+		::brook::stream excl,
+		::brook::stream outforce1,
+		::brook::stream outforce2); 
+
+void  kmerge_partial_forces (const float  repfac,
+		const float  atomStrWidth,
+		const float  pforceStrWidth,
+		const float  natoms,
+		::brook::stream pforce1,
+		::brook::stream pforce2,
+		::brook::stream force); 
+
+void kMergeFloat3_4( 
+      float repfac, 
+      float atomStreamWidth, 
+      float pStreamWidth,
+      float natoms,
+      float roundNatoms,
+      float iUnroll,
+		::brook::stream pforce1,
+		::brook::stream pforce2,
+		::brook::stream pforce3,
+		::brook::stream pforce4,
+		::brook::stream force );
+
+
+void kMergeFloat3_4_nobranch( 
+      float repfac, 
+      float atomStreamWidth, 
+      float pStreamWidth,
+      float natoms,
+      float roundNatoms,
+      float iUnroll,
+		::brook::stream pforce1,
+		::brook::stream pforce2,
+		::brook::stream pforce3,
+		::brook::stream pforce4,
+		::brook::stream force );
+
+typedef void ( *gpuNBForceFunction14 )
+	(const float  natoms,
+		const float  strwidth,
+		const float  epsfac,
+		const float4 params,
+		::brook::stream atoms,
+		::brook::stream posq,
+		::brook::stream sigeps,
+		::brook::stream fi,
+		::brook::stream fj); 
+
+
+void  kforce14_CDLJ
+	(const float  natoms,
+		const float  strwidth,
+		const float  epsfac,
+		const float4 params,
+		::brook::stream atoms,
+		::brook::stream posq,
+		::brook::stream sigeps,
+		::brook::stream fi,
+		::brook::stream fj); 
+
+void  kforce14_LDLJ 
+	(const float  natoms,
+		const float  strwidth,
+		const float  epsfac,
+		const float4 params,
+		::brook::stream atoms,
+		::brook::stream posq,
+		::brook::stream sigeps,
+		::brook::stream fi,
+		::brook::stream fj); 
+
+void  kforce14_SFDLJ 
+	(const float  natoms,
+		const float  strwidth,
+		const float  epsfac,
+		const float4 params,
+		::brook::stream atoms,
+		::brook::stream posq,
+		::brook::stream sigeps,
+		::brook::stream fi,
+		::brook::stream fj); 
+
+void  kforce14_SHEFFIELD
+	(const float  natoms,
+		const float  strwidth,
+		const float  epsfac,
+		const float4 params,
+		::brook::stream atoms,
+		::brook::stream posq,
+		::brook::stream sigeps,
+		::brook::stream fi,
+		::brook::stream fj); 
+
+typedef void (*gpuBondedFunction)(
+		const float  epsfac,
+		const float  xstrwidth,
+		const float4 params,
+		::brook::stream posq,
+		::brook::stream atoms,
+		::brook::stream parm0,
+		::brook::stream parm1,
+		::brook::stream parm2,
+		::brook::stream parm3,
+		::brook::stream parm4,
+		::brook::stream fi,
+		::brook::stream fj,
+		::brook::stream fk,
+		::brook::stream fl);
+
+void  kbonded_CDLJ (const float  epsfac,
+		const float  xstrwidth,
+		const float4 params,
+		::brook::stream posq,
+		::brook::stream atoms,
+		::brook::stream parm0,
+		::brook::stream parm1,
+		::brook::stream parm2,
+		::brook::stream parm3,
+		::brook::stream parm4,
+		::brook::stream fi,
+		::brook::stream fj,
+		::brook::stream fk,
+		::brook::stream fl); 
+
+void  kbonded_CDLJDebug (const float  epsfac,
+		const float  xstrwidth,
+		const float4 params,
+		::brook::stream posq,
+		::brook::stream atoms,
+		::brook::stream parm0,
+		::brook::stream parm1,
+		::brook::stream parm2,
+		::brook::stream parm3,
+		::brook::stream parm4,
+		::brook::stream fi,
+		::brook::stream fj,
+		::brook::stream fk,
+		::brook::stream fl); 
+
+void  kbonded_LDLJ (const float  epsfac,
+		const float  xstrwidth,
+		const float4 params,
+		::brook::stream posq,
+		::brook::stream atoms,
+		::brook::stream parm0,
+		::brook::stream parm1,
+		::brook::stream parm2,
+		::brook::stream parm3,
+		::brook::stream parm4,
+		::brook::stream fi,
+		::brook::stream fj,
+		::brook::stream fk,
+		::brook::stream fl); 
+
+void  kbonded_SFDLJ (const float  epsfac,
+		const float  xstrwidth,
+		const float4 params,
+		::brook::stream posq,
+		::brook::stream atoms,
+		::brook::stream parm0,
+		::brook::stream parm1,
+		::brook::stream parm2,
+		::brook::stream parm3,
+		::brook::stream parm4,
+		::brook::stream fi,
+		::brook::stream fj,
+		::brook::stream fk,
+		::brook::stream fl); 
+
+void  kbonded_SHEFFIELD (const float  epsfac,
+		const float  xstrwidth,
+		const float4 params,
+		::brook::stream posq,
+		::brook::stream atoms,
+		::brook::stream parm0,
+		::brook::stream parm1,
+		::brook::stream parm2,
+		::brook::stream parm3,
+		::brook::stream parm4,
+		::brook::stream fi,
+		::brook::stream fj,
+		::brook::stream fk,
+		::brook::stream fl); 
+
+void kAddForces3_4( 
+		const float  conversion,
+      ::brook::stream force1, 
+      ::brook::stream force2, 
+      ::brook::stream outForce );
+ 
+#endif //__KFORCE_H__
--- a/platforms/brook/src/gpu/kforce_CDLJ.br
+++ b/platforms/brook/src/gpu/kforce_CDLJ.br
+kernel float4 scalar_force_CDLJ( float4 qq, float epsfac, float4 sig, float4 eps, float4 r2, float4 params ) {
+ float4 invr, invrsig2, invrsig6;
+ float4 f;
+
+ invr = rsqrt( r2 );
+ invrsig2 = invr * sig;
+ invrsig2 = invrsig2 * invrsig2;
+ invrsig6 = invrsig2 * invrsig2 * invrsig2;
+ f = eps * ( 12.0f*invrsig6 - 6.0f ) * invrsig6;
+ f += epsfac*qq*invr;
+ f *= invr*invr;
+ return f;
+}
+
+kernel float scalar_force_single_CDLJ( float qq, float epsfac, float sig, float eps, float r2 ){
+ float invr, invrsig2, invrsig6;
+ float f;
+
+ invr = rsqrt( r2 );
+ invrsig2 = invr * sig;
+ invrsig2 = invrsig2 * invrsig2;
+ invrsig6 = invrsig2 * invrsig2 * invrsig2;
+ f = eps * ( 12.0f*invrsig6 - 6.0f ) * invrsig6;
+ f += epsfac*qq*invr;
+ f *= invr*invr;
+ return f;
+}
+kernel float4 get_r2_CDLJ( float3 d1, float3 d2, float3 d3, float3 d4 ) {
+ float4 r2;
+ r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) );
+ return r2;
+}
+kernel void knbforce_CDLJ (
+  float natoms,
+  float dupfac,
+  float strheight,
+  float strwidth,
+  float jstrwidth,
+  float fstrwidth,
+  float epsfac,
+  float4 params,
+  float4 posq[][],
+  float4 isigeps<>,
+  float4 sigma[][],
+  float4 epsilon[][],
+  float2 excl[][],
+
+
+  out float3 outforce1<>,
+  out float3 outforce2<> ) {
+
+ float4 jsig, jeps;
+ float3 ipos1, ipos2;
+ float3 jpos1, jpos2, jpos3, jpos4;
+ float qi1, qi2;
+ float4 qj, qq;
+ float3 d1, d2, d3, d4;
+ float2 exclind;
+ float4 sig, eps;
+ float2 exclusions;
+ float4 r2;
+ float3 pad;
+ float4 exclmask;
+ float4 exclconst;
+ float2 iatom;
+ float2 jstrindex;
+ float a_iatom;
+ float linind;
+
+ float breakflag;
+ float2 jatom;
+ float4 fs;
+ float jend, jstart;
+ float which_rep;
+
+ exclconst = float4( 2.0f, 3.0f, 5.0f, 7.0f );
+
+
+ outforce1 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
+ outforce2 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
+
+
+   a_iatom = 2.0f*((indexof outforce1).x + (indexof outforce1).y * fstrwidth);
+
+
+ linind = fmod( a_iatom, natoms );
+ iatom.x = fmod( linind, strwidth );
+ iatom.y = round( (linind - fmod(linind, strwidth))/strwidth );
+
+
+
+ ipos1 = posq[ iatom ].xyz;
+ qi1 = posq[ iatom ].w;
+
+ iatom.x += 1;
+ ipos2 = posq[ iatom ].xyz;
+ qi2 = posq[ iatom ].w;
+
+
+ breakflag = 1.0f;
+ jatom.y = 0.0f;
+
+
+
+
+
+
+
+ which_rep = round( (a_iatom - fmod(a_iatom, natoms))/natoms );
+ jend = 1 + round( (natoms - fmod(natoms, (dupfac * strwidth )))/(dupfac * strwidth ) );
+ jstart = which_rep * jend;
+ if ( which_rep > dupfac - 1 - 0.5f ) {
+
+  jend = 1e6f;
+ }
+ jend += jstart;
+
+ exclind.x = linind/2.0f;
+ exclind.y = jstart * strwidth/4.0f;
+
+ jatom.y = jstart;
+ while ( jatom.y < jend && breakflag > 0.0f ) {
+  jatom.x = 0;
+  while ( jatom.x < strwidth && breakflag > 0.0f ) {
+
+   exclusions = excl[ exclind ];
+
+   exclmask = fmod( exclusions.x, exclconst );
+
+
+   linind = (jatom.x + jatom.y*strwidth)/4.0f;
+   jstrindex.y = round( (linind - fmod(linind, jstrwidth))/jstrwidth );
+   jstrindex.x = linind - jstrindex.y * jstrwidth;
+
+
+   jsig = sigma[ jstrindex ];
+   jeps = epsilon[ jstrindex ];
+
+   jpos1 = posq[ jatom ].xyz;
+   qj.x = posq[ jatom ].w;
+   jatom.x += 1.0f;
+
+   jpos2 = posq[ jatom ].xyz;
+   qj.y = posq[ jatom ].w;
+   jatom.x += 1.0f;
+
+   jpos3 = posq[ jatom ].xyz;
+   qj.z = posq[ jatom ].w;
+   jatom.x += 1.0f;
+
+   jpos4 = posq[ jatom ].xyz;
+   qj.w = posq[ jatom ].w;
+   jatom.x += 1.0f;
+
+
+   sig = isigeps.x + jsig;
+   eps = isigeps.y * jeps;
+   qq = qi1 * qj;
+
+   d1 = ipos1 - jpos1;
+   d2 = ipos1 - jpos2;
+   d3 = ipos1 - jpos3;
+   d4 = ipos1 - jpos4;
+
+
+   r2 = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask * 10000.0f;
+   fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
+
+   outforce1 += fs.x * d1;
+   outforce1 += fs.y * d2;
+   outforce1 += fs.z * d3;
+   outforce1 += fs.w * d4;
+/*
+outforce1.x += exclmask.x;
+outforce1.x += exclmask.y;
+outforce1.x += exclmask.z;
+outforce1.x += exclmask.w;
+if( exclmask.x > 0.5f ){
+   outforce1.x += 1.0f;
+}
+if( exclmask.y > 0.5f ){
+   outforce1.x += 1.0f;
+}
+if( exclmask.z > 0.5f ){
+   outforce1.x += 1.0f;
+}
+if( exclmask.w > 0.5f ){
+   outforce1.x += 1.0f;
+}
+*/
+
+   exclmask = fmod( exclusions.y, exclconst );
+
+   sig = isigeps.z + jsig;
+   eps = isigeps.w * jeps;
+   qq = qi2 * qj;
+
+   d1 = ipos2 - jpos1;
+   d2 = ipos2 - jpos2;
+   d3 = ipos2 - jpos3;
+   d4 = ipos2 - jpos4;
+
+   r2 = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask * 10000.0f;
+   fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
+
+   outforce2 += fs.x * d1;
+   outforce2 += fs.y * d2;
+   outforce2 += fs.z * d3;
+   outforce2 += fs.w * d4;
+
+/*
+outforce2.x += exclmask.x;
+outforce2.x += exclmask.y;
+outforce2.x += exclmask.z;
+outforce2.x += exclmask.w;
+if( exclmask.x > 0.5f ){
+   outforce2.x += 1.0f;
+}
+if( exclmask.y > 0.5f ){
+   outforce2.x += 1.0f;
+}
+if( exclmask.z > 0.5f ){
+   outforce2.x += 1.0f;
+}
+if( exclmask.w > 0.5f ){
+   outforce2.x += 1.0f;
+}
+*/
+
+   exclind.y += 1.0f;
+
+
+   if ( natoms - (jatom.y*strwidth + jatom.x) < 0.9f )
+    breakflag = -1.0f;
+  }
+
+  jatom.y += 1.0f;
+ }
+
+}
+
+kernel void knbforce_CDLJ2(
+  float natoms,
+  float dupfac,
+  float strheight,
+  float strwidth,
+  float jstrwidth,
+  float fstrwidth,
+  float epsfac,
+  float4 params,
+  float4 posq[][],
+  float4 isigeps[][],
+  float4 sigma[][],
+  float4 epsilon[][],
+  float excl[][],
+
+
+  out float3 outforce1<>,
+  out float3 outforce2<> ) {
+
+ float4 jsig, jeps;
+ float3 ipos1, ipos2;
+ float4 jpos;
+ float3 jpos1, jpos2, jpos3, jpos4;
+ float qi1, qi2;
+ float isig1, isig2;
+ float ieps1, ieps2;
+ float4 qj, qq;
+ float3 d1, d2, d3, d4;
+ float2 exclind;
+ float4 sig, eps;
+ float  exclusions;
+ float4 r2;
+ float3 pad;
+ float4 exclmask;
+ float4 exclconst;
+ float2 iatom;
+ float2 jstrindex;
+ float a_iatom;
+ float linind;
+
+ float breakflag;
+ float2 jatom;
+ float4 fs;
+ float jend, jstart;
+ float which_rep;
+
+ exclconst = float4( 2.0f, 3.0f, 5.0f, 7.0f );
+
+ outforce1  = float4( 0.0f, 0.0f, 0.0f, 0.0f );
+ outforce2  = float4( 0.0f, 0.0f, 0.0f, 0.0f );
+
+
+ a_iatom    = 2.0f*((indexof outforce1).x + (indexof outforce1).y * fstrwidth);
+ linind     = fmod( a_iatom, natoms );
+ iatom.x    = fmod( linind, strwidth );
+ iatom.y    = round( (linind - fmod(linind, strwidth))/strwidth );
+
+ jpos       = posq[ iatom ];
+ ipos1      = jpos.xyz;
+ qi1        = jpos.w;
+ jstrindex  = isigeps[ iatom ];
+ isig1      = jstrindex.x;
+ ieps1      = jstrindex.y;
+
+ iatom.x   += 1;
+ jpos       = posq[ iatom ];
+ ipos2      = jpos.xyz;
+ qi2        = jpos.w;
+ jstrindex  = isigeps[ iatom ];
+ isig2      = jstrindex.x;
+ ieps2      = jstrindex.y;
+
+ breakflag  = 1.0f;
+ jatom.y    = 0.0f;
+
+ which_rep  = round( (a_iatom - fmod(a_iatom, natoms))/natoms );
+ jend       = 1 + round( (natoms - fmod(natoms, (dupfac * strwidth )))/(dupfac * strwidth ) );
+ jstart     = which_rep * jend;
+ if ( which_rep > dupfac - 1 - 0.5f ) {
+  jend = 1e6f;
+ }
+ jend      += jstart;
+
+ exclind.x  = linind;
+ exclind.y  = jstart * strwidth/4.0f;
+
+ jatom.y    = jstart;
+ while ( jatom.y < jend && breakflag > 0.0f ) {
+  jatom.x = 0;
+  while ( jatom.x < strwidth && breakflag > 0.0f ) {
+
+   exclusions  = excl[ exclind ];
+   exclmask    = fmod( exclusions, exclconst );
+   linind      = (jatom.x + jatom.y*strwidth)/4.0f;
+   jstrindex.y = round( (linind - fmod(linind, jstrwidth))/jstrwidth );
+   jstrindex.x = linind - jstrindex.y * jstrwidth;
+
+   jsig        = sigma[ jstrindex ];
+   jeps        = epsilon[ jstrindex ];
+
+   jpos        = posq[ jatom ];
+   jpos1       = jpos.xyz;
+   qj.x        = jpos.w;
+   jatom.x    += 1.0f;
+
+   jpos        = posq[ jatom ];
+   jpos2       = jpos.xyz;
+   qj.y        = jpos.w;
+   jatom.x    += 1.0f;
+
+   jpos        = posq[ jatom ];
+   jpos3       = jpos.xyz;
+   qj.z        = jpos.w;
+   jatom.x    += 1.0f;
+
+   jpos        = posq[ jatom ];
+   jpos4       = jpos.xyz;
+   qj.w        = jpos.w;
+   jatom.x    += 1.0f;
+
+   sig         = isig1 + jsig;
+   eps         = ieps1 * jeps;
+   qq          = qi1*qj;
+
+   d1          = ipos1 - jpos1;
+   d2          = ipos1 - jpos2;
+   d3          = ipos1 - jpos3;
+   d4          = ipos1 - jpos4;
+
+   r2          = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask * 10000.0f;
+   fs          = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
+
+   outforce1  += fs.x * d1;
+   outforce1  += fs.y * d2;
+   outforce1  += fs.z * d3;
+   outforce1  += fs.w * d4;
+
+   exclind.x  += 1.0f;
+   exclusions  = excl[ exclind ];
+   exclmask    = fmod( exclusions, exclconst );
+
+   sig         = isig2 + jsig;
+   eps         = ieps2 * jeps;
+   qq          = qi2 * qj;
+
+   d1          = ipos2 - jpos1;
+   d2          = ipos2 - jpos2;
+   d3          = ipos2 - jpos3;
+   d4          = ipos2 - jpos4;
+
+   r2          = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask * 10000.0f;
+   fs          = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
+
+   outforce2  += fs.x * d1;
+   outforce2  += fs.y * d2;
+   outforce2  += fs.z * d3;
+   outforce2  += fs.w * d4;
+
+   exclind.x  -= 1.0f;
+   exclind.y  += 1.0f;
+
+   if ( natoms - (jatom.y*strwidth + jatom.x) < 0.9f )
+    breakflag = -1.0f;
+  }
+
+  jatom.y += 1.0f;
+ }
+
+}
+
+
+kernel void knbforce_CDLJ4(
+  float natoms,
+  float nAtomsCeiling,
+  float dupfac,
+  float strheight,
+  float strwidth,
+  float jstrwidth,
+  float fstrwidth,
+  float epsfac,
+  float4 params,
+  float4 posq[][],
+  float4 isigeps[][],
+  float4 sigma[][],
+  float4 epsilon[][],
+  float excl[][],
+  out float3 outforce1<>,
+  out float3 outforce2<>,
+  out float3 outforce3<>,
+  out float3 outforce4<> ) {
+
+ float4 jsig, jeps;
+ float3 ipos1, ipos2, ipos3, ipos4;
+ float4 jpos;
+ float3 jpos1, jpos2, jpos3, jpos4;
+ float qi1, qi2, qi3, qi4;
+
+ float isig1, isig2, isig3, isig4;
+ float ieps1, ieps2, ieps3, ieps4;
+
+ float4 qj, qq;
+ float3 d1, d2, d3, d4;
+ float2 exclind;
+ float4 sig, eps;
+ float  exclusions;
+ float4 r2;
+ float3 pad;
+ float4 exclmask;
+ float4 exclconst;
+ float2 iatom;
+ float2 jstrindex;
+ float a_iatom;
+ float linind;
+
+ float breakflag;
+ float2 jatom;
+ float4 fs;
+ float jend, jstart;
+ float which_rep;
+
+ exclconst  = float4( 2.0f, 3.0f, 5.0f, 7.0f );
+
+ outforce1  = float3( 0.0f, 0.0f, 0.0f );
+ outforce2  = float3( 0.0f, 0.0f, 0.0f );
+ outforce3  = float3( 0.0f, 0.0f, 0.0f );
+ outforce4  = float3( 0.0f, 0.0f, 0.0f );
+
+ a_iatom    = 4.0f*((indexof outforce1).x + (indexof outforce1).y * fstrwidth);
+
+ // linind     = fmod( a_iatom, natoms );
+ linind     = fmod( a_iatom, nAtomsCeiling );
+
+ iatom.x    = fmod( linind, strwidth );
+ iatom.y    = round( (linind - fmod(linind, strwidth))/strwidth );
+
+/* --------------------------------------------------------------------- */
+
+ jpos       = posq[ iatom ];
+ ipos1      = jpos.xyz;
+ qi1        = jpos.w;
+
+ jstrindex  = isigeps[ iatom ];
+ isig1      = jstrindex.x;
+ ieps1      = jstrindex.y;
+
+/* --------------------------------------------------------------------- */
+
+ iatom.x   += 1.0f;
+ jpos       = posq[ iatom ];
+ ipos2      = jpos.xyz;
+ qi2        = jpos.w;
+
+ jstrindex  = isigeps[ iatom ];
+ isig2      = jstrindex.x;
+ ieps2      = jstrindex.y;
+
+/* --------------------------------------------------------------------- */
+
+ iatom.x   += 1.0f;
+ jpos       = posq[ iatom ];
+ ipos3      = jpos.xyz;
+ qi3        = jpos.w;
+
+ jstrindex  = isigeps[ iatom ];
+ isig3      = jstrindex.x;
+ ieps3      = jstrindex.y;
+
+/* --------------------------------------------------------------------- */
+
+ iatom.x   += 1.0f;
+ jpos       = posq[ iatom ];
+ ipos4      = jpos.xyz;
+ qi4        = jpos.w;
+
+ jstrindex  = isigeps[ iatom ];
+ isig4      = jstrindex.x;
+ ieps4      = jstrindex.y;
+
+/* --------------------------------------------------------------------- */
+
+ breakflag  = 1.0f;
+ jatom.y    = 0.0f;
+
+ // which_rep  = round( (a_iatom - fmod(a_iatom, natoms))/natoms );
+ which_rep  = round( (a_iatom - fmod(a_iatom, nAtomsCeiling ))/nAtomsCeiling );
+
+ jend       = 1.0f + round( (natoms - fmod(natoms, (dupfac * strwidth )))/(dupfac * strwidth ) );
+ jstart     = which_rep * jend;
+ if ( which_rep > dupfac - 1.5f ) {
+  jend = 1e6f;
+ }
+ jend      += jstart;
+
+ exclind.x  = linind;
+ exclind.y  = jstart * strwidth/4.0f;
+ exclind.y  = floor(  exclind.y  + 0.25f );
+
+ jatom.y    = jstart;
+ while ( jatom.y < jend && breakflag > 0.0f ) {
+  jatom.x = 0;
+  while ( jatom.x < strwidth && breakflag > 0.0f ) {
+
+   exclusions  = excl[ exclind ];
+   exclmask    = fmod( exclusions, exclconst );
+
+   linind      = (jatom.x + jatom.y*strwidth)/4.0f;
+   linind      = floor( linind + 0.25f );
+
+   jstrindex.y = round( (linind - fmod(linind, jstrwidth))/jstrwidth );
+   jstrindex.x = linind - jstrindex.y * jstrwidth;
+   jstrindex.x = floor( jstrindex.x + 0.25f );
+
+   /* --------------------------------------------------------------------- */
+
+   jsig        = sigma[ jstrindex ];
+   jeps        = epsilon[ jstrindex ];
+
+   jpos        = posq[ jatom ];
+   jpos1       = jpos.xyz;
+   qj.x        = jpos.w;
+   jatom.x     = floor( jatom.x + 1.2f );
+   //jatom.x    += 1.0f;
+
+   jpos        = posq[ jatom ];
+   jpos2       = jpos.xyz;
+   qj.y        = jpos.w;
+   jatom.x     = floor( jatom.x + 1.2f );
+   //jatom.x    += 1.0f;
+
+   jpos        = posq[ jatom ];
+   jpos3       = jpos.xyz;
+   qj.z        = jpos.w;
+   jatom.x     = floor( jatom.x + 1.2f );
+   //jatom.x    += 1.0f;
+
+   jpos        = posq[ jatom ];
+   jpos4       = jpos.xyz;
+   qj.w        = jpos.w;
+   jatom.x     = floor( jatom.x + 1.2f );
+   //jatom.x    += 1.0f;
+
+   /* --------------------------------------------------------------------- */
+
+   sig         = isig1 + jsig;
+   eps         = ieps1 * jeps;
+   qq          = qi1*qj;
+
+   d1          = ipos1 - jpos1;
+   d2          = ipos1 - jpos2;
+   d3          = ipos1 - jpos3;
+   d4          = ipos1 - jpos4;
+
+   r2          = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask*10000.0f;
+   fs          = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
+
+   outforce1  += fs.x * d1;
+   outforce1  += fs.y * d2;
+   outforce1  += fs.z * d3;
+   outforce1  += fs.w * d4;
+
+   /* --------------------------------------------------------------------- */
+
+   //exclind.x  += 1.0f;
+   exclind.x   = floor( exclind.x + 1.2f );
+   exclusions  = excl[ exclind ];
+   exclmask    = fmod( exclusions, exclconst );
+
+   sig         = isig2 + jsig;
+   eps         = ieps2 * jeps;
+   qq          = qi2 * qj;
+
+   d1          = ipos2 - jpos1;
+   d2          = ipos2 - jpos2;
+   d3          = ipos2 - jpos3;
+   d4          = ipos2 - jpos4;
+
+   r2          = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask * 10000.0f;
+   fs          = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
+
+   outforce2  += fs.x * d1;
+   outforce2  += fs.y * d2;
+   outforce2  += fs.z * d3;
+   outforce2  += fs.w * d4;
+
+   /* --------------------------------------------------------------------- */
+
+   //exclind.x  += 1.0f;
+   exclind.x   = floor( exclind.x + 1.2f );
+   exclusions  = excl[ exclind ];
+   exclmask    = fmod( exclusions, exclconst );
+
+   sig         = isig3 + jsig;
+   eps         = ieps3 * jeps;
+   qq          = qi3   * qj;
+
+   d1          = ipos3 - jpos1;
+   d2          = ipos3 - jpos2;
+   d3          = ipos3 - jpos3;
+   d4          = ipos3 - jpos4;
+
+   r2          = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask * 10000.0f;
+   fs          = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
+
+   outforce3  += fs.x * d1;
+   outforce3  += fs.y * d2;
+   outforce3  += fs.z * d3;
+   outforce3  += fs.w * d4;
+
+   /* --------------------------------------------------------------------- */
+
+   //exclind.x  += 1.0f;
+   exclind.x   = floor( exclind.x + 1.2f );
+   exclusions  = excl[ exclind ];
+   exclmask    = fmod( exclusions, exclconst );
+
+   sig         = isig4 + jsig;
+   eps         = ieps4 * jeps;
+   qq          = qi4   * qj;
+
+   d1          = ipos4 - jpos1;
+   d2          = ipos4 - jpos2;
+   d3          = ipos4 - jpos3;
+   d4          = ipos4 - jpos4;
+
+   r2          = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask*10000.0f;
+   fs          = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
+
+   outforce4  += fs.x * d1;
+   outforce4  += fs.y * d2;
+   outforce4  += fs.z * d3;
+   outforce4  += fs.w * d4;
+
+   /* --------------------------------------------------------------------- */
+
+   //exclind.x  -= 3.0f;
+   exclind.x   = floor( exclind.x - 2.9f );
+
+   //exclind.y  += 1.0f;
+   exclind.y  = floor(  exclind.y  + 1.25f );
+
+   if( natoms - (jatom.y*strwidth + jatom.x) < 0.9f )breakflag = -1.0f;
+  }
+
+  //jatom.y += 1.0f;
+  jatom.y  = floor( jatom.y + 1.2f );
+ }
+
+}
+
+kernel void knbforce_CDLJ4Debug(
+  float natoms,
+  float nAtomsCeiling,
+  float dupfac,
+  float strheight,
+  float strwidth,
+  float jstrwidth,
+  float fstrwidth,
+  float epsfac,
+  float4 params,
+  float4 posq[][],
+  float4 isigeps[][],
+  float4 sigma[][],
+  float4 epsilon[][],
+  float excl[][],
+  out float3 outforce1<>,
+  out float3 outforce2<>,
+  out float3 outforce3<>,
+  out float3 outforce4<> ) {
+
+ float4 jsig, jeps;
+ float3 ipos1, ipos2, ipos3, ipos4;
+ float4 jpos;
+ float3 jpos1, jpos2, jpos3, jpos4;
+ float qi1, qi2, qi3, qi4;
+
+ float isig1, isig2, isig3, isig4;
+ float ieps1, ieps2, ieps3, ieps4;
+
+ float4 qj, qq;
+ float3 d1, d2, d3, d4;
+ float2 exclind;
+ float4 sig, eps;
+ float  exclusions;
+ float4 r2;
+ float3 pad;
+ float4 exclmask;
+ float4 exclconst;
+ float2 iatom;
+ float2 jstrindex;
+ float a_iatom;
+ float linind;
+ float linind1;
+
+ float breakflag;
+ float2 jatom;
+ float4 fs;
+ float jend, jstart;
+ float which_rep;
+ float4 one4;
+ float4 zero4;
+ float4 frac4;
+float sum;
+
+ exclconst  = float4( 2.0f, 3.0f, 5.0f, 7.0f );
+
+ outforce1  = float3( 0.0f, 0.0f, 0.0f );
+ outforce2  = float3( 0.0f, 0.0f, 0.0f );
+ outforce3  = float3( 0.0f, 0.0f, 0.0f );
+ outforce4  = float3( 0.0f, 0.0f, 0.0f );
+
+ one4       = float4( 1.0f, 1.0f, 1.0f, 1.0f );
+ zero4      = float4( 0.0f, 0.0f, 0.0f, 0.0f );
+ frac4      = float4( 0.2f, 0.2f, 0.2f, 0.2f );
+
+ a_iatom    = 4.0f*((indexof outforce1).x + (indexof outforce1).y * fstrwidth);
+
+ // linind     = fmod( a_iatom, natoms );
+ linind     = fmod( a_iatom, nAtomsCeiling );
+
+ iatom.x    = fmod( linind, strwidth );
+ iatom.y    = round( (linind - fmod(linind, strwidth))/strwidth );
+
+/* --------------------------------------------------------------------- */
+
+ jpos       = posq[ iatom ];
+ ipos1      = jpos.xyz;
+ qi1        = jpos.w;
+
+ jstrindex  = isigeps[ iatom ];
+ isig1      = jstrindex.x;
+ ieps1      = jstrindex.y;
+
+/* --------------------------------------------------------------------- */
+
+ iatom.x   += 1;
+ jpos       = posq[ iatom ];
+ ipos2      = jpos.xyz;
+ qi2        = jpos.w;
+
+ jstrindex  = isigeps[ iatom ];
+ isig2      = jstrindex.x;
+ ieps2      = jstrindex.y;
+
+/* --------------------------------------------------------------------- */
+
+ iatom.x   += 1;
+ jpos       = posq[ iatom ];
+ ipos3      = jpos.xyz;
+ qi3        = jpos.w;
+
+ jstrindex  = isigeps[ iatom ];
+ isig3      = jstrindex.x;
+ ieps3      = jstrindex.y;
+
+/* --------------------------------------------------------------------- */
+
+ iatom.x   += 1;
+ jpos       = posq[ iatom ];
+ ipos4      = jpos.xyz;
+ qi4        = jpos.w;
+
+ jstrindex  = isigeps[ iatom ];
+ isig4      = jstrindex.x;
+ ieps4      = jstrindex.y;
+
+/* --------------------------------------------------------------------- */
+
+ breakflag  = 1.0f;
+ jatom.y    = 0.0f;
+
+ // which_rep  = round( (a_iatom - fmod(a_iatom, natoms))/natoms );
+ which_rep  = round( (a_iatom - fmod(a_iatom, nAtomsCeiling ))/nAtomsCeiling );
+
+ jend       = 1.0f + round( (natoms - fmod(natoms, (dupfac * strwidth )))/(dupfac * strwidth ) );
+ jstart     = which_rep*jend;
+ if ( which_rep > dupfac - 1.5f ) {
+  jend = 1e6f;
+ }
+ jend      += jstart;
+
+ exclind.x  = linind;
+ linind1    = linind;
+
+ exclind.y  = jstart*strwidth/4.0f;
+ exclind.y  = floor( exclind.y + 0.25f );
+
+/*
+outforce1   = float3( linind, exclind.y, jstart );
+outforce2   = float3( linind, exclind.y, jend );
+outforce3   = float3( linind, exclind.y, strwidth );
+outforce4   = float3( linind, exclind.y, exclind.y );
+*/
+
+ jatom.y    = jstart;
+ while ( jatom.y < jend && breakflag > 0.0f ) {
+  jatom.x = 0;
+  while ( jatom.x < strwidth && breakflag > 0.0f ) {
+
+   linind      = (jatom.x + jatom.y*strwidth)/4.0f;
+   linind      = floor( linind + 0.25f );
+
+   jstrindex.y = round( (linind - fmod(linind, jstrwidth))/jstrwidth );
+   jstrindex.x = linind - jstrindex.y*jstrwidth;
+   jstrindex.x = floor( jstrindex.x + 0.25f );
+
+   // --------------------------------------------------------------------- 
+
+   jsig        = sigma[ jstrindex ];
+   jeps        = epsilon[ jstrindex ];
+
+   jpos        = posq[ jatom ];
+   jpos1       = jpos.xyz;
+   qj.x        = jpos.w;
+   jatom.x     = floor( jatom.x + 1.2f );
+
+   jpos        = posq[ jatom ];
+   jpos2       = jpos.xyz;
+   qj.y        = jpos.w;
+   jatom.x     = floor( jatom.x + 1.2f );
+
+   jpos        = posq[ jatom ];
+   jpos3       = jpos.xyz;
+   qj.z        = jpos.w;
+   jatom.x     = floor( jatom.x + 1.2f );
+
+   jpos        = posq[ jatom ];
+   jpos4       = jpos.xyz;
+   qj.w        = jpos.w;
+   jatom.x     = floor( jatom.x + 1.2f );
+
+   // --------------------------------------------------------------------- 
+
+   sig         = isig1 + jsig;
+   eps         = ieps1 * jeps;
+   qq          = qi1*qj;
+
+   d1          = ipos1 - jpos1;
+   d2          = ipos1 - jpos2;
+   d3          = ipos1 - jpos3;
+   d4          = ipos1 - jpos4;
+
+   exclusions  = excl[ exclind ];
+   exclmask    = fmod( exclusions, exclconst );
+
+   r2          = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask*10000.0f;
+   fs          = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
+
+   //    outforce1  += fs.x * d1;
+   //    outforce1  += fs.y * d2;
+   //    outforce1  += fs.z * d3;
+   //    outforce1  += fs.w * d4;
+   // outforce1  += float3( 4.0, 4.0, 4.0);
+   // outforce1  += 1.0/r2;
+
+    exclmask    = exclmask > frac4 ? one4 : zero4;
+    sum         = floor( exclmask.x + exclmask.y + exclmask.z + exclmask.w + 0.25f );
+/*
+    if( sum > 0.25f ){
+        outforce1.x  += 1.0f;
+        outforce1.y  += exclusions;
+        outforce1.z  += sum;
+        // outforce1.y  += exclind.x + strwidth*exclind.y;
+        //outforce1.y  += 4.0;
+    }
+*/
+   if( exclind.y < 5.9f && exclind.y > 4.1f ){
+       outforce1.y  += exclind.x + 576.0f*exclind.y;
+       outforce1.z  += exclusions;
+       outforce1.x  += exclind.y;
+    }    
+
+   // --------------------------------------------------------------------- 
+
+   exclind.x   = floor( exclind.x + 1.2f );
+   exclusions  = excl[ exclind ];
+   exclmask    = fmod( exclusions, exclconst );
+
+   sig         = isig2 + jsig;
+   eps         = ieps2 * jeps;
+   qq          = qi2 * qj;
+
+   d1          = ipos2 - jpos1;
+   d2          = ipos2 - jpos2;
+   d3          = ipos2 - jpos3;
+   d4          = ipos2 - jpos4;
+
+   r2          = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask*10000.0f;
+   fs          = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
+
+   // outforce2  += fs.x * d1;
+   // outforce2  += fs.y * d2;
+   // outforce2  += fs.z * d3;
+   // outforce2  += fs.w * d4;
+   // outforce2  += float3( 4.0f, 4.0f, 4.0f);
+    exclmask    = exclmask > frac4 ? one4 : zero4;
+    sum         = floor( exclmask.x + exclmask.y + exclmask.z + exclmask.w + 0.25f );
+    if( sum > 0.25f ){
+        outforce2.x  += 1.0f;
+        outforce2.y  += exclusions;
+        outforce2.z  += sum;
+    }
+
+   // --------------------------------------------------------------------- 
+
+   exclind.x   = floor( exclind.x + 1.2f );
+   exclusions  = excl[ exclind ];
+   exclmask    = fmod( exclusions, exclconst );
+
+   sig         = isig3 + jsig;
+   eps         = ieps3 * jeps;
+   qq          = qi3   * qj;
+
+   d1          = ipos3 - jpos1;
+   d2          = ipos3 - jpos2;
+   d3          = ipos3 - jpos3;
+   d4          = ipos3 - jpos4;
+
+   r2          = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask * 10000.0f;
+   fs          = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
+
+   // outforce3  += fs.x * d1;
+   // outforce3  += fs.y * d2;
+   // outforce3  += fs.z * d3;
+   // outforce3  += fs.w * d4;
+   // outforce3  += float3( 4.0f, 4.0f, 4.0f);
+   // outforce3  += 1.0f/r2;
+    exclmask    = exclmask > frac4 ? one4 : zero4;
+    sum         = floor( exclmask.x + exclmask.y + exclmask.z + exclmask.w + 0.25f );
+    if( sum > 0.25f ){
+        outforce3.x  += 1.0f;
+        outforce3.y  += exclusions;
+        outforce3.z  += sum;
+        // outforce3.y  += exclind.x + strwidth*exclind.y;
+        //outforce3.y  += 4.0;
+    }
+
+   // --------------------------------------------------------------------- 
+
+   exclind.x   = floor( exclind.x + 1.2f );
+   exclusions  = excl[ exclind ];
+   exclmask    = fmod( exclusions, exclconst );
+
+   sig         = isig4 + jsig;
+   eps         = ieps4 * jeps;
+   qq          = qi4   * qj;
+
+   d1          = ipos4 - jpos1;
+   d2          = ipos4 - jpos2;
+   d3          = ipos4 - jpos3;
+   d4          = ipos4 - jpos4;
+
+   r2          = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask * 10000.0f;
+   fs          = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
+
+   // outforce4  += fs.x * d1;
+   // outforce4  += fs.y * d2;
+   // outforce4  += fs.z * d3;
+   // outforce4  += fs.w * d4;
+   // outforce4  += float3( 4.0f, 4.0f, 4.0f);
+   // outforce4  += 1.0f/r2;
+
+    exclmask    = exclmask > frac4 ? one4 : zero4;
+    sum         = floor( exclmask.x + exclmask.y + exclmask.z + exclmask.w + 0.25f );
+    if( sum > 0.25f ){
+        outforce4.x  += 1.0f;
+        outforce4.y  += exclusions;
+        outforce4.z  += sum;
+        // outforce4.y  += exclind.x + strwidth*exclind.y;
+        //outforce4.y  += 4.0;
+    }
+
+   // --------------------------------------------------------------------- 
+
+   exclind.x   = floor( exclind.x - 2.9f );
+   exclind.y   = floor( exclind.y + 1.2f );
+
+   if ( natoms - (jatom.y*strwidth + jatom.x) < 0.9f )
+    breakflag = -1.0f;
+  }
+
+  jatom.y  = floor( jatom.y + 1.2f );
+ }
+
+}
+
+kernel void kbonded_CDLJ (
+  float xstrwidth,
+  float4 posq[][],
+  float4 atoms<>,
+  float4 parm0<>,
+  float4 parm1<>,
+  float4 parm2<>,
+  float4 parm3<>,
+  float4 parm4<>,
+  out float3 fi<>,
+  out float3 fj<>,
+  out float3 fk<>,
+  out float3 fl<>
+  )
+{
+ float3 rij, rkj, rkl, ril;
+ float2 ai, aj, ak, al;
+ float3 m, n;
+ float sgnphi;
+ float cosfac;
+ float ddphi, ddphi_rb;
+ float3 u, v, s;
+ float invlkj, invlij, invlkl, msq, nsq, cos_phi, sin_phi;
+ float3 uij, ukj, ukl;
+ float phi, mdphi;
+ float rij_d_ukj, ukj_d_rkl;
+ float normfac;
+ float qq;
+ float3 fi_ang, fj_ang, fk_ang;
+ float3 fi_bond;
+ float cii, ckk, cik;
+ float fs, st, sth;
+ float3 fi_pair;
+ float r2 ;
+ float torsion_numerator;
+ float theta1, costheta1, invsintheta1;
+ float theta2, costheta2, invsintheta2;
+ float4 posqi, posqj, posqk, posql;
+ float sig, eps;
+
+ ai.y = round( (atoms.x - fmod( atoms.x, xstrwidth ))/xstrwidth );
+ ai.x = atoms.x - ai.y * xstrwidth;
+ posqi = posq[ ai ];
+
+ if ( atoms.y > -0.5f ) {
+
+      aj.y = round( (atoms.y - fmod( atoms.y, xstrwidth ))/xstrwidth );
+  aj.x = atoms.y - aj.y * xstrwidth;
+  posqj = posq[ aj ];
+ }
+ else
+  posqj = float4( 0.0f, 0.0f, 1.0f, 0.0f );
+
+ if ( atoms.z > -0.5f ) {
+
+      ak.y = round( (atoms.z - fmod( atoms.z, xstrwidth ))/xstrwidth );
+  ak.x = atoms.z - ak.y * xstrwidth;
+  posqk = posq[ ak ];
+ }
+ else
+  posqk = float4( 0.0f, 1.0f, 1.0f, 0.0f );
+
+ if ( atoms.w > -0.5f ) {
+
+      al.y = round( (atoms.w - fmod( atoms.w, xstrwidth ))/xstrwidth );
+  al.x = atoms.w - al.y * xstrwidth;
+  posql = posq[ al ];
+ }
+ else
+  posql = float4( 1.0f, 1.0f, 1.0f, 0.0f );
+
+ qq = posqi.w * posql.w;
+ rij = posqi.xyz - posqj.xyz;
+ rkj = posqk.xyz - posqj.xyz;
+ rkl = posqk.xyz - posql.xyz;
+ ril = posqi.xyz - posql.xyz;
+
+
+
+ invlij = rsqrt( dot( rij, rij ) );
+ invlkj = rsqrt( dot( rkj, rkj ) );
+ invlkl = rsqrt( dot( rkl, rkl ) );
+
+ uij = rij * invlij;
+ ukj = rkj * invlkj;
+ ukl = rkl * invlkl;
+
+
+ rij_d_ukj = dot( rij, ukj );
+ ukj_d_rkl = dot( ukj, rkl );
+
+ m = cross( uij, ukj );
+ n = cross( ukj, ukl );
+
+
+ costheta1 = clamp( rij_d_ukj * invlij, -1.0f, 1.0f );
+ theta1 = acos( costheta1 );
+ invsintheta1 = rsqrt( 1.0f - costheta1 * costheta1 );
+
+
+ costheta2 = clamp( ukj_d_rkl * invlkl, -1.0f, 1.0f );
+ theta2 = acos( costheta2 );
+ invsintheta2 = rsqrt( 1.0f - costheta2 * costheta2 );
+
+ cos_phi = clamp( dot(m,n) * invsintheta1 * invsintheta2, -1.0f, 1.0f );
+
+ sgnphi = sign( dot(rij, n) );
+
+
+ phi = sgnphi * acos( cos_phi );
+ mdphi = parm1.w * phi - parm1.z;
+ ddphi = -parm1.y * parm1.w * sin( mdphi );
+
+
+ cos_phi = -cos_phi;
+ sin_phi = -sgnphi*sqrt( clamp( 1.0f - cos_phi * cos_phi, 0.0f, 1.0f) );
+
+ ddphi_rb = 5.0f * parm1.x;
+ ddphi_rb = 4.0f * parm0.w + ddphi_rb * cos_phi;
+ ddphi_rb = 3.0f * parm0.z + ddphi_rb * cos_phi;
+ ddphi_rb = 2.0f * parm0.y + ddphi_rb * cos_phi;
+ ddphi_rb = parm0.x + ddphi_rb * cos_phi;
+ ddphi_rb = -ddphi_rb * sin_phi;
+
+
+ ddphi += ddphi_rb;
+
+ fi = (-ddphi * invlij * invsintheta1 * invsintheta1 ) * m;
+ fl = ( ddphi * invlkl * invsintheta2 * invsintheta2 ) * n;
+
+
+ u = rij_d_ukj * invlkj * fi;
+ v = ukj_d_rkl * invlkj * fl;
+ s = u - v;
+
+ fj = s - fi;
+ fk = -(s + fl);
+
+
+
+
+
+ fs = -parm2.y * ( theta1 - parm2.x );
+ st = fs * invsintheta1;
+ st = clamp( st, -100000.0f, 100000.0f );
+
+ sth = st * costheta1;
+
+ cik = st * invlkj * invlij;
+ cii = sth * invlij;
+ ckk = sth * invlkj;
+
+ fi_ang = -( cik * rkj - cii * uij );
+ fk_ang = -( cik * rij - ckk * ukj );
+ fj_ang = -fi_ang - fk_ang;
+
+ fi += fi_ang;
+ fj += fj_ang;
+ fk += fk_ang;
+ fs = -parm2.w * ( theta2 - parm2.z );
+ st = fs * invsintheta2;
+ st = clamp( st, -100000.0f, 100000.0f );
+
+ sth = st * costheta2;
+
+ cik = st * invlkj * invlkl;
+ cii = sth * invlkj;
+ ckk = sth * invlkl;
+
+ fi_ang = ( cik * rkl - cii * ukj );
+ fk_ang = ( cik * rkj - ckk * ukl );
+ fj_ang = -fi_ang - fk_ang;
+
+ fj += fi_ang;
+ fk += fj_ang;
+ fl += fk_ang;
+
+
+ fi_bond = -parm3.y * ( 1.0f - parm3.x * invlij ) * rij;
+ fi += fi_bond;
+ fj += -fi_bond;
+
+
+ fi_bond = parm3.w * ( 1.0f - parm3.z * invlkj ) * rkj;
+ fj += fi_bond;
+ fk += -fi_bond;
+
+
+ fi_bond = -parm4.y * ( 1.0f - parm4.x * invlkl ) * rkl;
+ fk += fi_bond;
+ fl += -fi_bond;
+
+ if ( parm4.z > -0.5f ) {
+  r2 = dot( ril, ril );
+  fs = scalar_force_single_CDLJ( qq, epsfac, parm4.z, parm4.w, r2 );
+  fi_pair = fs * ril;
+  fi += fi_pair;
+  fl -= fi_pair;
+ }
+
+}
+
+kernel void kbonded_CDLJDebug (
+  float epsfac,
+  float xstrwidth,
+  float4 nbparams,
+  float4 posq[][],
+  float4 atoms<>,
+  float4 parm0<>,
+  float4 parm1<>,
+  float4 parm2<>,
+  float4 parm3<>,
+  float4 parm4<>,
+  out float3 fi<>,
+  out float3 fj<>,
+  out float3 fk<>,
+  out float3 fl<>
+  )
+{
+ float3 rij, rkj, rkl, ril;
+ float2 ai, aj, ak, al;
+ float3 m, n;
+ float sgnphi;
+ float cosfac;
+ float ddphi, ddphi_rb;
+ float3 u, v, s;
+ float invlkj, invlij, invlkl, msq, nsq, cos_phi, sin_phi;
+ float3 uij, ukj, ukl;
+ float phi, mdphi;
+ float rij_d_ukj, ukj_d_rkl;
+ float normfac;
+ float qq;
+ float3 fi_ang, fj_ang, fk_ang;
+ float3 fi_bond;
+ float cii, ckk, cik;
+ float fs, st, sth;
+ float3 fi_pair;
+ float r2 ;
+ float torsion_numerator;
+ float theta1, costheta1, invsintheta1;
+ float theta2, costheta2, invsintheta2;
+ float4 posqi, posqj, posqk, posql;
+ float sig, eps;
+
+ ai.y = round( (atoms.x - fmod( atoms.x, xstrwidth ))/xstrwidth );
+ ai.x = atoms.x - ai.y * xstrwidth;
+ posqi = posq[ ai ];
+
+ if ( atoms.y > -0.5f ) {
+  aj.y  = round( (atoms.y - fmod( atoms.y, xstrwidth ))/xstrwidth );
+  aj.x  = atoms.y - aj.y * xstrwidth;
+  posqj = posq[ aj ];
+ } else {
+  posqj = float4( 0.0f, 0.0f, 1.0f, 0.0f );
+ }
+
+ if ( atoms.z > -0.5f ) {
+  ak.y  = round( (atoms.z - fmod( atoms.z, xstrwidth ))/xstrwidth );
+  ak.x  = atoms.z - ak.y * xstrwidth;
+  posqk = posq[ ak ];
+ } else {
+  posqk = float4( 0.0f, 1.0f, 1.0f, 0.0f );
+ }
+
+ if ( atoms.w > -0.5f ) {
+  al.y = round( (atoms.w - fmod( atoms.w, xstrwidth ))/xstrwidth );
+  al.x = atoms.w - al.y * xstrwidth;
+  posql = posq[ al ];
+ } else {
+  posql = float4( 1.0f, 1.0f, 1.0f, 0.0f );
+ }
+
+ qq  = posqi.w * posql.w;
+ rij = posqi.xyz - posqj.xyz;
+ rkj = posqk.xyz - posqj.xyz;
+ rkl = posqk.xyz - posql.xyz;
+ ril = posqi.xyz - posql.xyz;
+
+ invlij = rsqrt( dot( rij, rij ) );
+ invlkj = rsqrt( dot( rkj, rkj ) );
+ invlkl = rsqrt( dot( rkl, rkl ) );
+
+ uij = rij * invlij;
+ ukj = rkj * invlkj;
+ ukl = rkl * invlkl;
+
+
+ rij_d_ukj = dot( rij, ukj );
+ ukj_d_rkl = dot( ukj, rkl );
+
+ m = cross( uij, ukj );
+ n = cross( ukj, ukl );
+
+
+ costheta1 = clamp( rij_d_ukj * invlij, -1.0f, 1.0f );
+ theta1 = acos( costheta1 );
+ invsintheta1 = rsqrt( 1.0f - costheta1 * costheta1 );
+
+
+ costheta2 = clamp( ukj_d_rkl * invlkl, -1.0f, 1.0f );
+ theta2 = acos( costheta2 );
+ invsintheta2 = rsqrt( 1.0f - costheta2 * costheta2 );
+
+ cos_phi = clamp( dot(m,n) * invsintheta1 * invsintheta2, -1.0f, 1.0f );
+
+ sgnphi = sign( dot(rij, n) );
+
+ phi = sgnphi * acos( cos_phi );
+ mdphi = parm1.w * phi - parm1.z;
+ ddphi = -parm1.y * parm1.w * sin( mdphi );
+
+
+ cos_phi = -cos_phi;
+ sin_phi = -sgnphi*sqrt( clamp( 1.0f - cos_phi * cos_phi, 0.0f, 1.0f) );
+
+ ddphi_rb = 5.0f * parm1.x;
+ ddphi_rb = 4.0f * parm0.w + ddphi_rb * cos_phi;
+ ddphi_rb = 3.0f * parm0.z + ddphi_rb * cos_phi;
+ ddphi_rb = 2.0f * parm0.y + ddphi_rb * cos_phi;
+ ddphi_rb = parm0.x + ddphi_rb * cos_phi;
+ ddphi_rb = -ddphi_rb * sin_phi;
+
+ ddphi += ddphi_rb;
+ //ddphi = ddphi_rb;
+ //ddphi = 0.0f;
+
+ fi = (-ddphi * invlij * invsintheta1 * invsintheta1 ) * m;
+ fl = ( ddphi * invlkl * invsintheta2 * invsintheta2 ) * n;
+
+ u = rij_d_ukj * invlkj * fi;
+ v = ukj_d_rkl * invlkj * fl;
+ s = u - v;
+ fj = s - fi;
+ fk = -(s + fl);
+
+ fs = -parm2.y * ( theta1 - parm2.x );
+ st = fs * invsintheta1;
+ st = clamp( st, -100000.0f, 100000.0f );
+
+ sth = st * costheta1;
+
+ cik = st * invlkj * invlij;
+ cii = sth * invlij;
+ ckk = sth * invlkj;
+
+ fi_ang = -( cik * rkj - cii * uij );
+ fk_ang = -( cik * rij - ckk * ukj );
+ fj_ang = -fi_ang - fk_ang;
+
+/*
+ fi += fi_ang;
+ fj += fj_ang;
+ fk += fk_ang;
+*/
+
+ fs = -parm2.w * ( theta2 - parm2.z );
+ st = fs * invsintheta2;
+ st = clamp( st, -100000.0f, 100000.0f );
+
+ sth = st * costheta2;
+
+ cik = st * invlkj * invlkl;
+ cii = sth * invlkj;
+ ckk = sth * invlkl;
+
+ fi_ang = ( cik * rkl - cii * ukj );
+ fk_ang = ( cik * rkj - ckk * ukl );
+ fj_ang = -fi_ang - fk_ang;
+
+/*
+ fj += fi_ang;
+ fk += fj_ang;
+ fl += fk_ang;
+*/
+
+ // bonded terms
+ // k*(r - d)*deltaX/r = k*(1 - d/r)*deltaX
+
+/*
+ fi = float3( 0.0f, 0.0f, 0.0f );
+ fj = float3( 0.0f, 0.0f, 0.0f );
+ fk = float3( 0.0f, 0.0f, 0.0f );
+ fl = float3( 0.0f, 0.0f, 0.0f );
+*/
+/*
+// ------------------------------------------------------------
+
+ fi_bond = -parm3.y * ( 1.0f - parm3.x * invlij ) * rij;
+ fi += fi_bond;
+ fj += -fi_bond;
+
+// ------------------------------------------------------------
+
+ fi_bond = parm3.w * ( 1.0f - parm3.z * invlkj ) * rkj;
+ fj += fi_bond;
+ fk += -fi_bond;
+
+// ------------------------------------------------------------
+
+ fi_bond = -parm4.y * ( 1.0f - parm4.x * invlkl ) * rkl;
+ fk += fi_bond;
+ fl += -fi_bond;
+*/
+
+
+// ------------------------------------------------------------
+
+// LJ14 terms
+
+//fi   = float3( 0.0f, 0.0f, 0.0f);
+//fj   = float3( 0.0f, 0.0f, 0.0f);
+//fk   = float3( 0.0f, 0.0f, 0.0f);
+//fl   = float3( 0.0f, 0.0f, 0.0f);
+/*
+ if ( parm4.z > -0.5f ) {
+  r2 = dot( ril, ril );
+  fs = scalar_force_single_CDLJ( qq, epsfac, parm4.z, parm4.w, r2 );
+  fi_pair = fs * ril;
+//fi_pair = float3( posqi.w + posql.w, posqi.w, posql.w );
+//fi_pair = float3( posqi.w + posql.w, posqi.w, posql.w );
+//fi_pair = float3( atoms.x, parm4.z, parm4.w );
+//fi_pair = float3( 0.0f, 0.0f, 0.0f);
+  fi  += fi_pair;
+  fl  -= fi_pair;
+  //fi   = fi_pair;
+  //fl   =  float3( atoms.w, parm4.z, parm4.w );
+ }
+*/
+
+}
+
+kernel void knbforce_CDLJ4NoEx(
+  float natoms,
+  float nAtomsCeiling,
+  float dupfac,
+  float strheight,
+  float strwidth,
+  float jstrwidth,
+  float fstrwidth,
+  float epsfac,
+  float4 params,
+  float4 posq[][],
+  float4 isigeps[][],
+  float4 sigma[][],
+  float4 epsilon[][],
+  float excl[][],
+  out float3 outforce1<>,
+  out float3 outforce2<>,
+  out float3 outforce3<>,
+  out float3 outforce4<> ) {
+
+ float4 jsig, jeps;
+ float3 ipos1, ipos2, ipos3, ipos4;
+ float4 jpos;
+ float3 jpos1, jpos2, jpos3, jpos4;
+ float qi1, qi2, qi3, qi4;
+
+ float isig1, isig2, isig3, isig4;
+ float ieps1, ieps2, ieps3, ieps4;
+
+ float4 qj, qq;
+ float3 d1, d2, d3, d4;
+ float2 exclind;
+ float4 sig, eps;
+ float  exclusions;
+ float4 r2;
+ float3 pad;
+ float4 exclmask;
+ float4 exclconst;
+ float2 iatom;
+ float2 jstrindex;
+ float a_iatom;
+ float linind;
+
+ float breakflag;
+ float2 jatom;
+ float4 fs;
+ float jend, jstart;
+ float which_rep;
+
+ exclconst  = float4( 2.0f, 3.0f, 5.0f, 7.0f );
+
+ outforce1  = float3( 0.0f, 0.0f, 0.0f );
+ outforce2  = float3( 0.0f, 0.0f, 0.0f );
+ outforce3  = float3( 0.0f, 0.0f, 0.0f );
+ outforce4  = float3( 0.0f, 0.0f, 0.0f );
+
+ a_iatom    = 4.0f*((indexof outforce1).x + (indexof outforce1).y * fstrwidth);
+
+ // linind     = fmod( a_iatom, natoms );
+ linind     = fmod( a_iatom, nAtomsCeiling );
+
+ iatom.x    = fmod( linind, strwidth );
+ iatom.y    = round( (linind - fmod(linind, strwidth))/strwidth );
+
+/* --------------------------------------------------------------------- */
+
+ jpos       = posq[ iatom ];
+ ipos1      = jpos.xyz;
+ qi1        = jpos.w;
+
+ jstrindex  = isigeps[ iatom ];
+ isig1      = jstrindex.x;
+ ieps1      = jstrindex.y;
+
+/* --------------------------------------------------------------------- */
+
+ iatom.x   += 1.0f;
+ jpos       = posq[ iatom ];
+ ipos2      = jpos.xyz;
+ qi2        = jpos.w;
+
+ jstrindex  = isigeps[ iatom ];
+ isig2      = jstrindex.x;
+ ieps2      = jstrindex.y;
+
+/* --------------------------------------------------------------------- */
+
+ iatom.x   += 1.0f;
+ jpos       = posq[ iatom ];
+ ipos3      = jpos.xyz;
+ qi3        = jpos.w;
+
+ jstrindex  = isigeps[ iatom ];
+ isig3      = jstrindex.x;
+ ieps3      = jstrindex.y;
+
+/* --------------------------------------------------------------------- */
+
+ iatom.x   += 1.0f;
+ jpos       = posq[ iatom ];
+ ipos4      = jpos.xyz;
+ qi4        = jpos.w;
+
+ jstrindex  = isigeps[ iatom ];
+ isig4      = jstrindex.x;
+ ieps4      = jstrindex.y;
+
+/* --------------------------------------------------------------------- */
+
+ breakflag  = 1.0f;
+ jatom.y    = 0.0f;
+
+ // which_rep  = round( (a_iatom - fmod(a_iatom, natoms))/natoms );
+ which_rep  = round( (a_iatom - fmod(a_iatom, nAtomsCeiling ))/nAtomsCeiling );
+
+ jend       = 1.0f + round( (natoms - fmod(natoms, (dupfac * strwidth )))/(dupfac * strwidth ) );
+ jstart     = which_rep * jend;
+ if ( which_rep > dupfac - 1.5f ) {
+  jend = 1e6f;
+ }
+ jend      += jstart;
+
+ //exclind.x  = linind;
+ //exclind.y  = jstart * strwidth/4.0f;
+ //exclind.y  = floor(  exclind.y  + 0.25f );
+
+ jatom.y    = jstart;
+ while ( jatom.y < jend && breakflag > 0.0f ) {
+  jatom.x = 0;
+  while ( jatom.x < strwidth && breakflag > 0.0f ) {
+
+//  exclusions  = excl[ exclind ];
+//   exclmask    = fmod( exclusions, exclconst );
+
+   linind      = (jatom.x + jatom.y*strwidth)/4.0f;
+   linind      = floor( linind + 0.25f );
+
+   jstrindex.y = round( (linind - fmod(linind, jstrwidth))/jstrwidth );
+   jstrindex.x = linind - jstrindex.y * jstrwidth;
+   jstrindex.x = floor( jstrindex.x + 0.25f );
+
+   /* --------------------------------------------------------------------- */
+
+   jsig        = sigma[ jstrindex ];
+   jeps        = epsilon[ jstrindex ];
+
+   jpos        = posq[ jatom ];
+   jpos1       = jpos.xyz;
+   qj.x        = jpos.w;
+   jatom.x     = floor( jatom.x + 1.2f );
+   //jatom.x    += 1.0f;
+
+   jpos        = posq[ jatom ];
+   jpos2       = jpos.xyz;
+   qj.y        = jpos.w;
+   jatom.x     = floor( jatom.x + 1.2f );
+   //jatom.x    += 1.0f;
+
+   jpos        = posq[ jatom ];
+   jpos3       = jpos.xyz;
+   qj.z        = jpos.w;
+   jatom.x     = floor( jatom.x + 1.2f );
+   //jatom.x    += 1.0f;
+
+   jpos        = posq[ jatom ];
+   jpos4       = jpos.xyz;
+   qj.w        = jpos.w;
+   jatom.x     = floor( jatom.x + 1.2f );
+   //jatom.x    += 1.0f;
+
+   /* --------------------------------------------------------------------- */
+
+   sig         = isig1 + jsig;
+   eps         = ieps1 * jeps;
+   qq          = qi1*qj;
+
+   d1          = ipos1 - jpos1;
+   d2          = ipos1 - jpos2;
+   d3          = ipos1 - jpos3;
+   d4          = ipos1 - jpos4;
+
+   r2          = get_r2_CDLJ( d1, d2, d3, d4 );
+   fs          = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
+
+   outforce1  += fs.x * d1;
+   outforce1  += fs.y * d2;
+   outforce1  += fs.z * d3;
+   outforce1  += fs.w * d4;
+
+   /* --------------------------------------------------------------------- */
+
+   //exclind.x  += 1.0f;
+/*
+   exclind.x   = floor( exclind.x + 1.2f );
+   exclusions  = excl[ exclind ];
+   exclmask    = fmod( exclusions, exclconst );
+*/
+
+   sig         = isig2 + jsig;
+   eps         = ieps2 * jeps;
+   qq          = qi2 * qj;
+
+   d1          = ipos2 - jpos1;
+   d2          = ipos2 - jpos2;
+   d3          = ipos2 - jpos3;
+   d4          = ipos2 - jpos4;
+
+   r2          = get_r2_CDLJ( d1, d2, d3, d4 );
+   fs          = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
+
+   outforce2  += fs.x * d1;
+   outforce2  += fs.y * d2;
+   outforce2  += fs.z * d3;
+   outforce2  += fs.w * d4;
+
+   /* --------------------------------------------------------------------- */
+
+   //exclind.x  += 1.0f;
+/*
+   exclind.x   = floor( exclind.x + 1.2f );
+   exclusions  = excl[ exclind ];
+   exclmask    = fmod( exclusions, exclconst );
+*/
+
+   sig         = isig3 + jsig;
+   eps         = ieps3 * jeps;
+   qq          = qi3   * qj;
+
+   d1          = ipos3 - jpos1;
+   d2          = ipos3 - jpos2;
+   d3          = ipos3 - jpos3;
+   d4          = ipos3 - jpos4;
+
+   r2          = get_r2_CDLJ( d1, d2, d3, d4 );
+   fs          = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
+
+   outforce3  += fs.x * d1;
+   outforce3  += fs.y * d2;
+   outforce3  += fs.z * d3;
+   outforce3  += fs.w * d4;
+
+   /* --------------------------------------------------------------------- */
+
+   //exclind.x  += 1.0f;
+/*
+   exclind.x   = floor( exclind.x + 1.2f );
+   exclusions  = excl[ exclind ];
+   exclmask    = fmod( exclusions, exclconst );
+*/
+
+   sig         = isig4 + jsig;
+   eps         = ieps4 * jeps;
+   qq          = qi4   * qj;
+
+   d1          = ipos4 - jpos1;
+   d2          = ipos4 - jpos2;
+   d3          = ipos4 - jpos3;
+   d4          = ipos4 - jpos4;
+
+   r2          = get_r2_CDLJ( d1, d2, d3, d4 );
+   fs          = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
+
+   outforce4  += fs.x * d1;
+   outforce4  += fs.y * d2;
+   outforce4  += fs.z * d3;
+   outforce4  += fs.w * d4;
+
+   /* --------------------------------------------------------------------- */
+
+   //exclind.x  -= 3.0f;
+   //exclind.x   = floor( exclind.x - 2.9f );
+
+   //exclind.y  += 1.0f;
+   //exclind.y  = floor(  exclind.y  + 1.25f );
+
+   if( natoms - (jatom.y*strwidth + jatom.x) < 0.9f )breakflag = -1.0f;
+  }
+
+  //jatom.y += 1.0f;
+  jatom.y  = floor( jatom.y + 1.2f );
+ }
+
+}
--- a/platforms/brook/src/gpu/kinvmap_gather.br
+++ b/platforms/brook/src/gpu/kinvmap_gather.br
+
+/****************************************************************
+* This file is part of the gpu acceleration library for gromacs.
+* Author: V. Vishal
+* Copyright (C) Pande Group, Stanford, 2006
+*****************************************************************/
+
+//Gather kernel for use with diferent angle, dihedral and improper functions
+//
+//For small systems, the overhead of calling the kernel is so high
+//That we have to minimize the number of kernel calls. At the same
+//time we don't want to waste reads, so I'm providing a bunch of kernels
+//here that are unrolled to diferent extents. Use the appropriate one
+//depending on how many inverse maps there are, rather than looping over
+//the simple one.
+
+
+//helper function to make the unrolling look better
+kernel float3 do_gather( float strwidth, float4 invmap<>, float3 forces[][] ) {
+	float3 f;
+	float4 quotient, remainder;
+	float2 idx;
+
+	f = float3( 0.0f, 0.0f, 0.0f );
+	
+	//Convert from linear to 2D index
+	// quotient  = floor( invmap / strwidth );
+	quotient  = round( ( invmap - fmod(invmap, strwidth))/strwidth ); 
+	remainder = invmap - quotient * strwidth;
+	
+	//Add each force only if non-negative
+	if ( invmap.x >= 0.0f ) {
+		idx.y = quotient.x;
+		idx.x = remainder.x;
+		f = forces[ idx ];
+	}
+
+	if ( invmap.y >= 0.0f ) {
+		idx.y = quotient.y;
+		idx.x = remainder.y;
+		f += forces[ idx ];
+	}
+	
+	if ( invmap.z >= 0.0f ) {
+		idx.y = quotient.z;
+		idx.x = remainder.z;
+		f += forces[ idx ];
+	}
+
+	if ( invmap.w >= 0.0f ) {
+		idx.y = quotient.w;
+		idx.x = remainder.w;
+		f += forces[ idx ];
+	}
+
+	return f;
+}
+
+//Simple version, takes only one index stream
+kernel void kinvmap_gather(
+		float strwidth,    //stream width of the dihedral forces
+		float4 invmap<>,   //indices into the dihedral forces
+		float3 forces[][], //dihedral forces
+		float3 inforce<>,  //particle forces before
+		out float3 outforce<> //particle forces after
+		)
+{
+	outforce = inforce;
+
+	outforce += do_gather( strwidth, invmap, forces );
+}
+
+//Takes two inverse maps
+kernel void kinvmap_gather2(
+		float strwidth,    //stream width of the dihedral forces
+		float4 invmap1<>,   //indices into the dihedral forces
+		float4 invmap2<>,   //indices into the dihedral forces
+		float3 forces[][], //dihedral forces
+		float3 inforce<>,  //particle forces before
+		out float3 outforce<> //particle forces after
+		)
+{
+	outforce = inforce;
+
+	outforce += do_gather( strwidth, invmap1, forces );
+	outforce += do_gather( strwidth, invmap2, forces );
+}
+
+//Takes three inverse maps
+kernel void kinvmap_gather3(
+		float strwidth,    //stream width of the dihedral forces
+		float4 invmap1<>,   //indices into the dihedral forces
+		float4 invmap2<>,   //indices into the dihedral forces
+		float4 invmap3<>,
+		float3 forces[][], //dihedral forces
+		float3 inforce<>,  //particle forces before
+		out float3 outforce<> //particle forces after
+		)
+{
+	outforce = inforce;
+
+	outforce += do_gather( strwidth, invmap1, forces );
+	outforce += do_gather( strwidth, invmap2, forces );
+	outforce += do_gather( strwidth, invmap3, forces );
+}
+
+//Takes four inverse maps
+kernel void kinvmap_gather4(
+		float strwidth,    //stream width of the dihedral forces
+		float4 invmap1<>,   //indices into the dihedral forces
+		float4 invmap2<>,   //indices into the dihedral forces
+		float4 invmap3<>,
+		float4 invmap4<>,
+		float3 forces[][], //dihedral forces
+		float3 inforce<>,  //particle forces before
+		out float3 outforce<> //particle forces after
+		)
+{
+	outforce = inforce;
+
+	outforce += do_gather( strwidth, invmap1, forces );
+	outforce += do_gather( strwidth, invmap2, forces );
+	outforce += do_gather( strwidth, invmap3, forces );
+	outforce += do_gather( strwidth, invmap4, forces );
+}
+
+//Takes five inverse maps
+kernel void kinvmap_gather5(
+		float strwidth,    //stream width of the dihedral forces
+		float4 invmap1<>,   //indices into the dihedral forces
+		float4 invmap2<>,   //indices into the dihedral forces
+		float4 invmap3<>,
+		float4 invmap4<>,
+		float4 invmap5<>,
+		float3 forces[][], //dihedral forces
+		float3 inforce<>,  //particle forces before
+		out float3 outforce<> //particle forces after
+		)
+{
+	outforce = inforce;
+
+	outforce += do_gather( strwidth, invmap1, forces );
+	outforce += do_gather( strwidth, invmap2, forces );
+	outforce += do_gather( strwidth, invmap3, forces );
+	outforce += do_gather( strwidth, invmap4, forces );
+	outforce += do_gather( strwidth, invmap5, forces );
+}
+
+//Takes six inverse maps - this is the last one!
+kernel void kinvmap_gather6(
+		float strwidth,    //stream width of the dihedral forces
+		float4 invmap1<>,   //indices into the dihedral forces
+		float4 invmap2<>,   //indices into the dihedral forces
+		float4 invmap3<>,
+		float4 invmap4<>,
+		float4 invmap5<>,
+		float4 invmap6<>,
+		float3 forces[][], //dihedral forces
+		float3 inforce<>,  //particle forces before
+		out float3 outforce<> //particle forces after
+		)
+{
+	outforce = inforce;
+
+	outforce += do_gather( strwidth, invmap1, forces );
+	outforce += do_gather( strwidth, invmap2, forces );
+	outforce += do_gather( strwidth, invmap3, forces );
+	outforce += do_gather( strwidth, invmap4, forces );
+	outforce += do_gather( strwidth, invmap5, forces );
+	outforce += do_gather( strwidth, invmap6, forces );
+}
+
+
+
+kernel float3 etch_force( 
+		float fpos, 
+		float strwidth,
+		float3 fi[][], 
+		float3 fj[][],
+		float3 fk[][],
+		float3 fl[][]
+		)
+{
+	float2 ind;
+	float _fpos;
+
+	_fpos = fpos;
+	
+	if ( _fpos > 300000.0f ) {
+		_fpos = _fpos - 300000.0f;
+		//ind.y = floor( _fpos / strwidth );
+      ind.y = round( ( _fpos - fmod( _fpos, strwidth))/strwidth );
+		ind.x = _fpos - ind.y * strwidth;
+
+		return fl[ ind ];
+	}
+	else if ( _fpos > 200000.0f ) {
+		_fpos = _fpos - 200000.0f;
+		//ind.y = floor( _fpos / strwidth );
+      ind.y = round( ( _fpos - fmod( _fpos, strwidth))/strwidth );
+		ind.x = _fpos - ind.y * strwidth;
+
+		return fk[ ind ];
+	}
+	else if ( _fpos > 100000.0f ) {
+		_fpos = _fpos - 100000.0f;
+		//ind.y = floor( _fpos / strwidth );
+      ind.y = round( ( _fpos - fmod( _fpos, strwidth))/strwidth );
+		ind.x = _fpos - ind.y * strwidth;
+
+		return fj[ ind ];
+	}
+	else if ( _fpos >= -0.5f ) {
+		//ind.y = floor( _fpos / strwidth );
+      ind.y = round( ( _fpos - fmod( _fpos, strwidth))/strwidth );
+		ind.x = _fpos - ind.y * strwidth;
+
+		return fi[ ind ];
+	}
+	else 
+		return 0.0f;
+	
+}
+
+
+//For-loop version doesn't work
+//Using a merged version of the above
+
+kernel float2 linear_to_2D( float linind, float width )
+{
+	float2 ind;
+	//ind.y = floor( linind / width );
+   ind.y = round( ( linind - fmod( linind, width))/width );
+	ind.x = linind - ind.y * width;
+
+	return ind;
+}
+
+//helper function to make the unrolling look better
+kernel float3 do_gather_merged_single( float strwidth, float invmap, 
+		float3 fi[][], float3 fj[][], float3 fk[][], float3 fl[][] ) {
+	
+	float3 f;
+	float2 idx;
+	float _invmap;
+	float n;
+
+	_invmap = invmap;
+
+	n = floor( _invmap / 100000.0f );
+	_invmap -= n * 100000.0f;
+	idx = linear_to_2D( _invmap, strwidth );
+
+	if ( n > 2.5f ) { 
+		f = fl[ idx ];
+	}
+	else if ( n > 1.5f ) {
+		f = fk[ idx ];
+	}
+	else if ( n > 0.5f ) {
+		f = fj[ idx ];
+	}
+	else if( n > -0.5f ) {
+		f = fi[ idx ];
+	}
+
+	return f;
+}
+
+kernel float3 do_gather_merged( float strwidth, float4 invmap,
+		float3 fi[][], float3 fj[][], float3 fk[][], float3 fl[][])
+{
+	float3 f;
+	f = do_gather_merged_single( strwidth, invmap.x, fi, fj, fk, fl )
+	    + do_gather_merged_single( strwidth, invmap.y, fi, fj, fk, fl )
+	    + do_gather_merged_single( strwidth, invmap.z, fi, fj, fk, fl )
+	    + do_gather_merged_single( strwidth, invmap.w, fi, fj, fk, fl );
+
+	return f;
+}
+
+kernel void kinvmap_gather_merged5(
+		float natoms,      //number of atoms
+		float strwidth,    //stream width of out-of-order forces
+		float4 invmap0<>, 
+		float4 invmap1<>, 
+		float4 invmap2<>, 
+		float4 invmap3<>, 
+		float4 invmap4<>, 
+		float3 fi[][],     //i-forces
+		float3 fj[][],     //j-forces
+		float3 fk[][],     //k-forces
+		float3 fl[][],     //l-forces
+		float3 inforce<>,  
+		out float3 outforce<>
+		)
+{
+	outforce = inforce;
+	outforce += do_gather_merged( strwidth, invmap0, fi, fj, fk, fl )
+	            + do_gather_merged( strwidth, invmap1, fi, fj, fk, fl )
+	            + do_gather_merged( strwidth, invmap2, fi, fj, fk, fl )
+	            + do_gather_merged( strwidth, invmap3, fi, fj, fk, fl )
+	            + do_gather_merged( strwidth, invmap4, fi, fj, fk, fl );
+}
+
+kernel void kinvmap_gather_merged9(
+		float natoms,      //number of atoms
+		float strwidth,    //stream width of out-of-order forces
+		float4 invmap0<>, 
+		float4 invmap1<>, 
+		float4 invmap2<>, 
+		float4 invmap3<>, 
+		float4 invmap4<>, 
+		float4 invmap5<>, 
+		float4 invmap6<>, 
+		float4 invmap7<>, 
+		float4 invmap8<>, 
+		float3 fi[][],     //i-forces
+		float3 fj[][],     //j-forces
+		float3 fk[][],     //k-forces
+		float3 fl[][],     //l-forces
+		float3 inforce<>,  
+		out float3 outforce<>
+		)
+{
+	outforce = inforce;
+	outforce += do_gather_merged( strwidth, invmap0, fi, fj, fk, fl )
+	            + do_gather_merged( strwidth, invmap1, fi, fj, fk, fl )
+	            + do_gather_merged( strwidth, invmap2, fi, fj, fk, fl )
+	            + do_gather_merged( strwidth, invmap3, fi, fj, fk, fl )
+	            + do_gather_merged( strwidth, invmap4, fi, fj, fk, fl )
+	            + do_gather_merged( strwidth, invmap5, fi, fj, fk, fl )
+	            + do_gather_merged( strwidth, invmap6, fi, fj, fk, fl )
+	            + do_gather_merged( strwidth, invmap7, fi, fj, fk, fl )
+	            + do_gather_merged( strwidth, invmap8, fi, fj, fk, fl );
+}
--- a/platforms/brook/src/gpu/kinvmap_gather.h
+++ b/platforms/brook/src/gpu/kinvmap_gather.h
+void  kinvmap_gather (const float  strwidth,
+		::brook::stream invmap,
+		::brook::stream forces,
+		::brook::stream inforce,
+		::brook::stream outforce); 
+
+void  kinvmap_gather2 (const float  strwidth,
+		::brook::stream invmap1,
+		::brook::stream invmap2,
+		::brook::stream forces,
+		::brook::stream inforce,
+		::brook::stream outforce); 
+
+void  kinvmap_gather3 (const float  strwidth,
+		::brook::stream invmap1,
+		::brook::stream invmap2,
+		::brook::stream invmap3,
+		::brook::stream forces,
+		::brook::stream inforce,
+		::brook::stream outforce);
+
+void  kinvmap_gather4 (const float  strwidth,
+		::brook::stream invmap1,
+		::brook::stream invmap2,
+		::brook::stream invmap3,
+		::brook::stream invmap4,
+		::brook::stream forces,
+		::brook::stream inforce,
+		::brook::stream outforce);
+
+void  kinvmap_gather5 (const float  strwidth,
+		::brook::stream invmap1,
+		::brook::stream invmap2,
+		::brook::stream invmap3,
+		::brook::stream invmap4,
+		::brook::stream invmap5,
+		::brook::stream forces,
+		::brook::stream inforce,
+		::brook::stream outforce);
+
+void  kinvmap_gather6 (const float  strwidth,
+		::brook::stream invmap1,
+		::brook::stream invmap2,
+		::brook::stream invmap3,
+		::brook::stream invmap4,
+		::brook::stream invmap5,
+		::brook::stream invmap6,
+		::brook::stream forces,
+		::brook::stream inforce,
+		::brook::stream outforce); 
+
+
+void  kinvmap_gather3_4 (const float  strwidth,
+		::brook::stream invmap3_1,
+		::brook::stream invmap3_2,
+		::brook::stream invmap3_3,
+		::brook::stream forces3,
+		::brook::stream invmap4_1,
+		::brook::stream invmap4_2,
+		::brook::stream invmap4_3,
+		::brook::stream invmap4_4,
+		::brook::stream forces4,
+		::brook::stream inforce,
+		::brook::stream outforce);
+
+void  kinvmap_gather3_5 (const float  strwidth,
+		::brook::stream invmap3_1,
+		::brook::stream invmap3_2,
+		::brook::stream invmap3_3,
+		::brook::stream forces3,
+		::brook::stream invmap5_1,
+		::brook::stream invmap5_2,
+		::brook::stream invmap5_3,
+		::brook::stream invmap5_4,
+		::brook::stream invmap5_5,
+		::brook::stream forces5,
+		::brook::stream inforce,
+		::brook::stream outforce);
+
+
+void  kinvmap_gather5_2 (const float  strwidth,
+		::brook::stream invmap5_1,
+		::brook::stream invmap5_2,
+		::brook::stream invmap5_3,
+		::brook::stream invmap5_4,
+		::brook::stream invmap5_5,
+		::brook::stream forces5,
+		::brook::stream invmap2_1,
+		::brook::stream invmap2_2,
+		::brook::stream forces2,
+		::brook::stream inforce,
+		::brook::stream outforce);
+
+
+void  kinvmap_gather4_2 (const float  strwidth,
+		::brook::stream invmap4_1,
+		::brook::stream invmap4_2,
+		::brook::stream invmap4_3,
+		::brook::stream invmap4_4,
+		::brook::stream forces4,
+		::brook::stream invmap2_1,
+		::brook::stream invmap2_2,
+		::brook::stream forces2,
+		::brook::stream inforce,
+		::brook::stream outforce);
+
+
+void  kinvmap_gather_merged (const float  natoms,
+		const float  strwidth,
+		const float  istrwidth,
+		const float  fstrwidth,
+		::brook::stream nimap,
+		::brook::stream invmap,
+		::brook::stream fi,
+		::brook::stream fj,
+		::brook::stream fk,
+		::brook::stream fl,
+		::brook::stream inforce,
+		::brook::stream outforce); 
+
+void  kinvmap_gather_merged9 (const float  natoms,
+		const float  strwidth,
+		::brook::stream invmap0,
+		::brook::stream invmap1,
+		::brook::stream invmap2,
+		::brook::stream invmap3,
+		::brook::stream invmap4,
+		::brook::stream invmap5,
+		::brook::stream invmap6,
+		::brook::stream invmap7,
+		::brook::stream invmap8,
+		::brook::stream fi,
+		::brook::stream fj,
+		::brook::stream fk,
+		::brook::stream fl,
+		::brook::stream inforce,
+		::brook::stream outforce); 
+
+void  kinvmap_gather_merged5 (const float  natoms,
+		const float  strwidth,
+		::brook::stream invmap0,
+		::brook::stream invmap1,
+		::brook::stream invmap2,
+		::brook::stream invmap3,
+		::brook::stream invmap4,
+		::brook::stream fi,
+		::brook::stream fj,
+		::brook::stream fk,
+		::brook::stream fl,
+		::brook::stream inforce,
+		::brook::stream outforce); 
--- a/platforms/brook/src/gpu/kmerge_partial_forces.br
+++ b/platforms/brook/src/gpu/kmerge_partial_forces.br
+/* After forces above, we have the forces for even numbered particles
+ * in one stream, odd numbered particles in another.
+ * In each stream, the forces are in several parts depending on how
+ * many times we replicated the input stream.
+ *
+ * This kernel should not be a bottle neck, but if it turns out to
+ * be so, we can try some work arounds.
+ *
+ * To avoid an extra kernel to zero forces, this sets the forces
+ * rather than adding to it.
+ * */
+
+kernel static void kmerge_partial_forces( 
+		float repfac, 
+		float atomStrWidth, 
+		float pforceStrWidth,
+		float natoms,
+		float3 pforce1[][], 
+		float3 pforce2[][], 
+		out float3 force<> )
+{
+	float linind;
+	float2 pindex;
+	float odd;
+	float i;
+	
+	float2 adjustcount = (indexof force);
+	
+	//convert to linear atom index
+	linind = adjustcount.x + adjustcount.y * atomStrWidth;
+
+	//If odd or even, we pick from diferent streams.
+	odd = linind - floor( linind / 2.0f ) * 2.0f;
+
+	//Now linear index is the index into partial_forces
+	linind = floor( linind / 2.0f );
+	force = float3( 0.0f, 0.0f, 0.0f );
+
+	//If we have predicated conditionals, we should 
+	//keep the conditional inside the loop
+	for ( i = 0; i < repfac; i+=1.0f ) {
+		pindex.y = floor( linind / pforceStrWidth );		
+		pindex.x = linind - pindex.y * pforceStrWidth;
+		//pindex.x = fmod( linind, pforceStrWidth );
+		
+		if ( odd > 0.5f ) { //is odd
+			force += pforce2[ pindex ];
+		}
+		else {
+			force += pforce1[ pindex ];
+		}
+		linind += natoms/2.0f; 
+	}
+}
+
+kernel void kMergeFloat3_4( 
+      float repfac, 
+      float atomStreamWidth, 
+      float pStreamWidth,
+      float natoms,
+      float roundNatoms,
+      float iUnroll,
+      float3 pstream1[][], 
+      float3 pstream2[][], 
+      float3 pstream3[][], 
+      float3 pstream4[][], 
+      out float3 outstream<> )
+{
+   float atomIndex, forceIndex, qIndex, qOff;
+   float2 pindex;
+   float i;
+   
+   // given atom index find force indices and streams
+
+   pindex      = indexof( outstream );
+   atomIndex   = pindex.x + pindex.y*atomStreamWidth;
+   forceIndex  = atomIndex;
+
+   outstream   = float3( 0.0f, 0.0f, 0.0f );
+   
+   for( i = 0.0f; i < repfac; i += 1.0f ){
+
+      qIndex            = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
+      qOff              = forceIndex - iUnroll*qIndex;
+      pindex.y          = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
+      pindex.x          = qIndex - pindex.y*pStreamWidth;
+
+      if ( qOff < 0.5f ){ 
+         outstream += pstream1[ pindex ];
+      } else if( qOff < 1.5f ){
+         outstream += pstream2[ pindex ];
+      } else if( qOff < 2.5f ){
+         outstream += pstream3[ pindex ];
+      } else {
+         outstream += pstream4[ pindex ];
+      }
+
+      forceIndex += roundNatoms;
+   }
+}
+
--- a/platforms/brook/src/gpu/knlforce.br
+++ b/platforms/brook/src/gpu/knlforce.br
+
+/****************************************************************
+* This file is part of the gpu acceleration library for gromacs.
+* Author: V. Vishal
+* Copyright (C) Pande Group, Stanford, 2006
+*****************************************************************/
+
+/* 
+ * 3rd attempt at writing a force kernel that uses neighborlists.
+ * Keeping things very simple this time - no unrolling at all.
+ *
+ * Starting with simple Coul-LJ
+ * */
+
+//Constant dielectric, LJ (normal forces)
+kernel float4 nl_scalar_force_CDLJ( float4 qq, float epsfac, float4 sig, float4 eps, float4 r2 ) {
+	float4 invr, invrsig2, invrsig6;
+	float4 f;
+
+	invr = rsqrt( r2 ); // 1 or 2 flops?
+	invrsig2 = invr * sig; //1 flop
+	invrsig2 = invrsig2 * invrsig2; //1 flop
+	invrsig6 = invrsig2 * invrsig2 * invrsig2; //2flops
+	f = eps * ( 12.0f*invrsig6 - 6.0f ) * invrsig6; //4flops
+	f += epsfac*qq*invr; //2 flops
+	f *= invr*invr; //2 flops 
+	return f; //Total ? flops
+}
+
+kernel float4 nl_get_r2( float3 d1, float3 d2, float3 d3, float3 d4 ) {
+	float4 r2;
+	r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) ); //5*4 flops
+	return r2;
+}
+
+kernel void knbforce_nl(
+		float AtomStrWidth,  //Width of atom position stream
+		float epsfac,
+		iter float2 wpos<>,  //pixel position of output (i-atom)
+		float4 posq[][],     //coordinates and charges
+		float3 inforce<>,
+		float4 nlist0<>,     //nbor indices
+		float4 sig0<>,        //LJ sigmas
+		float4 eps0<>,        //LJ epsilons
+		//We'll add more nlists here to reduce number of kernel calls
+		out float3 force<>   //output forces
+		)
+{
+	float2 jind;
+	float4 ind_tmp1, ind_tmp2;
+	float4 qj;
+	float3 jpos1, jpos2, jpos3, jpos4;
+	float4 r2, fs, qq;
+	float3 d1, d2, d3, d4;
+	float3 ipos;
+	float qi;
+	
+	//Since neighbors will not be adjacent in memory
+	//We may want to stagger compute and etch, but 
+	//with fxc, we don't have control over scheduling anyway.
+	//Will try hand scheduling the ps3 code later
+
+	//etch i parameters
+	ipos = posq[ wpos ].xyz;
+	qi   = posq[ wpos ].w;
+
+	//etch j parameters
+	
+	ind_tmp1 = floor( nlist0 / AtomStrWidth );
+	ind_tmp2 = nlist0 - ind_tmp1 * AtomStrWidth;
+
+	force = inforce;
+	
+	//1st set of 4 neighbors
+	jind.y = ind_tmp1.x;
+	jind.x = ind_tmp2.x;
+	jpos1  = posq[ jind ].xyz;
+	qj.x   = posq[ jind ].w;
+	
+	jind.y = ind_tmp1.y;
+	jind.x = ind_tmp2.y;
+	jpos2  = posq[ jind ].xyz;
+	qj.y   = posq[ jind ].w;
+	
+	jind.y = ind_tmp1.z;
+	jind.x = ind_tmp2.z;
+	jpos3  = posq[ jind ].xyz;
+	qj.z   = posq[ jind ].w;
+
+	
+	jind.y = ind_tmp1.w;
+	jind.x = ind_tmp2.w;
+	jpos4  = posq[ jind ].xyz;
+	qj.w   = posq[ jind ].w;
+
+
+	d1 = ipos - jpos1;
+	d2 = ipos - jpos2;
+	d3 = ipos - jpos3;
+	d4 = ipos - jpos4;
+	
+	r2 = nl_get_r2( d1, d2, d3, d4 );
+
+	qq = qi * qj;
+
+	fs = nl_scalar_force_CDLJ( qq, epsfac, sig0, eps0, r2 );
+	
+	force += fs.x * d1;
+	force += fs.y * d2;
+	force += fs.z * d3;
+	force += fs.w * d4;	
+}
+
+
--- a/platforms/brook/src/gpu/knlist.br
+++ b/platforms/brook/src/gpu/knlist.br
+
+/****************************************************************
+* This file is part of the gpu acceleration library for gromacs.
+* Author: V. Vishal
+* Copyright (C) Pande Group, Stanford, 2006
+*****************************************************************/
+
+/* Order N^2 neighbor searching.
+ *
+ * This only works for force fields that don't have charge groups.
+ * If you insist on charge groups, you'll have to pass in appropriate masks here.
+ *
+ * This is a simplified kernel, for testing the O(N) speeds.
+ *
+ * This does a complete N^2 search without considering groups of
+ * atoms. Most likely this will prove to be inefficient for 
+ * the O(N) kernel. Lets find out.
+ * 
+ *
+ * Each component of the curpass textures is an atom index. The w component
+ * of curpass3 is a count indicating how many j particles we have
+ * scanned for this particular i atom.
+ * 
+ * */
+
+kernel void knborsearch(
+		float first,          //Positive means constructing the first 16.
+		iter float2 wpos<>,   //pixel position of output
+		float AtomStrHeight,
+		float AtomStrWidth,
+		float cutoff2,      //square of the cutoff
+		float natoms,       //number of atoms
+		float excl[][],    //exclusions in 1x1 format, 0 means not excluded, 1 means excluded.
+		float4 posq[][],    //atom positions/charges
+		float4 prevpass3<>, //Last output texture of previous pass
+		out float4 curpass0<>,  //First output of current pass
+		out float4 curpass1<>,  
+		out float4 curpass2<>,  
+		out float4 curpass3<>  //Last output of current pass, used in next pass
+		){
+	/*For this kernel, wpos == iatom*/
+	float2 iind;
+	float2 jind;
+	float3 ipos, jpos, dr;
+	float r2;
+	float listptr; //Where in the 16-chunk are we now.
+	float jlinind;
+	float breakflag; //positive means keep looping, negative means stop
+	float4 exclconst;
+	float2 exclind;
+	float exclusions;
+
+	exclconst = float4( 2.0f, 3.0f, 5.0f, 7.0f );
+	
+	iind = wpos;
+
+	exclind.x = iind.x + iind.y * AtomStrWidth;
+
+	//etch i atom
+	ipos = posq[ iind ].xyz;
+
+	//Loop over j depending on prevpass
+	jlinind = prevpass3.w + 1;
+	jind.y = floor( jlinind / AtomStrWidth );
+	jind.x = fmod( jlinind, AtomStrWidth );
+	exclind.y = jlinind;
+
+	//All outputs should be initialized to 
+
+	listptr = 0.0f;
+	breakflag = 1.0f;
+
+	//if we already finished, do nothing
+	if ( first < 0.0f && prevpass3.w < 0.0f )
+		breakflag = -1.0f;
+
+	//set to -1 to indicate no neighbor
+	//just to save a separate set of init calls
+	curpass0 = float4( -1.0f, -1.0f, -1.0f, -1.0f );
+	curpass1 = curpass0;
+	curpass2 = curpass0;
+	curpass3 = curpass0;
+	
+	while ( jind.y < AtomStrHeight && breakflag > 0.0f ) {
+		while ( jind.x < AtomStrWidth && breakflag > 0.0f ) {
+
+			//First see if this pair is excluded
+			exclusions = excl[ exclind ];
+
+			if ( exclusions < 0.5f ) {
+				jpos = posq[ jind ].xyz;
+
+				dr = jpos - ipos;
+				r2 = dot( dr, dr );
+
+				//If it is inside the cutoff
+				if ( r2 < cutoff2 ) {
+					//Figure out where to put it
+					//We are allowed 4 nested conditionals
+					//We can play with the structuring of these
+					if ( listptr < 0.5f )
+						curpass0.x = jlinind;
+					else if ( listptr < 1.5f )
+						curpass0.y = jlinind;
+					else if ( listptr < 2.5f )
+						curpass0.z = jlinind;
+					else if ( listptr < 3.5f )
+						curpass0.w = jlinind;
+					else if ( listptr < 4.5f )
+						curpass1.x = jlinind;
+					else if ( listptr < 5.5f )
+						curpass1.y = jlinind;
+					else if ( listptr < 6.5f )
+						curpass1.z = jlinind;
+					else if ( listptr < 7.5f )
+						curpass1.w = jlinind;
+					else if ( listptr < 8.5f )
+						curpass2.x = jlinind;
+					else if ( listptr < 9.5f )
+						curpass2.y = jlinind;
+					else if ( listptr < 10.5f )
+						curpass2.z = jlinind;
+					else if ( listptr < 11.5f )
+						curpass2.w = jlinind;
+					else if ( listptr < 12.5f )
+						curpass3.x = jlinind;
+					else if ( listptr < 13.5f )
+						curpass3.y = jlinind;
+					else if ( listptr < 14.5f ) {
+						curpass3.z = jlinind;
+					}
+					else if ( listptr < 15.5f ) {
+						//We're done for this pass
+						curpass3.w = jlinind;
+						breakflag = -1.0f;
+					}
+					listptr += 1.0f;
+				}
+			}
+
+			jlinind += 1.0f;
+			exclind.y += 1.0f;
+			jind.x  += 1.0f;
+		}
+		jind.x = 0.0f;
+		jind.y += 1.0f;
+	}
+	
+}
+
+//Precomputes lennard jones sig and eps 
+//to save an indirect etch (and a ew flops) in the 
+//force kernel. The charge product is not done this way
+//because charges have to be etched anyway with the 
+//positions
+kernel void knl_precompute_sigeps(
+		float AtomStrWidth,
+		iter float2 wpos<>,
+		float2 sigeps[][], //x=sigma, y=epsilon
+		float4 nlist0<>,
+		float4 nlist1<>,
+		out float4 sig0<>,
+		out float4 eps0<>,
+		out float4 sig1<>,
+		out float4 eps1<>
+	   	)
+{
+	float2 jind;
+	float4 ind_tmp1, ind_tmp2;
+	float2 isigeps, jsigeps1, jsigeps2, jsigeps3, jsigeps4;
+	
+	isigeps = sigeps[ wpos ];
+
+	ind_tmp1 = floor( nlist0 / AtomStrWidth );
+	ind_tmp2 = nlist0 - ind_tmp1 * AtomStrWidth;
+
+	jind.y = ind_tmp1.x;
+	jind.x = ind_tmp2.x;
+	jsigeps1 = sigeps[ jind ];
+	
+	jind.y = ind_tmp1.y;
+	jind.x = ind_tmp2.y;
+	jsigeps2 = sigeps[ jind ];
+	
+	jind.y = ind_tmp1.z;
+	jind.x = ind_tmp2.z;
+	jsigeps3 = sigeps[ jind ];
+	
+	jind.y = ind_tmp1.w;
+	jind.x = ind_tmp2.w;
+	jsigeps4 = sigeps[ jind ];
+
+	sig0.x = isigeps.x + jsigeps1.x;
+	sig0.y = isigeps.x + jsigeps2.x;
+	sig0.z = isigeps.x + jsigeps3.x;
+	sig0.w = isigeps.x + jsigeps4.x;
+	
+	eps0.x = isigeps.y * jsigeps1.y;
+	eps0.y = isigeps.y * jsigeps2.y;
+	eps0.z = isigeps.y * jsigeps3.y;
+	eps0.w = isigeps.y * jsigeps4.y;
+
+	//2nd nlist set
+	
+	ind_tmp1 = floor( nlist1 / AtomStrWidth );
+	ind_tmp2 = nlist1 - ind_tmp1 * AtomStrWidth;
+
+	jind.y = ind_tmp1.x;
+	jind.x = ind_tmp2.x;
+	jsigeps1 = sigeps[ jind ];
+	
+	jind.y = ind_tmp1.y;
+	jind.x = ind_tmp2.y;
+	jsigeps2 = sigeps[ jind ];
+	
+	jind.y = ind_tmp1.z;
+	jind.x = ind_tmp2.z;
+	jsigeps3 = sigeps[ jind ];
+	
+	jind.y = ind_tmp1.w;
+	jind.x = ind_tmp2.w;
+	jsigeps4 = sigeps[ jind ];
+
+	
+
+	sig1.x = isigeps.x + jsigeps1.x;
+	sig1.y = isigeps.x + jsigeps2.x;
+	sig1.z = isigeps.x + jsigeps3.x;
+	sig1.w = isigeps.x + jsigeps4.x;
+	
+	eps1.x = isigeps.y * jsigeps1.y;
+	eps1.y = isigeps.y * jsigeps2.y;
+	eps1.z = isigeps.y * jsigeps3.y;
+	eps1.w = isigeps.y * jsigeps4.y;
+	
+	
+}
--- a/platforms/brook/src/gpu/kpdihs.br
+++ b/platforms/brook/src/gpu/kpdihs.br
+
+/****************************************************************
+* This file is part of the gpu acceleration library for gromacs.
+* Author: V. Vishal
+* Copyright (C) Pande Group, Stanford, 2006
+*****************************************************************/
+
+
+//Ryckaert Bellman dihedrals, needed for Amber/OPLS ff's
+//
+//Input is a stream of quartets i, j, k, l and the output is
+//four float3 streams fi, fj, fk, fl.
+//If by any chance this kernel becomes the bottleneck, we will
+//optimize, but for now, this is kept pretty simple.
+//To keep things streaming, we have a stream of 6 parameters(a float4 and float2)
+//for each dihedral.
+kernel void kpdih( 
+        float xstrwidth, //stream width for x
+        float4 xq[][], //particle coordinates and charges
+        float4 atoms<>, //ijkl quartets
+        float4 parms<>, //parms = ( cp, phi0, mult, 0.0 )
+        out float3 fi<>, //output forces for i, j, k, l
+        out float3 fj<>,
+        out float3 fk<>,
+        out float3 fl<>
+	  	) {
+	
+	float3 r_ij, r_kj, r_kl;
+	float2 ai, aj, ak, al;
+	float3 m, n;
+	float sgnphi;
+	float cosfac;
+	float phi, ddphi, mdphi;
+	float3 u, v, s;
+	float nrkj, nrkj2, msq, nsq, cos_phi, sin_phi;
+
+	//Convert from linear indices to 2D indices into x
+	//If this kernel is compute bound, we can do this 
+	//conversion before-hand and feed in the 2D coordinates
+	
+	ai.y = floor( atoms.x / xstrwidth );
+	ai.x = atoms.x - ai.y * xstrwidth;
+
+	aj.y = floor( atoms.y / xstrwidth );
+	aj.x = atoms.y - aj.y * xstrwidth;
+	
+	ak.y = floor( atoms.z / xstrwidth );
+	ak.x = atoms.z - ak.y * xstrwidth;
+	
+	al.y = floor( atoms.w / xstrwidth );
+	al.x = atoms.w - al.y * xstrwidth;
+	
+	
+	r_ij = xq[ai].xyz - xq[aj].xyz; //3
+	r_kj = xq[ak].xyz - xq[aj].xyz; //3
+	r_kl = xq[ak].xyz - xq[al].xyz; //3
+	
+	m = cross( r_ij, r_kj ); //9
+	n = cross( r_kj, r_kl ); //9
+
+	msq = dot(m, m); //5
+	nsq = dot(n, n); //5
+	
+	cos_phi = clamp( dot(m, n)/sqrt(msq*nsq), -1.0, 1.0 ); //8
+	
+	sgnphi = sign( dot( r_ij, n ) ); //5
+
+	phi = sgnphi * acos( cos_phi ); //2
+
+	mdphi = parms.z * phi - parms.y; //2
+	ddphi = - parms.x * parms.z * sin( mdphi ); //3
+
+	nrkj2 = dot( r_kj, r_kj ); //5
+	nrkj  = sqrt( nrkj2 );     //1
+
+	fi = -ddphi * nrkj / msq * m; //5
+	fl =  ddphi * nrkj / nsq * n; //5
+	
+	u = dot( r_ij, r_kj ) / nrkj2 * fi; //9
+	v = dot( r_kl, r_kj ) / nrkj2 * fl; //9
+	s = u - v; //3
+	
+	fj = s - fi; //3
+	fk = -(s + fl); //3
+
+	//Total : 100 flops
+}
--- a/platforms/brook/src/gpu/krbdihs.br
+++ b/platforms/brook/src/gpu/krbdihs.br
+
+/****************************************************************
+* This file is part of the gpu acceleration library for gromacs.
+* Author: V. Vishal
+* Copyright (C) Pande Group, Stanford, 2006
+*****************************************************************/
+
+
+//Ryckaert Bellman dihedrals, needed for Amber/OPLS ff's
+//
+//Input is a stream of quartets i, j, k, l and the output is
+//four float3 streams fi, fj, fk, fl.
+//If by any chance this kernel becomes the bottleneck, we will
+//optimize, but for now, this is kept pretty simple.
+//To keep things streaming, we have a stream of 6 parameters(a float4 and float2)
+//for each dihedral.
+kernel void krbdih( 
+        float xstrwidth, //stream width for x
+        float4 xq[][], //particle coordinates and charges
+        float4 atoms<>, //ijkl quartets
+        float4 parm03<>, //params 0-3
+        float2 parm45<>, //params 4 and 5
+        out float3 fi<>, //output forces for i, j, k, l
+        out float3 fj<>,
+        out float3 fk<>,
+        out float3 fl<>
+	  	) {
+	
+	float3 r_ij, r_kj, r_kl;
+	float2 ai, aj, ak, al;
+	float3 m, n;
+	float sgnphi;
+	float cosfac;
+	float ddphi;
+	float3 u, v, s;
+	float nrkj, nrkj2, msq, nsq, cos_phi, sin_phi;
+
+	//Convert from linear indices to 2D indices into x
+	//If this kernel is compute bound, we can do this 
+	//conversion before-hand and feed in the 2D coordinates
+	
+	ai.y = floor( atoms.x / xstrwidth );
+	ai.x = atoms.x - ai.y * xstrwidth;
+
+	aj.y = floor( atoms.y / xstrwidth );
+	aj.x = atoms.y - aj.y * xstrwidth;
+	
+	ak.y = floor( atoms.z / xstrwidth );
+	ak.x = atoms.z - ak.y * xstrwidth;
+	
+	al.y = floor( atoms.w / xstrwidth );
+	al.x = atoms.w - al.y * xstrwidth;
+	
+	
+	r_ij = xq[ai].xyz - xq[aj].xyz; //3
+	r_kj = xq[ak].xyz - xq[aj].xyz; //3
+	r_kl = xq[ak].xyz - xq[al].xyz; //3
+	
+	m = cross( r_ij, r_kj ); //9
+	n = cross( r_kj, r_kl ); //9
+
+	msq = dot(m, m); //5
+	nsq = dot(n, n); //5
+	
+	cos_phi = dot(m, n)/sqrt(msq*nsq); //8 (sqrt=1)
+
+	//Switching to "polymer convention"
+	//See gromacs code
+	cos_phi = -cos_phi;
+	sgnphi  = sign( dot(r_ij, n) ); //5
+	sin_phi = -sgnphi*sqrt( clamp( 1.0 - cos_phi * cos_phi, 0.0, 1.0) ); //3
+
+	//ddphi is basically sum_{i=1}^5 i parm_i cosphi^{i-1}
+	//This might not be the best way to use the 
+	//4-way mads, but for now we'll let fxc figure it
+	//out.
+	//If we precompute some ratios of the parameters
+	//we can use the 4-way mads better
+	ddphi = 5.0 * parm45.y;
+	ddphi = 4.0 * parm45.x + ddphi * cos_phi;
+	ddphi = 3.0 * parm03.w + ddphi * cos_phi;
+	ddphi = 2.0 * parm03.z + ddphi * cos_phi;
+	ddphi = parm03.y + ddphi * cos_phi;
+	ddphi = -ddphi * sin_phi; //13 flops total for ddphi
+
+	nrkj2 = dot( r_kj, r_kj ); //5
+	nrkj  = sqrt( nrkj2 );     //1
+
+	fi = -ddphi * nrkj / msq * m; //5
+	fl =  ddphi * nrkj / nsq * n; //5
+	
+	u = dot( r_ij, r_kj ) / nrkj2 * fi; //9
+	v = dot( r_kl, r_kj ) / nrkj2 * fl; //9
+	s = u - v; //3
+	
+	fj = s - fi; //3
+	fk = -(s + fl); //3
+
+	//Total flops: 109 per rb torsion.
+}
--- a/platforms/brook/src/gpu/kshakeh.br
+++ b/platforms/brook/src/gpu/kshakeh.br
+
+/****************************************************************
+* This file is part of the gpu acceleration library for gromacs.
+* Author: V. Vishal
+* Copyright (C) Pande Group, Stanford, 2006
+*****************************************************************/
+
+/*
+ * This kernel constrains bonds involving hydrogen using an unrolled
+ * shake implementation. We assume that a heavy atom can have at most 
+ * 3 hydrogens attached. The hydrogens are expected to be identical
+ *
+ * For atoms that have ewer than 3 hydrogens
+ *
+ * Also we don't check the constraints!
+ *  
+ * We do a fixed number of iterations without checking for convergence
+ * to avoid stalling
+ * *************************************************************/
+
+/*
+kernel void
+kshakeh(
+		float nit,      //number of iterations
+		float strwidth, //stream width of posq
+		float invmH,    //inverse mass of hydrogen
+		float omega,    //parameter from gromacs, possibly unused
+		float4 atoms<>, //heavy0, h1, h2, h3
+		float4 posq[][], //positions before update
+		float4 posqp[][], //positions after update
+		float4 params<>, // (1/m0, mu/2, blensq, 0.0f )
+		out float4 cposq0<>, //constrained position for heavy atom
+		out float4 cposq1<>, //ditto for h1
+		out float4 cposq2<>, //ditto for h2
+		out float4 cposq3<> //ditto for h3
+		) {
+	float2 ai, aj1, aj2, aj3; //2d indices, can be precalc.
+	float i; //iteration count
+	float3 xi, xj1, xj2, xj3; //coordinates
+	float3 xpi, xpj1, xpj2, xpj3; //coordinates
+	float3 rij1, rij2, rij3, rpij, dr;
+	float rpsqij, rrpr, acor;
+	float mask2, mask3;
+	float diff;
+	
+	ai.y  = floor( atoms.x / strwidth );
+	ai.x  = atoms.x - ai.y * strwidth;
+
+	aj1.y = floor( atoms.y / strwidth );
+	aj1.x = atoms.y - aj1.y * strwidth;
+
+	//If further hydrogens are absent,
+	//just set to the coordinates of the first
+	//so we don't have any junk memory accesses
+	//or nans/infs in the calcs.	
+	if ( atoms.z > -0.5f ) {
+		aj2.y = floor( atoms.z / strwidth );
+		aj2.x = atoms.z - aj2.y * strwidth;
+		mask2 = 1.0f;
+	}
+	else {
+		aj2 = aj1; 
+		mask2 = 0.0f;
+	}
+
+	if ( atoms.w > -0.5f ) {	
+		aj3.y = floor( atoms.w / strwidth );
+		aj3.x = atoms.w - aj3.y * strwidth;
+		mask3 = 1.0f;
+	}
+	else {
+		aj3 = aj1;
+		mask3 = 0.0f;
+	}
+
+
+	cposq0 = posq[ai];
+	cposq1 = posq[aj1];
+	cposq2 = posq[aj2];
+	cposq3 = posq[aj3];
+	
+	
+	xi  = cposq0.xyz;
+	xj1 = cposq1.xyz;
+	xj2 = cposq2.xyz;
+	xj3 = cposq3.xyz;
+	
+	rij1 = xi - xj1;
+	rij2 = xi - xj2;
+	rij3 = xi - xj3;
+
+	xpi  = posqp[ai].xyz;
+	xpj1 = posqp[aj1].xyz;
+	xpj2 = posqp[aj2].xyz;
+	xpj3 = posqp[aj3].xyz;
+
+	i = 0.0f;
+	while ( i < 15.0f ) {
+		//First hydrogen
+		rpij   = xpi - xpj1;
+		rpsqij = dot( rpij, rpij );
+		rrpr   = dot( rij1, rpij );
+		//for debugging only
+		diff = abs( params.z - rpsqij ) / (params.z * 2 * 1e-4 );
+		if ( diff < 1.0f )
+			acor = 0.0f;
+		else
+			//params.y = mu/2, params.z = blen*blen
+			acor = omega * ( params.z - rpsqij ) * params.y / rrpr ;
+		dr   = rij1 * acor;
+		xpi  += dr * params.x;
+		xpj1 -= dr * invmH;
+
+		//Second hydrogen
+		rpij   = xpi - xpj2;
+		rpsqij = dot( rpij, rpij );
+		rrpr   = dot( rij2, rpij );
+		//for debugging only
+		diff = abs( params.z - rpsqij ) / (params.z * 2 * 1e-4 );
+		if ( diff < 1.0f )
+			acor = 0.0f;
+		else
+			//params.y = mu/2, params.z = blen*blen
+			acor = mask2 * omega * ( params.z - rpsqij ) * params.y / rrpr ;
+		dr   = rij2 * acor;
+		xpi  += dr * params.x;
+		xpj2 -= dr * invmH;
+
+		//Third hydrogen
+		rpij   = xpi - xpj3;
+		rpsqij = dot( rpij, rpij );
+		rrpr   = dot( rij3, rpij );
+		//for debugging only
+		diff = abs( params.z - rpsqij ) / (params.z * 2 * 1e-4 );
+		if ( diff < 1.0f )
+			acor = 0.0f;
+		else
+			//params.y = mu/2, params.z = blen*blen
+			acor = mask3 * omega * ( params.z - rpsqij ) * params.y / rrpr ;
+		dr   = rij3 * acor;
+		xpi  += dr * params.x;
+		xpj3 -= dr * invmH;
+		
+		i += 1.0f;
+	}
+
+	cposq0.xyz = xpi;
+	cposq1.xyz = xpj1;
+	cposq2.xyz = xpj2;
+	cposq3.xyz = xpj3;
+} */
+
+/*Applies the updated positions
+ *Each atom occurs at one and only one position
+ * */
+/*
+kernel void kshakeh_update( 
+		float strwidth, //width of cposq streams
+		float2 invmap<>,    //shakeh inverse map
+		float4 posq<>,     //untouched positions
+		float4 cposq0[][], //constrained position for heavy atom
+		float4 cposq1[][], //ditto for h1
+		float4 cposq2[][], //ditto for h2
+		float4 cposq3[][], //ditto for h3
+		out float4 oposq<> //updated positions
+		){
+	float2 atom;
+	
+	atom.y = floor( invmap.x / strwidth );
+	atom.x = invmap.x - atom.y * strwidth;
+	
+	if ( invmap.y < 0 )
+		oposq = posq;
+	else if ( invmap.y < 0.5f ) 
+		oposq = cposq0[ atom ];
+	else if ( invmap.y < 1.5f )
+		oposq = cposq1[ atom ];
+	else if ( invmap.y < 2.5f )
+		oposq = cposq2[ atom ];
+	else if ( invmap.y < 3.5f )
+		oposq = cposq3[ atom ];
+} */
+
+/* Fix for precision of terms first order in dt
+ * To be used with corresponding update kernels
+ * The input posqp is now changes to posq rather than
+ * posq+changes
+ * */
+kernel void
+kshakeh_fix1(
+		float nit,      //number of iterations
+		float strwidth, //stream width of posq
+		float invmH,    //inverse mass of hydrogen
+		float omega,    //parameter from gromacs, possibly unused
+		float4 atoms<>, //heavy0, h1, h2, h3
+		float4 posq[][], //positions before update
+		float4 posqp[][], //changes to positions
+		float4 params<>, // (1/m0, mu/2, blensq, 0.0f )
+		out float4 cposq0<>, //constrained position for heavy atom
+		out float4 cposq1<>, //ditto for h1
+		out float4 cposq2<>, //ditto for h2
+		out float4 cposq3<> //ditto for h3
+		) {
+	float2 ai, aj1, aj2, aj3; //2d indices, can be precalc.
+	float i; //iteration count
+	float3 xi, xj1, xj2, xj3; //coordinates
+	float3 xpi, xpj1, xpj2, xpj3; //coordinates
+	float3 rij1, rij2, rij3, rpij, dr;
+	float rpsqij, rrpr, acor;
+	float mask2, mask3;
+	float diff;
+	float rij1sq, rij2sq, rij3sq;
+	float ld1, ld2, ld3;
+	
+	// ai.y  = floor( atoms.x / strwidth );
+   ai.y  = round( (atoms.x - fmod( atoms.x, strwidth ))/strwidth );
+	ai.x  = atoms.x - ai.y * strwidth;
+
+	// aj1.y = floor( atoms.y / strwidth );
+   aj1.y  = round( (atoms.y - fmod( atoms.y, strwidth ))/strwidth );
+	aj1.x = atoms.y - aj1.y * strwidth;
+
+	//If further hydrogens are absent,
+	//just set to the coordinates of the first
+	//so we don't have any junk memory accesses
+	//or nans/infs in the calcs.	
+	if ( atoms.z > -0.5f ) {
+		// aj2.y = floor( atoms.z / strwidth );
+		aj2.y = round( (atoms.z - fmod( atoms.z, strwidth ))/strwidth );
+		aj2.x = atoms.z - aj2.y * strwidth;
+		mask2 = 1.0f;
+	}
+	else {
+		aj2 = aj1; 
+		mask2 = 0.0f;
+	}
+
+	if ( atoms.w > -0.5f ) {	
+		// aj3.y = floor( atoms.w / strwidth );
+		aj3.y = round( (atoms.w - fmod( atoms.w, strwidth ))/strwidth );
+		aj3.x = atoms.w - aj3.y * strwidth;
+		mask3 = 1.0f;
+	}
+	else {
+		aj3 = aj1;
+		mask3 = 0.0f;
+	}
+
+
+	cposq0 = posq[ai];
+	cposq1 = posq[aj1];
+	cposq2 = posq[aj2];
+	cposq3 = posq[aj3];
+	
+	xi  = cposq0.xyz;
+	xj1 = cposq1.xyz;
+	xj2 = cposq2.xyz;
+	xj3 = cposq3.xyz;
+	
+	rij1 = xi - xj1;
+	rij2 = xi - xj2;
+	rij3 = xi - xj3;
+
+	rij1sq = dot( rij1, rij1 );
+	rij2sq = dot( rij2, rij2 );
+	rij3sq = dot( rij3, rij3 );
+
+	ld1 = params.z - rij1sq;
+	ld2 = params.z - rij2sq;
+	ld3 = params.z - rij3sq;
+
+/*
+	xpi  = posqp[ai].xyz;
+	xpj1 = posqp[aj1].xyz;
+	xpj2 = posqp[aj2].xyz;
+	xpj3 = posqp[aj3].xyz;
+*/
+
+
+	xpi  = posqp[ai].xyz   - xi;
+	xpj1 = posqp[aj1].xyz  - xj1;
+	xpj2 = posqp[aj2].xyz  - xj2;
+	xpj3 = posqp[aj3].xyz  - xj3;
+
+
+// XXX
+	// cposq0 = posqp[ai];
+	// cposq1 = posqp[aj1];
+	// cposq2 = posqp[aj2];
+	// cposq3 = posqp[aj3];
+// OK
+// XXX
+	
+
+
+	i = 0.0f;
+	while ( i < 15.0f ) {
+		//First hydrogen
+		rpij   = xpi - xpj1; //This is really rpij - rij
+		rpsqij = dot( rpij, rpij ); //This is really deltar ^ 2
+		rrpr   = dot( rij1, rpij ); //This is r.deltar
+		//for debugging only
+		//params.y = mu/2, params.z = blen*blen
+		diff = abs( ld1 - 2 * rrpr - rpsqij ) / (params.z * 2 * 1e-4f );
+		if ( diff < 1.0f )
+		  acor = 0.0f;
+		 else
+			acor = omega * ( ld1 - 2 * rrpr - rpsqij ) * params.y / ( rrpr + rij1sq ) ;
+		dr   = rij1 * acor;
+		xpi  += dr * params.x;
+		xpj1 -= dr * invmH;
+
+		//Second hydrogen
+		rpij   = xpi - xpj2;
+		rpsqij = dot( rpij, rpij );
+		rrpr   = dot( rij2, rpij );
+		//for debugging only
+		diff = abs( ld2 - 2 * rrpr - rpsqij ) / (params.z * 2 * 1e-4f );
+		if ( diff < 1.0f )
+			acor = 0.0f;
+		else
+			acor = mask2 * omega * ( ld2 - 2 * rrpr - rpsqij ) * params.y / ( rrpr + rij2sq ) ;
+		dr   = rij2 * acor;
+		xpi  += dr * params.x;
+		xpj2 -= dr * invmH;
+
+		//Third hydrogen
+		rpij   = xpi - xpj3;
+		rpsqij = dot( rpij, rpij );
+		rrpr   = dot( rij3, rpij );
+		diff = abs( ld3 - 2 * rrpr - rpsqij ) / (params.z * 2 * 1e-4f );
+		// for debugging only
+		if ( diff < 1.0f )
+			 acor = 0.0f;
+		else
+			acor = mask3 * omega * ( ld3 - 2 * rrpr - rpsqij ) * params.y / ( rrpr + rij3sq ) ;
+		dr   = rij3 * acor;
+		xpi  += dr * params.x;
+		xpj3 -= dr * invmH;
+		
+		i += 1.0f;
+	}
+
+	//Output modified delta's
+	cposq0.xyz = xpi;
+	cposq1.xyz = xpj1;
+	cposq2.xyz = xpj2;
+	cposq3.xyz = xpj3;
+
+}
+
+kernel void
+kshakeh_fix2(
+		float nit,      //number of iterations
+		float strwidth, //stream width of posq
+		float invmH,    //inverse mass of hydrogen
+		float omega,    //parameter from gromacs, possibly unused
+		float4 atoms<>, //heavy0, h1, h2, h3
+		float4 posq[][], //positions before update
+		float4 posqp[][], //changes to positions
+		float4 params<>, // (1/m0, mu/2, blensq, 0.0f )
+		out float4 cposq0<>, //constrained position for heavy atom
+		out float4 cposq1<>, //ditto for h1
+		out float4 cposq2<>, //ditto for h2
+		out float4 cposq3<> //ditto for h3
+		) {
+	float2 ai, aj1, aj2, aj3; //2d indices, can be precalc.
+	float i; //iteration count
+	float3 xi, xj1, xj2, xj3; //coordinates
+	float3 xpi, xpj1, xpj2, xpj3; //coordinates
+	float3 rij1, rij2, rij3, rpij, dr;
+	float rpsqij, rrpr, acor;
+	float mask2, mask3;
+	float diff;
+	float rij1sq, rij2sq, rij3sq;
+	float ld1, ld2, ld3;
+	
+	// ai.y  = floor( atoms.x / strwidth );
+   ai.y  = round( (atoms.x - fmod( atoms.x, strwidth ))/strwidth );
+	ai.x  = atoms.x - ai.y * strwidth;
+
+	// aj1.y = floor( atoms.y / strwidth );
+   aj1.y  = round( (atoms.y - fmod( atoms.y, strwidth ))/strwidth );
+	aj1.x = atoms.y - aj1.y * strwidth;
+
+	//If further hydrogens are absent,
+	//just set to the coordinates of the first
+	//so we don't have any junk memory accesses
+	//or nans/infs in the calcs.	
+	if ( atoms.z > -0.5f ) {
+		// aj2.y = floor( atoms.z / strwidth );
+		aj2.y = round( (atoms.z - fmod( atoms.z, strwidth ))/strwidth );
+		aj2.x = atoms.z - aj2.y * strwidth;
+		mask2 = 1.0f;
+	}
+	else {
+		aj2 = aj1; 
+		mask2 = 0.0f;
+	}
+
+	if ( atoms.w > -0.5f ) {	
+		// aj3.y = floor( atoms.w / strwidth );
+		aj3.y = round( (atoms.w - fmod( atoms.w, strwidth ))/strwidth );
+		aj3.x = atoms.w - aj3.y * strwidth;
+		mask3 = 1.0f;
+	}
+	else {
+		aj3 = aj1;
+		mask3 = 0.0f;
+	}
+
+
+	cposq0 = posq[ai];
+	cposq1 = posq[aj1];
+	cposq2 = posq[aj2];
+	cposq3 = posq[aj3];
+	
+	xi  = cposq0.xyz;
+	xj1 = cposq1.xyz;
+	xj2 = cposq2.xyz;
+	xj3 = cposq3.xyz;
+	
+	rij1 = xi - xj1;
+	rij2 = xi - xj2;
+	rij3 = xi - xj3;
+
+	rij1sq = dot( rij1, rij1 );
+	rij2sq = dot( rij2, rij2 );
+	rij3sq = dot( rij3, rij3 );
+
+	ld1 = params.z - rij1sq;
+	ld2 = params.z - rij2sq;
+	ld3 = params.z - rij3sq;
+
+	xpi  = posqp[ai].xyz;
+	xpj1 = posqp[aj1].xyz;
+	xpj2 = posqp[aj2].xyz;
+	xpj3 = posqp[aj3].xyz;
+
+
+/*
+	xpi  = posqp[ai].xyz   - xi;
+	xpj1 = posqp[aj1].xyz  - xj1;
+	xpj2 = posqp[aj2].xyz  - xj2;
+	xpj3 = posqp[aj3].xyz  - xj3;
+
+*/
+
+// XXX
+	// cposq0 = posqp[ai];
+	// cposq1 = posqp[aj1];
+	// cposq2 = posqp[aj2];
+	// cposq3 = posqp[aj3];
+// OK
+// XXX
+	
+
+
+	i = 0.0f;
+	while ( i < 15.0f ) {
+		//First hydrogen
+		rpij   = xpi - xpj1; //This is really rpij - rij
+		rpsqij = dot( rpij, rpij ); //This is really deltar ^ 2
+		rrpr   = dot( rij1, rpij ); //This is r.deltar
+		//for debugging only
+		//params.y = mu/2, params.z = blen*blen
+		diff = abs( ld1 - 2 * rrpr - rpsqij ) / (params.z * 2 * 1e-4f );
+		if ( diff < 1.0f )
+		  acor = 0.0f;
+		 else
+			acor = omega * ( ld1 - 2 * rrpr - rpsqij ) * params.y / ( rrpr + rij1sq ) ;
+		dr   = rij1 * acor;
+		xpi  += dr * params.x;
+		xpj1 -= dr * invmH;
+
+		//Second hydrogen
+		rpij   = xpi - xpj2;
+		rpsqij = dot( rpij, rpij );
+		rrpr   = dot( rij2, rpij );
+		//for debugging only
+		diff = abs( ld2 - 2 * rrpr - rpsqij ) / (params.z * 2 * 1e-4f );
+		if ( diff < 1.0f )
+			acor = 0.0f;
+		else
+			acor = mask2 * omega * ( ld2 - 2 * rrpr - rpsqij ) * params.y / ( rrpr + rij2sq ) ;
+		dr   = rij2 * acor;
+		xpi  += dr * params.x;
+		xpj2 -= dr * invmH;
+
+		//Third hydrogen
+		rpij   = xpi - xpj3;
+		rpsqij = dot( rpij, rpij );
+		rrpr   = dot( rij3, rpij );
+		diff = abs( ld3 - 2 * rrpr - rpsqij ) / (params.z * 2 * 1e-4f );
+		// for debugging only
+		if ( diff < 1.0f )
+			 acor = 0.0f;
+		else
+			acor = mask3 * omega * ( ld3 - 2 * rrpr - rpsqij ) * params.y / ( rrpr + rij3sq ) ;
+		dr   = rij3 * acor;
+		xpi  += dr * params.x;
+		xpj3 -= dr * invmH;
+		
+		i += 1.0f;
+	}
+
+	//Output modified delta's
+	cposq0.xyz = xpi;
+	cposq1.xyz = xpj1;
+	cposq2.xyz = xpj2;
+	cposq3.xyz = xpj3;
+
+}
+
+/*Applies the updated delta's
+ *Each atom occurs at one and only one position
+ * */
+kernel void kshakeh_update1_fix1Old( 
+		float strwidth, //width of cposq streams
+		float2 invmap<>,    //shakeh inverse map
+		float4 posq<>,     //constrained deltas
+		float4 cposq0[][], //constrained delta for heavy atom
+		float4 cposq1[][], //ditto for h1
+		float4 cposq2[][], //ditto for h2
+		float4 cposq3[][], //ditto for h3
+		out float4 oposq<> //updated deltas
+		){
+	float2 atom;
+	
+	//atom.y = floor( invmap.x / strwidth );
+   atom.y  = round( (invmap.x - fmod( invmap.x, strwidth ))/strwidth );
+	atom.x = invmap.x - atom.y * strwidth;
+	
+	if ( invmap.y < 0 )
+		oposq = posq;
+	else if ( invmap.y < 0.5f ) 
+		oposq = cposq0[ atom ];
+	else if ( invmap.y < 1.5f )
+		oposq = cposq1[ atom ];
+	else if ( invmap.y < 2.5f )
+		oposq = cposq2[ atom ];
+	else if ( invmap.y < 3.5f )
+		oposq = cposq3[ atom ];
+}
+
+kernel void kshakeh_update2_fix1( 
+		float strwidth, //width of cposq streams
+		float2 invmap<>,    //shakeh inverse map
+		float4 posq<>,     //old positions
+		float4 posqp<>,    //deltas from sd2
+		float4 cposq0[][], //constrained delta for heavy atom
+		float4 cposq1[][], //ditto for h1
+		float4 cposq2[][], //ditto for h2
+		float4 cposq3[][], //ditto for h3
+		out float4 oposq<> //updated deltas
+		){
+	float2 atom;
+	
+	// atom.y = floor( invmap.x / strwidth );
+   atom.y  = round( (invmap.x - fmod( invmap.x, strwidth ))/strwidth );
+	atom.x = invmap.x - atom.y * strwidth;
+	
+	if ( invmap.y < 0 ){
+		oposq.w = posq.w;
+		oposq.xyz = posqp.xyz - posq.xyz;
+	} else if ( invmap.y < 0.5f ) 
+		oposq = cposq0[ atom ];
+	else if ( invmap.y < 1.5f)
+		oposq = cposq1[ atom ];
+	else if ( invmap.y < 2.5f )
+		oposq = cposq2[ atom ];
+	else if ( invmap.y < 3.5f )
+		oposq = cposq3[ atom ];
+
+	oposq.xyz += posq.xyz;
+}
+
+kernel void kshakeh_update1_fix1( 
+		float strwidth, //width of cposq streams
+      float sdFactor,
+		float2 invmap<>,    //shakeh inverse map
+		float4 posq<>,     //old positions
+		float4 posqp<>,    //deltas from sd2
+		float3 vp<>,    //deltas from sd2
+		float4 cposq0[][], //constrained delta for heavy atom
+		float4 cposq1[][], //ditto for h1
+		float4 cposq2[][], //ditto for h2
+		float4 cposq3[][], //ditto for h3
+		out float4 oposq<> //updated deltas
+		){
+	float2 atom;
+	
+	// atom.y = floor( invmap.x / strwidth );
+   atom.y  = round( (invmap.x - fmod( invmap.x, strwidth ))/strwidth );
+	atom.x = invmap.x - atom.y * strwidth;
+	
+	oposq = posq;
+	if ( invmap.y < 0 ){
+		oposq = posqp;
+	} else if ( invmap.y < 0.5f ) 
+		oposq.xyz += cposq0[ atom ].xyz;
+	else if ( invmap.y < 1.5f )
+		oposq.xyz += cposq1[ atom ].xyz;
+	else if ( invmap.y < 2.5f )
+		oposq.xyz += cposq2[ atom ].xyz;
+	else if ( invmap.y < 3.5f )
+		oposq.xyz += cposq3[ atom ].xyz;
+
+}
--- a/platforms/brook/src/gpu/kupdatemd.br
+++ b/platforms/brook/src/gpu/kupdatemd.br
+
+/****************************************************************
+* This file is part of the gpu acceleration library for gromacs.
+* Author: V. Vishal
+* Copyright (C) Pande Group, Stanford, 2006
+*****************************************************************/
+
+kernel void kupdate_md1_berendsen( 
+		float dt,
+		float3 lg, //Berendsen coupling, assuming only one group
+		float3 uold, //Mean velocity from previous step
+		float4 posq<>, 
+		float3 v<>, 
+		float3 f<>,
+		float invmass<>,
+		out float3 vnew<>, 
+		out float4 posqp<> )
+{
+	float3 vb;
+	float3 one;
+	posqp = posq;
+
+	one = float3( 1.0f, 1.0f, 1.0f );
+
+	vb = ( one - lg ) * uold;
+	
+	vnew = lg* ( v + f * invmass * dt ) + vb;
+ 	
+	posqp.xyz += vnew * dt;
+}
+
+//Nose-Hoover / Parinello Rahman
+kernel void kupdate_md1_extended (
+		float dt,
+		float3 lg, 
+		float xi,
+		float3 M0, 
+		float3 M1,
+		float3 M2,
+		float3 uold,
+		float4 posq<>, 
+		float3 v<>, 
+		float3 f<>,
+		float invmass<>,
+		out float3 vnew<>, 
+		out float4 posqp<> )
+{
+	float3 vrel, vnrel;
+	float3 vtrans;
+
+	vrel  = v - uold;
+	vtrans = float3( dot(M0, vrel), dot(M1, vrel), dot(M2, vrel) );
+	vrel += dt * ( invmass * f - xi * vrel - vtrans );
+	vnew = uold + lg * vrel;
+	posqp = posq;
+	posqp.xyz += vnew * dt;
+}
+
+		
+kernel void kupdate_md2(
+		float dtinv, //1/dt
+		float4 posqp<>, //positions after constraints
+		float4 posq<>,  //positions before update
+		out float3 vnew<>, //Corrected velocities
+		out float4 posqnew<> //equal to posqp, avoids an extra call to copy
+		)
+{
+	posqnew = posqp;
+	vnew = ( posqp - posq ) * dtinv;
+}
+
+/* Version that handles terms of order dt more carefully
+ * Update1 computes deltax's rather than x + deltax 
+ * Corresponding shake implementation modifies the delta's
+ * Update2 adds the constrained deltas to x.
+ * */
+
+kernel void kupdate_md1_berendsen_fix1( 
+		float dt,
+		float3 lg, //Berendsen coupling, assuming only one group
+		float3 uold, //Mean velocity from previous step
+		float4 posq<>, 
+		float3 v<>, 
+		float3 f<>,
+		float invmass<>,
+		out float3 vnew<>, 
+		out float4 posqp<> )
+{
+	float3 vb;
+	float3 one;
+
+	one = float3( 1.0f, 1.0f, 1.0f );
+
+	vb = ( one - lg ) * uold;
+	
+	vnew = lg* ( v + f * invmass * dt ) + vb;
+ 	
+	posqp.xyz = vnew * dt;
+	posqp.w = posq.w;
+}
+
+//Nose-Hoover / Parinello Rahman
+kernel void kupdate_md1_extended_fix1 (
+		float dt,
+		float3 lg, 
+		float xi,
+		float3 M0, 
+		float3 M1,
+		float3 M2,
+		float3 uold,
+		float4 posq<>, 
+		float3 v<>, 
+		float3 f<>,
+		float invmass<>,
+		out float3 vnew<>, 
+		out float4 posqp<> )
+{
+	float3 vrel, vnrel;
+	float3 vtrans;
+
+	vrel  = v - uold;
+	vtrans = float3( dot(M0, vrel), dot(M1, vrel), dot(M2, vrel) );
+	vrel += dt * ( invmass * f - xi * vrel - vtrans );
+	vnew = uold + lg * vrel;
+	posqp.w = posq.w;
+	posqp.xyz = vnew * dt;
+}
+
+kernel void kupdate_md2_fix1(
+		float dtinv, //1/dt
+		float4 posqp<>, //positions after constraints
+		float4 posq<>,  //positions before update
+		out float3 vnew<>, //Corrected velocities
+		out float4 posqnew<> //equal to posqp, avoids an extra call to copy
+		)
+{
+	posqnew.xyz = posq.xyz + posqp.xyz;
+	posqnew.w   = posq.w;
+	vnew = posqp.xyz * dtinv;
+}
+
--- a/platforms/brook/src/gpu/kupdatesd.br
+++ b/platforms/brook/src/gpu/kupdatesd.br
+
+/****************************************************************
+* This file is part of the gpu acceleration library for gromacs.
+* Author: V. Vishal
+* Copyright (C) Pande Group, Stanford, 2006
+*****************************************************************/
+
+
+/* Stochastic Integrator
+ *
+ * Since we don't have random numbers on the GPU, we 
+ * precalculate random numbers from a normal distribution
+ * and store them in several large textures. Periodically
+ * these will have to be refreshed on the CPU.
+ *
+ * Doing gathers instead of streams for the random vectors, 
+ * but the gathers should be fully coherent anyway. 
+ * This is just to avoid some
+ * painful manipulations with multiple streams and domains.  
+ * */
+
+
+/* First part before first constraint call 
+ *
+ *
+ * Many things can be precalculated here
+ * If we become compute bound (wonder how), we can precalc stuff
+ *
+ * Assuming constant temperature. If you want to change the 
+ * temperature, you have to change all the precalculated 
+ * constants that use it.
+ * */
+
+/*
+kernel void kupdate_sd1(
+		float xstrwidth,   //atom stream width
+		float gstrwidth,   //stream width of random numbers
+		float goffset,     //starting offset for random numbers
+
+		//sd constants, prefixed with c 
+		//to avoid 1 letter names, cxx = sdc[0].xx
+		float cem,
+
+		//sd precalculated constants
+		float pc1,  //tau_t[0] * ( 1 - sdc[0].em )
+		float pc2,  //tau_t[gt] * (sdc[gt].eph - sdc.emh)
+		float pc3,  //sdc[0].d/(tau_t * sdc[0].c);
+		
+		//sdpc = sd precalculated per atom values
+		//sdpc.x = 1/sqrt(m)*sig[0].Yv
+		//sdpc.y = 1/sqrt(m)*sig[0].V
+		float2 sdpc<>,   
+		
+		//Normally distributed random vectors
+		float3 fgauss[][],
+	
+		float3 sd2X<>,  //This is calculated in sd2 of the previous step	
+		float4 posq<>,
+		float3 f<>,
+		float3 v<>,
+
+		float invmass<>,
+		out float3 sd1V<>, //This is V in gromacs, reused in sd2  
+		out float3 vnew<>,
+		out float4 posqp<>
+		){
+	float3 Vmh;
+	float3 fg1, fg2;
+	float linind_gauss; //linear index into the random numbers
+	float2 igauss;      //2d index
+
+	igauss = indexof( posq );
+	linind_gauss = igauss.y * xstrwidth + igauss.x;
+	//2 random vectors used in each fragment
+	linind_gauss = 2 * linind_gauss + goffset; 	
+	
+	igauss.y = floor( linind_gauss / gstrwidth );
+	igauss.x = linind_gauss - igauss.y * gstrwidth;
+
+	fg1 = fgauss[ igauss ];
+
+	linind_gauss += 1.0f;
+	igauss.y = floor( linind_gauss / gstrwidth );
+	igauss.x = linind_gauss - igauss.y * gstrwidth;
+
+	fg2 = fgauss[ igauss ];
+	
+	Vmh  = sd2X * pc3 + sdpc.x * fg1;
+	sd1V = sdpc.y * fg2;
+	
+	vnew = v * cem + invmass * f * pc1 + sd1V - Vmh * cem;
+	
+	posqp.w = posq.w;
+	posqp.xyz = posq.xyz + vnew * pc2;
+	
+} */
+
+/*Second part of sd update, after first constraints call*/
+/*
+kernel void kupdate_sd2(
+		float xstrwidth,   //stream width of atoms
+		float gstrwidth,   //stream width of random numbers
+		float goffset,     //starting offset for random numbers
+		
+		float pc1,          //this is 1/pc2 for sd1 
+		                    //= 1/(tau_t*(sdc[0].eph-sdc[0].emh))
+		float pc2,          //= tau_t * sdc[0].d/(sdc[0].em - 1)
+		
+
+		//per atom precalcs
+		//sdpc.x = 1/sqrt(m)*sig[0].Yx
+		//sdpc.y = 1/sqrt(m)*sig[0].X
+		float2 sdpc<>,
+
+		//Normally distributed random vectors
+		float3 fgauss[][],
+		
+		float3 sd1V<>,      //calculated in sd1
+		float4 posq<>,      //positions pre-update
+		float4 posqp<>,     //positions post-constraint
+		float3 vnew<>,      //velocities after sd1
+		out float3 sd2X<>,  //Used in sd1
+		out float3 v<>,     //velocities corrected for constraints
+		out float4 posqp2<> //Will need to be constrained again
+		){
+	float linind_gauss; //linear index into the random numbers
+	float2 igauss;      //2d index
+	float3 fg1, fg2;
+	float3 Xmh;
+
+	igauss = indexof( posq );
+	linind_gauss = igauss.y * xstrwidth + igauss.x;
+	linind_gauss = 2 * linind_gauss + goffset; 	
+	
+	igauss.y = floor( linind_gauss / gstrwidth );
+	igauss.x = linind_gauss - igauss.y * gstrwidth;
+	fg1 = fgauss[ igauss ];
+
+	linind_gauss += 1.0f;
+	igauss.y = floor( linind_gauss / gstrwidth );
+	igauss.x = linind_gauss - igauss.y * gstrwidth;
+	fg2 = fgauss[ igauss ];
+			
+	v = pc1 * ( posqp.xyz - posq.xyz );
+	Xmh = sd1V * pc2 + sdpc.x * fg1;
+	
+	sd2X = sdpc.y * fg2;
+
+	posqp2 = posqp;
+	posqp2.xyz += sd2X - Xmh;
+} */
+
+//Applies a permutation to the given random vector stream
+//Totally bandwidth bound and streams are large, should be done infrequently.
+//Probably don't want gvin and gvout to be the same stream!
+kernel void kpermute_vectors( float gstrwidth, float perm<>, 
+		float3 gvin[][], out float3 gvout<> )
+{
+	float2 ind;
+	//ind.y = floor( perm / gstrwidth );
+   ind.y = round( (perm - fmod( perm, gstrwidth ))/gstrwidth );
+	ind.x = perm - ind.y * gstrwidth;
+
+	gvout = gvin[ ind ];
+}
+
+/*
+ * Alternative formulation to handle precision better
+ * In updatesd1, we output only deltaV's. The shake
+ * kernel uses the deltaV's and the X's to output 
+ * a bunch of constrained deltax's
+ * */
+
+
+kernel void kupdate_sd1_fix1(
+		float xstrwidth,   //atom stream width
+		float gstrwidth,   //stream width of random numbers
+		float goffset,     //starting offset for random numbers
+
+		//sd constants, prefixed with c 
+		//to avoid 1 letter names, cxx = sdc[0].xx
+		float cem,
+
+		//sd precalculated constants
+		float pc1,  //tau_t[0] * ( 1 - sdc[0].em )
+		float pc2,  //tau_t[gt] * (sdc[gt].eph - sdc.emh)
+		float pc3,  //sdc[0].d/(tau_t * sdc[0].c);
+		
+		//sdpc = sd precalculated per atom values
+		//sdpc.x = 1/sqrt(m)*sig[0].Yv
+		//sdpc.y = 1/sqrt(m)*sig[0].V
+		float2 sdpc<>,   
+		
+		//Normally distributed random vectors
+		float3 fgauss[][],
+	
+		float3 sd2X<>,  //This is calculated in sd2 of the previous step	
+		float4 posq<>,
+		float3 f<>,
+		float3 v<>,
+
+		float invmass<>,
+		out float3 sd1V<>, //This is V in gromacs, reused in sd2  
+		out float3 vnew<>,
+		out float4 posqp<>
+		){
+	float3 Vmh;
+	float3 fg1, fg2;
+	float linind_gauss; //linear index into the random numbers
+	float2 igauss;      //2d index
+
+	igauss = indexof( posq );
+	linind_gauss = igauss.y * xstrwidth + igauss.x;
+	//2 random vectors used in each fragment
+	linind_gauss = 2 * linind_gauss + goffset; 	
+	
+	//igauss.y = floor( linind_gauss / gstrwidth );
+   igauss.y = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth);
+	igauss.x = linind_gauss - igauss.y * gstrwidth;
+
+	fg1 = fgauss[ igauss ];
+
+	linind_gauss += 1.0f;
+	//igauss.y = floor( linind_gauss / gstrwidth );
+   igauss.y = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth);
+	igauss.x = linind_gauss - igauss.y * gstrwidth;
+
+	fg2 = fgauss[ igauss ];
+	
+   /* 
+      Gromacs code
+    n               = 1 -> ngtc
+
+    sdc[n].gdt      = dt/tau_t[n];
+    sdc[n].eph      = exp(sdc[n].gdt/2);
+    sdc[n].emh      = exp(-sdc[n].gdt/2);
+    sdc[n].em       = exp(-sdc[n].gdt);
+
+    sdc[n].b        = sdc[n].gdt*(sqr(sdc[n].eph)-1) - 4*sqr(sdc[n].eph-1);
+
+    // 2.15 in paper (C)
+    sdc[n].c        = sdc[n].gdt - 3 + 4*sdc[n].emh - sdc[n].em;
+
+    sdc[n].d        = 2 - sdc[n].eph - sdc[n].emh;
+
+      Vmh           = X[n-start][d]*sdc[gt].d/(tau_t[gt]*sdc[gt].c) + ism*sig[gt].Yv*fgauss(&jran);
+
+      V[n-start][d] = ism*sig[gt].V*fgauss(&jran);
+
+      v[n][d]       = vn*sdc[gt].em + (invmass[n]*f[n][d] + accel[ga][d])*tau_t[gt]*(1 - sdc[gt].em) +
+                      V[n-start][d] - sdc[gt].em*Vmh;
+           
+      xprime[n][d]  = x[n][d] + v[n][d]*tau_t[gt]*(sdc[gt].eph - sdc[gt].emh); 
+  
+   */
+
+// ---------------------------------------------------------------------------------------------
+	// float pc3 = sdc[0].d/(tau_t * sdc[0].c);
+   //sdpc.x = 1/sqrt(m)*sig[0].Yv
+	Vmh  = sd2X * pc3 + sdpc.x * fg1;
+// ---------------------------------------------------------------------------------------------
+
+// ---------------------------------------------------------------------------------------------
+	//sdpc.y = 1/sqrt(m)*sig[0].V
+	sd1V = sdpc.y * fg2;
+// ---------------------------------------------------------------------------------------------
+	
+
+// ---------------------------------------------------------------------------------------------
+   // 
+   // float pc1 = tau_t[0] * ( 1 - sdc[0].em )
+	vnew = v * cem + invmass * f * pc1 + sd1V - Vmh * cem;
+	
+   // float pc2 = tau_t[gt] * (sdc[gt].eph - sdc.emh)
+	posqp       = posq;
+	posqp.xyz  += vnew * pc2;
+}
+
+kernel void kupdate_sd2_fix1(
+		float xstrwidth,   //stream width of atoms
+		float gstrwidth,   //stream width of random numbers
+		float goffset,     //starting offset for random numbers
+		
+		float pc1,          //this is 1/pc2 for sd1 
+		                    //= 1/(tau_t*(sdc[0].eph-sdc[0].emh))
+		float pc2,          //= tau_t * sdc[0].d/(sdc[0].em - 1)
+		
+
+		//per atom precalcs
+		//sdpc.x = 1/sqrt(m)*sig[0].Yx
+		//sdpc.y = 1/sqrt(m)*sig[0].X
+		float2 sdpc<>,
+
+		//Normally distributed random vectors
+		float3 fgauss[][],
+		
+		float3 sd1V<>,      //calculated in sd1
+		float4 posq<>,      //positions pre-update
+		float4 posqp<>,     //deltas post constraint
+		float3 vnew<>,      //velocities after sd1
+		out float3 sd2X<>,  //Used in sd1
+		out float3 v<>,     //velocities corrected for constraints
+		out float4 posqp2<> //new deltas, need to be constrained again
+		){
+	float linind_gauss; //linear index into the random numbers
+	float2 igauss;      //2d index
+	float3 fg1, fg2;
+	float3 Xmh;
+
+	igauss = indexof( posq );
+	linind_gauss = igauss.y * xstrwidth + igauss.x;
+	linind_gauss = 2 * linind_gauss + goffset; 	
+	
+	//igauss.y = floor( linind_gauss / gstrwidth );
+   igauss.y = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth); 
+	igauss.x = linind_gauss - igauss.y * gstrwidth;
+	fg1 = fgauss[ igauss ];
+
+	linind_gauss += 1.0f;
+	//igauss.y = floor( linind_gauss / gstrwidth );
+   igauss.y = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth); 
+	igauss.x = linind_gauss - igauss.y * gstrwidth;
+	fg2 = fgauss[ igauss ];
+			
+	//v   = pc1 * posqp.xyz (!!!!!!!! ?????? !!!!!!!)
+	v   = pc1 * (posqp.xyz - posq.xyz);
+	Xmh = sd1V * pc2 + sdpc.x * fg1;
+	
+	sd2X = sdpc.y * fg2;
+
+	posqp2 = posqp;
+	posqp2.xyz += sd2X - Xmh;
+}
+
+
+kernel void kupdate_sd1_fix1_FixedRV(
+		float xstrwidth,   //atom stream width
+		float gstrwidth,   //stream width of random numbers
+		float goffset,     //starting offset for random numbers
+
+		//sd constants, prefixed with c 
+		//to avoid 1 letter names, cxx = sdc[0].xx
+		float cem,
+
+		//sd precalculated constants
+		float pc1,  //tau_t[0] * ( 1 - sdc[0].em )
+		float pc2,  //tau_t[gt] * (sdc[gt].eph - sdc.emh)
+		float pc3,  //sdc[0].d/(tau_t * sdc[0].c);
+		
+		//sdpc = sd precalculated per atom values
+		//sdpc.x = 1/sqrt(m)*sig[0].Yv
+		//sdpc.y = 1/sqrt(m)*sig[0].V
+		float2 sdpc<>,   
+		
+		//Normally distributed random vectors
+		float3 fgauss[][],
+	
+		float3 sd2X<>,  //This is calculated in sd2 of the previous step	
+		float4 posq<>,
+		float3 f<>,
+		float3 v<>,
+
+		float invmass<>,
+		out float3 sd1V<>, //This is V in gromacs, reused in sd2  
+		out float3 vnew<>,
+		out float4 posqp<>
+		){
+	float3 Vmh;
+	float3 fg1, fg2;
+	float linind_gauss; //linear index into the random numbers
+	float2 igauss;      //2d index
+   float RandomValueVsp = 0.1f;
+
+	igauss = indexof( posq );
+	linind_gauss = igauss.y * xstrwidth + igauss.x;
+	//2 random vectors used in each fragment
+	linind_gauss = 2 * linind_gauss + goffset; 	
+	
+	//igauss.y = floor( linind_gauss / gstrwidth );
+   igauss.y = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth);
+	igauss.x = linind_gauss - igauss.y * gstrwidth;
+
+	//fg1 = fgauss[ igauss ];
+   fg1 = float3( RandomValueVsp, RandomValueVsp, RandomValueVsp );
+
+	linind_gauss += 1.0f;
+	//igauss.y = floor( linind_gauss / gstrwidth );
+   igauss.y = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth);
+	igauss.x = linind_gauss - igauss.y * gstrwidth;
+
+	//fg2 = fgauss[ igauss ];
+   fg2 = float3( RandomValueVsp, RandomValueVsp, RandomValueVsp );
+	
+   /* 
+      Gromacs code
+    n               = 1 -> ngtc
+
+    sdc[n].gdt      = dt/tau_t[n];
+    sdc[n].eph      = exp(sdc[n].gdt/2);
+    sdc[n].emh      = exp(-sdc[n].gdt/2);
+    sdc[n].em       = exp(-sdc[n].gdt);
+
+    sdc[n].b        = sdc[n].gdt*(sqr(sdc[n].eph)-1) - 4*sqr(sdc[n].eph-1);
+
+    // 2.15 in paper (C)
+    sdc[n].c        = sdc[n].gdt - 3 + 4*sdc[n].emh - sdc[n].em;
+
+    sdc[n].d        = 2 - sdc[n].eph - sdc[n].emh;
+
+      Vmh           = X[n-start][d]*sdc[gt].d/(tau_t[gt]*sdc[gt].c) + ism*sig[gt].Yv*fgauss(&jran);
+
+      V[n-start][d] = ism*sig[gt].V*fgauss(&jran);
+
+      v[n][d]       = vn*sdc[gt].em + (invmass[n]*f[n][d] + accel[ga][d])*tau_t[gt]*(1 - sdc[gt].em) +
+                      V[n-start][d] - sdc[gt].em*Vmh;
+           
+      xprime[n][d]  = x[n][d] + v[n][d]*tau_t[gt]*(sdc[gt].eph - sdc[gt].emh); 
+  
+   */
+
+// ---------------------------------------------------------------------------------------------
+	// float pc3 = sdc[0].d/(tau_t * sdc[0].c);
+   //sdpc.x = 1/sqrt(m)*sig[0].Yv
+	Vmh  = sd2X * pc3 + sdpc.x * fg1;
+// ---------------------------------------------------------------------------------------------
+
+// ---------------------------------------------------------------------------------------------
+	//sdpc.y = 1/sqrt(m)*sig[0].V
+	sd1V = sdpc.y * fg2;
+// ---------------------------------------------------------------------------------------------
+	
+
+// ---------------------------------------------------------------------------------------------
+   // 
+   // float pc1 = tau_t[0] * ( 1 - sdc[0].em )
+	vnew = v * cem + invmass * f * pc1 + sd1V - Vmh * cem;
+	
+   // float pc2 = tau_t[gt] * (sdc[gt].eph - sdc.emh)
+	posqp       = posq;
+	posqp.xyz  += vnew * pc2;
+}
+
+kernel void kupdate_sd2_fix1_FixedRV(
+		float xstrwidth,   //stream width of atoms
+		float gstrwidth,   //stream width of random numbers
+		float goffset,     //starting offset for random numbers
+		
+		float pc1,          //this is 1/pc2 for sd1 
+		                    //= 1/(tau_t*(sdc[0].eph-sdc[0].emh))
+		float pc2,          //= tau_t * sdc[0].d/(sdc[0].em - 1)
+		
+
+		//per atom precalcs
+		//sdpc.x = 1/sqrt(m)*sig[0].Yx
+		//sdpc.y = 1/sqrt(m)*sig[0].X
+		float2 sdpc<>,
+
+		//Normally distributed random vectors
+		float3 fgauss[][],
+		
+		float3 sd1V<>,      //calculated in sd1
+		float4 posq<>,      //positions pre-update
+		float4 posqp<>,     //deltas post constraint
+		float3 vnew<>,      //velocities after sd1
+		out float3 sd2X<>,  //Used in sd1
+		out float3 v<>,     //velocities corrected for constraints
+		out float4 posqp2<> //new deltas, need to be constrained again
+		){
+	float linind_gauss; //linear index into the random numbers
+	float2 igauss;      //2d index
+	float3 fg1, fg2;
+	float3 Xmh;
+   float RandomValueVsp = 0.1f;
+
+	igauss = indexof( posq );
+	linind_gauss = igauss.y * xstrwidth + igauss.x;
+	linind_gauss = 2 * linind_gauss + goffset; 	
+	
+	//igauss.y = floor( linind_gauss / gstrwidth );
+   igauss.y = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth); 
+	igauss.x = linind_gauss - igauss.y * gstrwidth;
+	//fg1 = fgauss[ igauss ];
+   fg1 = float3( RandomValueVsp, RandomValueVsp, RandomValueVsp );
+
+	linind_gauss += 1.0f;
+	//igauss.y = floor( linind_gauss / gstrwidth );
+   igauss.y = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth); 
+	igauss.x = linind_gauss - igauss.y * gstrwidth;
+	// fg2 = fgauss[ igauss ];
+   fg2 = float3( RandomValueVsp, RandomValueVsp, RandomValueVsp );
+			
+	//v   = pc1 * posqp.xyz (!!!!!!!! ?????? !!!!!!!)
+	v   = pc1 * (posqp.xyz - posq.xyz);
+	Xmh = sd1V * pc2 + sdpc.x * fg1;
+	
+	sd2X = sdpc.y * fg2;
+
+	posqp2 = posqp;
+	posqp2.xyz += sd2X - Xmh;
+}
+
--- a/platforms/brook/src/gpu/pre_test.br
+++ b/platforms/brook/src/gpu/pre_test.br
+/* pre-tests, known to be right */
+
+kernel void pre_test0( float input<>, out float output<> )
+{
+  output = input;
+}
+
+kernel void pre_test1( float input<>, out float output<> )
+{
+  output = ( input * input ) + input;
+}