/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
 * Portions copyright (c) 2008 Stanford University and the Authors.           *
 * Authors: Peter Eastman, Mark Friedrichs, Chris Bruns                       *
 * Contributors:                                                              *
 *                                                                            *
 * Permission is hereby granted, free of charge, to any person obtaining a    *
 * copy of this software and associated documentation files (the "Software"), *
 * to deal in the Software without restriction, including without limitation  *
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
 * and/or sell copies of the Software, and to permit persons to whom the      *
 * Software is furnished to do so, subject to the following conditions:       *
 *                                                                            *
 * The above copyright notice and this permission notice shall be included in *
 * all copies or substantial portions of the Software.                        *
 *                                                                            *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
 * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

kernel float4 scalar_force_CDLJ( float4 qq, float epsfac, float4 sig, float4 eps, float4 r2, float4 params ) {
 float4 invr, invrsig2, invrsig6;
 float4 f;

 invr = rsqrt( r2 );
 invrsig2 = invr * sig;
 invrsig2 = invrsig2 * invrsig2;
 invrsig6 = invrsig2 * invrsig2 * invrsig2;
 f = eps * ( 12.0f*invrsig6 - 6.0f ) * invrsig6;
 f += epsfac*qq*invr;
 f *= invr*invr;
 return f;
}

kernel float scalar_force_single_CDLJ( float qq, float epsfac, float sig, float eps, float r2, float4 params ) {
 float invr, invrsig2, invrsig6;
 float f;

 invr = rsqrt( r2 );
 invrsig2 = invr * sig;
 invrsig2 = invrsig2 * invrsig2;
 invrsig6 = invrsig2 * invrsig2 * invrsig2;
 f = eps * ( 12.0f*invrsig6 - 6.0f ) * invrsig6;
 f += epsfac*qq*invr;
 f *= invr*invr;
 return f;
}
kernel float4 get_r2_CDLJ( float3 d1, float3 d2, float3 d3, float3 d4 ) {
 float4 r2;
 r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) );
 return r2;
}

kernel void knbforce_CDLJ (
  float natoms,
  float dupfac,
  float strheight,
  float strwidth,
  float jstrwidth,
  float fstrwidth,
  float epsfac,
  float4 params,
  float4 posq[][],
  float4 isigeps<>,
  float4 sigma[][],
  float4 epsilon[][],
  float2 excl[][],


  out float3 outforce1<>,
  out float3 outforce2<> ) {

 float4 jsig, jeps;
 float3 ipos1, ipos2;
 float3 jpos1, jpos2, jpos3, jpos4;
 float qi1, qi2;
 float4 qj, qq;
 float3 d1, d2, d3, d4;
 float2 exclind;
 float4 sig, eps;
 float2 exclusions;
 float4 r2;
 float3 pad;
 float4 exclmask;
 float4 exclconst;
 float2 iatom;
 float2 jstrindex;
 float a_iatom;
 float linind;

 float breakflag;
 float2 jatom;
 float4 fs;
 float jend, jstart;
 float which_rep;

 exclconst = float4( 2.0f, 3.0f, 5.0f, 7.0f );


 outforce1 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
 outforce2 = float4( 0.0f, 0.0f, 0.0f, 0.0f );


   a_iatom = 2.0f*((indexof outforce1).x + (indexof outforce1).y * fstrwidth);


 linind = fmod( a_iatom, natoms );
 iatom.x = fmod( linind, strwidth );
 iatom.y = round( (linind - fmod(linind, strwidth))/strwidth );


 ipos1 = posq[ iatom ].xyz;
 qi1 = posq[ iatom ].w;

 iatom.x += 1;
 ipos2 = posq[ iatom ].xyz;
 qi2 = posq[ iatom ].w;


 breakflag = 1.0f;
 jatom.y = 0.0f;


 which_rep = round( (a_iatom - fmod(a_iatom, natoms))/natoms );
 jend = 1 + round( (natoms - fmod(natoms, (dupfac * strwidth )))/(dupfac * strwidth ) );
 jstart = which_rep * jend;
 if ( which_rep > dupfac - 1 - 0.5f ) {

  jend = 1e6f;
 }
 jend += jstart;

 exclind.x = linind/2.0f;
 exclind.y = jstart * strwidth/4.0f;

 jatom.y = jstart;
 while ( jatom.y < jend && breakflag > 0.0f ) {
  jatom.x = 0;
  while ( jatom.x < strwidth && breakflag > 0.0f ) {

   exclusions = excl[ exclind ];

   exclmask = fmod( exclusions.x, exclconst );


   linind = (jatom.x + jatom.y*strwidth)/4.0f;
   jstrindex.y = round( (linind - fmod(linind, jstrwidth))/jstrwidth );
   jstrindex.x = linind - jstrindex.y * jstrwidth;


   jsig = sigma[ jstrindex ];
   jeps = epsilon[ jstrindex ];

   jpos1 = posq[ jatom ].xyz;
   qj.x = posq[ jatom ].w;
   jatom.x += 1.0f;

   jpos2 = posq[ jatom ].xyz;
   qj.y = posq[ jatom ].w;
   jatom.x += 1.0f;

   jpos3 = posq[ jatom ].xyz;
   qj.z = posq[ jatom ].w;
   jatom.x += 1.0f;

   jpos4 = posq[ jatom ].xyz;
   qj.w = posq[ jatom ].w;
   jatom.x += 1.0f;


   sig = isigeps.x + jsig;
   eps = isigeps.y * jeps;
   qq = qi1 * qj;

   d1 = ipos1 - jpos1;
   d2 = ipos1 - jpos2;
   d3 = ipos1 - jpos3;
   d4 = ipos1 - jpos4;


   r2 = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask * 10000.0f;
   fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );

   outforce1 += fs.x * d1;
   outforce1 += fs.y * d2;
   outforce1 += fs.z * d3;
   outforce1 += fs.w * d4;
/*
outforce1.x += exclmask.x;
outforce1.x += exclmask.y;
outforce1.x += exclmask.z;
outforce1.x += exclmask.w;
if( exclmask.x > 0.5f ){
   outforce1.x += 1.0f;
}
if( exclmask.y > 0.5f ){
   outforce1.x += 1.0f;
}
if( exclmask.z > 0.5f ){
   outforce1.x += 1.0f;
}
if( exclmask.w > 0.5f ){
   outforce1.x += 1.0f;
}
*/

   exclmask = fmod( exclusions.y, exclconst );

   sig = isigeps.z + jsig;
   eps = isigeps.w * jeps;
   qq = qi2 * qj;

   d1 = ipos2 - jpos1;
   d2 = ipos2 - jpos2;
   d3 = ipos2 - jpos3;
   d4 = ipos2 - jpos4;

   r2 = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask * 10000.0f;
   fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );

   outforce2 += fs.x * d1;
   outforce2 += fs.y * d2;
   outforce2 += fs.z * d3;
   outforce2 += fs.w * d4;

/*
outforce2.x += exclmask.x;
outforce2.x += exclmask.y;
outforce2.x += exclmask.z;
outforce2.x += exclmask.w;
if( exclmask.x > 0.5f ){
   outforce2.x += 1.0f;
}
if( exclmask.y > 0.5f ){
   outforce2.x += 1.0f;
}
if( exclmask.z > 0.5f ){
   outforce2.x += 1.0f;
}
if( exclmask.w > 0.5f ){
   outforce2.x += 1.0f;
}
*/

   exclind.y += 1.0f;


   if ( natoms - (jatom.y*strwidth + jatom.x) < 0.9f )
    breakflag = -1.0f;
  }

  jatom.y += 1.0f;
 }

}

kernel void knbforce_CDLJ4(
  float natoms,
  float nAtomsCeiling,
  float dupfac,
  float strheight,
  float strwidth,
  float jstrwidth,
  float fstrwidth,
  float epsfac,
  float4 params,
  float3 posq[][],
  float  charge[][],
  float4 isigeps[][],
  float4 sigma[][],
  float4 epsilon[][],
  float excl[][],
  out float3 outforce1<>,
  out float3 outforce2<>,
  out float3 outforce3<>,
  out float3 outforce4<> ) {

 float4 jsig, jeps;
 float3 ipos1, ipos2, ipos3, ipos4;
 float4 jpos;
 float3 jpos1, jpos2, jpos3, jpos4;
 float qi1, qi2, qi3, qi4;

 float isig1, isig2, isig3, isig4;
 float ieps1, ieps2, ieps3, ieps4;

 float4 qj, qq;
 float3 d1, d2, d3, d4;
 float2 exclind;
 float4 sig, eps;
 float  exclusions;
 float4 r2;
 float3 pad;
 float4 exclmask;
 float4 exclconst;
 float2 iatom;
 float2 jstrindex;
 float a_iatom;
 float linind;

 float breakflag;
 float2 jatom;
 float4 fs;
 float jend, jstart;
 float which_rep;
 float maskMultiplier;

 maskMultiplier = 1.0e+06f;

 exclconst  = float4( 2.0f, 3.0f, 5.0f, 7.0f );

 outforce1  = float3( 0.0f, 0.0f, 0.0f );
 outforce2  = float3( 0.0f, 0.0f, 0.0f );
 outforce3  = float3( 0.0f, 0.0f, 0.0f );
 outforce4  = float3( 0.0f, 0.0f, 0.0f );

 a_iatom    = 4.0f*((indexof outforce1).x + (indexof outforce1).y * fstrwidth);

 // linind     = fmod( a_iatom, natoms );
 linind     = fmod( a_iatom, nAtomsCeiling );

 iatom.x    = fmod( linind, strwidth );
 iatom.y    = round( (linind - fmod(linind, strwidth))/strwidth );

/* --------------------------------------------------------------------- */

 ipos1      = posq[ iatom ];
 qi1        = charge[ iatom ];

 jstrindex  = isigeps[ iatom ];
 isig1      = jstrindex.x;
 ieps1      = jstrindex.y;

/* --------------------------------------------------------------------- */

 iatom.x   += 1;
 ipos2      = posq[ iatom ];
 qi2        = charge[ iatom ];

 jstrindex  = isigeps[ iatom ];
 isig2      = jstrindex.x;
 ieps2      = jstrindex.y;

/* --------------------------------------------------------------------- */

 iatom.x   += 1;
 ipos3      = posq[ iatom ];
 qi3        = charge[ iatom ];

 jstrindex  = isigeps[ iatom ];
 isig3      = jstrindex.x;
 ieps3      = jstrindex.y;

/* --------------------------------------------------------------------- */

 iatom.x   += 1;
 ipos4      = posq[ iatom ];
 qi4        = charge[ iatom ];


 jstrindex  = isigeps[ iatom ];
 isig4      = jstrindex.x;
 ieps4      = jstrindex.y;

/* --------------------------------------------------------------------- */

 breakflag  = 1.0f;
 jatom.y    = 0.0f;

 // which_rep  = round( (a_iatom - fmod(a_iatom, natoms))/natoms );
 which_rep  = round( (a_iatom - fmod(a_iatom, nAtomsCeiling ))/nAtomsCeiling );

 jend       = 1.0f + round( (natoms - fmod(natoms, (dupfac * strwidth )))/(dupfac * strwidth ) );
 jstart     = which_rep * jend;
 if ( which_rep > dupfac - 1.5f ) {
  jend = 1e6f;
 }
 jend      += jstart;

 exclind.x  = linind;
 exclind.y  = jstart * strwidth/4.0f;
 exclind.y  = floor(  exclind.y  + 0.25f );

 jatom.y    = jstart;
 while ( jatom.y < jend && breakflag > 0.0f ) {
  jatom.x = 0;
  while ( jatom.x < strwidth && breakflag > 0.0f ) {

   exclusions  = excl[ exclind ];
   exclmask    = fmod( exclusions, exclconst );

   linind      = (jatom.x + jatom.y*strwidth)/4.0f;
   linind      = floor( linind + 0.25f );

   jstrindex.y = round( (linind - fmod(linind, jstrwidth))/jstrwidth );
   jstrindex.x = linind - jstrindex.y * jstrwidth;
   jstrindex.x = floor( jstrindex.x + 0.25f );

   /* --------------------------------------------------------------------- */

   jsig        = sigma[ jstrindex ];
   jeps        = epsilon[ jstrindex ];

   jpos1       = posq[ jatom ];
   qj.x        = charge[ jatom ];
   jatom.x     = floor( jatom.x + 1.2f );

   jpos2       = posq[ jatom ];
   qj.y        = charge[ jatom ];
   jatom.x     = floor( jatom.x + 1.2f );

   jpos3       = posq[ jatom ];
   qj.z        = charge[ jatom ];
   jatom.x     = floor( jatom.x + 1.2f );

   jpos4       = posq[ jatom ];
   qj.w        = charge[ jatom ];
   jatom.x     = floor( jatom.x + 1.2f );

   /* --------------------------------------------------------------------- */

   sig         = isig1 + jsig;
   eps         = ieps1 * jeps;
   qq          = qi1*qj;

   d1          = ipos1 - jpos1;
   d2          = ipos1 - jpos2;
   d3          = ipos1 - jpos3;
   d4          = ipos1 - jpos4;

   r2          = get_r2_CDLJ( d1, d2, d3, d4 ) + (exclmask*maskMultiplier);
   fs          = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );

   outforce1  += fs.x * d1;
   outforce1  += fs.y * d2;
   outforce1  += fs.z * d3;
   outforce1  += fs.w * d4;

   /* --------------------------------------------------------------------- */

   //exclind.x  += 1.0f;
   exclind.x   = floor( exclind.x + 1.2f );
   exclusions  = excl[ exclind ];
   exclmask    = fmod( exclusions, exclconst );

   sig         = isig2 + jsig;
   eps         = ieps2 * jeps;
   qq          = qi2 * qj;

   d1          = ipos2 - jpos1;
   d2          = ipos2 - jpos2;
   d3          = ipos2 - jpos3;
   d4          = ipos2 - jpos4;

   r2          = get_r2_CDLJ( d1, d2, d3, d4 ) + (exclmask*maskMultiplier);
   fs          = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );

   outforce2  += fs.x * d1;
   outforce2  += fs.y * d2;
   outforce2  += fs.z * d3;
   outforce2  += fs.w * d4;

   /* --------------------------------------------------------------------- */

   //exclind.x  += 1.0f;
   exclind.x   = floor( exclind.x + 1.2f );
   exclusions  = excl[ exclind ];
   exclmask    = fmod( exclusions, exclconst );

   sig         = isig3 + jsig;
   eps         = ieps3 * jeps;
   qq          = qi3   * qj;

   d1          = ipos3 - jpos1;
   d2          = ipos3 - jpos2;
   d3          = ipos3 - jpos3;
   d4          = ipos3 - jpos4;

   r2          = get_r2_CDLJ( d1, d2, d3, d4 ) + (exclmask*maskMultiplier);
   fs          = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );

   outforce3  += fs.x * d1;
   outforce3  += fs.y * d2;
   outforce3  += fs.z * d3;
   outforce3  += fs.w * d4;

   /* --------------------------------------------------------------------- */

   //exclind.x  += 1.0f;
   exclind.x   = floor( exclind.x + 1.2f );
   exclusions  = excl[ exclind ];
   exclmask    = fmod( exclusions, exclconst );

   sig         = isig4 + jsig;
   eps         = ieps4 * jeps;
   qq          = qi4   * qj;

   d1          = ipos4 - jpos1;
   d2          = ipos4 - jpos2;
   d3          = ipos4 - jpos3;
   d4          = ipos4 - jpos4;

   r2          = get_r2_CDLJ( d1, d2, d3, d4 ) + (exclmask*maskMultiplier);
   fs          = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );

   outforce4  += fs.x * d1;
   outforce4  += fs.y * d2;
   outforce4  += fs.z * d3;
   outforce4  += fs.w * d4;

   /* --------------------------------------------------------------------- */

   //exclind.x  -= 3.0f;
   exclind.x   = floor( exclind.x - 2.9f );

   //exclind.y  += 1.0f;
   exclind.y  = floor(  exclind.y  + 1.25f );

   if( natoms - (jatom.y*strwidth + jatom.x) < 0.9f )breakflag = -1.0f;
  }

  //jatom.y += 1.0f;
  jatom.y  = floor( jatom.y + 1.2f );
 }

}

kernel void kforce14_CDLJ(
  float natoms,
  float strwidth,
  float epsfac,
  float4 params,
  float2 atoms<>,
  float4 posq[][],
  float2 sigeps<>,
  out float3 fi<>,
  out float3 fj<>
  )
{
 float2 ai, aj;
 float3 d1;
 float r2;
 float qq, fs;

 //ai.y = floor( atoms.x / strwidth );
 ai.y = round( (atoms.x - fmod( atoms.x, strwidth ))/strwidth );
 ai.x = atoms.x - ai.y * strwidth;

 //aj.y = floor( atoms.y / strwidth );
 aj.y = round( (atoms.y - fmod( atoms.y, strwidth ))/strwidth );
 aj.x = atoms.y - aj.y * strwidth;

 d1 = posq[ai].xyz - posq[aj].xyz;
 qq = posq[ai].w * posq[aj].w;

 r2 = dot( d1, d1 );

 fs = scalar_force_single_CDLJ( qq, epsfac, sigeps.x, sigeps.y, r2, params );

 fi = fs.x * d1;
 fj = -fi;
}

kernel void kAcosV( float x<>, out float acosVal<> ){

   float x2, x3, x5, x7, x9, x11, x13, x15, x17, x19;

   float pi2           = 1.57079632679489661923f;
   float coefficient3  = 1.0f/6.0f;
   float coefficient5  = 3.0f/40.0f;
   float coefficient7  = 5.0f/112.0f;
   float coefficient9  = 35.0f/1152.0f;
   float coefficient11 = 63.0f/2816.0f;
   float coefficient13 = 231.0f/13312.0f;
   float coefficient15 = 143.0f/10240.0f;
   float coefficient17 = 6435.0f/557056.0f;
   float coefficient19 = 12155.0f/1245184.0f;
 
   // ---------------------------------------------------------------------------------------

   x2             =  x*x;
   x3             =  x*x2;
   x5             = x3*x2;
   x7             = x5*x2;
   x9             = x7*x2;
   x11            = x9*x2;
   x13            = x11*x2;
   x15            = x13*x2;
   x17            = x15*x2;
   x19            = x17*x2;

   acosVal        = x19*coefficient19;
   acosVal       += x17*coefficient17;
   acosVal       += x15*coefficient15;
   acosVal       += x13*coefficient13;
   acosVal       += x11*coefficient11;
   acosVal       += x9*coefficient9;
   acosVal       += x7*coefficient7;
   acosVal       += x5*coefficient5;
   acosVal       += x3*coefficient3 + x;
   acosVal        = pi2 - acosVal;

}

kernel void kAcosTanV( float xIn<>, out float acosVal<> ){

   const float c1             = 48.70107004404898384f;
   const float c2             = 49.5326263772254345f;
   const float c3             = 9.40604244231624f;
   const float c4             = 48.70107004404996166f;
   const float c5             = 65.7663163908956299f;
   const float c6             = 21.587934067020262f;
   const float tantwelfthpi   = 0.26794919243112270647255365849413f;
   const float tansixthpi     = 0.57735026918962576450914878050196f;
   const float sixthpi        = 0.52359877559829887307710723054658f;
   const float halfpi         = 1.5707963267948966192313216916398f;

   float x2, y, x;
   float sign, complement, region;

   // ---------------------------------------------------------------------------------------

   complement                 = 0.0f;
   region                     = 0.0f;

   x                          = xIn;
   y                          = rsqrt( (1.0f + x)*(1.0f  - x) );
   x                         *= y;
   if( x < 0.0f ){
      x     = -x;
      sign  = -1.0f;
   } else {
      sign  =  1.0f;
   }

   if( x > 1.0f ){
      x           = 1.0f/x;
      complement  = 1.0f;
   } else {
      complement  = 0.0f;
   }

   if( x > tantwelfthpi ){
      x        = (x-tansixthpi)/(1.0f + tansixthpi*x);
      region   = sixthpi;
   } else {
      region   = 0.0f;
   }

   x2          = x*x;
   y           = (x*(c1 + x2*(c2 + x2*c3))/(c4 + x2*(c5 + x2*(c6 + x2))));

   y          += region;

   y           = (complement > 0.0f) ? (halfpi- y) : y;
   acosVal     = halfpi - sign*y;

}

kernel void kAcos( float xIn, out float acosVal<> ){

   const float c1             = 48.70107004404898384f;
   const float c2             = 49.5326263772254345f;
   const float c3             = 9.40604244231624f;
   const float c4             = 48.70107004404996166f;
   const float c5             = 65.7663163908956299f;
   const float c6             = 21.587934067020262f;
   const float tantwelfthpi   = 0.26794919243112270647255365849413f;
   const float tansixthpi     = 0.57735026918962576450914878050196f;
   const float sixthpi        = 0.52359877559829887307710723054658f;
   const float halfpi         = 1.5707963267948966192313216916398f;

   float x2, y, x;
   float sign, complement, region;

   // ---------------------------------------------------------------------------------------

   // acos(x) = atan(x/sqrt(1-x**2))

   x        = xIn;
   y        = rsqrt( (1.0f + x)*(1.0f - x) );
   x        *= y;

   if( x < 0.0f ){
      x     = -x;
      sign  = -1.0f;
   } else {
      sign  =  1.0f;
   }

   if( x > 1.0f ){
      x           = 1.0f/x;
      complement  = 1.0f;
   } else {
      complement  = 0.0f;
   }

   if( x > tantwelfthpi ){
      x        = (x - tansixthpi)/(1.0f + tansixthpi*x);
      region   = sixthpi;
   } else {
      region   = 0.0f;
   }

   x2          = x*x;
   y           = (x*(c1 + x2*(c2 + x2*c3))/(c4 + x2*(c5 + x2*(c6 + x2))));

   y          += region;

   y           = (complement > 0.0f) ? (halfpi- y) : y;
   acosVal     = halfpi - sign*y;

}

kernel void kbonded_CDLJ (
      float epsfac,
      float xstrwidth,
      float4 nbparams,
      float3 posq[][],
      float  charge<>,
      float4 atoms<>,
      float4 parm0<>,
      float4 parm1<>,
      float4 parm2<>,
      float4 parm3<>,
      float4 parm4<>,
      out float3 fi<>,
      out float3 fj<>,
      out float3 fk<>,
      out float3 fl<>
   ){
  
   float3 rij, rkj, rkl, ril;
   float2 ai, aj, ak, al;
   float3 m, n;
   float sgnphi;
   float cosfac;
   float ddphi, ddphi_rb;
   float3 u, v, s;
   float invlkj, invlij, invlkl, msq, nsq, cos_phi, sin_phi;
   float3 uij, ukj, ukl;
   float phi, mdphi;
   float rij_d_ukj, ukj_d_rkl;
   float normfac;
   float qq;
   float3 fi_ang, fj_ang, fk_ang;
   float3 fi_bond;
   float cii, ckk, cik;
   float fs, st, sth;
   float3 fi_pair;
   float r2 ;
   float torsion_numerator;
   float theta1, costheta1, invsintheta1;
   float theta2, costheta2, invsintheta2;
   float3 posqi, posqj, posqk, posql;
   float sig, eps;
  
   // i-coordinates

   ai.y  = round( (atoms.x - fmod( atoms.x, xstrwidth ))/xstrwidth );
   ai.x  = atoms.x - ai.y * xstrwidth;
   posqi = posq[ ai ];
  
   // j-coordinates

   if ( atoms.y > -0.5f ) {
      aj.y  = round( (atoms.y - fmod( atoms.y, xstrwidth ))/xstrwidth );
      aj.x  = atoms.y - aj.y * xstrwidth;
      posqj = posq[ aj ];
   } else {
      posqj = float3( 0.0f, 1.0f, 0.0f );
   }
  
   // k-coordinates

   if ( atoms.z > -0.5f ) {
      ak.y = round( (atoms.z - fmod( atoms.z, xstrwidth ))/xstrwidth );
      ak.x = atoms.z - ak.y * xstrwidth;
      posqk = posq[ ak ];
   } else { 
      posqk = float3( 1.0f, 1.0f, 0.0f );
   }
  
   // l-coordinates

   if ( atoms.w > -0.5f ) {
      al.y  = round( (atoms.w - fmod( atoms.w, xstrwidth ))/xstrwidth );
      al.x  = atoms.w - al.y * xstrwidth;
      posql = posq[ al ];
   } else {
      posql = float3( 1.0f, 1.0f, 1.0f );
   }
  
   rij             = posqi - posqj;
   rkj             = posqk - posqj;
   rkl             = posqk - posql;
   ril             = posqi - posql;
  
   invlij          = rsqrt( dot( rij, rij ) );
   invlkj          = rsqrt( dot( rkj, rkj ) );
   invlkl          = rsqrt( dot( rkl, rkl ) );
  
   uij             = rij * invlij;
   ukj             = rkj * invlkj;
   ukl             = rkl * invlkl;
  
   rij_d_ukj       = dot( rij, ukj );
   ukj_d_rkl       = dot( ukj, rkl );
  
   m               = cross( uij, ukj );
   n               = cross( ukj, ukl );
  
   costheta1       = clamp( rij_d_ukj * invlij, -1.0f, 1.0f );
   //theta1          = acos( costheta1 );
   kAcos( costheta1, theta1 );
   invsintheta1    = rsqrt( (1.0f - costheta1) * (1.0f + costheta1) );
  
   costheta2       = clamp( ukj_d_rkl * invlkl, -1.0f, 1.0f );
   //theta2          = acos( costheta2 );
   kAcos( costheta2, theta2 );
   invsintheta2    = rsqrt( (1.0f - costheta2) * (1.0f + costheta2) );
  
   cos_phi         = clamp( dot(m,n) * invsintheta1 * invsintheta2, -1.0f, 1.0f );
   sgnphi          = sign( dot(rij, n) );
  
   phi             = sgnphi * acos( cos_phi );
   mdphi           = parm1.w * phi - parm1.z;
   ddphi           = -parm1.y * parm1.w * sin( mdphi );
  
   cos_phi         = -cos_phi;
   sin_phi         = -sgnphi*sqrt( (1.0f - cos_phi) * (1.0f + cos_phi) );
  
   ddphi_rb        = 5.0f * parm1.x;
   ddphi_rb        = 4.0f * parm0.w + ddphi_rb * cos_phi;
   ddphi_rb        = 3.0f * parm0.z + ddphi_rb * cos_phi;
   ddphi_rb        = 2.0f * parm0.y + ddphi_rb * cos_phi;
   ddphi_rb        = parm0.x + ddphi_rb * cos_phi;
   ddphi_rb        = -ddphi_rb * sin_phi;
  
   ddphi          += ddphi_rb;
/*
fi = float3( cos_phi, sgnphi, ddphi_rb );
fj = float3( sin_phi, parm1.x, parm0.w );
fk = float3( parm0.z, parm0.y, parm0.x );
fl = float3( ddphi, 0.0f, 0.0f );
*/
  
   fi              = (-ddphi * invlij * invsintheta1 * invsintheta1 ) * m;
   fl              = ( ddphi * invlkl * invsintheta2 * invsintheta2 ) * n;
  
   u               = rij_d_ukj * invlkj * fi;
   v               = ukj_d_rkl * invlkj * fl;
   s               = u - v;
  
   fj              = s - fi;
   fk              = -(s + fl);
  
  
   fs              = -parm2.y * ( theta1 - parm2.x );
   st              = fs * invsintheta1;
   st              = clamp( st, -100000.0f, 100000.0f );
  
   sth             = st * costheta1;
  
   cik             = st * invlkj * invlij;
   cii             = sth * invlij;
   ckk             = sth * invlkj;
  
   fi_ang          = -( cik * rkj - cii * uij );
   fk_ang          = -( cik * rij - ckk * ukj );
   fj_ang          = -fi_ang - fk_ang;
  
   fi             += fi_ang;
   fj             += fj_ang;
   fk             += fk_ang;
   fs              = -parm2.w * ( theta2 - parm2.z );
   st              = fs * invsintheta2;
   st              = clamp( st, -100000.0f, 100000.0f );
  
   sth             = st * costheta2;
  
   cik             = st * invlkj * invlkl;
   cii             = sth * invlkj;
   ckk             = sth * invlkl;
  
   fi_ang          = ( cik * rkl - cii * ukj );
   fk_ang          = ( cik * rkj - ckk * ukl );
   fj_ang          = -fi_ang - fk_ang;
  
   fj             += fi_ang;
   fk             += fj_ang;
   fl             += fk_ang;
  
  
   fi_bond         = -parm3.y * ( 1.0f - parm3.x * invlij ) * rij;
   fi             += fi_bond;
   fj             += -fi_bond;
  
   fi_bond         = parm3.w * ( 1.0f - parm3.z * invlkj ) * rkj;
   fj             += fi_bond;
   fk             += -fi_bond;
  
   fi_bond         = -parm4.y * ( 1.0f - parm4.x * invlkl ) * rkl;
   fk             += fi_bond;
   fl             += -fi_bond;
  
   if( parm4.z > -0.5f ){
      r2        = dot( ril, ril );
      fs        = scalar_force_single_CDLJ( charge, epsfac, parm4.z, parm4.w, r2, nbparams );
      fi_pair   = fs * ril;
      fi       += fi_pair;
      fl       -= fi_pair;
   }

}