Commit b53cd947 authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Latest mods

parent ff88ddad
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
//Kernel to set the xyz components of a float4 stream
//Used for changing the coordinates without changing
//the charges in strPosQ
kernel void ksetxyz( float3 instr<>, float4 before<>, out float4 after<> ) {
after.xyz = instr;
after.w = before.w;
}
//Inverse of above
kernel void kgetxyz( float4 instr<>, out float3 outstr<> ) {
outstr = instr.xyz;
}
//Zeroes out a stream
kernel void kzerof3( out float3 outstr<> ) {
outstr = float3( 0.0, 0.0, 0.0 );
}
//Zeros out a stream
kernel void kzerof4( out float4 outstr<> ) {
outstr = float4( 0.0, 0.0, 0.0, 0.0 );
}
kernel void ksetf4( float4 val, out float4 outstr<> ) {
outstr = val;
}
#ifndef __KFORCE_H__
#define __KFORCE_H__
//Define a generic force kernel prototype
//To make switching easier to look at
//This should match the kernels from kforce.br
typedef void (*gpuNBForceFunction)(
const float natoms,
const float dupfac,
const float strheight,
const float strwidth,
const float jstrwidth,
const float fstrwidth,
const float epsfac,
const float4 params,
::brook::stream posq,
::brook::stream isigeps,
::brook::stream sigma,
::brook::stream epsilon,
::brook::stream excl,
::brook::stream outforce1,
::brook::stream outforce2
);
void knbforce_CDLJ (const float natoms,
const float dupfac,
const float strheight,
const float strwidth,
const float jstrwidth,
const float fstrwidth,
const float epsfac,
const float4 params,
::brook::stream posq,
::brook::stream isigeps,
::brook::stream sigma,
::brook::stream epsilon,
::brook::stream excl,
::brook::stream outforce1,
::brook::stream outforce2);
void knbforce_CDLJ2(const float natoms,
const float dupfac,
const float strheight,
const float strwidth,
const float jstrwidth,
const float fstrwidth,
const float epsfac,
const float4 params,
::brook::stream posq,
::brook::stream isigeps,
::brook::stream sigma,
::brook::stream epsilon,
::brook::stream excl,
::brook::stream outforce1,
::brook::stream outforce2);
void knbforce_CDLJ4(
const float natoms,
const float nAtomsCeiling,
const float dupfac,
const float strheight,
const float strwidth,
const float jstrwidth,
const float fstrwidth,
const float epsfac,
const float4 params,
::brook::stream posq,
::brook::stream isigeps,
::brook::stream sigma,
::brook::stream epsilon,
::brook::stream excl,
::brook::stream outforce1,
::brook::stream outforce2,
::brook::stream outforce3,
::brook::stream outforce4);
void knbforce_CDLJ4Debug(
const float natoms,
const float nAtomsCeiling,
const float dupfac,
const float strheight,
const float strwidth,
const float jstrwidth,
const float fstrwidth,
const float epsfac,
const float4 params,
::brook::stream posq,
::brook::stream isigeps,
::brook::stream sigma,
::brook::stream epsilon,
::brook::stream excl,
::brook::stream outforce1,
::brook::stream outforce2,
::brook::stream outforce3,
::brook::stream outforce4);
void knbforce_CDLJ_1(
const float natoms,
const float nAtomsCeiling,
const float strwidth,
const float epsfac,
::brook::stream posq,
::brook::stream isigeps,
::brook::stream excl,
::brook::stream outforce );
void knbforce_CDLJ4NoEx(
const float natoms,
const float nAtomsCeiling,
const float dupfac,
const float strheight,
const float strwidth,
const float jstrwidth,
const float fstrwidth,
const float epsfac,
const float4 params,
::brook::stream posq,
::brook::stream isigeps,
::brook::stream sigma,
::brook::stream epsilon,
::brook::stream excl,
::brook::stream outforce1,
::brook::stream outforce2,
::brook::stream outforce3,
::brook::stream outforce4);
void knbforce_LDLJ (const float natoms,
const float dupfac,
const float strheight,
const float strwidth,
const float jstrwidth,
const float fstrwidth,
const float epsfac,
const float4 params,
const __BRTIter& a_iatom2,
::brook::stream posq,
::brook::stream isigeps,
::brook::stream sigma,
::brook::stream epsilon,
::brook::stream excl,
::brook::stream outforce1,
::brook::stream outforce2);
void knbforce_SFDLJ (const float natoms,
const float dupfac,
const float strheight,
const float strwidth,
const float jstrwidth,
const float fstrwidth,
const float epsfac,
const float4 params,
const __BRTIter& a_iatom2,
::brook::stream posq,
::brook::stream isigeps,
::brook::stream sigma,
::brook::stream epsilon,
::brook::stream excl,
::brook::stream outforce1,
::brook::stream outforce2);
void knbforce_SHEFFIELD (const float natoms,
const float dupfac,
const float strheight,
const float strwidth,
const float jstrwidth,
const float fstrwidth,
const float epsfac,
const float4 params,
const __BRTIter& a_iatom2,
::brook::stream posq,
::brook::stream isigeps,
::brook::stream sigma,
::brook::stream epsilon,
::brook::stream excl,
::brook::stream outforce1,
::brook::stream outforce2);
void kmerge_partial_forces (const float repfac,
const float atomStrWidth,
const float pforceStrWidth,
const float natoms,
::brook::stream pforce1,
::brook::stream pforce2,
::brook::stream force);
void kMergeFloat3_4(
float repfac,
float atomStreamWidth,
float pStreamWidth,
float natoms,
float roundNatoms,
float iUnroll,
::brook::stream pforce1,
::brook::stream pforce2,
::brook::stream pforce3,
::brook::stream pforce4,
::brook::stream force );
void kMergeFloat3_4_nobranch(
float repfac,
float atomStreamWidth,
float pStreamWidth,
float natoms,
float roundNatoms,
float iUnroll,
::brook::stream pforce1,
::brook::stream pforce2,
::brook::stream pforce3,
::brook::stream pforce4,
::brook::stream force );
typedef void ( *gpuNBForceFunction14 )
(const float natoms,
const float strwidth,
const float epsfac,
const float4 params,
::brook::stream atoms,
::brook::stream posq,
::brook::stream sigeps,
::brook::stream fi,
::brook::stream fj);
void kforce14_CDLJ
(const float natoms,
const float strwidth,
const float epsfac,
const float4 params,
::brook::stream atoms,
::brook::stream posq,
::brook::stream sigeps,
::brook::stream fi,
::brook::stream fj);
void kforce14_LDLJ
(const float natoms,
const float strwidth,
const float epsfac,
const float4 params,
::brook::stream atoms,
::brook::stream posq,
::brook::stream sigeps,
::brook::stream fi,
::brook::stream fj);
void kforce14_SFDLJ
(const float natoms,
const float strwidth,
const float epsfac,
const float4 params,
::brook::stream atoms,
::brook::stream posq,
::brook::stream sigeps,
::brook::stream fi,
::brook::stream fj);
void kforce14_SHEFFIELD
(const float natoms,
const float strwidth,
const float epsfac,
const float4 params,
::brook::stream atoms,
::brook::stream posq,
::brook::stream sigeps,
::brook::stream fi,
::brook::stream fj);
typedef void (*gpuBondedFunction)(
const float epsfac,
const float xstrwidth,
const float4 params,
::brook::stream posq,
::brook::stream atoms,
::brook::stream parm0,
::brook::stream parm1,
::brook::stream parm2,
::brook::stream parm3,
::brook::stream parm4,
::brook::stream fi,
::brook::stream fj,
::brook::stream fk,
::brook::stream fl);
void kbonded_CDLJ (const float epsfac,
const float xstrwidth,
const float4 params,
::brook::stream posq,
::brook::stream atoms,
::brook::stream parm0,
::brook::stream parm1,
::brook::stream parm2,
::brook::stream parm3,
::brook::stream parm4,
::brook::stream fi,
::brook::stream fj,
::brook::stream fk,
::brook::stream fl);
void kbonded_CDLJDebug (const float epsfac,
const float xstrwidth,
const float4 params,
::brook::stream posq,
::brook::stream atoms,
::brook::stream parm0,
::brook::stream parm1,
::brook::stream parm2,
::brook::stream parm3,
::brook::stream parm4,
::brook::stream fi,
::brook::stream fj,
::brook::stream fk,
::brook::stream fl);
void kbonded_LDLJ (const float epsfac,
const float xstrwidth,
const float4 params,
::brook::stream posq,
::brook::stream atoms,
::brook::stream parm0,
::brook::stream parm1,
::brook::stream parm2,
::brook::stream parm3,
::brook::stream parm4,
::brook::stream fi,
::brook::stream fj,
::brook::stream fk,
::brook::stream fl);
void kbonded_SFDLJ (const float epsfac,
const float xstrwidth,
const float4 params,
::brook::stream posq,
::brook::stream atoms,
::brook::stream parm0,
::brook::stream parm1,
::brook::stream parm2,
::brook::stream parm3,
::brook::stream parm4,
::brook::stream fi,
::brook::stream fj,
::brook::stream fk,
::brook::stream fl);
void kbonded_SHEFFIELD (const float epsfac,
const float xstrwidth,
const float4 params,
::brook::stream posq,
::brook::stream atoms,
::brook::stream parm0,
::brook::stream parm1,
::brook::stream parm2,
::brook::stream parm3,
::brook::stream parm4,
::brook::stream fi,
::brook::stream fj,
::brook::stream fk,
::brook::stream fl);
void kAddForces3_4(
const float conversion,
::brook::stream force1,
::brook::stream force2,
::brook::stream outForce );
#endif //__KFORCE_H__
kernel float4 scalar_force_CDLJ( float4 qq, float epsfac, float4 sig, float4 eps, float4 r2, float4 params ) {
float4 invr, invrsig2, invrsig6;
float4 f;
invr = rsqrt( r2 );
invrsig2 = invr * sig;
invrsig2 = invrsig2 * invrsig2;
invrsig6 = invrsig2 * invrsig2 * invrsig2;
f = eps * ( 12.0f*invrsig6 - 6.0f ) * invrsig6;
f += epsfac*qq*invr;
f *= invr*invr;
return f;
}
kernel float scalar_force_single_CDLJ( float qq, float epsfac, float sig, float eps, float r2 ){
float invr, invrsig2, invrsig6;
float f;
invr = rsqrt( r2 );
invrsig2 = invr * sig;
invrsig2 = invrsig2 * invrsig2;
invrsig6 = invrsig2 * invrsig2 * invrsig2;
f = eps * ( 12.0f*invrsig6 - 6.0f ) * invrsig6;
f += epsfac*qq*invr;
f *= invr*invr;
return f;
}
kernel float4 get_r2_CDLJ( float3 d1, float3 d2, float3 d3, float3 d4 ) {
float4 r2;
r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) );
return r2;
}
kernel void knbforce_CDLJ (
float natoms,
float dupfac,
float strheight,
float strwidth,
float jstrwidth,
float fstrwidth,
float epsfac,
float4 params,
float4 posq[][],
float4 isigeps<>,
float4 sigma[][],
float4 epsilon[][],
float2 excl[][],
out float3 outforce1<>,
out float3 outforce2<> ) {
float4 jsig, jeps;
float3 ipos1, ipos2;
float3 jpos1, jpos2, jpos3, jpos4;
float qi1, qi2;
float4 qj, qq;
float3 d1, d2, d3, d4;
float2 exclind;
float4 sig, eps;
float2 exclusions;
float4 r2;
float3 pad;
float4 exclmask;
float4 exclconst;
float2 iatom;
float2 jstrindex;
float a_iatom;
float linind;
float breakflag;
float2 jatom;
float4 fs;
float jend, jstart;
float which_rep;
exclconst = float4( 2.0f, 3.0f, 5.0f, 7.0f );
outforce1 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
outforce2 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
a_iatom = 2.0f*((indexof outforce1).x + (indexof outforce1).y * fstrwidth);
linind = fmod( a_iatom, natoms );
iatom.x = fmod( linind, strwidth );
iatom.y = round( (linind - fmod(linind, strwidth))/strwidth );
ipos1 = posq[ iatom ].xyz;
qi1 = posq[ iatom ].w;
iatom.x += 1;
ipos2 = posq[ iatom ].xyz;
qi2 = posq[ iatom ].w;
breakflag = 1.0f;
jatom.y = 0.0f;
which_rep = round( (a_iatom - fmod(a_iatom, natoms))/natoms );
jend = 1 + round( (natoms - fmod(natoms, (dupfac * strwidth )))/(dupfac * strwidth ) );
jstart = which_rep * jend;
if ( which_rep > dupfac - 1 - 0.5f ) {
jend = 1e6f;
}
jend += jstart;
exclind.x = linind/2.0f;
exclind.y = jstart * strwidth/4.0f;
jatom.y = jstart;
while ( jatom.y < jend && breakflag > 0.0f ) {
jatom.x = 0;
while ( jatom.x < strwidth && breakflag > 0.0f ) {
exclusions = excl[ exclind ];
exclmask = fmod( exclusions.x, exclconst );
linind = (jatom.x + jatom.y*strwidth)/4.0f;
jstrindex.y = round( (linind - fmod(linind, jstrwidth))/jstrwidth );
jstrindex.x = linind - jstrindex.y * jstrwidth;
jsig = sigma[ jstrindex ];
jeps = epsilon[ jstrindex ];
jpos1 = posq[ jatom ].xyz;
qj.x = posq[ jatom ].w;
jatom.x += 1.0f;
jpos2 = posq[ jatom ].xyz;
qj.y = posq[ jatom ].w;
jatom.x += 1.0f;
jpos3 = posq[ jatom ].xyz;
qj.z = posq[ jatom ].w;
jatom.x += 1.0f;
jpos4 = posq[ jatom ].xyz;
qj.w = posq[ jatom ].w;
jatom.x += 1.0f;
sig = isigeps.x + jsig;
eps = isigeps.y * jeps;
qq = qi1 * qj;
d1 = ipos1 - jpos1;
d2 = ipos1 - jpos2;
d3 = ipos1 - jpos3;
d4 = ipos1 - jpos4;
r2 = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask * 10000.0f;
fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
outforce1 += fs.x * d1;
outforce1 += fs.y * d2;
outforce1 += fs.z * d3;
outforce1 += fs.w * d4;
/*
outforce1.x += exclmask.x;
outforce1.x += exclmask.y;
outforce1.x += exclmask.z;
outforce1.x += exclmask.w;
if( exclmask.x > 0.5f ){
outforce1.x += 1.0f;
}
if( exclmask.y > 0.5f ){
outforce1.x += 1.0f;
}
if( exclmask.z > 0.5f ){
outforce1.x += 1.0f;
}
if( exclmask.w > 0.5f ){
outforce1.x += 1.0f;
}
*/
exclmask = fmod( exclusions.y, exclconst );
sig = isigeps.z + jsig;
eps = isigeps.w * jeps;
qq = qi2 * qj;
d1 = ipos2 - jpos1;
d2 = ipos2 - jpos2;
d3 = ipos2 - jpos3;
d4 = ipos2 - jpos4;
r2 = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask * 10000.0f;
fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
outforce2 += fs.x * d1;
outforce2 += fs.y * d2;
outforce2 += fs.z * d3;
outforce2 += fs.w * d4;
/*
outforce2.x += exclmask.x;
outforce2.x += exclmask.y;
outforce2.x += exclmask.z;
outforce2.x += exclmask.w;
if( exclmask.x > 0.5f ){
outforce2.x += 1.0f;
}
if( exclmask.y > 0.5f ){
outforce2.x += 1.0f;
}
if( exclmask.z > 0.5f ){
outforce2.x += 1.0f;
}
if( exclmask.w > 0.5f ){
outforce2.x += 1.0f;
}
*/
exclind.y += 1.0f;
if ( natoms - (jatom.y*strwidth + jatom.x) < 0.9f )
breakflag = -1.0f;
}
jatom.y += 1.0f;
}
}
kernel void knbforce_CDLJ2(
float natoms,
float dupfac,
float strheight,
float strwidth,
float jstrwidth,
float fstrwidth,
float epsfac,
float4 params,
float4 posq[][],
float4 isigeps[][],
float4 sigma[][],
float4 epsilon[][],
float excl[][],
out float3 outforce1<>,
out float3 outforce2<> ) {
float4 jsig, jeps;
float3 ipos1, ipos2;
float4 jpos;
float3 jpos1, jpos2, jpos3, jpos4;
float qi1, qi2;
float isig1, isig2;
float ieps1, ieps2;
float4 qj, qq;
float3 d1, d2, d3, d4;
float2 exclind;
float4 sig, eps;
float exclusions;
float4 r2;
float3 pad;
float4 exclmask;
float4 exclconst;
float2 iatom;
float2 jstrindex;
float a_iatom;
float linind;
float breakflag;
float2 jatom;
float4 fs;
float jend, jstart;
float which_rep;
exclconst = float4( 2.0f, 3.0f, 5.0f, 7.0f );
outforce1 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
outforce2 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
a_iatom = 2.0f*((indexof outforce1).x + (indexof outforce1).y * fstrwidth);
linind = fmod( a_iatom, natoms );
iatom.x = fmod( linind, strwidth );
iatom.y = round( (linind - fmod(linind, strwidth))/strwidth );
jpos = posq[ iatom ];
ipos1 = jpos.xyz;
qi1 = jpos.w;
jstrindex = isigeps[ iatom ];
isig1 = jstrindex.x;
ieps1 = jstrindex.y;
iatom.x += 1;
jpos = posq[ iatom ];
ipos2 = jpos.xyz;
qi2 = jpos.w;
jstrindex = isigeps[ iatom ];
isig2 = jstrindex.x;
ieps2 = jstrindex.y;
breakflag = 1.0f;
jatom.y = 0.0f;
which_rep = round( (a_iatom - fmod(a_iatom, natoms))/natoms );
jend = 1 + round( (natoms - fmod(natoms, (dupfac * strwidth )))/(dupfac * strwidth ) );
jstart = which_rep * jend;
if ( which_rep > dupfac - 1 - 0.5f ) {
jend = 1e6f;
}
jend += jstart;
exclind.x = linind;
exclind.y = jstart * strwidth/4.0f;
jatom.y = jstart;
while ( jatom.y < jend && breakflag > 0.0f ) {
jatom.x = 0;
while ( jatom.x < strwidth && breakflag > 0.0f ) {
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
linind = (jatom.x + jatom.y*strwidth)/4.0f;
jstrindex.y = round( (linind - fmod(linind, jstrwidth))/jstrwidth );
jstrindex.x = linind - jstrindex.y * jstrwidth;
jsig = sigma[ jstrindex ];
jeps = epsilon[ jstrindex ];
jpos = posq[ jatom ];
jpos1 = jpos.xyz;
qj.x = jpos.w;
jatom.x += 1.0f;
jpos = posq[ jatom ];
jpos2 = jpos.xyz;
qj.y = jpos.w;
jatom.x += 1.0f;
jpos = posq[ jatom ];
jpos3 = jpos.xyz;
qj.z = jpos.w;
jatom.x += 1.0f;
jpos = posq[ jatom ];
jpos4 = jpos.xyz;
qj.w = jpos.w;
jatom.x += 1.0f;
sig = isig1 + jsig;
eps = ieps1 * jeps;
qq = qi1*qj;
d1 = ipos1 - jpos1;
d2 = ipos1 - jpos2;
d3 = ipos1 - jpos3;
d4 = ipos1 - jpos4;
r2 = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask * 10000.0f;
fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
outforce1 += fs.x * d1;
outforce1 += fs.y * d2;
outforce1 += fs.z * d3;
outforce1 += fs.w * d4;
exclind.x += 1.0f;
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
sig = isig2 + jsig;
eps = ieps2 * jeps;
qq = qi2 * qj;
d1 = ipos2 - jpos1;
d2 = ipos2 - jpos2;
d3 = ipos2 - jpos3;
d4 = ipos2 - jpos4;
r2 = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask * 10000.0f;
fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
outforce2 += fs.x * d1;
outforce2 += fs.y * d2;
outforce2 += fs.z * d3;
outforce2 += fs.w * d4;
exclind.x -= 1.0f;
exclind.y += 1.0f;
if ( natoms - (jatom.y*strwidth + jatom.x) < 0.9f )
breakflag = -1.0f;
}
jatom.y += 1.0f;
}
}
kernel void knbforce_CDLJ4(
float natoms,
float nAtomsCeiling,
float dupfac,
float strheight,
float strwidth,
float jstrwidth,
float fstrwidth,
float epsfac,
float4 params,
float4 posq[][],
float4 isigeps[][],
float4 sigma[][],
float4 epsilon[][],
float excl[][],
out float3 outforce1<>,
out float3 outforce2<>,
out float3 outforce3<>,
out float3 outforce4<> ) {
float4 jsig, jeps;
float3 ipos1, ipos2, ipos3, ipos4;
float4 jpos;
float3 jpos1, jpos2, jpos3, jpos4;
float qi1, qi2, qi3, qi4;
float isig1, isig2, isig3, isig4;
float ieps1, ieps2, ieps3, ieps4;
float4 qj, qq;
float3 d1, d2, d3, d4;
float2 exclind;
float4 sig, eps;
float exclusions;
float4 r2;
float3 pad;
float4 exclmask;
float4 exclconst;
float2 iatom;
float2 jstrindex;
float a_iatom;
float linind;
float breakflag;
float2 jatom;
float4 fs;
float jend, jstart;
float which_rep;
exclconst = float4( 2.0f, 3.0f, 5.0f, 7.0f );
outforce1 = float3( 0.0f, 0.0f, 0.0f );
outforce2 = float3( 0.0f, 0.0f, 0.0f );
outforce3 = float3( 0.0f, 0.0f, 0.0f );
outforce4 = float3( 0.0f, 0.0f, 0.0f );
a_iatom = 4.0f*((indexof outforce1).x + (indexof outforce1).y * fstrwidth);
// linind = fmod( a_iatom, natoms );
linind = fmod( a_iatom, nAtomsCeiling );
iatom.x = fmod( linind, strwidth );
iatom.y = round( (linind - fmod(linind, strwidth))/strwidth );
/* --------------------------------------------------------------------- */
jpos = posq[ iatom ];
ipos1 = jpos.xyz;
qi1 = jpos.w;
jstrindex = isigeps[ iatom ];
isig1 = jstrindex.x;
ieps1 = jstrindex.y;
/* --------------------------------------------------------------------- */
iatom.x += 1.0f;
jpos = posq[ iatom ];
ipos2 = jpos.xyz;
qi2 = jpos.w;
jstrindex = isigeps[ iatom ];
isig2 = jstrindex.x;
ieps2 = jstrindex.y;
/* --------------------------------------------------------------------- */
iatom.x += 1.0f;
jpos = posq[ iatom ];
ipos3 = jpos.xyz;
qi3 = jpos.w;
jstrindex = isigeps[ iatom ];
isig3 = jstrindex.x;
ieps3 = jstrindex.y;
/* --------------------------------------------------------------------- */
iatom.x += 1.0f;
jpos = posq[ iatom ];
ipos4 = jpos.xyz;
qi4 = jpos.w;
jstrindex = isigeps[ iatom ];
isig4 = jstrindex.x;
ieps4 = jstrindex.y;
/* --------------------------------------------------------------------- */
breakflag = 1.0f;
jatom.y = 0.0f;
// which_rep = round( (a_iatom - fmod(a_iatom, natoms))/natoms );
which_rep = round( (a_iatom - fmod(a_iatom, nAtomsCeiling ))/nAtomsCeiling );
jend = 1.0f + round( (natoms - fmod(natoms, (dupfac * strwidth )))/(dupfac * strwidth ) );
jstart = which_rep * jend;
if ( which_rep > dupfac - 1.5f ) {
jend = 1e6f;
}
jend += jstart;
exclind.x = linind;
exclind.y = jstart * strwidth/4.0f;
exclind.y = floor( exclind.y + 0.25f );
jatom.y = jstart;
while ( jatom.y < jend && breakflag > 0.0f ) {
jatom.x = 0;
while ( jatom.x < strwidth && breakflag > 0.0f ) {
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
linind = (jatom.x + jatom.y*strwidth)/4.0f;
linind = floor( linind + 0.25f );
jstrindex.y = round( (linind - fmod(linind, jstrwidth))/jstrwidth );
jstrindex.x = linind - jstrindex.y * jstrwidth;
jstrindex.x = floor( jstrindex.x + 0.25f );
/* --------------------------------------------------------------------- */
jsig = sigma[ jstrindex ];
jeps = epsilon[ jstrindex ];
jpos = posq[ jatom ];
jpos1 = jpos.xyz;
qj.x = jpos.w;
jatom.x = floor( jatom.x + 1.2f );
//jatom.x += 1.0f;
jpos = posq[ jatom ];
jpos2 = jpos.xyz;
qj.y = jpos.w;
jatom.x = floor( jatom.x + 1.2f );
//jatom.x += 1.0f;
jpos = posq[ jatom ];
jpos3 = jpos.xyz;
qj.z = jpos.w;
jatom.x = floor( jatom.x + 1.2f );
//jatom.x += 1.0f;
jpos = posq[ jatom ];
jpos4 = jpos.xyz;
qj.w = jpos.w;
jatom.x = floor( jatom.x + 1.2f );
//jatom.x += 1.0f;
/* --------------------------------------------------------------------- */
sig = isig1 + jsig;
eps = ieps1 * jeps;
qq = qi1*qj;
d1 = ipos1 - jpos1;
d2 = ipos1 - jpos2;
d3 = ipos1 - jpos3;
d4 = ipos1 - jpos4;
r2 = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask*10000.0f;
fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
outforce1 += fs.x * d1;
outforce1 += fs.y * d2;
outforce1 += fs.z * d3;
outforce1 += fs.w * d4;
/* --------------------------------------------------------------------- */
//exclind.x += 1.0f;
exclind.x = floor( exclind.x + 1.2f );
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
sig = isig2 + jsig;
eps = ieps2 * jeps;
qq = qi2 * qj;
d1 = ipos2 - jpos1;
d2 = ipos2 - jpos2;
d3 = ipos2 - jpos3;
d4 = ipos2 - jpos4;
r2 = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask * 10000.0f;
fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
outforce2 += fs.x * d1;
outforce2 += fs.y * d2;
outforce2 += fs.z * d3;
outforce2 += fs.w * d4;
/* --------------------------------------------------------------------- */
//exclind.x += 1.0f;
exclind.x = floor( exclind.x + 1.2f );
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
sig = isig3 + jsig;
eps = ieps3 * jeps;
qq = qi3 * qj;
d1 = ipos3 - jpos1;
d2 = ipos3 - jpos2;
d3 = ipos3 - jpos3;
d4 = ipos3 - jpos4;
r2 = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask * 10000.0f;
fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
outforce3 += fs.x * d1;
outforce3 += fs.y * d2;
outforce3 += fs.z * d3;
outforce3 += fs.w * d4;
/* --------------------------------------------------------------------- */
//exclind.x += 1.0f;
exclind.x = floor( exclind.x + 1.2f );
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
sig = isig4 + jsig;
eps = ieps4 * jeps;
qq = qi4 * qj;
d1 = ipos4 - jpos1;
d2 = ipos4 - jpos2;
d3 = ipos4 - jpos3;
d4 = ipos4 - jpos4;
r2 = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask*10000.0f;
fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
outforce4 += fs.x * d1;
outforce4 += fs.y * d2;
outforce4 += fs.z * d3;
outforce4 += fs.w * d4;
/* --------------------------------------------------------------------- */
//exclind.x -= 3.0f;
exclind.x = floor( exclind.x - 2.9f );
//exclind.y += 1.0f;
exclind.y = floor( exclind.y + 1.25f );
if( natoms - (jatom.y*strwidth + jatom.x) < 0.9f )breakflag = -1.0f;
}
//jatom.y += 1.0f;
jatom.y = floor( jatom.y + 1.2f );
}
}
kernel void knbforce_CDLJ4Debug(
float natoms,
float nAtomsCeiling,
float dupfac,
float strheight,
float strwidth,
float jstrwidth,
float fstrwidth,
float epsfac,
float4 params,
float4 posq[][],
float4 isigeps[][],
float4 sigma[][],
float4 epsilon[][],
float excl[][],
out float3 outforce1<>,
out float3 outforce2<>,
out float3 outforce3<>,
out float3 outforce4<> ) {
float4 jsig, jeps;
float3 ipos1, ipos2, ipos3, ipos4;
float4 jpos;
float3 jpos1, jpos2, jpos3, jpos4;
float qi1, qi2, qi3, qi4;
float isig1, isig2, isig3, isig4;
float ieps1, ieps2, ieps3, ieps4;
float4 qj, qq;
float3 d1, d2, d3, d4;
float2 exclind;
float4 sig, eps;
float exclusions;
float4 r2;
float3 pad;
float4 exclmask;
float4 exclconst;
float2 iatom;
float2 jstrindex;
float a_iatom;
float linind;
float linind1;
float breakflag;
float2 jatom;
float4 fs;
float jend, jstart;
float which_rep;
float4 one4;
float4 zero4;
float4 frac4;
float sum;
exclconst = float4( 2.0f, 3.0f, 5.0f, 7.0f );
outforce1 = float3( 0.0f, 0.0f, 0.0f );
outforce2 = float3( 0.0f, 0.0f, 0.0f );
outforce3 = float3( 0.0f, 0.0f, 0.0f );
outforce4 = float3( 0.0f, 0.0f, 0.0f );
one4 = float4( 1.0f, 1.0f, 1.0f, 1.0f );
zero4 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
frac4 = float4( 0.2f, 0.2f, 0.2f, 0.2f );
a_iatom = 4.0f*((indexof outforce1).x + (indexof outforce1).y * fstrwidth);
// linind = fmod( a_iatom, natoms );
linind = fmod( a_iatom, nAtomsCeiling );
iatom.x = fmod( linind, strwidth );
iatom.y = round( (linind - fmod(linind, strwidth))/strwidth );
/* --------------------------------------------------------------------- */
jpos = posq[ iatom ];
ipos1 = jpos.xyz;
qi1 = jpos.w;
jstrindex = isigeps[ iatom ];
isig1 = jstrindex.x;
ieps1 = jstrindex.y;
/* --------------------------------------------------------------------- */
iatom.x += 1;
jpos = posq[ iatom ];
ipos2 = jpos.xyz;
qi2 = jpos.w;
jstrindex = isigeps[ iatom ];
isig2 = jstrindex.x;
ieps2 = jstrindex.y;
/* --------------------------------------------------------------------- */
iatom.x += 1;
jpos = posq[ iatom ];
ipos3 = jpos.xyz;
qi3 = jpos.w;
jstrindex = isigeps[ iatom ];
isig3 = jstrindex.x;
ieps3 = jstrindex.y;
/* --------------------------------------------------------------------- */
iatom.x += 1;
jpos = posq[ iatom ];
ipos4 = jpos.xyz;
qi4 = jpos.w;
jstrindex = isigeps[ iatom ];
isig4 = jstrindex.x;
ieps4 = jstrindex.y;
/* --------------------------------------------------------------------- */
breakflag = 1.0f;
jatom.y = 0.0f;
// which_rep = round( (a_iatom - fmod(a_iatom, natoms))/natoms );
which_rep = round( (a_iatom - fmod(a_iatom, nAtomsCeiling ))/nAtomsCeiling );
jend = 1.0f + round( (natoms - fmod(natoms, (dupfac * strwidth )))/(dupfac * strwidth ) );
jstart = which_rep*jend;
if ( which_rep > dupfac - 1.5f ) {
jend = 1e6f;
}
jend += jstart;
exclind.x = linind;
linind1 = linind;
exclind.y = jstart*strwidth/4.0f;
exclind.y = floor( exclind.y + 0.25f );
/*
outforce1 = float3( linind, exclind.y, jstart );
outforce2 = float3( linind, exclind.y, jend );
outforce3 = float3( linind, exclind.y, strwidth );
outforce4 = float3( linind, exclind.y, exclind.y );
*/
jatom.y = jstart;
while ( jatom.y < jend && breakflag > 0.0f ) {
jatom.x = 0;
while ( jatom.x < strwidth && breakflag > 0.0f ) {
linind = (jatom.x + jatom.y*strwidth)/4.0f;
linind = floor( linind + 0.25f );
jstrindex.y = round( (linind - fmod(linind, jstrwidth))/jstrwidth );
jstrindex.x = linind - jstrindex.y*jstrwidth;
jstrindex.x = floor( jstrindex.x + 0.25f );
// ---------------------------------------------------------------------
jsig = sigma[ jstrindex ];
jeps = epsilon[ jstrindex ];
jpos = posq[ jatom ];
jpos1 = jpos.xyz;
qj.x = jpos.w;
jatom.x = floor( jatom.x + 1.2f );
jpos = posq[ jatom ];
jpos2 = jpos.xyz;
qj.y = jpos.w;
jatom.x = floor( jatom.x + 1.2f );
jpos = posq[ jatom ];
jpos3 = jpos.xyz;
qj.z = jpos.w;
jatom.x = floor( jatom.x + 1.2f );
jpos = posq[ jatom ];
jpos4 = jpos.xyz;
qj.w = jpos.w;
jatom.x = floor( jatom.x + 1.2f );
// ---------------------------------------------------------------------
sig = isig1 + jsig;
eps = ieps1 * jeps;
qq = qi1*qj;
d1 = ipos1 - jpos1;
d2 = ipos1 - jpos2;
d3 = ipos1 - jpos3;
d4 = ipos1 - jpos4;
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
r2 = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask*10000.0f;
fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
// outforce1 += fs.x * d1;
// outforce1 += fs.y * d2;
// outforce1 += fs.z * d3;
// outforce1 += fs.w * d4;
// outforce1 += float3( 4.0, 4.0, 4.0);
// outforce1 += 1.0/r2;
exclmask = exclmask > frac4 ? one4 : zero4;
sum = floor( exclmask.x + exclmask.y + exclmask.z + exclmask.w + 0.25f );
/*
if( sum > 0.25f ){
outforce1.x += 1.0f;
outforce1.y += exclusions;
outforce1.z += sum;
// outforce1.y += exclind.x + strwidth*exclind.y;
//outforce1.y += 4.0;
}
*/
if( exclind.y < 5.9f && exclind.y > 4.1f ){
outforce1.y += exclind.x + 576.0f*exclind.y;
outforce1.z += exclusions;
outforce1.x += exclind.y;
}
// ---------------------------------------------------------------------
exclind.x = floor( exclind.x + 1.2f );
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
sig = isig2 + jsig;
eps = ieps2 * jeps;
qq = qi2 * qj;
d1 = ipos2 - jpos1;
d2 = ipos2 - jpos2;
d3 = ipos2 - jpos3;
d4 = ipos2 - jpos4;
r2 = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask*10000.0f;
fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
// outforce2 += fs.x * d1;
// outforce2 += fs.y * d2;
// outforce2 += fs.z * d3;
// outforce2 += fs.w * d4;
// outforce2 += float3( 4.0f, 4.0f, 4.0f);
exclmask = exclmask > frac4 ? one4 : zero4;
sum = floor( exclmask.x + exclmask.y + exclmask.z + exclmask.w + 0.25f );
if( sum > 0.25f ){
outforce2.x += 1.0f;
outforce2.y += exclusions;
outforce2.z += sum;
}
// ---------------------------------------------------------------------
exclind.x = floor( exclind.x + 1.2f );
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
sig = isig3 + jsig;
eps = ieps3 * jeps;
qq = qi3 * qj;
d1 = ipos3 - jpos1;
d2 = ipos3 - jpos2;
d3 = ipos3 - jpos3;
d4 = ipos3 - jpos4;
r2 = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask * 10000.0f;
fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
// outforce3 += fs.x * d1;
// outforce3 += fs.y * d2;
// outforce3 += fs.z * d3;
// outforce3 += fs.w * d4;
// outforce3 += float3( 4.0f, 4.0f, 4.0f);
// outforce3 += 1.0f/r2;
exclmask = exclmask > frac4 ? one4 : zero4;
sum = floor( exclmask.x + exclmask.y + exclmask.z + exclmask.w + 0.25f );
if( sum > 0.25f ){
outforce3.x += 1.0f;
outforce3.y += exclusions;
outforce3.z += sum;
// outforce3.y += exclind.x + strwidth*exclind.y;
//outforce3.y += 4.0;
}
// ---------------------------------------------------------------------
exclind.x = floor( exclind.x + 1.2f );
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
sig = isig4 + jsig;
eps = ieps4 * jeps;
qq = qi4 * qj;
d1 = ipos4 - jpos1;
d2 = ipos4 - jpos2;
d3 = ipos4 - jpos3;
d4 = ipos4 - jpos4;
r2 = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask * 10000.0f;
fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
// outforce4 += fs.x * d1;
// outforce4 += fs.y * d2;
// outforce4 += fs.z * d3;
// outforce4 += fs.w * d4;
// outforce4 += float3( 4.0f, 4.0f, 4.0f);
// outforce4 += 1.0f/r2;
exclmask = exclmask > frac4 ? one4 : zero4;
sum = floor( exclmask.x + exclmask.y + exclmask.z + exclmask.w + 0.25f );
if( sum > 0.25f ){
outforce4.x += 1.0f;
outforce4.y += exclusions;
outforce4.z += sum;
// outforce4.y += exclind.x + strwidth*exclind.y;
//outforce4.y += 4.0;
}
// ---------------------------------------------------------------------
exclind.x = floor( exclind.x - 2.9f );
exclind.y = floor( exclind.y + 1.2f );
if ( natoms - (jatom.y*strwidth + jatom.x) < 0.9f )
breakflag = -1.0f;
}
jatom.y = floor( jatom.y + 1.2f );
}
}
kernel void kbonded_CDLJ (
float xstrwidth,
float4 posq[][],
float4 atoms<>,
float4 parm0<>,
float4 parm1<>,
float4 parm2<>,
float4 parm3<>,
float4 parm4<>,
out float3 fi<>,
out float3 fj<>,
out float3 fk<>,
out float3 fl<>
)
{
float3 rij, rkj, rkl, ril;
float2 ai, aj, ak, al;
float3 m, n;
float sgnphi;
float cosfac;
float ddphi, ddphi_rb;
float3 u, v, s;
float invlkj, invlij, invlkl, msq, nsq, cos_phi, sin_phi;
float3 uij, ukj, ukl;
float phi, mdphi;
float rij_d_ukj, ukj_d_rkl;
float normfac;
float qq;
float3 fi_ang, fj_ang, fk_ang;
float3 fi_bond;
float cii, ckk, cik;
float fs, st, sth;
float3 fi_pair;
float r2 ;
float torsion_numerator;
float theta1, costheta1, invsintheta1;
float theta2, costheta2, invsintheta2;
float4 posqi, posqj, posqk, posql;
float sig, eps;
ai.y = round( (atoms.x - fmod( atoms.x, xstrwidth ))/xstrwidth );
ai.x = atoms.x - ai.y * xstrwidth;
posqi = posq[ ai ];
if ( atoms.y > -0.5f ) {
aj.y = round( (atoms.y - fmod( atoms.y, xstrwidth ))/xstrwidth );
aj.x = atoms.y - aj.y * xstrwidth;
posqj = posq[ aj ];
}
else
posqj = float4( 0.0f, 0.0f, 1.0f, 0.0f );
if ( atoms.z > -0.5f ) {
ak.y = round( (atoms.z - fmod( atoms.z, xstrwidth ))/xstrwidth );
ak.x = atoms.z - ak.y * xstrwidth;
posqk = posq[ ak ];
}
else
posqk = float4( 0.0f, 1.0f, 1.0f, 0.0f );
if ( atoms.w > -0.5f ) {
al.y = round( (atoms.w - fmod( atoms.w, xstrwidth ))/xstrwidth );
al.x = atoms.w - al.y * xstrwidth;
posql = posq[ al ];
}
else
posql = float4( 1.0f, 1.0f, 1.0f, 0.0f );
qq = posqi.w * posql.w;
rij = posqi.xyz - posqj.xyz;
rkj = posqk.xyz - posqj.xyz;
rkl = posqk.xyz - posql.xyz;
ril = posqi.xyz - posql.xyz;
invlij = rsqrt( dot( rij, rij ) );
invlkj = rsqrt( dot( rkj, rkj ) );
invlkl = rsqrt( dot( rkl, rkl ) );
uij = rij * invlij;
ukj = rkj * invlkj;
ukl = rkl * invlkl;
rij_d_ukj = dot( rij, ukj );
ukj_d_rkl = dot( ukj, rkl );
m = cross( uij, ukj );
n = cross( ukj, ukl );
costheta1 = clamp( rij_d_ukj * invlij, -1.0f, 1.0f );
theta1 = acos( costheta1 );
invsintheta1 = rsqrt( 1.0f - costheta1 * costheta1 );
costheta2 = clamp( ukj_d_rkl * invlkl, -1.0f, 1.0f );
theta2 = acos( costheta2 );
invsintheta2 = rsqrt( 1.0f - costheta2 * costheta2 );
cos_phi = clamp( dot(m,n) * invsintheta1 * invsintheta2, -1.0f, 1.0f );
sgnphi = sign( dot(rij, n) );
phi = sgnphi * acos( cos_phi );
mdphi = parm1.w * phi - parm1.z;
ddphi = -parm1.y * parm1.w * sin( mdphi );
cos_phi = -cos_phi;
sin_phi = -sgnphi*sqrt( clamp( 1.0f - cos_phi * cos_phi, 0.0f, 1.0f) );
ddphi_rb = 5.0f * parm1.x;
ddphi_rb = 4.0f * parm0.w + ddphi_rb * cos_phi;
ddphi_rb = 3.0f * parm0.z + ddphi_rb * cos_phi;
ddphi_rb = 2.0f * parm0.y + ddphi_rb * cos_phi;
ddphi_rb = parm0.x + ddphi_rb * cos_phi;
ddphi_rb = -ddphi_rb * sin_phi;
ddphi += ddphi_rb;
fi = (-ddphi * invlij * invsintheta1 * invsintheta1 ) * m;
fl = ( ddphi * invlkl * invsintheta2 * invsintheta2 ) * n;
u = rij_d_ukj * invlkj * fi;
v = ukj_d_rkl * invlkj * fl;
s = u - v;
fj = s - fi;
fk = -(s + fl);
fs = -parm2.y * ( theta1 - parm2.x );
st = fs * invsintheta1;
st = clamp( st, -100000.0f, 100000.0f );
sth = st * costheta1;
cik = st * invlkj * invlij;
cii = sth * invlij;
ckk = sth * invlkj;
fi_ang = -( cik * rkj - cii * uij );
fk_ang = -( cik * rij - ckk * ukj );
fj_ang = -fi_ang - fk_ang;
fi += fi_ang;
fj += fj_ang;
fk += fk_ang;
fs = -parm2.w * ( theta2 - parm2.z );
st = fs * invsintheta2;
st = clamp( st, -100000.0f, 100000.0f );
sth = st * costheta2;
cik = st * invlkj * invlkl;
cii = sth * invlkj;
ckk = sth * invlkl;
fi_ang = ( cik * rkl - cii * ukj );
fk_ang = ( cik * rkj - ckk * ukl );
fj_ang = -fi_ang - fk_ang;
fj += fi_ang;
fk += fj_ang;
fl += fk_ang;
fi_bond = -parm3.y * ( 1.0f - parm3.x * invlij ) * rij;
fi += fi_bond;
fj += -fi_bond;
fi_bond = parm3.w * ( 1.0f - parm3.z * invlkj ) * rkj;
fj += fi_bond;
fk += -fi_bond;
fi_bond = -parm4.y * ( 1.0f - parm4.x * invlkl ) * rkl;
fk += fi_bond;
fl += -fi_bond;
if ( parm4.z > -0.5f ) {
r2 = dot( ril, ril );
fs = scalar_force_single_CDLJ( qq, epsfac, parm4.z, parm4.w, r2 );
fi_pair = fs * ril;
fi += fi_pair;
fl -= fi_pair;
}
}
kernel void kbonded_CDLJDebug (
float epsfac,
float xstrwidth,
float4 nbparams,
float4 posq[][],
float4 atoms<>,
float4 parm0<>,
float4 parm1<>,
float4 parm2<>,
float4 parm3<>,
float4 parm4<>,
out float3 fi<>,
out float3 fj<>,
out float3 fk<>,
out float3 fl<>
)
{
float3 rij, rkj, rkl, ril;
float2 ai, aj, ak, al;
float3 m, n;
float sgnphi;
float cosfac;
float ddphi, ddphi_rb;
float3 u, v, s;
float invlkj, invlij, invlkl, msq, nsq, cos_phi, sin_phi;
float3 uij, ukj, ukl;
float phi, mdphi;
float rij_d_ukj, ukj_d_rkl;
float normfac;
float qq;
float3 fi_ang, fj_ang, fk_ang;
float3 fi_bond;
float cii, ckk, cik;
float fs, st, sth;
float3 fi_pair;
float r2 ;
float torsion_numerator;
float theta1, costheta1, invsintheta1;
float theta2, costheta2, invsintheta2;
float4 posqi, posqj, posqk, posql;
float sig, eps;
ai.y = round( (atoms.x - fmod( atoms.x, xstrwidth ))/xstrwidth );
ai.x = atoms.x - ai.y * xstrwidth;
posqi = posq[ ai ];
if ( atoms.y > -0.5f ) {
aj.y = round( (atoms.y - fmod( atoms.y, xstrwidth ))/xstrwidth );
aj.x = atoms.y - aj.y * xstrwidth;
posqj = posq[ aj ];
} else {
posqj = float4( 0.0f, 0.0f, 1.0f, 0.0f );
}
if ( atoms.z > -0.5f ) {
ak.y = round( (atoms.z - fmod( atoms.z, xstrwidth ))/xstrwidth );
ak.x = atoms.z - ak.y * xstrwidth;
posqk = posq[ ak ];
} else {
posqk = float4( 0.0f, 1.0f, 1.0f, 0.0f );
}
if ( atoms.w > -0.5f ) {
al.y = round( (atoms.w - fmod( atoms.w, xstrwidth ))/xstrwidth );
al.x = atoms.w - al.y * xstrwidth;
posql = posq[ al ];
} else {
posql = float4( 1.0f, 1.0f, 1.0f, 0.0f );
}
qq = posqi.w * posql.w;
rij = posqi.xyz - posqj.xyz;
rkj = posqk.xyz - posqj.xyz;
rkl = posqk.xyz - posql.xyz;
ril = posqi.xyz - posql.xyz;
invlij = rsqrt( dot( rij, rij ) );
invlkj = rsqrt( dot( rkj, rkj ) );
invlkl = rsqrt( dot( rkl, rkl ) );
uij = rij * invlij;
ukj = rkj * invlkj;
ukl = rkl * invlkl;
rij_d_ukj = dot( rij, ukj );
ukj_d_rkl = dot( ukj, rkl );
m = cross( uij, ukj );
n = cross( ukj, ukl );
costheta1 = clamp( rij_d_ukj * invlij, -1.0f, 1.0f );
theta1 = acos( costheta1 );
invsintheta1 = rsqrt( 1.0f - costheta1 * costheta1 );
costheta2 = clamp( ukj_d_rkl * invlkl, -1.0f, 1.0f );
theta2 = acos( costheta2 );
invsintheta2 = rsqrt( 1.0f - costheta2 * costheta2 );
cos_phi = clamp( dot(m,n) * invsintheta1 * invsintheta2, -1.0f, 1.0f );
sgnphi = sign( dot(rij, n) );
phi = sgnphi * acos( cos_phi );
mdphi = parm1.w * phi - parm1.z;
ddphi = -parm1.y * parm1.w * sin( mdphi );
cos_phi = -cos_phi;
sin_phi = -sgnphi*sqrt( clamp( 1.0f - cos_phi * cos_phi, 0.0f, 1.0f) );
ddphi_rb = 5.0f * parm1.x;
ddphi_rb = 4.0f * parm0.w + ddphi_rb * cos_phi;
ddphi_rb = 3.0f * parm0.z + ddphi_rb * cos_phi;
ddphi_rb = 2.0f * parm0.y + ddphi_rb * cos_phi;
ddphi_rb = parm0.x + ddphi_rb * cos_phi;
ddphi_rb = -ddphi_rb * sin_phi;
ddphi += ddphi_rb;
//ddphi = ddphi_rb;
//ddphi = 0.0f;
fi = (-ddphi * invlij * invsintheta1 * invsintheta1 ) * m;
fl = ( ddphi * invlkl * invsintheta2 * invsintheta2 ) * n;
u = rij_d_ukj * invlkj * fi;
v = ukj_d_rkl * invlkj * fl;
s = u - v;
fj = s - fi;
fk = -(s + fl);
fs = -parm2.y * ( theta1 - parm2.x );
st = fs * invsintheta1;
st = clamp( st, -100000.0f, 100000.0f );
sth = st * costheta1;
cik = st * invlkj * invlij;
cii = sth * invlij;
ckk = sth * invlkj;
fi_ang = -( cik * rkj - cii * uij );
fk_ang = -( cik * rij - ckk * ukj );
fj_ang = -fi_ang - fk_ang;
/*
fi += fi_ang;
fj += fj_ang;
fk += fk_ang;
*/
fs = -parm2.w * ( theta2 - parm2.z );
st = fs * invsintheta2;
st = clamp( st, -100000.0f, 100000.0f );
sth = st * costheta2;
cik = st * invlkj * invlkl;
cii = sth * invlkj;
ckk = sth * invlkl;
fi_ang = ( cik * rkl - cii * ukj );
fk_ang = ( cik * rkj - ckk * ukl );
fj_ang = -fi_ang - fk_ang;
/*
fj += fi_ang;
fk += fj_ang;
fl += fk_ang;
*/
// bonded terms
// k*(r - d)*deltaX/r = k*(1 - d/r)*deltaX
/*
fi = float3( 0.0f, 0.0f, 0.0f );
fj = float3( 0.0f, 0.0f, 0.0f );
fk = float3( 0.0f, 0.0f, 0.0f );
fl = float3( 0.0f, 0.0f, 0.0f );
*/
/*
// ------------------------------------------------------------
fi_bond = -parm3.y * ( 1.0f - parm3.x * invlij ) * rij;
fi += fi_bond;
fj += -fi_bond;
// ------------------------------------------------------------
fi_bond = parm3.w * ( 1.0f - parm3.z * invlkj ) * rkj;
fj += fi_bond;
fk += -fi_bond;
// ------------------------------------------------------------
fi_bond = -parm4.y * ( 1.0f - parm4.x * invlkl ) * rkl;
fk += fi_bond;
fl += -fi_bond;
*/
// ------------------------------------------------------------
// LJ14 terms
//fi = float3( 0.0f, 0.0f, 0.0f);
//fj = float3( 0.0f, 0.0f, 0.0f);
//fk = float3( 0.0f, 0.0f, 0.0f);
//fl = float3( 0.0f, 0.0f, 0.0f);
/*
if ( parm4.z > -0.5f ) {
r2 = dot( ril, ril );
fs = scalar_force_single_CDLJ( qq, epsfac, parm4.z, parm4.w, r2 );
fi_pair = fs * ril;
//fi_pair = float3( posqi.w + posql.w, posqi.w, posql.w );
//fi_pair = float3( posqi.w + posql.w, posqi.w, posql.w );
//fi_pair = float3( atoms.x, parm4.z, parm4.w );
//fi_pair = float3( 0.0f, 0.0f, 0.0f);
fi += fi_pair;
fl -= fi_pair;
//fi = fi_pair;
//fl = float3( atoms.w, parm4.z, parm4.w );
}
*/
}
kernel void knbforce_CDLJ4NoEx(
float natoms,
float nAtomsCeiling,
float dupfac,
float strheight,
float strwidth,
float jstrwidth,
float fstrwidth,
float epsfac,
float4 params,
float4 posq[][],
float4 isigeps[][],
float4 sigma[][],
float4 epsilon[][],
float excl[][],
out float3 outforce1<>,
out float3 outforce2<>,
out float3 outforce3<>,
out float3 outforce4<> ) {
float4 jsig, jeps;
float3 ipos1, ipos2, ipos3, ipos4;
float4 jpos;
float3 jpos1, jpos2, jpos3, jpos4;
float qi1, qi2, qi3, qi4;
float isig1, isig2, isig3, isig4;
float ieps1, ieps2, ieps3, ieps4;
float4 qj, qq;
float3 d1, d2, d3, d4;
float2 exclind;
float4 sig, eps;
float exclusions;
float4 r2;
float3 pad;
float4 exclmask;
float4 exclconst;
float2 iatom;
float2 jstrindex;
float a_iatom;
float linind;
float breakflag;
float2 jatom;
float4 fs;
float jend, jstart;
float which_rep;
exclconst = float4( 2.0f, 3.0f, 5.0f, 7.0f );
outforce1 = float3( 0.0f, 0.0f, 0.0f );
outforce2 = float3( 0.0f, 0.0f, 0.0f );
outforce3 = float3( 0.0f, 0.0f, 0.0f );
outforce4 = float3( 0.0f, 0.0f, 0.0f );
a_iatom = 4.0f*((indexof outforce1).x + (indexof outforce1).y * fstrwidth);
// linind = fmod( a_iatom, natoms );
linind = fmod( a_iatom, nAtomsCeiling );
iatom.x = fmod( linind, strwidth );
iatom.y = round( (linind - fmod(linind, strwidth))/strwidth );
/* --------------------------------------------------------------------- */
jpos = posq[ iatom ];
ipos1 = jpos.xyz;
qi1 = jpos.w;
jstrindex = isigeps[ iatom ];
isig1 = jstrindex.x;
ieps1 = jstrindex.y;
/* --------------------------------------------------------------------- */
iatom.x += 1.0f;
jpos = posq[ iatom ];
ipos2 = jpos.xyz;
qi2 = jpos.w;
jstrindex = isigeps[ iatom ];
isig2 = jstrindex.x;
ieps2 = jstrindex.y;
/* --------------------------------------------------------------------- */
iatom.x += 1.0f;
jpos = posq[ iatom ];
ipos3 = jpos.xyz;
qi3 = jpos.w;
jstrindex = isigeps[ iatom ];
isig3 = jstrindex.x;
ieps3 = jstrindex.y;
/* --------------------------------------------------------------------- */
iatom.x += 1.0f;
jpos = posq[ iatom ];
ipos4 = jpos.xyz;
qi4 = jpos.w;
jstrindex = isigeps[ iatom ];
isig4 = jstrindex.x;
ieps4 = jstrindex.y;
/* --------------------------------------------------------------------- */
breakflag = 1.0f;
jatom.y = 0.0f;
// which_rep = round( (a_iatom - fmod(a_iatom, natoms))/natoms );
which_rep = round( (a_iatom - fmod(a_iatom, nAtomsCeiling ))/nAtomsCeiling );
jend = 1.0f + round( (natoms - fmod(natoms, (dupfac * strwidth )))/(dupfac * strwidth ) );
jstart = which_rep * jend;
if ( which_rep > dupfac - 1.5f ) {
jend = 1e6f;
}
jend += jstart;
//exclind.x = linind;
//exclind.y = jstart * strwidth/4.0f;
//exclind.y = floor( exclind.y + 0.25f );
jatom.y = jstart;
while ( jatom.y < jend && breakflag > 0.0f ) {
jatom.x = 0;
while ( jatom.x < strwidth && breakflag > 0.0f ) {
// exclusions = excl[ exclind ];
// exclmask = fmod( exclusions, exclconst );
linind = (jatom.x + jatom.y*strwidth)/4.0f;
linind = floor( linind + 0.25f );
jstrindex.y = round( (linind - fmod(linind, jstrwidth))/jstrwidth );
jstrindex.x = linind - jstrindex.y * jstrwidth;
jstrindex.x = floor( jstrindex.x + 0.25f );
/* --------------------------------------------------------------------- */
jsig = sigma[ jstrindex ];
jeps = epsilon[ jstrindex ];
jpos = posq[ jatom ];
jpos1 = jpos.xyz;
qj.x = jpos.w;
jatom.x = floor( jatom.x + 1.2f );
//jatom.x += 1.0f;
jpos = posq[ jatom ];
jpos2 = jpos.xyz;
qj.y = jpos.w;
jatom.x = floor( jatom.x + 1.2f );
//jatom.x += 1.0f;
jpos = posq[ jatom ];
jpos3 = jpos.xyz;
qj.z = jpos.w;
jatom.x = floor( jatom.x + 1.2f );
//jatom.x += 1.0f;
jpos = posq[ jatom ];
jpos4 = jpos.xyz;
qj.w = jpos.w;
jatom.x = floor( jatom.x + 1.2f );
//jatom.x += 1.0f;
/* --------------------------------------------------------------------- */
sig = isig1 + jsig;
eps = ieps1 * jeps;
qq = qi1*qj;
d1 = ipos1 - jpos1;
d2 = ipos1 - jpos2;
d3 = ipos1 - jpos3;
d4 = ipos1 - jpos4;
r2 = get_r2_CDLJ( d1, d2, d3, d4 );
fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
outforce1 += fs.x * d1;
outforce1 += fs.y * d2;
outforce1 += fs.z * d3;
outforce1 += fs.w * d4;
/* --------------------------------------------------------------------- */
//exclind.x += 1.0f;
/*
exclind.x = floor( exclind.x + 1.2f );
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
*/
sig = isig2 + jsig;
eps = ieps2 * jeps;
qq = qi2 * qj;
d1 = ipos2 - jpos1;
d2 = ipos2 - jpos2;
d3 = ipos2 - jpos3;
d4 = ipos2 - jpos4;
r2 = get_r2_CDLJ( d1, d2, d3, d4 );
fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
outforce2 += fs.x * d1;
outforce2 += fs.y * d2;
outforce2 += fs.z * d3;
outforce2 += fs.w * d4;
/* --------------------------------------------------------------------- */
//exclind.x += 1.0f;
/*
exclind.x = floor( exclind.x + 1.2f );
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
*/
sig = isig3 + jsig;
eps = ieps3 * jeps;
qq = qi3 * qj;
d1 = ipos3 - jpos1;
d2 = ipos3 - jpos2;
d3 = ipos3 - jpos3;
d4 = ipos3 - jpos4;
r2 = get_r2_CDLJ( d1, d2, d3, d4 );
fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
outforce3 += fs.x * d1;
outforce3 += fs.y * d2;
outforce3 += fs.z * d3;
outforce3 += fs.w * d4;
/* --------------------------------------------------------------------- */
//exclind.x += 1.0f;
/*
exclind.x = floor( exclind.x + 1.2f );
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
*/
sig = isig4 + jsig;
eps = ieps4 * jeps;
qq = qi4 * qj;
d1 = ipos4 - jpos1;
d2 = ipos4 - jpos2;
d3 = ipos4 - jpos3;
d4 = ipos4 - jpos4;
r2 = get_r2_CDLJ( d1, d2, d3, d4 );
fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
outforce4 += fs.x * d1;
outforce4 += fs.y * d2;
outforce4 += fs.z * d3;
outforce4 += fs.w * d4;
/* --------------------------------------------------------------------- */
//exclind.x -= 3.0f;
//exclind.x = floor( exclind.x - 2.9f );
//exclind.y += 1.0f;
//exclind.y = floor( exclind.y + 1.25f );
if( natoms - (jatom.y*strwidth + jatom.x) < 0.9f )breakflag = -1.0f;
}
//jatom.y += 1.0f;
jatom.y = floor( jatom.y + 1.2f );
}
}
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
//Gather kernel for use with diferent angle, dihedral and improper functions
//
//For small systems, the overhead of calling the kernel is so high
//That we have to minimize the number of kernel calls. At the same
//time we don't want to waste reads, so I'm providing a bunch of kernels
//here that are unrolled to diferent extents. Use the appropriate one
//depending on how many inverse maps there are, rather than looping over
//the simple one.
//helper function to make the unrolling look better
kernel float3 do_gather( float strwidth, float4 invmap<>, float3 forces[][] ) {
float3 f;
float4 quotient, remainder;
float2 idx;
f = float3( 0.0f, 0.0f, 0.0f );
//Convert from linear to 2D index
// quotient = floor( invmap / strwidth );
quotient = round( ( invmap - fmod(invmap, strwidth))/strwidth );
remainder = invmap - quotient * strwidth;
//Add each force only if non-negative
if ( invmap.x >= 0.0f ) {
idx.y = quotient.x;
idx.x = remainder.x;
f = forces[ idx ];
}
if ( invmap.y >= 0.0f ) {
idx.y = quotient.y;
idx.x = remainder.y;
f += forces[ idx ];
}
if ( invmap.z >= 0.0f ) {
idx.y = quotient.z;
idx.x = remainder.z;
f += forces[ idx ];
}
if ( invmap.w >= 0.0f ) {
idx.y = quotient.w;
idx.x = remainder.w;
f += forces[ idx ];
}
return f;
}
//Simple version, takes only one index stream
kernel void kinvmap_gather(
float strwidth, //stream width of the dihedral forces
float4 invmap<>, //indices into the dihedral forces
float3 forces[][], //dihedral forces
float3 inforce<>, //particle forces before
out float3 outforce<> //particle forces after
)
{
outforce = inforce;
outforce += do_gather( strwidth, invmap, forces );
}
//Takes two inverse maps
kernel void kinvmap_gather2(
float strwidth, //stream width of the dihedral forces
float4 invmap1<>, //indices into the dihedral forces
float4 invmap2<>, //indices into the dihedral forces
float3 forces[][], //dihedral forces
float3 inforce<>, //particle forces before
out float3 outforce<> //particle forces after
)
{
outforce = inforce;
outforce += do_gather( strwidth, invmap1, forces );
outforce += do_gather( strwidth, invmap2, forces );
}
//Takes three inverse maps
kernel void kinvmap_gather3(
float strwidth, //stream width of the dihedral forces
float4 invmap1<>, //indices into the dihedral forces
float4 invmap2<>, //indices into the dihedral forces
float4 invmap3<>,
float3 forces[][], //dihedral forces
float3 inforce<>, //particle forces before
out float3 outforce<> //particle forces after
)
{
outforce = inforce;
outforce += do_gather( strwidth, invmap1, forces );
outforce += do_gather( strwidth, invmap2, forces );
outforce += do_gather( strwidth, invmap3, forces );
}
//Takes four inverse maps
kernel void kinvmap_gather4(
float strwidth, //stream width of the dihedral forces
float4 invmap1<>, //indices into the dihedral forces
float4 invmap2<>, //indices into the dihedral forces
float4 invmap3<>,
float4 invmap4<>,
float3 forces[][], //dihedral forces
float3 inforce<>, //particle forces before
out float3 outforce<> //particle forces after
)
{
outforce = inforce;
outforce += do_gather( strwidth, invmap1, forces );
outforce += do_gather( strwidth, invmap2, forces );
outforce += do_gather( strwidth, invmap3, forces );
outforce += do_gather( strwidth, invmap4, forces );
}
//Takes five inverse maps
kernel void kinvmap_gather5(
float strwidth, //stream width of the dihedral forces
float4 invmap1<>, //indices into the dihedral forces
float4 invmap2<>, //indices into the dihedral forces
float4 invmap3<>,
float4 invmap4<>,
float4 invmap5<>,
float3 forces[][], //dihedral forces
float3 inforce<>, //particle forces before
out float3 outforce<> //particle forces after
)
{
outforce = inforce;
outforce += do_gather( strwidth, invmap1, forces );
outforce += do_gather( strwidth, invmap2, forces );
outforce += do_gather( strwidth, invmap3, forces );
outforce += do_gather( strwidth, invmap4, forces );
outforce += do_gather( strwidth, invmap5, forces );
}
//Takes six inverse maps - this is the last one!
kernel void kinvmap_gather6(
float strwidth, //stream width of the dihedral forces
float4 invmap1<>, //indices into the dihedral forces
float4 invmap2<>, //indices into the dihedral forces
float4 invmap3<>,
float4 invmap4<>,
float4 invmap5<>,
float4 invmap6<>,
float3 forces[][], //dihedral forces
float3 inforce<>, //particle forces before
out float3 outforce<> //particle forces after
)
{
outforce = inforce;
outforce += do_gather( strwidth, invmap1, forces );
outforce += do_gather( strwidth, invmap2, forces );
outforce += do_gather( strwidth, invmap3, forces );
outforce += do_gather( strwidth, invmap4, forces );
outforce += do_gather( strwidth, invmap5, forces );
outforce += do_gather( strwidth, invmap6, forces );
}
kernel float3 etch_force(
float fpos,
float strwidth,
float3 fi[][],
float3 fj[][],
float3 fk[][],
float3 fl[][]
)
{
float2 ind;
float _fpos;
_fpos = fpos;
if ( _fpos > 300000.0f ) {
_fpos = _fpos - 300000.0f;
//ind.y = floor( _fpos / strwidth );
ind.y = round( ( _fpos - fmod( _fpos, strwidth))/strwidth );
ind.x = _fpos - ind.y * strwidth;
return fl[ ind ];
}
else if ( _fpos > 200000.0f ) {
_fpos = _fpos - 200000.0f;
//ind.y = floor( _fpos / strwidth );
ind.y = round( ( _fpos - fmod( _fpos, strwidth))/strwidth );
ind.x = _fpos - ind.y * strwidth;
return fk[ ind ];
}
else if ( _fpos > 100000.0f ) {
_fpos = _fpos - 100000.0f;
//ind.y = floor( _fpos / strwidth );
ind.y = round( ( _fpos - fmod( _fpos, strwidth))/strwidth );
ind.x = _fpos - ind.y * strwidth;
return fj[ ind ];
}
else if ( _fpos >= -0.5f ) {
//ind.y = floor( _fpos / strwidth );
ind.y = round( ( _fpos - fmod( _fpos, strwidth))/strwidth );
ind.x = _fpos - ind.y * strwidth;
return fi[ ind ];
}
else
return 0.0f;
}
//For-loop version doesn't work
//Using a merged version of the above
kernel float2 linear_to_2D( float linind, float width )
{
float2 ind;
//ind.y = floor( linind / width );
ind.y = round( ( linind - fmod( linind, width))/width );
ind.x = linind - ind.y * width;
return ind;
}
//helper function to make the unrolling look better
kernel float3 do_gather_merged_single( float strwidth, float invmap,
float3 fi[][], float3 fj[][], float3 fk[][], float3 fl[][] ) {
float3 f;
float2 idx;
float _invmap;
float n;
_invmap = invmap;
n = floor( _invmap / 100000.0f );
_invmap -= n * 100000.0f;
idx = linear_to_2D( _invmap, strwidth );
if ( n > 2.5f ) {
f = fl[ idx ];
}
else if ( n > 1.5f ) {
f = fk[ idx ];
}
else if ( n > 0.5f ) {
f = fj[ idx ];
}
else if( n > -0.5f ) {
f = fi[ idx ];
}
return f;
}
kernel float3 do_gather_merged( float strwidth, float4 invmap,
float3 fi[][], float3 fj[][], float3 fk[][], float3 fl[][])
{
float3 f;
f = do_gather_merged_single( strwidth, invmap.x, fi, fj, fk, fl )
+ do_gather_merged_single( strwidth, invmap.y, fi, fj, fk, fl )
+ do_gather_merged_single( strwidth, invmap.z, fi, fj, fk, fl )
+ do_gather_merged_single( strwidth, invmap.w, fi, fj, fk, fl );
return f;
}
kernel void kinvmap_gather_merged5(
float natoms, //number of atoms
float strwidth, //stream width of out-of-order forces
float4 invmap0<>,
float4 invmap1<>,
float4 invmap2<>,
float4 invmap3<>,
float4 invmap4<>,
float3 fi[][], //i-forces
float3 fj[][], //j-forces
float3 fk[][], //k-forces
float3 fl[][], //l-forces
float3 inforce<>,
out float3 outforce<>
)
{
outforce = inforce;
outforce += do_gather_merged( strwidth, invmap0, fi, fj, fk, fl )
+ do_gather_merged( strwidth, invmap1, fi, fj, fk, fl )
+ do_gather_merged( strwidth, invmap2, fi, fj, fk, fl )
+ do_gather_merged( strwidth, invmap3, fi, fj, fk, fl )
+ do_gather_merged( strwidth, invmap4, fi, fj, fk, fl );
}
kernel void kinvmap_gather_merged9(
float natoms, //number of atoms
float strwidth, //stream width of out-of-order forces
float4 invmap0<>,
float4 invmap1<>,
float4 invmap2<>,
float4 invmap3<>,
float4 invmap4<>,
float4 invmap5<>,
float4 invmap6<>,
float4 invmap7<>,
float4 invmap8<>,
float3 fi[][], //i-forces
float3 fj[][], //j-forces
float3 fk[][], //k-forces
float3 fl[][], //l-forces
float3 inforce<>,
out float3 outforce<>
)
{
outforce = inforce;
outforce += do_gather_merged( strwidth, invmap0, fi, fj, fk, fl )
+ do_gather_merged( strwidth, invmap1, fi, fj, fk, fl )
+ do_gather_merged( strwidth, invmap2, fi, fj, fk, fl )
+ do_gather_merged( strwidth, invmap3, fi, fj, fk, fl )
+ do_gather_merged( strwidth, invmap4, fi, fj, fk, fl )
+ do_gather_merged( strwidth, invmap5, fi, fj, fk, fl )
+ do_gather_merged( strwidth, invmap6, fi, fj, fk, fl )
+ do_gather_merged( strwidth, invmap7, fi, fj, fk, fl )
+ do_gather_merged( strwidth, invmap8, fi, fj, fk, fl );
}
void kinvmap_gather (const float strwidth,
::brook::stream invmap,
::brook::stream forces,
::brook::stream inforce,
::brook::stream outforce);
void kinvmap_gather2 (const float strwidth,
::brook::stream invmap1,
::brook::stream invmap2,
::brook::stream forces,
::brook::stream inforce,
::brook::stream outforce);
void kinvmap_gather3 (const float strwidth,
::brook::stream invmap1,
::brook::stream invmap2,
::brook::stream invmap3,
::brook::stream forces,
::brook::stream inforce,
::brook::stream outforce);
void kinvmap_gather4 (const float strwidth,
::brook::stream invmap1,
::brook::stream invmap2,
::brook::stream invmap3,
::brook::stream invmap4,
::brook::stream forces,
::brook::stream inforce,
::brook::stream outforce);
void kinvmap_gather5 (const float strwidth,
::brook::stream invmap1,
::brook::stream invmap2,
::brook::stream invmap3,
::brook::stream invmap4,
::brook::stream invmap5,
::brook::stream forces,
::brook::stream inforce,
::brook::stream outforce);
void kinvmap_gather6 (const float strwidth,
::brook::stream invmap1,
::brook::stream invmap2,
::brook::stream invmap3,
::brook::stream invmap4,
::brook::stream invmap5,
::brook::stream invmap6,
::brook::stream forces,
::brook::stream inforce,
::brook::stream outforce);
void kinvmap_gather3_4 (const float strwidth,
::brook::stream invmap3_1,
::brook::stream invmap3_2,
::brook::stream invmap3_3,
::brook::stream forces3,
::brook::stream invmap4_1,
::brook::stream invmap4_2,
::brook::stream invmap4_3,
::brook::stream invmap4_4,
::brook::stream forces4,
::brook::stream inforce,
::brook::stream outforce);
void kinvmap_gather3_5 (const float strwidth,
::brook::stream invmap3_1,
::brook::stream invmap3_2,
::brook::stream invmap3_3,
::brook::stream forces3,
::brook::stream invmap5_1,
::brook::stream invmap5_2,
::brook::stream invmap5_3,
::brook::stream invmap5_4,
::brook::stream invmap5_5,
::brook::stream forces5,
::brook::stream inforce,
::brook::stream outforce);
void kinvmap_gather5_2 (const float strwidth,
::brook::stream invmap5_1,
::brook::stream invmap5_2,
::brook::stream invmap5_3,
::brook::stream invmap5_4,
::brook::stream invmap5_5,
::brook::stream forces5,
::brook::stream invmap2_1,
::brook::stream invmap2_2,
::brook::stream forces2,
::brook::stream inforce,
::brook::stream outforce);
void kinvmap_gather4_2 (const float strwidth,
::brook::stream invmap4_1,
::brook::stream invmap4_2,
::brook::stream invmap4_3,
::brook::stream invmap4_4,
::brook::stream forces4,
::brook::stream invmap2_1,
::brook::stream invmap2_2,
::brook::stream forces2,
::brook::stream inforce,
::brook::stream outforce);
void kinvmap_gather_merged (const float natoms,
const float strwidth,
const float istrwidth,
const float fstrwidth,
::brook::stream nimap,
::brook::stream invmap,
::brook::stream fi,
::brook::stream fj,
::brook::stream fk,
::brook::stream fl,
::brook::stream inforce,
::brook::stream outforce);
void kinvmap_gather_merged9 (const float natoms,
const float strwidth,
::brook::stream invmap0,
::brook::stream invmap1,
::brook::stream invmap2,
::brook::stream invmap3,
::brook::stream invmap4,
::brook::stream invmap5,
::brook::stream invmap6,
::brook::stream invmap7,
::brook::stream invmap8,
::brook::stream fi,
::brook::stream fj,
::brook::stream fk,
::brook::stream fl,
::brook::stream inforce,
::brook::stream outforce);
void kinvmap_gather_merged5 (const float natoms,
const float strwidth,
::brook::stream invmap0,
::brook::stream invmap1,
::brook::stream invmap2,
::brook::stream invmap3,
::brook::stream invmap4,
::brook::stream fi,
::brook::stream fj,
::brook::stream fk,
::brook::stream fl,
::brook::stream inforce,
::brook::stream outforce);
/* After forces above, we have the forces for even numbered particles
* in one stream, odd numbered particles in another.
* In each stream, the forces are in several parts depending on how
* many times we replicated the input stream.
*
* This kernel should not be a bottle neck, but if it turns out to
* be so, we can try some work arounds.
*
* To avoid an extra kernel to zero forces, this sets the forces
* rather than adding to it.
* */
kernel static void kmerge_partial_forces(
float repfac,
float atomStrWidth,
float pforceStrWidth,
float natoms,
float3 pforce1[][],
float3 pforce2[][],
out float3 force<> )
{
float linind;
float2 pindex;
float odd;
float i;
float2 adjustcount = (indexof force);
//convert to linear atom index
linind = adjustcount.x + adjustcount.y * atomStrWidth;
//If odd or even, we pick from diferent streams.
odd = linind - floor( linind / 2.0f ) * 2.0f;
//Now linear index is the index into partial_forces
linind = floor( linind / 2.0f );
force = float3( 0.0f, 0.0f, 0.0f );
//If we have predicated conditionals, we should
//keep the conditional inside the loop
for ( i = 0; i < repfac; i+=1.0f ) {
pindex.y = floor( linind / pforceStrWidth );
pindex.x = linind - pindex.y * pforceStrWidth;
//pindex.x = fmod( linind, pforceStrWidth );
if ( odd > 0.5f ) { //is odd
force += pforce2[ pindex ];
}
else {
force += pforce1[ pindex ];
}
linind += natoms/2.0f;
}
}
kernel void kMergeFloat3_4(
float repfac,
float atomStreamWidth,
float pStreamWidth,
float natoms,
float roundNatoms,
float iUnroll,
float3 pstream1[][],
float3 pstream2[][],
float3 pstream3[][],
float3 pstream4[][],
out float3 outstream<> )
{
float atomIndex, forceIndex, qIndex, qOff;
float2 pindex;
float i;
// given atom index find force indices and streams
pindex = indexof( outstream );
atomIndex = pindex.x + pindex.y*atomStreamWidth;
forceIndex = atomIndex;
outstream = float3( 0.0f, 0.0f, 0.0f );
for( i = 0.0f; i < repfac; i += 1.0f ){
qIndex = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
qOff = forceIndex - iUnroll*qIndex;
pindex.y = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
pindex.x = qIndex - pindex.y*pStreamWidth;
if ( qOff < 0.5f ){
outstream += pstream1[ pindex ];
} else if( qOff < 1.5f ){
outstream += pstream2[ pindex ];
} else if( qOff < 2.5f ){
outstream += pstream3[ pindex ];
} else {
outstream += pstream4[ pindex ];
}
forceIndex += roundNatoms;
}
}
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
/*
* 3rd attempt at writing a force kernel that uses neighborlists.
* Keeping things very simple this time - no unrolling at all.
*
* Starting with simple Coul-LJ
* */
//Constant dielectric, LJ (normal forces)
kernel float4 nl_scalar_force_CDLJ( float4 qq, float epsfac, float4 sig, float4 eps, float4 r2 ) {
float4 invr, invrsig2, invrsig6;
float4 f;
invr = rsqrt( r2 ); // 1 or 2 flops?
invrsig2 = invr * sig; //1 flop
invrsig2 = invrsig2 * invrsig2; //1 flop
invrsig6 = invrsig2 * invrsig2 * invrsig2; //2flops
f = eps * ( 12.0f*invrsig6 - 6.0f ) * invrsig6; //4flops
f += epsfac*qq*invr; //2 flops
f *= invr*invr; //2 flops
return f; //Total ? flops
}
kernel float4 nl_get_r2( float3 d1, float3 d2, float3 d3, float3 d4 ) {
float4 r2;
r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) ); //5*4 flops
return r2;
}
kernel void knbforce_nl(
float AtomStrWidth, //Width of atom position stream
float epsfac,
iter float2 wpos<>, //pixel position of output (i-atom)
float4 posq[][], //coordinates and charges
float3 inforce<>,
float4 nlist0<>, //nbor indices
float4 sig0<>, //LJ sigmas
float4 eps0<>, //LJ epsilons
//We'll add more nlists here to reduce number of kernel calls
out float3 force<> //output forces
)
{
float2 jind;
float4 ind_tmp1, ind_tmp2;
float4 qj;
float3 jpos1, jpos2, jpos3, jpos4;
float4 r2, fs, qq;
float3 d1, d2, d3, d4;
float3 ipos;
float qi;
//Since neighbors will not be adjacent in memory
//We may want to stagger compute and etch, but
//with fxc, we don't have control over scheduling anyway.
//Will try hand scheduling the ps3 code later
//etch i parameters
ipos = posq[ wpos ].xyz;
qi = posq[ wpos ].w;
//etch j parameters
ind_tmp1 = floor( nlist0 / AtomStrWidth );
ind_tmp2 = nlist0 - ind_tmp1 * AtomStrWidth;
force = inforce;
//1st set of 4 neighbors
jind.y = ind_tmp1.x;
jind.x = ind_tmp2.x;
jpos1 = posq[ jind ].xyz;
qj.x = posq[ jind ].w;
jind.y = ind_tmp1.y;
jind.x = ind_tmp2.y;
jpos2 = posq[ jind ].xyz;
qj.y = posq[ jind ].w;
jind.y = ind_tmp1.z;
jind.x = ind_tmp2.z;
jpos3 = posq[ jind ].xyz;
qj.z = posq[ jind ].w;
jind.y = ind_tmp1.w;
jind.x = ind_tmp2.w;
jpos4 = posq[ jind ].xyz;
qj.w = posq[ jind ].w;
d1 = ipos - jpos1;
d2 = ipos - jpos2;
d3 = ipos - jpos3;
d4 = ipos - jpos4;
r2 = nl_get_r2( d1, d2, d3, d4 );
qq = qi * qj;
fs = nl_scalar_force_CDLJ( qq, epsfac, sig0, eps0, r2 );
force += fs.x * d1;
force += fs.y * d2;
force += fs.z * d3;
force += fs.w * d4;
}
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
/* Order N^2 neighbor searching.
*
* This only works for force fields that don't have charge groups.
* If you insist on charge groups, you'll have to pass in appropriate masks here.
*
* This is a simplified kernel, for testing the O(N) speeds.
*
* This does a complete N^2 search without considering groups of
* atoms. Most likely this will prove to be inefficient for
* the O(N) kernel. Lets find out.
*
*
* Each component of the curpass textures is an atom index. The w component
* of curpass3 is a count indicating how many j particles we have
* scanned for this particular i atom.
*
* */
kernel void knborsearch(
float first, //Positive means constructing the first 16.
iter float2 wpos<>, //pixel position of output
float AtomStrHeight,
float AtomStrWidth,
float cutoff2, //square of the cutoff
float natoms, //number of atoms
float excl[][], //exclusions in 1x1 format, 0 means not excluded, 1 means excluded.
float4 posq[][], //atom positions/charges
float4 prevpass3<>, //Last output texture of previous pass
out float4 curpass0<>, //First output of current pass
out float4 curpass1<>,
out float4 curpass2<>,
out float4 curpass3<> //Last output of current pass, used in next pass
){
/*For this kernel, wpos == iatom*/
float2 iind;
float2 jind;
float3 ipos, jpos, dr;
float r2;
float listptr; //Where in the 16-chunk are we now.
float jlinind;
float breakflag; //positive means keep looping, negative means stop
float4 exclconst;
float2 exclind;
float exclusions;
exclconst = float4( 2.0f, 3.0f, 5.0f, 7.0f );
iind = wpos;
exclind.x = iind.x + iind.y * AtomStrWidth;
//etch i atom
ipos = posq[ iind ].xyz;
//Loop over j depending on prevpass
jlinind = prevpass3.w + 1;
jind.y = floor( jlinind / AtomStrWidth );
jind.x = fmod( jlinind, AtomStrWidth );
exclind.y = jlinind;
//All outputs should be initialized to
listptr = 0.0f;
breakflag = 1.0f;
//if we already finished, do nothing
if ( first < 0.0f && prevpass3.w < 0.0f )
breakflag = -1.0f;
//set to -1 to indicate no neighbor
//just to save a separate set of init calls
curpass0 = float4( -1.0f, -1.0f, -1.0f, -1.0f );
curpass1 = curpass0;
curpass2 = curpass0;
curpass3 = curpass0;
while ( jind.y < AtomStrHeight && breakflag > 0.0f ) {
while ( jind.x < AtomStrWidth && breakflag > 0.0f ) {
//First see if this pair is excluded
exclusions = excl[ exclind ];
if ( exclusions < 0.5f ) {
jpos = posq[ jind ].xyz;
dr = jpos - ipos;
r2 = dot( dr, dr );
//If it is inside the cutoff
if ( r2 < cutoff2 ) {
//Figure out where to put it
//We are allowed 4 nested conditionals
//We can play with the structuring of these
if ( listptr < 0.5f )
curpass0.x = jlinind;
else if ( listptr < 1.5f )
curpass0.y = jlinind;
else if ( listptr < 2.5f )
curpass0.z = jlinind;
else if ( listptr < 3.5f )
curpass0.w = jlinind;
else if ( listptr < 4.5f )
curpass1.x = jlinind;
else if ( listptr < 5.5f )
curpass1.y = jlinind;
else if ( listptr < 6.5f )
curpass1.z = jlinind;
else if ( listptr < 7.5f )
curpass1.w = jlinind;
else if ( listptr < 8.5f )
curpass2.x = jlinind;
else if ( listptr < 9.5f )
curpass2.y = jlinind;
else if ( listptr < 10.5f )
curpass2.z = jlinind;
else if ( listptr < 11.5f )
curpass2.w = jlinind;
else if ( listptr < 12.5f )
curpass3.x = jlinind;
else if ( listptr < 13.5f )
curpass3.y = jlinind;
else if ( listptr < 14.5f ) {
curpass3.z = jlinind;
}
else if ( listptr < 15.5f ) {
//We're done for this pass
curpass3.w = jlinind;
breakflag = -1.0f;
}
listptr += 1.0f;
}
}
jlinind += 1.0f;
exclind.y += 1.0f;
jind.x += 1.0f;
}
jind.x = 0.0f;
jind.y += 1.0f;
}
}
//Precomputes lennard jones sig and eps
//to save an indirect etch (and a ew flops) in the
//force kernel. The charge product is not done this way
//because charges have to be etched anyway with the
//positions
kernel void knl_precompute_sigeps(
float AtomStrWidth,
iter float2 wpos<>,
float2 sigeps[][], //x=sigma, y=epsilon
float4 nlist0<>,
float4 nlist1<>,
out float4 sig0<>,
out float4 eps0<>,
out float4 sig1<>,
out float4 eps1<>
)
{
float2 jind;
float4 ind_tmp1, ind_tmp2;
float2 isigeps, jsigeps1, jsigeps2, jsigeps3, jsigeps4;
isigeps = sigeps[ wpos ];
ind_tmp1 = floor( nlist0 / AtomStrWidth );
ind_tmp2 = nlist0 - ind_tmp1 * AtomStrWidth;
jind.y = ind_tmp1.x;
jind.x = ind_tmp2.x;
jsigeps1 = sigeps[ jind ];
jind.y = ind_tmp1.y;
jind.x = ind_tmp2.y;
jsigeps2 = sigeps[ jind ];
jind.y = ind_tmp1.z;
jind.x = ind_tmp2.z;
jsigeps3 = sigeps[ jind ];
jind.y = ind_tmp1.w;
jind.x = ind_tmp2.w;
jsigeps4 = sigeps[ jind ];
sig0.x = isigeps.x + jsigeps1.x;
sig0.y = isigeps.x + jsigeps2.x;
sig0.z = isigeps.x + jsigeps3.x;
sig0.w = isigeps.x + jsigeps4.x;
eps0.x = isigeps.y * jsigeps1.y;
eps0.y = isigeps.y * jsigeps2.y;
eps0.z = isigeps.y * jsigeps3.y;
eps0.w = isigeps.y * jsigeps4.y;
//2nd nlist set
ind_tmp1 = floor( nlist1 / AtomStrWidth );
ind_tmp2 = nlist1 - ind_tmp1 * AtomStrWidth;
jind.y = ind_tmp1.x;
jind.x = ind_tmp2.x;
jsigeps1 = sigeps[ jind ];
jind.y = ind_tmp1.y;
jind.x = ind_tmp2.y;
jsigeps2 = sigeps[ jind ];
jind.y = ind_tmp1.z;
jind.x = ind_tmp2.z;
jsigeps3 = sigeps[ jind ];
jind.y = ind_tmp1.w;
jind.x = ind_tmp2.w;
jsigeps4 = sigeps[ jind ];
sig1.x = isigeps.x + jsigeps1.x;
sig1.y = isigeps.x + jsigeps2.x;
sig1.z = isigeps.x + jsigeps3.x;
sig1.w = isigeps.x + jsigeps4.x;
eps1.x = isigeps.y * jsigeps1.y;
eps1.y = isigeps.y * jsigeps2.y;
eps1.z = isigeps.y * jsigeps3.y;
eps1.w = isigeps.y * jsigeps4.y;
}
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
//Ryckaert Bellman dihedrals, needed for Amber/OPLS ff's
//
//Input is a stream of quartets i, j, k, l and the output is
//four float3 streams fi, fj, fk, fl.
//If by any chance this kernel becomes the bottleneck, we will
//optimize, but for now, this is kept pretty simple.
//To keep things streaming, we have a stream of 6 parameters(a float4 and float2)
//for each dihedral.
kernel void kpdih(
float xstrwidth, //stream width for x
float4 xq[][], //particle coordinates and charges
float4 atoms<>, //ijkl quartets
float4 parms<>, //parms = ( cp, phi0, mult, 0.0 )
out float3 fi<>, //output forces for i, j, k, l
out float3 fj<>,
out float3 fk<>,
out float3 fl<>
) {
float3 r_ij, r_kj, r_kl;
float2 ai, aj, ak, al;
float3 m, n;
float sgnphi;
float cosfac;
float phi, ddphi, mdphi;
float3 u, v, s;
float nrkj, nrkj2, msq, nsq, cos_phi, sin_phi;
//Convert from linear indices to 2D indices into x
//If this kernel is compute bound, we can do this
//conversion before-hand and feed in the 2D coordinates
ai.y = floor( atoms.x / xstrwidth );
ai.x = atoms.x - ai.y * xstrwidth;
aj.y = floor( atoms.y / xstrwidth );
aj.x = atoms.y - aj.y * xstrwidth;
ak.y = floor( atoms.z / xstrwidth );
ak.x = atoms.z - ak.y * xstrwidth;
al.y = floor( atoms.w / xstrwidth );
al.x = atoms.w - al.y * xstrwidth;
r_ij = xq[ai].xyz - xq[aj].xyz; //3
r_kj = xq[ak].xyz - xq[aj].xyz; //3
r_kl = xq[ak].xyz - xq[al].xyz; //3
m = cross( r_ij, r_kj ); //9
n = cross( r_kj, r_kl ); //9
msq = dot(m, m); //5
nsq = dot(n, n); //5
cos_phi = clamp( dot(m, n)/sqrt(msq*nsq), -1.0, 1.0 ); //8
sgnphi = sign( dot( r_ij, n ) ); //5
phi = sgnphi * acos( cos_phi ); //2
mdphi = parms.z * phi - parms.y; //2
ddphi = - parms.x * parms.z * sin( mdphi ); //3
nrkj2 = dot( r_kj, r_kj ); //5
nrkj = sqrt( nrkj2 ); //1
fi = -ddphi * nrkj / msq * m; //5
fl = ddphi * nrkj / nsq * n; //5
u = dot( r_ij, r_kj ) / nrkj2 * fi; //9
v = dot( r_kl, r_kj ) / nrkj2 * fl; //9
s = u - v; //3
fj = s - fi; //3
fk = -(s + fl); //3
//Total : 100 flops
}
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
//Ryckaert Bellman dihedrals, needed for Amber/OPLS ff's
//
//Input is a stream of quartets i, j, k, l and the output is
//four float3 streams fi, fj, fk, fl.
//If by any chance this kernel becomes the bottleneck, we will
//optimize, but for now, this is kept pretty simple.
//To keep things streaming, we have a stream of 6 parameters(a float4 and float2)
//for each dihedral.
kernel void krbdih(
float xstrwidth, //stream width for x
float4 xq[][], //particle coordinates and charges
float4 atoms<>, //ijkl quartets
float4 parm03<>, //params 0-3
float2 parm45<>, //params 4 and 5
out float3 fi<>, //output forces for i, j, k, l
out float3 fj<>,
out float3 fk<>,
out float3 fl<>
) {
float3 r_ij, r_kj, r_kl;
float2 ai, aj, ak, al;
float3 m, n;
float sgnphi;
float cosfac;
float ddphi;
float3 u, v, s;
float nrkj, nrkj2, msq, nsq, cos_phi, sin_phi;
//Convert from linear indices to 2D indices into x
//If this kernel is compute bound, we can do this
//conversion before-hand and feed in the 2D coordinates
ai.y = floor( atoms.x / xstrwidth );
ai.x = atoms.x - ai.y * xstrwidth;
aj.y = floor( atoms.y / xstrwidth );
aj.x = atoms.y - aj.y * xstrwidth;
ak.y = floor( atoms.z / xstrwidth );
ak.x = atoms.z - ak.y * xstrwidth;
al.y = floor( atoms.w / xstrwidth );
al.x = atoms.w - al.y * xstrwidth;
r_ij = xq[ai].xyz - xq[aj].xyz; //3
r_kj = xq[ak].xyz - xq[aj].xyz; //3
r_kl = xq[ak].xyz - xq[al].xyz; //3
m = cross( r_ij, r_kj ); //9
n = cross( r_kj, r_kl ); //9
msq = dot(m, m); //5
nsq = dot(n, n); //5
cos_phi = dot(m, n)/sqrt(msq*nsq); //8 (sqrt=1)
//Switching to "polymer convention"
//See gromacs code
cos_phi = -cos_phi;
sgnphi = sign( dot(r_ij, n) ); //5
sin_phi = -sgnphi*sqrt( clamp( 1.0 - cos_phi * cos_phi, 0.0, 1.0) ); //3
//ddphi is basically sum_{i=1}^5 i parm_i cosphi^{i-1}
//This might not be the best way to use the
//4-way mads, but for now we'll let fxc figure it
//out.
//If we precompute some ratios of the parameters
//we can use the 4-way mads better
ddphi = 5.0 * parm45.y;
ddphi = 4.0 * parm45.x + ddphi * cos_phi;
ddphi = 3.0 * parm03.w + ddphi * cos_phi;
ddphi = 2.0 * parm03.z + ddphi * cos_phi;
ddphi = parm03.y + ddphi * cos_phi;
ddphi = -ddphi * sin_phi; //13 flops total for ddphi
nrkj2 = dot( r_kj, r_kj ); //5
nrkj = sqrt( nrkj2 ); //1
fi = -ddphi * nrkj / msq * m; //5
fl = ddphi * nrkj / nsq * n; //5
u = dot( r_ij, r_kj ) / nrkj2 * fi; //9
v = dot( r_kl, r_kj ) / nrkj2 * fl; //9
s = u - v; //3
fj = s - fi; //3
fk = -(s + fl); //3
//Total flops: 109 per rb torsion.
}
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
/*
* This kernel constrains bonds involving hydrogen using an unrolled
* shake implementation. We assume that a heavy atom can have at most
* 3 hydrogens attached. The hydrogens are expected to be identical
*
* For atoms that have ewer than 3 hydrogens
*
* Also we don't check the constraints!
*
* We do a fixed number of iterations without checking for convergence
* to avoid stalling
* *************************************************************/
/*
kernel void
kshakeh(
float nit, //number of iterations
float strwidth, //stream width of posq
float invmH, //inverse mass of hydrogen
float omega, //parameter from gromacs, possibly unused
float4 atoms<>, //heavy0, h1, h2, h3
float4 posq[][], //positions before update
float4 posqp[][], //positions after update
float4 params<>, // (1/m0, mu/2, blensq, 0.0f )
out float4 cposq0<>, //constrained position for heavy atom
out float4 cposq1<>, //ditto for h1
out float4 cposq2<>, //ditto for h2
out float4 cposq3<> //ditto for h3
) {
float2 ai, aj1, aj2, aj3; //2d indices, can be precalc.
float i; //iteration count
float3 xi, xj1, xj2, xj3; //coordinates
float3 xpi, xpj1, xpj2, xpj3; //coordinates
float3 rij1, rij2, rij3, rpij, dr;
float rpsqij, rrpr, acor;
float mask2, mask3;
float diff;
ai.y = floor( atoms.x / strwidth );
ai.x = atoms.x - ai.y * strwidth;
aj1.y = floor( atoms.y / strwidth );
aj1.x = atoms.y - aj1.y * strwidth;
//If further hydrogens are absent,
//just set to the coordinates of the first
//so we don't have any junk memory accesses
//or nans/infs in the calcs.
if ( atoms.z > -0.5f ) {
aj2.y = floor( atoms.z / strwidth );
aj2.x = atoms.z - aj2.y * strwidth;
mask2 = 1.0f;
}
else {
aj2 = aj1;
mask2 = 0.0f;
}
if ( atoms.w > -0.5f ) {
aj3.y = floor( atoms.w / strwidth );
aj3.x = atoms.w - aj3.y * strwidth;
mask3 = 1.0f;
}
else {
aj3 = aj1;
mask3 = 0.0f;
}
cposq0 = posq[ai];
cposq1 = posq[aj1];
cposq2 = posq[aj2];
cposq3 = posq[aj3];
xi = cposq0.xyz;
xj1 = cposq1.xyz;
xj2 = cposq2.xyz;
xj3 = cposq3.xyz;
rij1 = xi - xj1;
rij2 = xi - xj2;
rij3 = xi - xj3;
xpi = posqp[ai].xyz;
xpj1 = posqp[aj1].xyz;
xpj2 = posqp[aj2].xyz;
xpj3 = posqp[aj3].xyz;
i = 0.0f;
while ( i < 15.0f ) {
//First hydrogen
rpij = xpi - xpj1;
rpsqij = dot( rpij, rpij );
rrpr = dot( rij1, rpij );
//for debugging only
diff = abs( params.z - rpsqij ) / (params.z * 2 * 1e-4 );
if ( diff < 1.0f )
acor = 0.0f;
else
//params.y = mu/2, params.z = blen*blen
acor = omega * ( params.z - rpsqij ) * params.y / rrpr ;
dr = rij1 * acor;
xpi += dr * params.x;
xpj1 -= dr * invmH;
//Second hydrogen
rpij = xpi - xpj2;
rpsqij = dot( rpij, rpij );
rrpr = dot( rij2, rpij );
//for debugging only
diff = abs( params.z - rpsqij ) / (params.z * 2 * 1e-4 );
if ( diff < 1.0f )
acor = 0.0f;
else
//params.y = mu/2, params.z = blen*blen
acor = mask2 * omega * ( params.z - rpsqij ) * params.y / rrpr ;
dr = rij2 * acor;
xpi += dr * params.x;
xpj2 -= dr * invmH;
//Third hydrogen
rpij = xpi - xpj3;
rpsqij = dot( rpij, rpij );
rrpr = dot( rij3, rpij );
//for debugging only
diff = abs( params.z - rpsqij ) / (params.z * 2 * 1e-4 );
if ( diff < 1.0f )
acor = 0.0f;
else
//params.y = mu/2, params.z = blen*blen
acor = mask3 * omega * ( params.z - rpsqij ) * params.y / rrpr ;
dr = rij3 * acor;
xpi += dr * params.x;
xpj3 -= dr * invmH;
i += 1.0f;
}
cposq0.xyz = xpi;
cposq1.xyz = xpj1;
cposq2.xyz = xpj2;
cposq3.xyz = xpj3;
} */
/*Applies the updated positions
*Each atom occurs at one and only one position
* */
/*
kernel void kshakeh_update(
float strwidth, //width of cposq streams
float2 invmap<>, //shakeh inverse map
float4 posq<>, //untouched positions
float4 cposq0[][], //constrained position for heavy atom
float4 cposq1[][], //ditto for h1
float4 cposq2[][], //ditto for h2
float4 cposq3[][], //ditto for h3
out float4 oposq<> //updated positions
){
float2 atom;
atom.y = floor( invmap.x / strwidth );
atom.x = invmap.x - atom.y * strwidth;
if ( invmap.y < 0 )
oposq = posq;
else if ( invmap.y < 0.5f )
oposq = cposq0[ atom ];
else if ( invmap.y < 1.5f )
oposq = cposq1[ atom ];
else if ( invmap.y < 2.5f )
oposq = cposq2[ atom ];
else if ( invmap.y < 3.5f )
oposq = cposq3[ atom ];
} */
/* Fix for precision of terms first order in dt
* To be used with corresponding update kernels
* The input posqp is now changes to posq rather than
* posq+changes
* */
kernel void
kshakeh_fix1(
float nit, //number of iterations
float strwidth, //stream width of posq
float invmH, //inverse mass of hydrogen
float omega, //parameter from gromacs, possibly unused
float4 atoms<>, //heavy0, h1, h2, h3
float4 posq[][], //positions before update
float4 posqp[][], //changes to positions
float4 params<>, // (1/m0, mu/2, blensq, 0.0f )
out float4 cposq0<>, //constrained position for heavy atom
out float4 cposq1<>, //ditto for h1
out float4 cposq2<>, //ditto for h2
out float4 cposq3<> //ditto for h3
) {
float2 ai, aj1, aj2, aj3; //2d indices, can be precalc.
float i; //iteration count
float3 xi, xj1, xj2, xj3; //coordinates
float3 xpi, xpj1, xpj2, xpj3; //coordinates
float3 rij1, rij2, rij3, rpij, dr;
float rpsqij, rrpr, acor;
float mask2, mask3;
float diff;
float rij1sq, rij2sq, rij3sq;
float ld1, ld2, ld3;
// ai.y = floor( atoms.x / strwidth );
ai.y = round( (atoms.x - fmod( atoms.x, strwidth ))/strwidth );
ai.x = atoms.x - ai.y * strwidth;
// aj1.y = floor( atoms.y / strwidth );
aj1.y = round( (atoms.y - fmod( atoms.y, strwidth ))/strwidth );
aj1.x = atoms.y - aj1.y * strwidth;
//If further hydrogens are absent,
//just set to the coordinates of the first
//so we don't have any junk memory accesses
//or nans/infs in the calcs.
if ( atoms.z > -0.5f ) {
// aj2.y = floor( atoms.z / strwidth );
aj2.y = round( (atoms.z - fmod( atoms.z, strwidth ))/strwidth );
aj2.x = atoms.z - aj2.y * strwidth;
mask2 = 1.0f;
}
else {
aj2 = aj1;
mask2 = 0.0f;
}
if ( atoms.w > -0.5f ) {
// aj3.y = floor( atoms.w / strwidth );
aj3.y = round( (atoms.w - fmod( atoms.w, strwidth ))/strwidth );
aj3.x = atoms.w - aj3.y * strwidth;
mask3 = 1.0f;
}
else {
aj3 = aj1;
mask3 = 0.0f;
}
cposq0 = posq[ai];
cposq1 = posq[aj1];
cposq2 = posq[aj2];
cposq3 = posq[aj3];
xi = cposq0.xyz;
xj1 = cposq1.xyz;
xj2 = cposq2.xyz;
xj3 = cposq3.xyz;
rij1 = xi - xj1;
rij2 = xi - xj2;
rij3 = xi - xj3;
rij1sq = dot( rij1, rij1 );
rij2sq = dot( rij2, rij2 );
rij3sq = dot( rij3, rij3 );
ld1 = params.z - rij1sq;
ld2 = params.z - rij2sq;
ld3 = params.z - rij3sq;
/*
xpi = posqp[ai].xyz;
xpj1 = posqp[aj1].xyz;
xpj2 = posqp[aj2].xyz;
xpj3 = posqp[aj3].xyz;
*/
xpi = posqp[ai].xyz - xi;
xpj1 = posqp[aj1].xyz - xj1;
xpj2 = posqp[aj2].xyz - xj2;
xpj3 = posqp[aj3].xyz - xj3;
// XXX
// cposq0 = posqp[ai];
// cposq1 = posqp[aj1];
// cposq2 = posqp[aj2];
// cposq3 = posqp[aj3];
// OK
// XXX
i = 0.0f;
while ( i < 15.0f ) {
//First hydrogen
rpij = xpi - xpj1; //This is really rpij - rij
rpsqij = dot( rpij, rpij ); //This is really deltar ^ 2
rrpr = dot( rij1, rpij ); //This is r.deltar
//for debugging only
//params.y = mu/2, params.z = blen*blen
diff = abs( ld1 - 2 * rrpr - rpsqij ) / (params.z * 2 * 1e-4f );
if ( diff < 1.0f )
acor = 0.0f;
else
acor = omega * ( ld1 - 2 * rrpr - rpsqij ) * params.y / ( rrpr + rij1sq ) ;
dr = rij1 * acor;
xpi += dr * params.x;
xpj1 -= dr * invmH;
//Second hydrogen
rpij = xpi - xpj2;
rpsqij = dot( rpij, rpij );
rrpr = dot( rij2, rpij );
//for debugging only
diff = abs( ld2 - 2 * rrpr - rpsqij ) / (params.z * 2 * 1e-4f );
if ( diff < 1.0f )
acor = 0.0f;
else
acor = mask2 * omega * ( ld2 - 2 * rrpr - rpsqij ) * params.y / ( rrpr + rij2sq ) ;
dr = rij2 * acor;
xpi += dr * params.x;
xpj2 -= dr * invmH;
//Third hydrogen
rpij = xpi - xpj3;
rpsqij = dot( rpij, rpij );
rrpr = dot( rij3, rpij );
diff = abs( ld3 - 2 * rrpr - rpsqij ) / (params.z * 2 * 1e-4f );
// for debugging only
if ( diff < 1.0f )
acor = 0.0f;
else
acor = mask3 * omega * ( ld3 - 2 * rrpr - rpsqij ) * params.y / ( rrpr + rij3sq ) ;
dr = rij3 * acor;
xpi += dr * params.x;
xpj3 -= dr * invmH;
i += 1.0f;
}
//Output modified delta's
cposq0.xyz = xpi;
cposq1.xyz = xpj1;
cposq2.xyz = xpj2;
cposq3.xyz = xpj3;
}
kernel void
kshakeh_fix2(
float nit, //number of iterations
float strwidth, //stream width of posq
float invmH, //inverse mass of hydrogen
float omega, //parameter from gromacs, possibly unused
float4 atoms<>, //heavy0, h1, h2, h3
float4 posq[][], //positions before update
float4 posqp[][], //changes to positions
float4 params<>, // (1/m0, mu/2, blensq, 0.0f )
out float4 cposq0<>, //constrained position for heavy atom
out float4 cposq1<>, //ditto for h1
out float4 cposq2<>, //ditto for h2
out float4 cposq3<> //ditto for h3
) {
float2 ai, aj1, aj2, aj3; //2d indices, can be precalc.
float i; //iteration count
float3 xi, xj1, xj2, xj3; //coordinates
float3 xpi, xpj1, xpj2, xpj3; //coordinates
float3 rij1, rij2, rij3, rpij, dr;
float rpsqij, rrpr, acor;
float mask2, mask3;
float diff;
float rij1sq, rij2sq, rij3sq;
float ld1, ld2, ld3;
// ai.y = floor( atoms.x / strwidth );
ai.y = round( (atoms.x - fmod( atoms.x, strwidth ))/strwidth );
ai.x = atoms.x - ai.y * strwidth;
// aj1.y = floor( atoms.y / strwidth );
aj1.y = round( (atoms.y - fmod( atoms.y, strwidth ))/strwidth );
aj1.x = atoms.y - aj1.y * strwidth;
//If further hydrogens are absent,
//just set to the coordinates of the first
//so we don't have any junk memory accesses
//or nans/infs in the calcs.
if ( atoms.z > -0.5f ) {
// aj2.y = floor( atoms.z / strwidth );
aj2.y = round( (atoms.z - fmod( atoms.z, strwidth ))/strwidth );
aj2.x = atoms.z - aj2.y * strwidth;
mask2 = 1.0f;
}
else {
aj2 = aj1;
mask2 = 0.0f;
}
if ( atoms.w > -0.5f ) {
// aj3.y = floor( atoms.w / strwidth );
aj3.y = round( (atoms.w - fmod( atoms.w, strwidth ))/strwidth );
aj3.x = atoms.w - aj3.y * strwidth;
mask3 = 1.0f;
}
else {
aj3 = aj1;
mask3 = 0.0f;
}
cposq0 = posq[ai];
cposq1 = posq[aj1];
cposq2 = posq[aj2];
cposq3 = posq[aj3];
xi = cposq0.xyz;
xj1 = cposq1.xyz;
xj2 = cposq2.xyz;
xj3 = cposq3.xyz;
rij1 = xi - xj1;
rij2 = xi - xj2;
rij3 = xi - xj3;
rij1sq = dot( rij1, rij1 );
rij2sq = dot( rij2, rij2 );
rij3sq = dot( rij3, rij3 );
ld1 = params.z - rij1sq;
ld2 = params.z - rij2sq;
ld3 = params.z - rij3sq;
xpi = posqp[ai].xyz;
xpj1 = posqp[aj1].xyz;
xpj2 = posqp[aj2].xyz;
xpj3 = posqp[aj3].xyz;
/*
xpi = posqp[ai].xyz - xi;
xpj1 = posqp[aj1].xyz - xj1;
xpj2 = posqp[aj2].xyz - xj2;
xpj3 = posqp[aj3].xyz - xj3;
*/
// XXX
// cposq0 = posqp[ai];
// cposq1 = posqp[aj1];
// cposq2 = posqp[aj2];
// cposq3 = posqp[aj3];
// OK
// XXX
i = 0.0f;
while ( i < 15.0f ) {
//First hydrogen
rpij = xpi - xpj1; //This is really rpij - rij
rpsqij = dot( rpij, rpij ); //This is really deltar ^ 2
rrpr = dot( rij1, rpij ); //This is r.deltar
//for debugging only
//params.y = mu/2, params.z = blen*blen
diff = abs( ld1 - 2 * rrpr - rpsqij ) / (params.z * 2 * 1e-4f );
if ( diff < 1.0f )
acor = 0.0f;
else
acor = omega * ( ld1 - 2 * rrpr - rpsqij ) * params.y / ( rrpr + rij1sq ) ;
dr = rij1 * acor;
xpi += dr * params.x;
xpj1 -= dr * invmH;
//Second hydrogen
rpij = xpi - xpj2;
rpsqij = dot( rpij, rpij );
rrpr = dot( rij2, rpij );
//for debugging only
diff = abs( ld2 - 2 * rrpr - rpsqij ) / (params.z * 2 * 1e-4f );
if ( diff < 1.0f )
acor = 0.0f;
else
acor = mask2 * omega * ( ld2 - 2 * rrpr - rpsqij ) * params.y / ( rrpr + rij2sq ) ;
dr = rij2 * acor;
xpi += dr * params.x;
xpj2 -= dr * invmH;
//Third hydrogen
rpij = xpi - xpj3;
rpsqij = dot( rpij, rpij );
rrpr = dot( rij3, rpij );
diff = abs( ld3 - 2 * rrpr - rpsqij ) / (params.z * 2 * 1e-4f );
// for debugging only
if ( diff < 1.0f )
acor = 0.0f;
else
acor = mask3 * omega * ( ld3 - 2 * rrpr - rpsqij ) * params.y / ( rrpr + rij3sq ) ;
dr = rij3 * acor;
xpi += dr * params.x;
xpj3 -= dr * invmH;
i += 1.0f;
}
//Output modified delta's
cposq0.xyz = xpi;
cposq1.xyz = xpj1;
cposq2.xyz = xpj2;
cposq3.xyz = xpj3;
}
/*Applies the updated delta's
*Each atom occurs at one and only one position
* */
kernel void kshakeh_update1_fix1Old(
float strwidth, //width of cposq streams
float2 invmap<>, //shakeh inverse map
float4 posq<>, //constrained deltas
float4 cposq0[][], //constrained delta for heavy atom
float4 cposq1[][], //ditto for h1
float4 cposq2[][], //ditto for h2
float4 cposq3[][], //ditto for h3
out float4 oposq<> //updated deltas
){
float2 atom;
//atom.y = floor( invmap.x / strwidth );
atom.y = round( (invmap.x - fmod( invmap.x, strwidth ))/strwidth );
atom.x = invmap.x - atom.y * strwidth;
if ( invmap.y < 0 )
oposq = posq;
else if ( invmap.y < 0.5f )
oposq = cposq0[ atom ];
else if ( invmap.y < 1.5f )
oposq = cposq1[ atom ];
else if ( invmap.y < 2.5f )
oposq = cposq2[ atom ];
else if ( invmap.y < 3.5f )
oposq = cposq3[ atom ];
}
kernel void kshakeh_update2_fix1(
float strwidth, //width of cposq streams
float2 invmap<>, //shakeh inverse map
float4 posq<>, //old positions
float4 posqp<>, //deltas from sd2
float4 cposq0[][], //constrained delta for heavy atom
float4 cposq1[][], //ditto for h1
float4 cposq2[][], //ditto for h2
float4 cposq3[][], //ditto for h3
out float4 oposq<> //updated deltas
){
float2 atom;
// atom.y = floor( invmap.x / strwidth );
atom.y = round( (invmap.x - fmod( invmap.x, strwidth ))/strwidth );
atom.x = invmap.x - atom.y * strwidth;
if ( invmap.y < 0 ){
oposq.w = posq.w;
oposq.xyz = posqp.xyz - posq.xyz;
} else if ( invmap.y < 0.5f )
oposq = cposq0[ atom ];
else if ( invmap.y < 1.5f)
oposq = cposq1[ atom ];
else if ( invmap.y < 2.5f )
oposq = cposq2[ atom ];
else if ( invmap.y < 3.5f )
oposq = cposq3[ atom ];
oposq.xyz += posq.xyz;
}
kernel void kshakeh_update1_fix1(
float strwidth, //width of cposq streams
float sdFactor,
float2 invmap<>, //shakeh inverse map
float4 posq<>, //old positions
float4 posqp<>, //deltas from sd2
float3 vp<>, //deltas from sd2
float4 cposq0[][], //constrained delta for heavy atom
float4 cposq1[][], //ditto for h1
float4 cposq2[][], //ditto for h2
float4 cposq3[][], //ditto for h3
out float4 oposq<> //updated deltas
){
float2 atom;
// atom.y = floor( invmap.x / strwidth );
atom.y = round( (invmap.x - fmod( invmap.x, strwidth ))/strwidth );
atom.x = invmap.x - atom.y * strwidth;
oposq = posq;
if ( invmap.y < 0 ){
oposq = posqp;
} else if ( invmap.y < 0.5f )
oposq.xyz += cposq0[ atom ].xyz;
else if ( invmap.y < 1.5f )
oposq.xyz += cposq1[ atom ].xyz;
else if ( invmap.y < 2.5f )
oposq.xyz += cposq2[ atom ].xyz;
else if ( invmap.y < 3.5f )
oposq.xyz += cposq3[ atom ].xyz;
}
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
kernel void kupdate_md1_berendsen(
float dt,
float3 lg, //Berendsen coupling, assuming only one group
float3 uold, //Mean velocity from previous step
float4 posq<>,
float3 v<>,
float3 f<>,
float invmass<>,
out float3 vnew<>,
out float4 posqp<> )
{
float3 vb;
float3 one;
posqp = posq;
one = float3( 1.0f, 1.0f, 1.0f );
vb = ( one - lg ) * uold;
vnew = lg* ( v + f * invmass * dt ) + vb;
posqp.xyz += vnew * dt;
}
//Nose-Hoover / Parinello Rahman
kernel void kupdate_md1_extended (
float dt,
float3 lg,
float xi,
float3 M0,
float3 M1,
float3 M2,
float3 uold,
float4 posq<>,
float3 v<>,
float3 f<>,
float invmass<>,
out float3 vnew<>,
out float4 posqp<> )
{
float3 vrel, vnrel;
float3 vtrans;
vrel = v - uold;
vtrans = float3( dot(M0, vrel), dot(M1, vrel), dot(M2, vrel) );
vrel += dt * ( invmass * f - xi * vrel - vtrans );
vnew = uold + lg * vrel;
posqp = posq;
posqp.xyz += vnew * dt;
}
kernel void kupdate_md2(
float dtinv, //1/dt
float4 posqp<>, //positions after constraints
float4 posq<>, //positions before update
out float3 vnew<>, //Corrected velocities
out float4 posqnew<> //equal to posqp, avoids an extra call to copy
)
{
posqnew = posqp;
vnew = ( posqp - posq ) * dtinv;
}
/* Version that handles terms of order dt more carefully
* Update1 computes deltax's rather than x + deltax
* Corresponding shake implementation modifies the delta's
* Update2 adds the constrained deltas to x.
* */
kernel void kupdate_md1_berendsen_fix1(
float dt,
float3 lg, //Berendsen coupling, assuming only one group
float3 uold, //Mean velocity from previous step
float4 posq<>,
float3 v<>,
float3 f<>,
float invmass<>,
out float3 vnew<>,
out float4 posqp<> )
{
float3 vb;
float3 one;
one = float3( 1.0f, 1.0f, 1.0f );
vb = ( one - lg ) * uold;
vnew = lg* ( v + f * invmass * dt ) + vb;
posqp.xyz = vnew * dt;
posqp.w = posq.w;
}
//Nose-Hoover / Parinello Rahman
kernel void kupdate_md1_extended_fix1 (
float dt,
float3 lg,
float xi,
float3 M0,
float3 M1,
float3 M2,
float3 uold,
float4 posq<>,
float3 v<>,
float3 f<>,
float invmass<>,
out float3 vnew<>,
out float4 posqp<> )
{
float3 vrel, vnrel;
float3 vtrans;
vrel = v - uold;
vtrans = float3( dot(M0, vrel), dot(M1, vrel), dot(M2, vrel) );
vrel += dt * ( invmass * f - xi * vrel - vtrans );
vnew = uold + lg * vrel;
posqp.w = posq.w;
posqp.xyz = vnew * dt;
}
kernel void kupdate_md2_fix1(
float dtinv, //1/dt
float4 posqp<>, //positions after constraints
float4 posq<>, //positions before update
out float3 vnew<>, //Corrected velocities
out float4 posqnew<> //equal to posqp, avoids an extra call to copy
)
{
posqnew.xyz = posq.xyz + posqp.xyz;
posqnew.w = posq.w;
vnew = posqp.xyz * dtinv;
}
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
/* Stochastic Integrator
*
* Since we don't have random numbers on the GPU, we
* precalculate random numbers from a normal distribution
* and store them in several large textures. Periodically
* these will have to be refreshed on the CPU.
*
* Doing gathers instead of streams for the random vectors,
* but the gathers should be fully coherent anyway.
* This is just to avoid some
* painful manipulations with multiple streams and domains.
* */
/* First part before first constraint call
*
*
* Many things can be precalculated here
* If we become compute bound (wonder how), we can precalc stuff
*
* Assuming constant temperature. If you want to change the
* temperature, you have to change all the precalculated
* constants that use it.
* */
/*
kernel void kupdate_sd1(
float xstrwidth, //atom stream width
float gstrwidth, //stream width of random numbers
float goffset, //starting offset for random numbers
//sd constants, prefixed with c
//to avoid 1 letter names, cxx = sdc[0].xx
float cem,
//sd precalculated constants
float pc1, //tau_t[0] * ( 1 - sdc[0].em )
float pc2, //tau_t[gt] * (sdc[gt].eph - sdc.emh)
float pc3, //sdc[0].d/(tau_t * sdc[0].c);
//sdpc = sd precalculated per atom values
//sdpc.x = 1/sqrt(m)*sig[0].Yv
//sdpc.y = 1/sqrt(m)*sig[0].V
float2 sdpc<>,
//Normally distributed random vectors
float3 fgauss[][],
float3 sd2X<>, //This is calculated in sd2 of the previous step
float4 posq<>,
float3 f<>,
float3 v<>,
float invmass<>,
out float3 sd1V<>, //This is V in gromacs, reused in sd2
out float3 vnew<>,
out float4 posqp<>
){
float3 Vmh;
float3 fg1, fg2;
float linind_gauss; //linear index into the random numbers
float2 igauss; //2d index
igauss = indexof( posq );
linind_gauss = igauss.y * xstrwidth + igauss.x;
//2 random vectors used in each fragment
linind_gauss = 2 * linind_gauss + goffset;
igauss.y = floor( linind_gauss / gstrwidth );
igauss.x = linind_gauss - igauss.y * gstrwidth;
fg1 = fgauss[ igauss ];
linind_gauss += 1.0f;
igauss.y = floor( linind_gauss / gstrwidth );
igauss.x = linind_gauss - igauss.y * gstrwidth;
fg2 = fgauss[ igauss ];
Vmh = sd2X * pc3 + sdpc.x * fg1;
sd1V = sdpc.y * fg2;
vnew = v * cem + invmass * f * pc1 + sd1V - Vmh * cem;
posqp.w = posq.w;
posqp.xyz = posq.xyz + vnew * pc2;
} */
/*Second part of sd update, after first constraints call*/
/*
kernel void kupdate_sd2(
float xstrwidth, //stream width of atoms
float gstrwidth, //stream width of random numbers
float goffset, //starting offset for random numbers
float pc1, //this is 1/pc2 for sd1
//= 1/(tau_t*(sdc[0].eph-sdc[0].emh))
float pc2, //= tau_t * sdc[0].d/(sdc[0].em - 1)
//per atom precalcs
//sdpc.x = 1/sqrt(m)*sig[0].Yx
//sdpc.y = 1/sqrt(m)*sig[0].X
float2 sdpc<>,
//Normally distributed random vectors
float3 fgauss[][],
float3 sd1V<>, //calculated in sd1
float4 posq<>, //positions pre-update
float4 posqp<>, //positions post-constraint
float3 vnew<>, //velocities after sd1
out float3 sd2X<>, //Used in sd1
out float3 v<>, //velocities corrected for constraints
out float4 posqp2<> //Will need to be constrained again
){
float linind_gauss; //linear index into the random numbers
float2 igauss; //2d index
float3 fg1, fg2;
float3 Xmh;
igauss = indexof( posq );
linind_gauss = igauss.y * xstrwidth + igauss.x;
linind_gauss = 2 * linind_gauss + goffset;
igauss.y = floor( linind_gauss / gstrwidth );
igauss.x = linind_gauss - igauss.y * gstrwidth;
fg1 = fgauss[ igauss ];
linind_gauss += 1.0f;
igauss.y = floor( linind_gauss / gstrwidth );
igauss.x = linind_gauss - igauss.y * gstrwidth;
fg2 = fgauss[ igauss ];
v = pc1 * ( posqp.xyz - posq.xyz );
Xmh = sd1V * pc2 + sdpc.x * fg1;
sd2X = sdpc.y * fg2;
posqp2 = posqp;
posqp2.xyz += sd2X - Xmh;
} */
//Applies a permutation to the given random vector stream
//Totally bandwidth bound and streams are large, should be done infrequently.
//Probably don't want gvin and gvout to be the same stream!
kernel void kpermute_vectors( float gstrwidth, float perm<>,
float3 gvin[][], out float3 gvout<> )
{
float2 ind;
//ind.y = floor( perm / gstrwidth );
ind.y = round( (perm - fmod( perm, gstrwidth ))/gstrwidth );
ind.x = perm - ind.y * gstrwidth;
gvout = gvin[ ind ];
}
/*
* Alternative formulation to handle precision better
* In updatesd1, we output only deltaV's. The shake
* kernel uses the deltaV's and the X's to output
* a bunch of constrained deltax's
* */
kernel void kupdate_sd1_fix1(
float xstrwidth, //atom stream width
float gstrwidth, //stream width of random numbers
float goffset, //starting offset for random numbers
//sd constants, prefixed with c
//to avoid 1 letter names, cxx = sdc[0].xx
float cem,
//sd precalculated constants
float pc1, //tau_t[0] * ( 1 - sdc[0].em )
float pc2, //tau_t[gt] * (sdc[gt].eph - sdc.emh)
float pc3, //sdc[0].d/(tau_t * sdc[0].c);
//sdpc = sd precalculated per atom values
//sdpc.x = 1/sqrt(m)*sig[0].Yv
//sdpc.y = 1/sqrt(m)*sig[0].V
float2 sdpc<>,
//Normally distributed random vectors
float3 fgauss[][],
float3 sd2X<>, //This is calculated in sd2 of the previous step
float4 posq<>,
float3 f<>,
float3 v<>,
float invmass<>,
out float3 sd1V<>, //This is V in gromacs, reused in sd2
out float3 vnew<>,
out float4 posqp<>
){
float3 Vmh;
float3 fg1, fg2;
float linind_gauss; //linear index into the random numbers
float2 igauss; //2d index
igauss = indexof( posq );
linind_gauss = igauss.y * xstrwidth + igauss.x;
//2 random vectors used in each fragment
linind_gauss = 2 * linind_gauss + goffset;
//igauss.y = floor( linind_gauss / gstrwidth );
igauss.y = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth);
igauss.x = linind_gauss - igauss.y * gstrwidth;
fg1 = fgauss[ igauss ];
linind_gauss += 1.0f;
//igauss.y = floor( linind_gauss / gstrwidth );
igauss.y = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth);
igauss.x = linind_gauss - igauss.y * gstrwidth;
fg2 = fgauss[ igauss ];
/*
Gromacs code
n = 1 -> ngtc
sdc[n].gdt = dt/tau_t[n];
sdc[n].eph = exp(sdc[n].gdt/2);
sdc[n].emh = exp(-sdc[n].gdt/2);
sdc[n].em = exp(-sdc[n].gdt);
sdc[n].b = sdc[n].gdt*(sqr(sdc[n].eph)-1) - 4*sqr(sdc[n].eph-1);
// 2.15 in paper (C)
sdc[n].c = sdc[n].gdt - 3 + 4*sdc[n].emh - sdc[n].em;
sdc[n].d = 2 - sdc[n].eph - sdc[n].emh;
Vmh = X[n-start][d]*sdc[gt].d/(tau_t[gt]*sdc[gt].c) + ism*sig[gt].Yv*fgauss(&jran);
V[n-start][d] = ism*sig[gt].V*fgauss(&jran);
v[n][d] = vn*sdc[gt].em + (invmass[n]*f[n][d] + accel[ga][d])*tau_t[gt]*(1 - sdc[gt].em) +
V[n-start][d] - sdc[gt].em*Vmh;
xprime[n][d] = x[n][d] + v[n][d]*tau_t[gt]*(sdc[gt].eph - sdc[gt].emh);
*/
// ---------------------------------------------------------------------------------------------
// float pc3 = sdc[0].d/(tau_t * sdc[0].c);
//sdpc.x = 1/sqrt(m)*sig[0].Yv
Vmh = sd2X * pc3 + sdpc.x * fg1;
// ---------------------------------------------------------------------------------------------
// ---------------------------------------------------------------------------------------------
//sdpc.y = 1/sqrt(m)*sig[0].V
sd1V = sdpc.y * fg2;
// ---------------------------------------------------------------------------------------------
// ---------------------------------------------------------------------------------------------
//
// float pc1 = tau_t[0] * ( 1 - sdc[0].em )
vnew = v * cem + invmass * f * pc1 + sd1V - Vmh * cem;
// float pc2 = tau_t[gt] * (sdc[gt].eph - sdc.emh)
posqp = posq;
posqp.xyz += vnew * pc2;
}
kernel void kupdate_sd2_fix1(
float xstrwidth, //stream width of atoms
float gstrwidth, //stream width of random numbers
float goffset, //starting offset for random numbers
float pc1, //this is 1/pc2 for sd1
//= 1/(tau_t*(sdc[0].eph-sdc[0].emh))
float pc2, //= tau_t * sdc[0].d/(sdc[0].em - 1)
//per atom precalcs
//sdpc.x = 1/sqrt(m)*sig[0].Yx
//sdpc.y = 1/sqrt(m)*sig[0].X
float2 sdpc<>,
//Normally distributed random vectors
float3 fgauss[][],
float3 sd1V<>, //calculated in sd1
float4 posq<>, //positions pre-update
float4 posqp<>, //deltas post constraint
float3 vnew<>, //velocities after sd1
out float3 sd2X<>, //Used in sd1
out float3 v<>, //velocities corrected for constraints
out float4 posqp2<> //new deltas, need to be constrained again
){
float linind_gauss; //linear index into the random numbers
float2 igauss; //2d index
float3 fg1, fg2;
float3 Xmh;
igauss = indexof( posq );
linind_gauss = igauss.y * xstrwidth + igauss.x;
linind_gauss = 2 * linind_gauss + goffset;
//igauss.y = floor( linind_gauss / gstrwidth );
igauss.y = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth);
igauss.x = linind_gauss - igauss.y * gstrwidth;
fg1 = fgauss[ igauss ];
linind_gauss += 1.0f;
//igauss.y = floor( linind_gauss / gstrwidth );
igauss.y = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth);
igauss.x = linind_gauss - igauss.y * gstrwidth;
fg2 = fgauss[ igauss ];
//v = pc1 * posqp.xyz (!!!!!!!! ?????? !!!!!!!)
v = pc1 * (posqp.xyz - posq.xyz);
Xmh = sd1V * pc2 + sdpc.x * fg1;
sd2X = sdpc.y * fg2;
posqp2 = posqp;
posqp2.xyz += sd2X - Xmh;
}
kernel void kupdate_sd1_fix1_FixedRV(
float xstrwidth, //atom stream width
float gstrwidth, //stream width of random numbers
float goffset, //starting offset for random numbers
//sd constants, prefixed with c
//to avoid 1 letter names, cxx = sdc[0].xx
float cem,
//sd precalculated constants
float pc1, //tau_t[0] * ( 1 - sdc[0].em )
float pc2, //tau_t[gt] * (sdc[gt].eph - sdc.emh)
float pc3, //sdc[0].d/(tau_t * sdc[0].c);
//sdpc = sd precalculated per atom values
//sdpc.x = 1/sqrt(m)*sig[0].Yv
//sdpc.y = 1/sqrt(m)*sig[0].V
float2 sdpc<>,
//Normally distributed random vectors
float3 fgauss[][],
float3 sd2X<>, //This is calculated in sd2 of the previous step
float4 posq<>,
float3 f<>,
float3 v<>,
float invmass<>,
out float3 sd1V<>, //This is V in gromacs, reused in sd2
out float3 vnew<>,
out float4 posqp<>
){
float3 Vmh;
float3 fg1, fg2;
float linind_gauss; //linear index into the random numbers
float2 igauss; //2d index
float RandomValueVsp = 0.1f;
igauss = indexof( posq );
linind_gauss = igauss.y * xstrwidth + igauss.x;
//2 random vectors used in each fragment
linind_gauss = 2 * linind_gauss + goffset;
//igauss.y = floor( linind_gauss / gstrwidth );
igauss.y = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth);
igauss.x = linind_gauss - igauss.y * gstrwidth;
//fg1 = fgauss[ igauss ];
fg1 = float3( RandomValueVsp, RandomValueVsp, RandomValueVsp );
linind_gauss += 1.0f;
//igauss.y = floor( linind_gauss / gstrwidth );
igauss.y = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth);
igauss.x = linind_gauss - igauss.y * gstrwidth;
//fg2 = fgauss[ igauss ];
fg2 = float3( RandomValueVsp, RandomValueVsp, RandomValueVsp );
/*
Gromacs code
n = 1 -> ngtc
sdc[n].gdt = dt/tau_t[n];
sdc[n].eph = exp(sdc[n].gdt/2);
sdc[n].emh = exp(-sdc[n].gdt/2);
sdc[n].em = exp(-sdc[n].gdt);
sdc[n].b = sdc[n].gdt*(sqr(sdc[n].eph)-1) - 4*sqr(sdc[n].eph-1);
// 2.15 in paper (C)
sdc[n].c = sdc[n].gdt - 3 + 4*sdc[n].emh - sdc[n].em;
sdc[n].d = 2 - sdc[n].eph - sdc[n].emh;
Vmh = X[n-start][d]*sdc[gt].d/(tau_t[gt]*sdc[gt].c) + ism*sig[gt].Yv*fgauss(&jran);
V[n-start][d] = ism*sig[gt].V*fgauss(&jran);
v[n][d] = vn*sdc[gt].em + (invmass[n]*f[n][d] + accel[ga][d])*tau_t[gt]*(1 - sdc[gt].em) +
V[n-start][d] - sdc[gt].em*Vmh;
xprime[n][d] = x[n][d] + v[n][d]*tau_t[gt]*(sdc[gt].eph - sdc[gt].emh);
*/
// ---------------------------------------------------------------------------------------------
// float pc3 = sdc[0].d/(tau_t * sdc[0].c);
//sdpc.x = 1/sqrt(m)*sig[0].Yv
Vmh = sd2X * pc3 + sdpc.x * fg1;
// ---------------------------------------------------------------------------------------------
// ---------------------------------------------------------------------------------------------
//sdpc.y = 1/sqrt(m)*sig[0].V
sd1V = sdpc.y * fg2;
// ---------------------------------------------------------------------------------------------
// ---------------------------------------------------------------------------------------------
//
// float pc1 = tau_t[0] * ( 1 - sdc[0].em )
vnew = v * cem + invmass * f * pc1 + sd1V - Vmh * cem;
// float pc2 = tau_t[gt] * (sdc[gt].eph - sdc.emh)
posqp = posq;
posqp.xyz += vnew * pc2;
}
kernel void kupdate_sd2_fix1_FixedRV(
float xstrwidth, //stream width of atoms
float gstrwidth, //stream width of random numbers
float goffset, //starting offset for random numbers
float pc1, //this is 1/pc2 for sd1
//= 1/(tau_t*(sdc[0].eph-sdc[0].emh))
float pc2, //= tau_t * sdc[0].d/(sdc[0].em - 1)
//per atom precalcs
//sdpc.x = 1/sqrt(m)*sig[0].Yx
//sdpc.y = 1/sqrt(m)*sig[0].X
float2 sdpc<>,
//Normally distributed random vectors
float3 fgauss[][],
float3 sd1V<>, //calculated in sd1
float4 posq<>, //positions pre-update
float4 posqp<>, //deltas post constraint
float3 vnew<>, //velocities after sd1
out float3 sd2X<>, //Used in sd1
out float3 v<>, //velocities corrected for constraints
out float4 posqp2<> //new deltas, need to be constrained again
){
float linind_gauss; //linear index into the random numbers
float2 igauss; //2d index
float3 fg1, fg2;
float3 Xmh;
float RandomValueVsp = 0.1f;
igauss = indexof( posq );
linind_gauss = igauss.y * xstrwidth + igauss.x;
linind_gauss = 2 * linind_gauss + goffset;
//igauss.y = floor( linind_gauss / gstrwidth );
igauss.y = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth);
igauss.x = linind_gauss - igauss.y * gstrwidth;
//fg1 = fgauss[ igauss ];
fg1 = float3( RandomValueVsp, RandomValueVsp, RandomValueVsp );
linind_gauss += 1.0f;
//igauss.y = floor( linind_gauss / gstrwidth );
igauss.y = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth);
igauss.x = linind_gauss - igauss.y * gstrwidth;
// fg2 = fgauss[ igauss ];
fg2 = float3( RandomValueVsp, RandomValueVsp, RandomValueVsp );
//v = pc1 * posqp.xyz (!!!!!!!! ?????? !!!!!!!)
v = pc1 * (posqp.xyz - posq.xyz);
Xmh = sd1V * pc2 + sdpc.x * fg1;
sd2X = sdpc.y * fg2;
posqp2 = posqp;
posqp2.xyz += sd2X - Xmh;
}
/* pre-tests, known to be right */
kernel void pre_test0( float input<>, out float output<> )
{
output = input;
}
kernel void pre_test1( float input<>, out float output<> )
{
output = ( input * input ) + input;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment