Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
b53cd947
Commit
b53cd947
authored
Aug 15, 2008
by
Mark Friedrichs
Browse files
Latest mods
parent
ff88ddad
Changes
54
Hide whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
4528 additions
and
0 deletions
+4528
-0
platforms/brook/src/gpu/kcommonOrig.br
platforms/brook/src/gpu/kcommonOrig.br
+34
-0
platforms/brook/src/gpu/kforce.h
platforms/brook/src/gpu/kforce.h
+366
-0
platforms/brook/src/gpu/kforce_CDLJ.br
platforms/brook/src/gpu/kforce_CDLJ.br
+1740
-0
platforms/brook/src/gpu/kinvmap_gather.br
platforms/brook/src/gpu/kinvmap_gather.br
+336
-0
platforms/brook/src/gpu/kinvmap_gather.h
platforms/brook/src/gpu/kinvmap_gather.h
+151
-0
platforms/brook/src/gpu/kmerge_partial_forces.br
platforms/brook/src/gpu/kmerge_partial_forces.br
+101
-0
platforms/brook/src/gpu/knlforce.br
platforms/brook/src/gpu/knlforce.br
+114
-0
platforms/brook/src/gpu/knlist.br
platforms/brook/src/gpu/knlist.br
+237
-0
platforms/brook/src/gpu/kpdihs.br
platforms/brook/src/gpu/kpdihs.br
+87
-0
platforms/brook/src/gpu/krbdihs.br
platforms/brook/src/gpu/krbdihs.br
+100
-0
platforms/brook/src/gpu/kshakeh.br
platforms/brook/src/gpu/kshakeh.br
+609
-0
platforms/brook/src/gpu/kupdatemd.br
platforms/brook/src/gpu/kupdatemd.br
+141
-0
platforms/brook/src/gpu/kupdatesd.br
platforms/brook/src/gpu/kupdatesd.br
+501
-0
platforms/brook/src/gpu/pre_test.br
platforms/brook/src/gpu/pre_test.br
+11
-0
No files found.
platforms/brook/src/gpu/kcommonOrig.br
0 → 100644
View file @
b53cd947
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
//Kernel to set the xyz components of a float4 stream
//Used for changing the coordinates without changing
//the charges in strPosQ
kernel void ksetxyz( float3 instr<>, float4 before<>, out float4 after<> ) {
after.xyz = instr;
after.w = before.w;
}
//Inverse of above
kernel void kgetxyz( float4 instr<>, out float3 outstr<> ) {
outstr = instr.xyz;
}
//Zeroes out a stream
kernel void kzerof3( out float3 outstr<> ) {
outstr = float3( 0.0, 0.0, 0.0 );
}
//Zeros out a stream
kernel void kzerof4( out float4 outstr<> ) {
outstr = float4( 0.0, 0.0, 0.0, 0.0 );
}
kernel void ksetf4( float4 val, out float4 outstr<> ) {
outstr = val;
}
platforms/brook/src/gpu/kforce.h
0 → 100644
View file @
b53cd947
#ifndef __KFORCE_H__
#define __KFORCE_H__
//Define a generic force kernel prototype
//To make switching easier to look at
//This should match the kernels from kforce.br
typedef
void
(
*
gpuNBForceFunction
)(
const
float
natoms
,
const
float
dupfac
,
const
float
strheight
,
const
float
strwidth
,
const
float
jstrwidth
,
const
float
fstrwidth
,
const
float
epsfac
,
const
float4
params
,
::
brook
::
stream
posq
,
::
brook
::
stream
isigeps
,
::
brook
::
stream
sigma
,
::
brook
::
stream
epsilon
,
::
brook
::
stream
excl
,
::
brook
::
stream
outforce1
,
::
brook
::
stream
outforce2
);
void
knbforce_CDLJ
(
const
float
natoms
,
const
float
dupfac
,
const
float
strheight
,
const
float
strwidth
,
const
float
jstrwidth
,
const
float
fstrwidth
,
const
float
epsfac
,
const
float4
params
,
::
brook
::
stream
posq
,
::
brook
::
stream
isigeps
,
::
brook
::
stream
sigma
,
::
brook
::
stream
epsilon
,
::
brook
::
stream
excl
,
::
brook
::
stream
outforce1
,
::
brook
::
stream
outforce2
);
void
knbforce_CDLJ2
(
const
float
natoms
,
const
float
dupfac
,
const
float
strheight
,
const
float
strwidth
,
const
float
jstrwidth
,
const
float
fstrwidth
,
const
float
epsfac
,
const
float4
params
,
::
brook
::
stream
posq
,
::
brook
::
stream
isigeps
,
::
brook
::
stream
sigma
,
::
brook
::
stream
epsilon
,
::
brook
::
stream
excl
,
::
brook
::
stream
outforce1
,
::
brook
::
stream
outforce2
);
void
knbforce_CDLJ4
(
const
float
natoms
,
const
float
nAtomsCeiling
,
const
float
dupfac
,
const
float
strheight
,
const
float
strwidth
,
const
float
jstrwidth
,
const
float
fstrwidth
,
const
float
epsfac
,
const
float4
params
,
::
brook
::
stream
posq
,
::
brook
::
stream
isigeps
,
::
brook
::
stream
sigma
,
::
brook
::
stream
epsilon
,
::
brook
::
stream
excl
,
::
brook
::
stream
outforce1
,
::
brook
::
stream
outforce2
,
::
brook
::
stream
outforce3
,
::
brook
::
stream
outforce4
);
void
knbforce_CDLJ4Debug
(
const
float
natoms
,
const
float
nAtomsCeiling
,
const
float
dupfac
,
const
float
strheight
,
const
float
strwidth
,
const
float
jstrwidth
,
const
float
fstrwidth
,
const
float
epsfac
,
const
float4
params
,
::
brook
::
stream
posq
,
::
brook
::
stream
isigeps
,
::
brook
::
stream
sigma
,
::
brook
::
stream
epsilon
,
::
brook
::
stream
excl
,
::
brook
::
stream
outforce1
,
::
brook
::
stream
outforce2
,
::
brook
::
stream
outforce3
,
::
brook
::
stream
outforce4
);
void
knbforce_CDLJ_1
(
const
float
natoms
,
const
float
nAtomsCeiling
,
const
float
strwidth
,
const
float
epsfac
,
::
brook
::
stream
posq
,
::
brook
::
stream
isigeps
,
::
brook
::
stream
excl
,
::
brook
::
stream
outforce
);
void
knbforce_CDLJ4NoEx
(
const
float
natoms
,
const
float
nAtomsCeiling
,
const
float
dupfac
,
const
float
strheight
,
const
float
strwidth
,
const
float
jstrwidth
,
const
float
fstrwidth
,
const
float
epsfac
,
const
float4
params
,
::
brook
::
stream
posq
,
::
brook
::
stream
isigeps
,
::
brook
::
stream
sigma
,
::
brook
::
stream
epsilon
,
::
brook
::
stream
excl
,
::
brook
::
stream
outforce1
,
::
brook
::
stream
outforce2
,
::
brook
::
stream
outforce3
,
::
brook
::
stream
outforce4
);
void
knbforce_LDLJ
(
const
float
natoms
,
const
float
dupfac
,
const
float
strheight
,
const
float
strwidth
,
const
float
jstrwidth
,
const
float
fstrwidth
,
const
float
epsfac
,
const
float4
params
,
const
__BRTIter
&
a_iatom2
,
::
brook
::
stream
posq
,
::
brook
::
stream
isigeps
,
::
brook
::
stream
sigma
,
::
brook
::
stream
epsilon
,
::
brook
::
stream
excl
,
::
brook
::
stream
outforce1
,
::
brook
::
stream
outforce2
);
void
knbforce_SFDLJ
(
const
float
natoms
,
const
float
dupfac
,
const
float
strheight
,
const
float
strwidth
,
const
float
jstrwidth
,
const
float
fstrwidth
,
const
float
epsfac
,
const
float4
params
,
const
__BRTIter
&
a_iatom2
,
::
brook
::
stream
posq
,
::
brook
::
stream
isigeps
,
::
brook
::
stream
sigma
,
::
brook
::
stream
epsilon
,
::
brook
::
stream
excl
,
::
brook
::
stream
outforce1
,
::
brook
::
stream
outforce2
);
void
knbforce_SHEFFIELD
(
const
float
natoms
,
const
float
dupfac
,
const
float
strheight
,
const
float
strwidth
,
const
float
jstrwidth
,
const
float
fstrwidth
,
const
float
epsfac
,
const
float4
params
,
const
__BRTIter
&
a_iatom2
,
::
brook
::
stream
posq
,
::
brook
::
stream
isigeps
,
::
brook
::
stream
sigma
,
::
brook
::
stream
epsilon
,
::
brook
::
stream
excl
,
::
brook
::
stream
outforce1
,
::
brook
::
stream
outforce2
);
void
kmerge_partial_forces
(
const
float
repfac
,
const
float
atomStrWidth
,
const
float
pforceStrWidth
,
const
float
natoms
,
::
brook
::
stream
pforce1
,
::
brook
::
stream
pforce2
,
::
brook
::
stream
force
);
void
kMergeFloat3_4
(
float
repfac
,
float
atomStreamWidth
,
float
pStreamWidth
,
float
natoms
,
float
roundNatoms
,
float
iUnroll
,
::
brook
::
stream
pforce1
,
::
brook
::
stream
pforce2
,
::
brook
::
stream
pforce3
,
::
brook
::
stream
pforce4
,
::
brook
::
stream
force
);
void
kMergeFloat3_4_nobranch
(
float
repfac
,
float
atomStreamWidth
,
float
pStreamWidth
,
float
natoms
,
float
roundNatoms
,
float
iUnroll
,
::
brook
::
stream
pforce1
,
::
brook
::
stream
pforce2
,
::
brook
::
stream
pforce3
,
::
brook
::
stream
pforce4
,
::
brook
::
stream
force
);
typedef
void
(
*
gpuNBForceFunction14
)
(
const
float
natoms
,
const
float
strwidth
,
const
float
epsfac
,
const
float4
params
,
::
brook
::
stream
atoms
,
::
brook
::
stream
posq
,
::
brook
::
stream
sigeps
,
::
brook
::
stream
fi
,
::
brook
::
stream
fj
);
void
kforce14_CDLJ
(
const
float
natoms
,
const
float
strwidth
,
const
float
epsfac
,
const
float4
params
,
::
brook
::
stream
atoms
,
::
brook
::
stream
posq
,
::
brook
::
stream
sigeps
,
::
brook
::
stream
fi
,
::
brook
::
stream
fj
);
void
kforce14_LDLJ
(
const
float
natoms
,
const
float
strwidth
,
const
float
epsfac
,
const
float4
params
,
::
brook
::
stream
atoms
,
::
brook
::
stream
posq
,
::
brook
::
stream
sigeps
,
::
brook
::
stream
fi
,
::
brook
::
stream
fj
);
void
kforce14_SFDLJ
(
const
float
natoms
,
const
float
strwidth
,
const
float
epsfac
,
const
float4
params
,
::
brook
::
stream
atoms
,
::
brook
::
stream
posq
,
::
brook
::
stream
sigeps
,
::
brook
::
stream
fi
,
::
brook
::
stream
fj
);
void
kforce14_SHEFFIELD
(
const
float
natoms
,
const
float
strwidth
,
const
float
epsfac
,
const
float4
params
,
::
brook
::
stream
atoms
,
::
brook
::
stream
posq
,
::
brook
::
stream
sigeps
,
::
brook
::
stream
fi
,
::
brook
::
stream
fj
);
typedef
void
(
*
gpuBondedFunction
)(
const
float
epsfac
,
const
float
xstrwidth
,
const
float4
params
,
::
brook
::
stream
posq
,
::
brook
::
stream
atoms
,
::
brook
::
stream
parm0
,
::
brook
::
stream
parm1
,
::
brook
::
stream
parm2
,
::
brook
::
stream
parm3
,
::
brook
::
stream
parm4
,
::
brook
::
stream
fi
,
::
brook
::
stream
fj
,
::
brook
::
stream
fk
,
::
brook
::
stream
fl
);
void
kbonded_CDLJ
(
const
float
epsfac
,
const
float
xstrwidth
,
const
float4
params
,
::
brook
::
stream
posq
,
::
brook
::
stream
atoms
,
::
brook
::
stream
parm0
,
::
brook
::
stream
parm1
,
::
brook
::
stream
parm2
,
::
brook
::
stream
parm3
,
::
brook
::
stream
parm4
,
::
brook
::
stream
fi
,
::
brook
::
stream
fj
,
::
brook
::
stream
fk
,
::
brook
::
stream
fl
);
void
kbonded_CDLJDebug
(
const
float
epsfac
,
const
float
xstrwidth
,
const
float4
params
,
::
brook
::
stream
posq
,
::
brook
::
stream
atoms
,
::
brook
::
stream
parm0
,
::
brook
::
stream
parm1
,
::
brook
::
stream
parm2
,
::
brook
::
stream
parm3
,
::
brook
::
stream
parm4
,
::
brook
::
stream
fi
,
::
brook
::
stream
fj
,
::
brook
::
stream
fk
,
::
brook
::
stream
fl
);
void
kbonded_LDLJ
(
const
float
epsfac
,
const
float
xstrwidth
,
const
float4
params
,
::
brook
::
stream
posq
,
::
brook
::
stream
atoms
,
::
brook
::
stream
parm0
,
::
brook
::
stream
parm1
,
::
brook
::
stream
parm2
,
::
brook
::
stream
parm3
,
::
brook
::
stream
parm4
,
::
brook
::
stream
fi
,
::
brook
::
stream
fj
,
::
brook
::
stream
fk
,
::
brook
::
stream
fl
);
void
kbonded_SFDLJ
(
const
float
epsfac
,
const
float
xstrwidth
,
const
float4
params
,
::
brook
::
stream
posq
,
::
brook
::
stream
atoms
,
::
brook
::
stream
parm0
,
::
brook
::
stream
parm1
,
::
brook
::
stream
parm2
,
::
brook
::
stream
parm3
,
::
brook
::
stream
parm4
,
::
brook
::
stream
fi
,
::
brook
::
stream
fj
,
::
brook
::
stream
fk
,
::
brook
::
stream
fl
);
void
kbonded_SHEFFIELD
(
const
float
epsfac
,
const
float
xstrwidth
,
const
float4
params
,
::
brook
::
stream
posq
,
::
brook
::
stream
atoms
,
::
brook
::
stream
parm0
,
::
brook
::
stream
parm1
,
::
brook
::
stream
parm2
,
::
brook
::
stream
parm3
,
::
brook
::
stream
parm4
,
::
brook
::
stream
fi
,
::
brook
::
stream
fj
,
::
brook
::
stream
fk
,
::
brook
::
stream
fl
);
void
kAddForces3_4
(
const
float
conversion
,
::
brook
::
stream
force1
,
::
brook
::
stream
force2
,
::
brook
::
stream
outForce
);
#endif //__KFORCE_H__
platforms/brook/src/gpu/kforce_CDLJ.br
0 → 100644
View file @
b53cd947
kernel float4 scalar_force_CDLJ( float4 qq, float epsfac, float4 sig, float4 eps, float4 r2, float4 params ) {
float4 invr, invrsig2, invrsig6;
float4 f;
invr = rsqrt( r2 );
invrsig2 = invr * sig;
invrsig2 = invrsig2 * invrsig2;
invrsig6 = invrsig2 * invrsig2 * invrsig2;
f = eps * ( 12.0f*invrsig6 - 6.0f ) * invrsig6;
f += epsfac*qq*invr;
f *= invr*invr;
return f;
}
kernel float scalar_force_single_CDLJ( float qq, float epsfac, float sig, float eps, float r2 ){
float invr, invrsig2, invrsig6;
float f;
invr = rsqrt( r2 );
invrsig2 = invr * sig;
invrsig2 = invrsig2 * invrsig2;
invrsig6 = invrsig2 * invrsig2 * invrsig2;
f = eps * ( 12.0f*invrsig6 - 6.0f ) * invrsig6;
f += epsfac*qq*invr;
f *= invr*invr;
return f;
}
kernel float4 get_r2_CDLJ( float3 d1, float3 d2, float3 d3, float3 d4 ) {
float4 r2;
r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) );
return r2;
}
kernel void knbforce_CDLJ (
float natoms,
float dupfac,
float strheight,
float strwidth,
float jstrwidth,
float fstrwidth,
float epsfac,
float4 params,
float4 posq[][],
float4 isigeps<>,
float4 sigma[][],
float4 epsilon[][],
float2 excl[][],
out float3 outforce1<>,
out float3 outforce2<> ) {
float4 jsig, jeps;
float3 ipos1, ipos2;
float3 jpos1, jpos2, jpos3, jpos4;
float qi1, qi2;
float4 qj, qq;
float3 d1, d2, d3, d4;
float2 exclind;
float4 sig, eps;
float2 exclusions;
float4 r2;
float3 pad;
float4 exclmask;
float4 exclconst;
float2 iatom;
float2 jstrindex;
float a_iatom;
float linind;
float breakflag;
float2 jatom;
float4 fs;
float jend, jstart;
float which_rep;
exclconst = float4( 2.0f, 3.0f, 5.0f, 7.0f );
outforce1 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
outforce2 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
a_iatom = 2.0f*((indexof outforce1).x + (indexof outforce1).y * fstrwidth);
linind = fmod( a_iatom, natoms );
iatom.x = fmod( linind, strwidth );
iatom.y = round( (linind - fmod(linind, strwidth))/strwidth );
ipos1 = posq[ iatom ].xyz;
qi1 = posq[ iatom ].w;
iatom.x += 1;
ipos2 = posq[ iatom ].xyz;
qi2 = posq[ iatom ].w;
breakflag = 1.0f;
jatom.y = 0.0f;
which_rep = round( (a_iatom - fmod(a_iatom, natoms))/natoms );
jend = 1 + round( (natoms - fmod(natoms, (dupfac * strwidth )))/(dupfac * strwidth ) );
jstart = which_rep * jend;
if ( which_rep > dupfac - 1 - 0.5f ) {
jend = 1e6f;
}
jend += jstart;
exclind.x = linind/2.0f;
exclind.y = jstart * strwidth/4.0f;
jatom.y = jstart;
while ( jatom.y < jend && breakflag > 0.0f ) {
jatom.x = 0;
while ( jatom.x < strwidth && breakflag > 0.0f ) {
exclusions = excl[ exclind ];
exclmask = fmod( exclusions.x, exclconst );
linind = (jatom.x + jatom.y*strwidth)/4.0f;
jstrindex.y = round( (linind - fmod(linind, jstrwidth))/jstrwidth );
jstrindex.x = linind - jstrindex.y * jstrwidth;
jsig = sigma[ jstrindex ];
jeps = epsilon[ jstrindex ];
jpos1 = posq[ jatom ].xyz;
qj.x = posq[ jatom ].w;
jatom.x += 1.0f;
jpos2 = posq[ jatom ].xyz;
qj.y = posq[ jatom ].w;
jatom.x += 1.0f;
jpos3 = posq[ jatom ].xyz;
qj.z = posq[ jatom ].w;
jatom.x += 1.0f;
jpos4 = posq[ jatom ].xyz;
qj.w = posq[ jatom ].w;
jatom.x += 1.0f;
sig = isigeps.x + jsig;
eps = isigeps.y * jeps;
qq = qi1 * qj;
d1 = ipos1 - jpos1;
d2 = ipos1 - jpos2;
d3 = ipos1 - jpos3;
d4 = ipos1 - jpos4;
r2 = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask * 10000.0f;
fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
outforce1 += fs.x * d1;
outforce1 += fs.y * d2;
outforce1 += fs.z * d3;
outforce1 += fs.w * d4;
/*
outforce1.x += exclmask.x;
outforce1.x += exclmask.y;
outforce1.x += exclmask.z;
outforce1.x += exclmask.w;
if( exclmask.x > 0.5f ){
outforce1.x += 1.0f;
}
if( exclmask.y > 0.5f ){
outforce1.x += 1.0f;
}
if( exclmask.z > 0.5f ){
outforce1.x += 1.0f;
}
if( exclmask.w > 0.5f ){
outforce1.x += 1.0f;
}
*/
exclmask = fmod( exclusions.y, exclconst );
sig = isigeps.z + jsig;
eps = isigeps.w * jeps;
qq = qi2 * qj;
d1 = ipos2 - jpos1;
d2 = ipos2 - jpos2;
d3 = ipos2 - jpos3;
d4 = ipos2 - jpos4;
r2 = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask * 10000.0f;
fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
outforce2 += fs.x * d1;
outforce2 += fs.y * d2;
outforce2 += fs.z * d3;
outforce2 += fs.w * d4;
/*
outforce2.x += exclmask.x;
outforce2.x += exclmask.y;
outforce2.x += exclmask.z;
outforce2.x += exclmask.w;
if( exclmask.x > 0.5f ){
outforce2.x += 1.0f;
}
if( exclmask.y > 0.5f ){
outforce2.x += 1.0f;
}
if( exclmask.z > 0.5f ){
outforce2.x += 1.0f;
}
if( exclmask.w > 0.5f ){
outforce2.x += 1.0f;
}
*/
exclind.y += 1.0f;
if ( natoms - (jatom.y*strwidth + jatom.x) < 0.9f )
breakflag = -1.0f;
}
jatom.y += 1.0f;
}
}
kernel void knbforce_CDLJ2(
float natoms,
float dupfac,
float strheight,
float strwidth,
float jstrwidth,
float fstrwidth,
float epsfac,
float4 params,
float4 posq[][],
float4 isigeps[][],
float4 sigma[][],
float4 epsilon[][],
float excl[][],
out float3 outforce1<>,
out float3 outforce2<> ) {
float4 jsig, jeps;
float3 ipos1, ipos2;
float4 jpos;
float3 jpos1, jpos2, jpos3, jpos4;
float qi1, qi2;
float isig1, isig2;
float ieps1, ieps2;
float4 qj, qq;
float3 d1, d2, d3, d4;
float2 exclind;
float4 sig, eps;
float exclusions;
float4 r2;
float3 pad;
float4 exclmask;
float4 exclconst;
float2 iatom;
float2 jstrindex;
float a_iatom;
float linind;
float breakflag;
float2 jatom;
float4 fs;
float jend, jstart;
float which_rep;
exclconst = float4( 2.0f, 3.0f, 5.0f, 7.0f );
outforce1 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
outforce2 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
a_iatom = 2.0f*((indexof outforce1).x + (indexof outforce1).y * fstrwidth);
linind = fmod( a_iatom, natoms );
iatom.x = fmod( linind, strwidth );
iatom.y = round( (linind - fmod(linind, strwidth))/strwidth );
jpos = posq[ iatom ];
ipos1 = jpos.xyz;
qi1 = jpos.w;
jstrindex = isigeps[ iatom ];
isig1 = jstrindex.x;
ieps1 = jstrindex.y;
iatom.x += 1;
jpos = posq[ iatom ];
ipos2 = jpos.xyz;
qi2 = jpos.w;
jstrindex = isigeps[ iatom ];
isig2 = jstrindex.x;
ieps2 = jstrindex.y;
breakflag = 1.0f;
jatom.y = 0.0f;
which_rep = round( (a_iatom - fmod(a_iatom, natoms))/natoms );
jend = 1 + round( (natoms - fmod(natoms, (dupfac * strwidth )))/(dupfac * strwidth ) );
jstart = which_rep * jend;
if ( which_rep > dupfac - 1 - 0.5f ) {
jend = 1e6f;
}
jend += jstart;
exclind.x = linind;
exclind.y = jstart * strwidth/4.0f;
jatom.y = jstart;
while ( jatom.y < jend && breakflag > 0.0f ) {
jatom.x = 0;
while ( jatom.x < strwidth && breakflag > 0.0f ) {
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
linind = (jatom.x + jatom.y*strwidth)/4.0f;
jstrindex.y = round( (linind - fmod(linind, jstrwidth))/jstrwidth );
jstrindex.x = linind - jstrindex.y * jstrwidth;
jsig = sigma[ jstrindex ];
jeps = epsilon[ jstrindex ];
jpos = posq[ jatom ];
jpos1 = jpos.xyz;
qj.x = jpos.w;
jatom.x += 1.0f;
jpos = posq[ jatom ];
jpos2 = jpos.xyz;
qj.y = jpos.w;
jatom.x += 1.0f;
jpos = posq[ jatom ];
jpos3 = jpos.xyz;
qj.z = jpos.w;
jatom.x += 1.0f;
jpos = posq[ jatom ];
jpos4 = jpos.xyz;
qj.w = jpos.w;
jatom.x += 1.0f;
sig = isig1 + jsig;
eps = ieps1 * jeps;
qq = qi1*qj;
d1 = ipos1 - jpos1;
d2 = ipos1 - jpos2;
d3 = ipos1 - jpos3;
d4 = ipos1 - jpos4;
r2 = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask * 10000.0f;
fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
outforce1 += fs.x * d1;
outforce1 += fs.y * d2;
outforce1 += fs.z * d3;
outforce1 += fs.w * d4;
exclind.x += 1.0f;
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
sig = isig2 + jsig;
eps = ieps2 * jeps;
qq = qi2 * qj;
d1 = ipos2 - jpos1;
d2 = ipos2 - jpos2;
d3 = ipos2 - jpos3;
d4 = ipos2 - jpos4;
r2 = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask * 10000.0f;
fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
outforce2 += fs.x * d1;
outforce2 += fs.y * d2;
outforce2 += fs.z * d3;
outforce2 += fs.w * d4;
exclind.x -= 1.0f;
exclind.y += 1.0f;
if ( natoms - (jatom.y*strwidth + jatom.x) < 0.9f )
breakflag = -1.0f;
}
jatom.y += 1.0f;
}
}
kernel void knbforce_CDLJ4(
float natoms,
float nAtomsCeiling,
float dupfac,
float strheight,
float strwidth,
float jstrwidth,
float fstrwidth,
float epsfac,
float4 params,
float4 posq[][],
float4 isigeps[][],
float4 sigma[][],
float4 epsilon[][],
float excl[][],
out float3 outforce1<>,
out float3 outforce2<>,
out float3 outforce3<>,
out float3 outforce4<> ) {
float4 jsig, jeps;
float3 ipos1, ipos2, ipos3, ipos4;
float4 jpos;
float3 jpos1, jpos2, jpos3, jpos4;
float qi1, qi2, qi3, qi4;
float isig1, isig2, isig3, isig4;
float ieps1, ieps2, ieps3, ieps4;
float4 qj, qq;
float3 d1, d2, d3, d4;
float2 exclind;
float4 sig, eps;
float exclusions;
float4 r2;
float3 pad;
float4 exclmask;
float4 exclconst;
float2 iatom;
float2 jstrindex;
float a_iatom;
float linind;
float breakflag;
float2 jatom;
float4 fs;
float jend, jstart;
float which_rep;
exclconst = float4( 2.0f, 3.0f, 5.0f, 7.0f );
outforce1 = float3( 0.0f, 0.0f, 0.0f );
outforce2 = float3( 0.0f, 0.0f, 0.0f );
outforce3 = float3( 0.0f, 0.0f, 0.0f );
outforce4 = float3( 0.0f, 0.0f, 0.0f );
a_iatom = 4.0f*((indexof outforce1).x + (indexof outforce1).y * fstrwidth);
// linind = fmod( a_iatom, natoms );
linind = fmod( a_iatom, nAtomsCeiling );
iatom.x = fmod( linind, strwidth );
iatom.y = round( (linind - fmod(linind, strwidth))/strwidth );
/* --------------------------------------------------------------------- */
jpos = posq[ iatom ];
ipos1 = jpos.xyz;
qi1 = jpos.w;
jstrindex = isigeps[ iatom ];
isig1 = jstrindex.x;
ieps1 = jstrindex.y;
/* --------------------------------------------------------------------- */
iatom.x += 1.0f;
jpos = posq[ iatom ];
ipos2 = jpos.xyz;
qi2 = jpos.w;
jstrindex = isigeps[ iatom ];
isig2 = jstrindex.x;
ieps2 = jstrindex.y;
/* --------------------------------------------------------------------- */
iatom.x += 1.0f;
jpos = posq[ iatom ];
ipos3 = jpos.xyz;
qi3 = jpos.w;
jstrindex = isigeps[ iatom ];
isig3 = jstrindex.x;
ieps3 = jstrindex.y;
/* --------------------------------------------------------------------- */
iatom.x += 1.0f;
jpos = posq[ iatom ];
ipos4 = jpos.xyz;
qi4 = jpos.w;
jstrindex = isigeps[ iatom ];
isig4 = jstrindex.x;
ieps4 = jstrindex.y;
/* --------------------------------------------------------------------- */
breakflag = 1.0f;
jatom.y = 0.0f;
// which_rep = round( (a_iatom - fmod(a_iatom, natoms))/natoms );
which_rep = round( (a_iatom - fmod(a_iatom, nAtomsCeiling ))/nAtomsCeiling );
jend = 1.0f + round( (natoms - fmod(natoms, (dupfac * strwidth )))/(dupfac * strwidth ) );
jstart = which_rep * jend;
if ( which_rep > dupfac - 1.5f ) {
jend = 1e6f;
}
jend += jstart;
exclind.x = linind;
exclind.y = jstart * strwidth/4.0f;
exclind.y = floor( exclind.y + 0.25f );
jatom.y = jstart;
while ( jatom.y < jend && breakflag > 0.0f ) {
jatom.x = 0;
while ( jatom.x < strwidth && breakflag > 0.0f ) {
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
linind = (jatom.x + jatom.y*strwidth)/4.0f;
linind = floor( linind + 0.25f );
jstrindex.y = round( (linind - fmod(linind, jstrwidth))/jstrwidth );
jstrindex.x = linind - jstrindex.y * jstrwidth;
jstrindex.x = floor( jstrindex.x + 0.25f );
/* --------------------------------------------------------------------- */
jsig = sigma[ jstrindex ];
jeps = epsilon[ jstrindex ];
jpos = posq[ jatom ];
jpos1 = jpos.xyz;
qj.x = jpos.w;
jatom.x = floor( jatom.x + 1.2f );
//jatom.x += 1.0f;
jpos = posq[ jatom ];
jpos2 = jpos.xyz;
qj.y = jpos.w;
jatom.x = floor( jatom.x + 1.2f );
//jatom.x += 1.0f;
jpos = posq[ jatom ];
jpos3 = jpos.xyz;
qj.z = jpos.w;
jatom.x = floor( jatom.x + 1.2f );
//jatom.x += 1.0f;
jpos = posq[ jatom ];
jpos4 = jpos.xyz;
qj.w = jpos.w;
jatom.x = floor( jatom.x + 1.2f );
//jatom.x += 1.0f;
/* --------------------------------------------------------------------- */
sig = isig1 + jsig;
eps = ieps1 * jeps;
qq = qi1*qj;
d1 = ipos1 - jpos1;
d2 = ipos1 - jpos2;
d3 = ipos1 - jpos3;
d4 = ipos1 - jpos4;
r2 = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask*10000.0f;
fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
outforce1 += fs.x * d1;
outforce1 += fs.y * d2;
outforce1 += fs.z * d3;
outforce1 += fs.w * d4;
/* --------------------------------------------------------------------- */
//exclind.x += 1.0f;
exclind.x = floor( exclind.x + 1.2f );
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
sig = isig2 + jsig;
eps = ieps2 * jeps;
qq = qi2 * qj;
d1 = ipos2 - jpos1;
d2 = ipos2 - jpos2;
d3 = ipos2 - jpos3;
d4 = ipos2 - jpos4;
r2 = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask * 10000.0f;
fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
outforce2 += fs.x * d1;
outforce2 += fs.y * d2;
outforce2 += fs.z * d3;
outforce2 += fs.w * d4;
/* --------------------------------------------------------------------- */
//exclind.x += 1.0f;
exclind.x = floor( exclind.x + 1.2f );
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
sig = isig3 + jsig;
eps = ieps3 * jeps;
qq = qi3 * qj;
d1 = ipos3 - jpos1;
d2 = ipos3 - jpos2;
d3 = ipos3 - jpos3;
d4 = ipos3 - jpos4;
r2 = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask * 10000.0f;
fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
outforce3 += fs.x * d1;
outforce3 += fs.y * d2;
outforce3 += fs.z * d3;
outforce3 += fs.w * d4;
/* --------------------------------------------------------------------- */
//exclind.x += 1.0f;
exclind.x = floor( exclind.x + 1.2f );
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
sig = isig4 + jsig;
eps = ieps4 * jeps;
qq = qi4 * qj;
d1 = ipos4 - jpos1;
d2 = ipos4 - jpos2;
d3 = ipos4 - jpos3;
d4 = ipos4 - jpos4;
r2 = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask*10000.0f;
fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
outforce4 += fs.x * d1;
outforce4 += fs.y * d2;
outforce4 += fs.z * d3;
outforce4 += fs.w * d4;
/* --------------------------------------------------------------------- */
//exclind.x -= 3.0f;
exclind.x = floor( exclind.x - 2.9f );
//exclind.y += 1.0f;
exclind.y = floor( exclind.y + 1.25f );
if( natoms - (jatom.y*strwidth + jatom.x) < 0.9f )breakflag = -1.0f;
}
//jatom.y += 1.0f;
jatom.y = floor( jatom.y + 1.2f );
}
}
kernel void knbforce_CDLJ4Debug(
float natoms,
float nAtomsCeiling,
float dupfac,
float strheight,
float strwidth,
float jstrwidth,
float fstrwidth,
float epsfac,
float4 params,
float4 posq[][],
float4 isigeps[][],
float4 sigma[][],
float4 epsilon[][],
float excl[][],
out float3 outforce1<>,
out float3 outforce2<>,
out float3 outforce3<>,
out float3 outforce4<> ) {
float4 jsig, jeps;
float3 ipos1, ipos2, ipos3, ipos4;
float4 jpos;
float3 jpos1, jpos2, jpos3, jpos4;
float qi1, qi2, qi3, qi4;
float isig1, isig2, isig3, isig4;
float ieps1, ieps2, ieps3, ieps4;
float4 qj, qq;
float3 d1, d2, d3, d4;
float2 exclind;
float4 sig, eps;
float exclusions;
float4 r2;
float3 pad;
float4 exclmask;
float4 exclconst;
float2 iatom;
float2 jstrindex;
float a_iatom;
float linind;
float linind1;
float breakflag;
float2 jatom;
float4 fs;
float jend, jstart;
float which_rep;
float4 one4;
float4 zero4;
float4 frac4;
float sum;
exclconst = float4( 2.0f, 3.0f, 5.0f, 7.0f );
outforce1 = float3( 0.0f, 0.0f, 0.0f );
outforce2 = float3( 0.0f, 0.0f, 0.0f );
outforce3 = float3( 0.0f, 0.0f, 0.0f );
outforce4 = float3( 0.0f, 0.0f, 0.0f );
one4 = float4( 1.0f, 1.0f, 1.0f, 1.0f );
zero4 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
frac4 = float4( 0.2f, 0.2f, 0.2f, 0.2f );
a_iatom = 4.0f*((indexof outforce1).x + (indexof outforce1).y * fstrwidth);
// linind = fmod( a_iatom, natoms );
linind = fmod( a_iatom, nAtomsCeiling );
iatom.x = fmod( linind, strwidth );
iatom.y = round( (linind - fmod(linind, strwidth))/strwidth );
/* --------------------------------------------------------------------- */
jpos = posq[ iatom ];
ipos1 = jpos.xyz;
qi1 = jpos.w;
jstrindex = isigeps[ iatom ];
isig1 = jstrindex.x;
ieps1 = jstrindex.y;
/* --------------------------------------------------------------------- */
iatom.x += 1;
jpos = posq[ iatom ];
ipos2 = jpos.xyz;
qi2 = jpos.w;
jstrindex = isigeps[ iatom ];
isig2 = jstrindex.x;
ieps2 = jstrindex.y;
/* --------------------------------------------------------------------- */
iatom.x += 1;
jpos = posq[ iatom ];
ipos3 = jpos.xyz;
qi3 = jpos.w;
jstrindex = isigeps[ iatom ];
isig3 = jstrindex.x;
ieps3 = jstrindex.y;
/* --------------------------------------------------------------------- */
iatom.x += 1;
jpos = posq[ iatom ];
ipos4 = jpos.xyz;
qi4 = jpos.w;
jstrindex = isigeps[ iatom ];
isig4 = jstrindex.x;
ieps4 = jstrindex.y;
/* --------------------------------------------------------------------- */
breakflag = 1.0f;
jatom.y = 0.0f;
// which_rep = round( (a_iatom - fmod(a_iatom, natoms))/natoms );
which_rep = round( (a_iatom - fmod(a_iatom, nAtomsCeiling ))/nAtomsCeiling );
jend = 1.0f + round( (natoms - fmod(natoms, (dupfac * strwidth )))/(dupfac * strwidth ) );
jstart = which_rep*jend;
if ( which_rep > dupfac - 1.5f ) {
jend = 1e6f;
}
jend += jstart;
exclind.x = linind;
linind1 = linind;
exclind.y = jstart*strwidth/4.0f;
exclind.y = floor( exclind.y + 0.25f );
/*
outforce1 = float3( linind, exclind.y, jstart );
outforce2 = float3( linind, exclind.y, jend );
outforce3 = float3( linind, exclind.y, strwidth );
outforce4 = float3( linind, exclind.y, exclind.y );
*/
jatom.y = jstart;
while ( jatom.y < jend && breakflag > 0.0f ) {
jatom.x = 0;
while ( jatom.x < strwidth && breakflag > 0.0f ) {
linind = (jatom.x + jatom.y*strwidth)/4.0f;
linind = floor( linind + 0.25f );
jstrindex.y = round( (linind - fmod(linind, jstrwidth))/jstrwidth );
jstrindex.x = linind - jstrindex.y*jstrwidth;
jstrindex.x = floor( jstrindex.x + 0.25f );
// ---------------------------------------------------------------------
jsig = sigma[ jstrindex ];
jeps = epsilon[ jstrindex ];
jpos = posq[ jatom ];
jpos1 = jpos.xyz;
qj.x = jpos.w;
jatom.x = floor( jatom.x + 1.2f );
jpos = posq[ jatom ];
jpos2 = jpos.xyz;
qj.y = jpos.w;
jatom.x = floor( jatom.x + 1.2f );
jpos = posq[ jatom ];
jpos3 = jpos.xyz;
qj.z = jpos.w;
jatom.x = floor( jatom.x + 1.2f );
jpos = posq[ jatom ];
jpos4 = jpos.xyz;
qj.w = jpos.w;
jatom.x = floor( jatom.x + 1.2f );
// ---------------------------------------------------------------------
sig = isig1 + jsig;
eps = ieps1 * jeps;
qq = qi1*qj;
d1 = ipos1 - jpos1;
d2 = ipos1 - jpos2;
d3 = ipos1 - jpos3;
d4 = ipos1 - jpos4;
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
r2 = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask*10000.0f;
fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
// outforce1 += fs.x * d1;
// outforce1 += fs.y * d2;
// outforce1 += fs.z * d3;
// outforce1 += fs.w * d4;
// outforce1 += float3( 4.0, 4.0, 4.0);
// outforce1 += 1.0/r2;
exclmask = exclmask > frac4 ? one4 : zero4;
sum = floor( exclmask.x + exclmask.y + exclmask.z + exclmask.w + 0.25f );
/*
if( sum > 0.25f ){
outforce1.x += 1.0f;
outforce1.y += exclusions;
outforce1.z += sum;
// outforce1.y += exclind.x + strwidth*exclind.y;
//outforce1.y += 4.0;
}
*/
if( exclind.y < 5.9f && exclind.y > 4.1f ){
outforce1.y += exclind.x + 576.0f*exclind.y;
outforce1.z += exclusions;
outforce1.x += exclind.y;
}
// ---------------------------------------------------------------------
exclind.x = floor( exclind.x + 1.2f );
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
sig = isig2 + jsig;
eps = ieps2 * jeps;
qq = qi2 * qj;
d1 = ipos2 - jpos1;
d2 = ipos2 - jpos2;
d3 = ipos2 - jpos3;
d4 = ipos2 - jpos4;
r2 = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask*10000.0f;
fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
// outforce2 += fs.x * d1;
// outforce2 += fs.y * d2;
// outforce2 += fs.z * d3;
// outforce2 += fs.w * d4;
// outforce2 += float3( 4.0f, 4.0f, 4.0f);
exclmask = exclmask > frac4 ? one4 : zero4;
sum = floor( exclmask.x + exclmask.y + exclmask.z + exclmask.w + 0.25f );
if( sum > 0.25f ){
outforce2.x += 1.0f;
outforce2.y += exclusions;
outforce2.z += sum;
}
// ---------------------------------------------------------------------
exclind.x = floor( exclind.x + 1.2f );
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
sig = isig3 + jsig;
eps = ieps3 * jeps;
qq = qi3 * qj;
d1 = ipos3 - jpos1;
d2 = ipos3 - jpos2;
d3 = ipos3 - jpos3;
d4 = ipos3 - jpos4;
r2 = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask * 10000.0f;
fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
// outforce3 += fs.x * d1;
// outforce3 += fs.y * d2;
// outforce3 += fs.z * d3;
// outforce3 += fs.w * d4;
// outforce3 += float3( 4.0f, 4.0f, 4.0f);
// outforce3 += 1.0f/r2;
exclmask = exclmask > frac4 ? one4 : zero4;
sum = floor( exclmask.x + exclmask.y + exclmask.z + exclmask.w + 0.25f );
if( sum > 0.25f ){
outforce3.x += 1.0f;
outforce3.y += exclusions;
outforce3.z += sum;
// outforce3.y += exclind.x + strwidth*exclind.y;
//outforce3.y += 4.0;
}
// ---------------------------------------------------------------------
exclind.x = floor( exclind.x + 1.2f );
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
sig = isig4 + jsig;
eps = ieps4 * jeps;
qq = qi4 * qj;
d1 = ipos4 - jpos1;
d2 = ipos4 - jpos2;
d3 = ipos4 - jpos3;
d4 = ipos4 - jpos4;
r2 = get_r2_CDLJ( d1, d2, d3, d4 ) + exclmask * 10000.0f;
fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
// outforce4 += fs.x * d1;
// outforce4 += fs.y * d2;
// outforce4 += fs.z * d3;
// outforce4 += fs.w * d4;
// outforce4 += float3( 4.0f, 4.0f, 4.0f);
// outforce4 += 1.0f/r2;
exclmask = exclmask > frac4 ? one4 : zero4;
sum = floor( exclmask.x + exclmask.y + exclmask.z + exclmask.w + 0.25f );
if( sum > 0.25f ){
outforce4.x += 1.0f;
outforce4.y += exclusions;
outforce4.z += sum;
// outforce4.y += exclind.x + strwidth*exclind.y;
//outforce4.y += 4.0;
}
// ---------------------------------------------------------------------
exclind.x = floor( exclind.x - 2.9f );
exclind.y = floor( exclind.y + 1.2f );
if ( natoms - (jatom.y*strwidth + jatom.x) < 0.9f )
breakflag = -1.0f;
}
jatom.y = floor( jatom.y + 1.2f );
}
}
kernel void kbonded_CDLJ (
float xstrwidth,
float4 posq[][],
float4 atoms<>,
float4 parm0<>,
float4 parm1<>,
float4 parm2<>,
float4 parm3<>,
float4 parm4<>,
out float3 fi<>,
out float3 fj<>,
out float3 fk<>,
out float3 fl<>
)
{
float3 rij, rkj, rkl, ril;
float2 ai, aj, ak, al;
float3 m, n;
float sgnphi;
float cosfac;
float ddphi, ddphi_rb;
float3 u, v, s;
float invlkj, invlij, invlkl, msq, nsq, cos_phi, sin_phi;
float3 uij, ukj, ukl;
float phi, mdphi;
float rij_d_ukj, ukj_d_rkl;
float normfac;
float qq;
float3 fi_ang, fj_ang, fk_ang;
float3 fi_bond;
float cii, ckk, cik;
float fs, st, sth;
float3 fi_pair;
float r2 ;
float torsion_numerator;
float theta1, costheta1, invsintheta1;
float theta2, costheta2, invsintheta2;
float4 posqi, posqj, posqk, posql;
float sig, eps;
ai.y = round( (atoms.x - fmod( atoms.x, xstrwidth ))/xstrwidth );
ai.x = atoms.x - ai.y * xstrwidth;
posqi = posq[ ai ];
if ( atoms.y > -0.5f ) {
aj.y = round( (atoms.y - fmod( atoms.y, xstrwidth ))/xstrwidth );
aj.x = atoms.y - aj.y * xstrwidth;
posqj = posq[ aj ];
}
else
posqj = float4( 0.0f, 0.0f, 1.0f, 0.0f );
if ( atoms.z > -0.5f ) {
ak.y = round( (atoms.z - fmod( atoms.z, xstrwidth ))/xstrwidth );
ak.x = atoms.z - ak.y * xstrwidth;
posqk = posq[ ak ];
}
else
posqk = float4( 0.0f, 1.0f, 1.0f, 0.0f );
if ( atoms.w > -0.5f ) {
al.y = round( (atoms.w - fmod( atoms.w, xstrwidth ))/xstrwidth );
al.x = atoms.w - al.y * xstrwidth;
posql = posq[ al ];
}
else
posql = float4( 1.0f, 1.0f, 1.0f, 0.0f );
qq = posqi.w * posql.w;
rij = posqi.xyz - posqj.xyz;
rkj = posqk.xyz - posqj.xyz;
rkl = posqk.xyz - posql.xyz;
ril = posqi.xyz - posql.xyz;
invlij = rsqrt( dot( rij, rij ) );
invlkj = rsqrt( dot( rkj, rkj ) );
invlkl = rsqrt( dot( rkl, rkl ) );
uij = rij * invlij;
ukj = rkj * invlkj;
ukl = rkl * invlkl;
rij_d_ukj = dot( rij, ukj );
ukj_d_rkl = dot( ukj, rkl );
m = cross( uij, ukj );
n = cross( ukj, ukl );
costheta1 = clamp( rij_d_ukj * invlij, -1.0f, 1.0f );
theta1 = acos( costheta1 );
invsintheta1 = rsqrt( 1.0f - costheta1 * costheta1 );
costheta2 = clamp( ukj_d_rkl * invlkl, -1.0f, 1.0f );
theta2 = acos( costheta2 );
invsintheta2 = rsqrt( 1.0f - costheta2 * costheta2 );
cos_phi = clamp( dot(m,n) * invsintheta1 * invsintheta2, -1.0f, 1.0f );
sgnphi = sign( dot(rij, n) );
phi = sgnphi * acos( cos_phi );
mdphi = parm1.w * phi - parm1.z;
ddphi = -parm1.y * parm1.w * sin( mdphi );
cos_phi = -cos_phi;
sin_phi = -sgnphi*sqrt( clamp( 1.0f - cos_phi * cos_phi, 0.0f, 1.0f) );
ddphi_rb = 5.0f * parm1.x;
ddphi_rb = 4.0f * parm0.w + ddphi_rb * cos_phi;
ddphi_rb = 3.0f * parm0.z + ddphi_rb * cos_phi;
ddphi_rb = 2.0f * parm0.y + ddphi_rb * cos_phi;
ddphi_rb = parm0.x + ddphi_rb * cos_phi;
ddphi_rb = -ddphi_rb * sin_phi;
ddphi += ddphi_rb;
fi = (-ddphi * invlij * invsintheta1 * invsintheta1 ) * m;
fl = ( ddphi * invlkl * invsintheta2 * invsintheta2 ) * n;
u = rij_d_ukj * invlkj * fi;
v = ukj_d_rkl * invlkj * fl;
s = u - v;
fj = s - fi;
fk = -(s + fl);
fs = -parm2.y * ( theta1 - parm2.x );
st = fs * invsintheta1;
st = clamp( st, -100000.0f, 100000.0f );
sth = st * costheta1;
cik = st * invlkj * invlij;
cii = sth * invlij;
ckk = sth * invlkj;
fi_ang = -( cik * rkj - cii * uij );
fk_ang = -( cik * rij - ckk * ukj );
fj_ang = -fi_ang - fk_ang;
fi += fi_ang;
fj += fj_ang;
fk += fk_ang;
fs = -parm2.w * ( theta2 - parm2.z );
st = fs * invsintheta2;
st = clamp( st, -100000.0f, 100000.0f );
sth = st * costheta2;
cik = st * invlkj * invlkl;
cii = sth * invlkj;
ckk = sth * invlkl;
fi_ang = ( cik * rkl - cii * ukj );
fk_ang = ( cik * rkj - ckk * ukl );
fj_ang = -fi_ang - fk_ang;
fj += fi_ang;
fk += fj_ang;
fl += fk_ang;
fi_bond = -parm3.y * ( 1.0f - parm3.x * invlij ) * rij;
fi += fi_bond;
fj += -fi_bond;
fi_bond = parm3.w * ( 1.0f - parm3.z * invlkj ) * rkj;
fj += fi_bond;
fk += -fi_bond;
fi_bond = -parm4.y * ( 1.0f - parm4.x * invlkl ) * rkl;
fk += fi_bond;
fl += -fi_bond;
if ( parm4.z > -0.5f ) {
r2 = dot( ril, ril );
fs = scalar_force_single_CDLJ( qq, epsfac, parm4.z, parm4.w, r2 );
fi_pair = fs * ril;
fi += fi_pair;
fl -= fi_pair;
}
}
kernel void kbonded_CDLJDebug (
float epsfac,
float xstrwidth,
float4 nbparams,
float4 posq[][],
float4 atoms<>,
float4 parm0<>,
float4 parm1<>,
float4 parm2<>,
float4 parm3<>,
float4 parm4<>,
out float3 fi<>,
out float3 fj<>,
out float3 fk<>,
out float3 fl<>
)
{
float3 rij, rkj, rkl, ril;
float2 ai, aj, ak, al;
float3 m, n;
float sgnphi;
float cosfac;
float ddphi, ddphi_rb;
float3 u, v, s;
float invlkj, invlij, invlkl, msq, nsq, cos_phi, sin_phi;
float3 uij, ukj, ukl;
float phi, mdphi;
float rij_d_ukj, ukj_d_rkl;
float normfac;
float qq;
float3 fi_ang, fj_ang, fk_ang;
float3 fi_bond;
float cii, ckk, cik;
float fs, st, sth;
float3 fi_pair;
float r2 ;
float torsion_numerator;
float theta1, costheta1, invsintheta1;
float theta2, costheta2, invsintheta2;
float4 posqi, posqj, posqk, posql;
float sig, eps;
ai.y = round( (atoms.x - fmod( atoms.x, xstrwidth ))/xstrwidth );
ai.x = atoms.x - ai.y * xstrwidth;
posqi = posq[ ai ];
if ( atoms.y > -0.5f ) {
aj.y = round( (atoms.y - fmod( atoms.y, xstrwidth ))/xstrwidth );
aj.x = atoms.y - aj.y * xstrwidth;
posqj = posq[ aj ];
} else {
posqj = float4( 0.0f, 0.0f, 1.0f, 0.0f );
}
if ( atoms.z > -0.5f ) {
ak.y = round( (atoms.z - fmod( atoms.z, xstrwidth ))/xstrwidth );
ak.x = atoms.z - ak.y * xstrwidth;
posqk = posq[ ak ];
} else {
posqk = float4( 0.0f, 1.0f, 1.0f, 0.0f );
}
if ( atoms.w > -0.5f ) {
al.y = round( (atoms.w - fmod( atoms.w, xstrwidth ))/xstrwidth );
al.x = atoms.w - al.y * xstrwidth;
posql = posq[ al ];
} else {
posql = float4( 1.0f, 1.0f, 1.0f, 0.0f );
}
qq = posqi.w * posql.w;
rij = posqi.xyz - posqj.xyz;
rkj = posqk.xyz - posqj.xyz;
rkl = posqk.xyz - posql.xyz;
ril = posqi.xyz - posql.xyz;
invlij = rsqrt( dot( rij, rij ) );
invlkj = rsqrt( dot( rkj, rkj ) );
invlkl = rsqrt( dot( rkl, rkl ) );
uij = rij * invlij;
ukj = rkj * invlkj;
ukl = rkl * invlkl;
rij_d_ukj = dot( rij, ukj );
ukj_d_rkl = dot( ukj, rkl );
m = cross( uij, ukj );
n = cross( ukj, ukl );
costheta1 = clamp( rij_d_ukj * invlij, -1.0f, 1.0f );
theta1 = acos( costheta1 );
invsintheta1 = rsqrt( 1.0f - costheta1 * costheta1 );
costheta2 = clamp( ukj_d_rkl * invlkl, -1.0f, 1.0f );
theta2 = acos( costheta2 );
invsintheta2 = rsqrt( 1.0f - costheta2 * costheta2 );
cos_phi = clamp( dot(m,n) * invsintheta1 * invsintheta2, -1.0f, 1.0f );
sgnphi = sign( dot(rij, n) );
phi = sgnphi * acos( cos_phi );
mdphi = parm1.w * phi - parm1.z;
ddphi = -parm1.y * parm1.w * sin( mdphi );
cos_phi = -cos_phi;
sin_phi = -sgnphi*sqrt( clamp( 1.0f - cos_phi * cos_phi, 0.0f, 1.0f) );
ddphi_rb = 5.0f * parm1.x;
ddphi_rb = 4.0f * parm0.w + ddphi_rb * cos_phi;
ddphi_rb = 3.0f * parm0.z + ddphi_rb * cos_phi;
ddphi_rb = 2.0f * parm0.y + ddphi_rb * cos_phi;
ddphi_rb = parm0.x + ddphi_rb * cos_phi;
ddphi_rb = -ddphi_rb * sin_phi;
ddphi += ddphi_rb;
//ddphi = ddphi_rb;
//ddphi = 0.0f;
fi = (-ddphi * invlij * invsintheta1 * invsintheta1 ) * m;
fl = ( ddphi * invlkl * invsintheta2 * invsintheta2 ) * n;
u = rij_d_ukj * invlkj * fi;
v = ukj_d_rkl * invlkj * fl;
s = u - v;
fj = s - fi;
fk = -(s + fl);
fs = -parm2.y * ( theta1 - parm2.x );
st = fs * invsintheta1;
st = clamp( st, -100000.0f, 100000.0f );
sth = st * costheta1;
cik = st * invlkj * invlij;
cii = sth * invlij;
ckk = sth * invlkj;
fi_ang = -( cik * rkj - cii * uij );
fk_ang = -( cik * rij - ckk * ukj );
fj_ang = -fi_ang - fk_ang;
/*
fi += fi_ang;
fj += fj_ang;
fk += fk_ang;
*/
fs = -parm2.w * ( theta2 - parm2.z );
st = fs * invsintheta2;
st = clamp( st, -100000.0f, 100000.0f );
sth = st * costheta2;
cik = st * invlkj * invlkl;
cii = sth * invlkj;
ckk = sth * invlkl;
fi_ang = ( cik * rkl - cii * ukj );
fk_ang = ( cik * rkj - ckk * ukl );
fj_ang = -fi_ang - fk_ang;
/*
fj += fi_ang;
fk += fj_ang;
fl += fk_ang;
*/
// bonded terms
// k*(r - d)*deltaX/r = k*(1 - d/r)*deltaX
/*
fi = float3( 0.0f, 0.0f, 0.0f );
fj = float3( 0.0f, 0.0f, 0.0f );
fk = float3( 0.0f, 0.0f, 0.0f );
fl = float3( 0.0f, 0.0f, 0.0f );
*/
/*
// ------------------------------------------------------------
fi_bond = -parm3.y * ( 1.0f - parm3.x * invlij ) * rij;
fi += fi_bond;
fj += -fi_bond;
// ------------------------------------------------------------
fi_bond = parm3.w * ( 1.0f - parm3.z * invlkj ) * rkj;
fj += fi_bond;
fk += -fi_bond;
// ------------------------------------------------------------
fi_bond = -parm4.y * ( 1.0f - parm4.x * invlkl ) * rkl;
fk += fi_bond;
fl += -fi_bond;
*/
// ------------------------------------------------------------
// LJ14 terms
//fi = float3( 0.0f, 0.0f, 0.0f);
//fj = float3( 0.0f, 0.0f, 0.0f);
//fk = float3( 0.0f, 0.0f, 0.0f);
//fl = float3( 0.0f, 0.0f, 0.0f);
/*
if ( parm4.z > -0.5f ) {
r2 = dot( ril, ril );
fs = scalar_force_single_CDLJ( qq, epsfac, parm4.z, parm4.w, r2 );
fi_pair = fs * ril;
//fi_pair = float3( posqi.w + posql.w, posqi.w, posql.w );
//fi_pair = float3( posqi.w + posql.w, posqi.w, posql.w );
//fi_pair = float3( atoms.x, parm4.z, parm4.w );
//fi_pair = float3( 0.0f, 0.0f, 0.0f);
fi += fi_pair;
fl -= fi_pair;
//fi = fi_pair;
//fl = float3( atoms.w, parm4.z, parm4.w );
}
*/
}
kernel void knbforce_CDLJ4NoEx(
float natoms,
float nAtomsCeiling,
float dupfac,
float strheight,
float strwidth,
float jstrwidth,
float fstrwidth,
float epsfac,
float4 params,
float4 posq[][],
float4 isigeps[][],
float4 sigma[][],
float4 epsilon[][],
float excl[][],
out float3 outforce1<>,
out float3 outforce2<>,
out float3 outforce3<>,
out float3 outforce4<> ) {
float4 jsig, jeps;
float3 ipos1, ipos2, ipos3, ipos4;
float4 jpos;
float3 jpos1, jpos2, jpos3, jpos4;
float qi1, qi2, qi3, qi4;
float isig1, isig2, isig3, isig4;
float ieps1, ieps2, ieps3, ieps4;
float4 qj, qq;
float3 d1, d2, d3, d4;
float2 exclind;
float4 sig, eps;
float exclusions;
float4 r2;
float3 pad;
float4 exclmask;
float4 exclconst;
float2 iatom;
float2 jstrindex;
float a_iatom;
float linind;
float breakflag;
float2 jatom;
float4 fs;
float jend, jstart;
float which_rep;
exclconst = float4( 2.0f, 3.0f, 5.0f, 7.0f );
outforce1 = float3( 0.0f, 0.0f, 0.0f );
outforce2 = float3( 0.0f, 0.0f, 0.0f );
outforce3 = float3( 0.0f, 0.0f, 0.0f );
outforce4 = float3( 0.0f, 0.0f, 0.0f );
a_iatom = 4.0f*((indexof outforce1).x + (indexof outforce1).y * fstrwidth);
// linind = fmod( a_iatom, natoms );
linind = fmod( a_iatom, nAtomsCeiling );
iatom.x = fmod( linind, strwidth );
iatom.y = round( (linind - fmod(linind, strwidth))/strwidth );
/* --------------------------------------------------------------------- */
jpos = posq[ iatom ];
ipos1 = jpos.xyz;
qi1 = jpos.w;
jstrindex = isigeps[ iatom ];
isig1 = jstrindex.x;
ieps1 = jstrindex.y;
/* --------------------------------------------------------------------- */
iatom.x += 1.0f;
jpos = posq[ iatom ];
ipos2 = jpos.xyz;
qi2 = jpos.w;
jstrindex = isigeps[ iatom ];
isig2 = jstrindex.x;
ieps2 = jstrindex.y;
/* --------------------------------------------------------------------- */
iatom.x += 1.0f;
jpos = posq[ iatom ];
ipos3 = jpos.xyz;
qi3 = jpos.w;
jstrindex = isigeps[ iatom ];
isig3 = jstrindex.x;
ieps3 = jstrindex.y;
/* --------------------------------------------------------------------- */
iatom.x += 1.0f;
jpos = posq[ iatom ];
ipos4 = jpos.xyz;
qi4 = jpos.w;
jstrindex = isigeps[ iatom ];
isig4 = jstrindex.x;
ieps4 = jstrindex.y;
/* --------------------------------------------------------------------- */
breakflag = 1.0f;
jatom.y = 0.0f;
// which_rep = round( (a_iatom - fmod(a_iatom, natoms))/natoms );
which_rep = round( (a_iatom - fmod(a_iatom, nAtomsCeiling ))/nAtomsCeiling );
jend = 1.0f + round( (natoms - fmod(natoms, (dupfac * strwidth )))/(dupfac * strwidth ) );
jstart = which_rep * jend;
if ( which_rep > dupfac - 1.5f ) {
jend = 1e6f;
}
jend += jstart;
//exclind.x = linind;
//exclind.y = jstart * strwidth/4.0f;
//exclind.y = floor( exclind.y + 0.25f );
jatom.y = jstart;
while ( jatom.y < jend && breakflag > 0.0f ) {
jatom.x = 0;
while ( jatom.x < strwidth && breakflag > 0.0f ) {
// exclusions = excl[ exclind ];
// exclmask = fmod( exclusions, exclconst );
linind = (jatom.x + jatom.y*strwidth)/4.0f;
linind = floor( linind + 0.25f );
jstrindex.y = round( (linind - fmod(linind, jstrwidth))/jstrwidth );
jstrindex.x = linind - jstrindex.y * jstrwidth;
jstrindex.x = floor( jstrindex.x + 0.25f );
/* --------------------------------------------------------------------- */
jsig = sigma[ jstrindex ];
jeps = epsilon[ jstrindex ];
jpos = posq[ jatom ];
jpos1 = jpos.xyz;
qj.x = jpos.w;
jatom.x = floor( jatom.x + 1.2f );
//jatom.x += 1.0f;
jpos = posq[ jatom ];
jpos2 = jpos.xyz;
qj.y = jpos.w;
jatom.x = floor( jatom.x + 1.2f );
//jatom.x += 1.0f;
jpos = posq[ jatom ];
jpos3 = jpos.xyz;
qj.z = jpos.w;
jatom.x = floor( jatom.x + 1.2f );
//jatom.x += 1.0f;
jpos = posq[ jatom ];
jpos4 = jpos.xyz;
qj.w = jpos.w;
jatom.x = floor( jatom.x + 1.2f );
//jatom.x += 1.0f;
/* --------------------------------------------------------------------- */
sig = isig1 + jsig;
eps = ieps1 * jeps;
qq = qi1*qj;
d1 = ipos1 - jpos1;
d2 = ipos1 - jpos2;
d3 = ipos1 - jpos3;
d4 = ipos1 - jpos4;
r2 = get_r2_CDLJ( d1, d2, d3, d4 );
fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
outforce1 += fs.x * d1;
outforce1 += fs.y * d2;
outforce1 += fs.z * d3;
outforce1 += fs.w * d4;
/* --------------------------------------------------------------------- */
//exclind.x += 1.0f;
/*
exclind.x = floor( exclind.x + 1.2f );
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
*/
sig = isig2 + jsig;
eps = ieps2 * jeps;
qq = qi2 * qj;
d1 = ipos2 - jpos1;
d2 = ipos2 - jpos2;
d3 = ipos2 - jpos3;
d4 = ipos2 - jpos4;
r2 = get_r2_CDLJ( d1, d2, d3, d4 );
fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
outforce2 += fs.x * d1;
outforce2 += fs.y * d2;
outforce2 += fs.z * d3;
outforce2 += fs.w * d4;
/* --------------------------------------------------------------------- */
//exclind.x += 1.0f;
/*
exclind.x = floor( exclind.x + 1.2f );
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
*/
sig = isig3 + jsig;
eps = ieps3 * jeps;
qq = qi3 * qj;
d1 = ipos3 - jpos1;
d2 = ipos3 - jpos2;
d3 = ipos3 - jpos3;
d4 = ipos3 - jpos4;
r2 = get_r2_CDLJ( d1, d2, d3, d4 );
fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
outforce3 += fs.x * d1;
outforce3 += fs.y * d2;
outforce3 += fs.z * d3;
outforce3 += fs.w * d4;
/* --------------------------------------------------------------------- */
//exclind.x += 1.0f;
/*
exclind.x = floor( exclind.x + 1.2f );
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
*/
sig = isig4 + jsig;
eps = ieps4 * jeps;
qq = qi4 * qj;
d1 = ipos4 - jpos1;
d2 = ipos4 - jpos2;
d3 = ipos4 - jpos3;
d4 = ipos4 - jpos4;
r2 = get_r2_CDLJ( d1, d2, d3, d4 );
fs = scalar_force_CDLJ( qq, epsfac, sig, eps, r2, params );
outforce4 += fs.x * d1;
outforce4 += fs.y * d2;
outforce4 += fs.z * d3;
outforce4 += fs.w * d4;
/* --------------------------------------------------------------------- */
//exclind.x -= 3.0f;
//exclind.x = floor( exclind.x - 2.9f );
//exclind.y += 1.0f;
//exclind.y = floor( exclind.y + 1.25f );
if( natoms - (jatom.y*strwidth + jatom.x) < 0.9f )breakflag = -1.0f;
}
//jatom.y += 1.0f;
jatom.y = floor( jatom.y + 1.2f );
}
}
platforms/brook/src/gpu/kinvmap_gather.br
0 → 100644
View file @
b53cd947
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
//Gather kernel for use with diferent angle, dihedral and improper functions
//
//For small systems, the overhead of calling the kernel is so high
//That we have to minimize the number of kernel calls. At the same
//time we don't want to waste reads, so I'm providing a bunch of kernels
//here that are unrolled to diferent extents. Use the appropriate one
//depending on how many inverse maps there are, rather than looping over
//the simple one.
//helper function to make the unrolling look better
kernel float3 do_gather( float strwidth, float4 invmap<>, float3 forces[][] ) {
float3 f;
float4 quotient, remainder;
float2 idx;
f = float3( 0.0f, 0.0f, 0.0f );
//Convert from linear to 2D index
// quotient = floor( invmap / strwidth );
quotient = round( ( invmap - fmod(invmap, strwidth))/strwidth );
remainder = invmap - quotient * strwidth;
//Add each force only if non-negative
if ( invmap.x >= 0.0f ) {
idx.y = quotient.x;
idx.x = remainder.x;
f = forces[ idx ];
}
if ( invmap.y >= 0.0f ) {
idx.y = quotient.y;
idx.x = remainder.y;
f += forces[ idx ];
}
if ( invmap.z >= 0.0f ) {
idx.y = quotient.z;
idx.x = remainder.z;
f += forces[ idx ];
}
if ( invmap.w >= 0.0f ) {
idx.y = quotient.w;
idx.x = remainder.w;
f += forces[ idx ];
}
return f;
}
//Simple version, takes only one index stream
kernel void kinvmap_gather(
float strwidth, //stream width of the dihedral forces
float4 invmap<>, //indices into the dihedral forces
float3 forces[][], //dihedral forces
float3 inforce<>, //particle forces before
out float3 outforce<> //particle forces after
)
{
outforce = inforce;
outforce += do_gather( strwidth, invmap, forces );
}
//Takes two inverse maps
kernel void kinvmap_gather2(
float strwidth, //stream width of the dihedral forces
float4 invmap1<>, //indices into the dihedral forces
float4 invmap2<>, //indices into the dihedral forces
float3 forces[][], //dihedral forces
float3 inforce<>, //particle forces before
out float3 outforce<> //particle forces after
)
{
outforce = inforce;
outforce += do_gather( strwidth, invmap1, forces );
outforce += do_gather( strwidth, invmap2, forces );
}
//Takes three inverse maps
kernel void kinvmap_gather3(
float strwidth, //stream width of the dihedral forces
float4 invmap1<>, //indices into the dihedral forces
float4 invmap2<>, //indices into the dihedral forces
float4 invmap3<>,
float3 forces[][], //dihedral forces
float3 inforce<>, //particle forces before
out float3 outforce<> //particle forces after
)
{
outforce = inforce;
outforce += do_gather( strwidth, invmap1, forces );
outforce += do_gather( strwidth, invmap2, forces );
outforce += do_gather( strwidth, invmap3, forces );
}
//Takes four inverse maps
kernel void kinvmap_gather4(
float strwidth, //stream width of the dihedral forces
float4 invmap1<>, //indices into the dihedral forces
float4 invmap2<>, //indices into the dihedral forces
float4 invmap3<>,
float4 invmap4<>,
float3 forces[][], //dihedral forces
float3 inforce<>, //particle forces before
out float3 outforce<> //particle forces after
)
{
outforce = inforce;
outforce += do_gather( strwidth, invmap1, forces );
outforce += do_gather( strwidth, invmap2, forces );
outforce += do_gather( strwidth, invmap3, forces );
outforce += do_gather( strwidth, invmap4, forces );
}
//Takes five inverse maps
kernel void kinvmap_gather5(
float strwidth, //stream width of the dihedral forces
float4 invmap1<>, //indices into the dihedral forces
float4 invmap2<>, //indices into the dihedral forces
float4 invmap3<>,
float4 invmap4<>,
float4 invmap5<>,
float3 forces[][], //dihedral forces
float3 inforce<>, //particle forces before
out float3 outforce<> //particle forces after
)
{
outforce = inforce;
outforce += do_gather( strwidth, invmap1, forces );
outforce += do_gather( strwidth, invmap2, forces );
outforce += do_gather( strwidth, invmap3, forces );
outforce += do_gather( strwidth, invmap4, forces );
outforce += do_gather( strwidth, invmap5, forces );
}
//Takes six inverse maps - this is the last one!
kernel void kinvmap_gather6(
float strwidth, //stream width of the dihedral forces
float4 invmap1<>, //indices into the dihedral forces
float4 invmap2<>, //indices into the dihedral forces
float4 invmap3<>,
float4 invmap4<>,
float4 invmap5<>,
float4 invmap6<>,
float3 forces[][], //dihedral forces
float3 inforce<>, //particle forces before
out float3 outforce<> //particle forces after
)
{
outforce = inforce;
outforce += do_gather( strwidth, invmap1, forces );
outforce += do_gather( strwidth, invmap2, forces );
outforce += do_gather( strwidth, invmap3, forces );
outforce += do_gather( strwidth, invmap4, forces );
outforce += do_gather( strwidth, invmap5, forces );
outforce += do_gather( strwidth, invmap6, forces );
}
kernel float3 etch_force(
float fpos,
float strwidth,
float3 fi[][],
float3 fj[][],
float3 fk[][],
float3 fl[][]
)
{
float2 ind;
float _fpos;
_fpos = fpos;
if ( _fpos > 300000.0f ) {
_fpos = _fpos - 300000.0f;
//ind.y = floor( _fpos / strwidth );
ind.y = round( ( _fpos - fmod( _fpos, strwidth))/strwidth );
ind.x = _fpos - ind.y * strwidth;
return fl[ ind ];
}
else if ( _fpos > 200000.0f ) {
_fpos = _fpos - 200000.0f;
//ind.y = floor( _fpos / strwidth );
ind.y = round( ( _fpos - fmod( _fpos, strwidth))/strwidth );
ind.x = _fpos - ind.y * strwidth;
return fk[ ind ];
}
else if ( _fpos > 100000.0f ) {
_fpos = _fpos - 100000.0f;
//ind.y = floor( _fpos / strwidth );
ind.y = round( ( _fpos - fmod( _fpos, strwidth))/strwidth );
ind.x = _fpos - ind.y * strwidth;
return fj[ ind ];
}
else if ( _fpos >= -0.5f ) {
//ind.y = floor( _fpos / strwidth );
ind.y = round( ( _fpos - fmod( _fpos, strwidth))/strwidth );
ind.x = _fpos - ind.y * strwidth;
return fi[ ind ];
}
else
return 0.0f;
}
//For-loop version doesn't work
//Using a merged version of the above
kernel float2 linear_to_2D( float linind, float width )
{
float2 ind;
//ind.y = floor( linind / width );
ind.y = round( ( linind - fmod( linind, width))/width );
ind.x = linind - ind.y * width;
return ind;
}
//helper function to make the unrolling look better
kernel float3 do_gather_merged_single( float strwidth, float invmap,
float3 fi[][], float3 fj[][], float3 fk[][], float3 fl[][] ) {
float3 f;
float2 idx;
float _invmap;
float n;
_invmap = invmap;
n = floor( _invmap / 100000.0f );
_invmap -= n * 100000.0f;
idx = linear_to_2D( _invmap, strwidth );
if ( n > 2.5f ) {
f = fl[ idx ];
}
else if ( n > 1.5f ) {
f = fk[ idx ];
}
else if ( n > 0.5f ) {
f = fj[ idx ];
}
else if( n > -0.5f ) {
f = fi[ idx ];
}
return f;
}
kernel float3 do_gather_merged( float strwidth, float4 invmap,
float3 fi[][], float3 fj[][], float3 fk[][], float3 fl[][])
{
float3 f;
f = do_gather_merged_single( strwidth, invmap.x, fi, fj, fk, fl )
+ do_gather_merged_single( strwidth, invmap.y, fi, fj, fk, fl )
+ do_gather_merged_single( strwidth, invmap.z, fi, fj, fk, fl )
+ do_gather_merged_single( strwidth, invmap.w, fi, fj, fk, fl );
return f;
}
kernel void kinvmap_gather_merged5(
float natoms, //number of atoms
float strwidth, //stream width of out-of-order forces
float4 invmap0<>,
float4 invmap1<>,
float4 invmap2<>,
float4 invmap3<>,
float4 invmap4<>,
float3 fi[][], //i-forces
float3 fj[][], //j-forces
float3 fk[][], //k-forces
float3 fl[][], //l-forces
float3 inforce<>,
out float3 outforce<>
)
{
outforce = inforce;
outforce += do_gather_merged( strwidth, invmap0, fi, fj, fk, fl )
+ do_gather_merged( strwidth, invmap1, fi, fj, fk, fl )
+ do_gather_merged( strwidth, invmap2, fi, fj, fk, fl )
+ do_gather_merged( strwidth, invmap3, fi, fj, fk, fl )
+ do_gather_merged( strwidth, invmap4, fi, fj, fk, fl );
}
kernel void kinvmap_gather_merged9(
float natoms, //number of atoms
float strwidth, //stream width of out-of-order forces
float4 invmap0<>,
float4 invmap1<>,
float4 invmap2<>,
float4 invmap3<>,
float4 invmap4<>,
float4 invmap5<>,
float4 invmap6<>,
float4 invmap7<>,
float4 invmap8<>,
float3 fi[][], //i-forces
float3 fj[][], //j-forces
float3 fk[][], //k-forces
float3 fl[][], //l-forces
float3 inforce<>,
out float3 outforce<>
)
{
outforce = inforce;
outforce += do_gather_merged( strwidth, invmap0, fi, fj, fk, fl )
+ do_gather_merged( strwidth, invmap1, fi, fj, fk, fl )
+ do_gather_merged( strwidth, invmap2, fi, fj, fk, fl )
+ do_gather_merged( strwidth, invmap3, fi, fj, fk, fl )
+ do_gather_merged( strwidth, invmap4, fi, fj, fk, fl )
+ do_gather_merged( strwidth, invmap5, fi, fj, fk, fl )
+ do_gather_merged( strwidth, invmap6, fi, fj, fk, fl )
+ do_gather_merged( strwidth, invmap7, fi, fj, fk, fl )
+ do_gather_merged( strwidth, invmap8, fi, fj, fk, fl );
}
platforms/brook/src/gpu/kinvmap_gather.h
0 → 100644
View file @
b53cd947
void
kinvmap_gather
(
const
float
strwidth
,
::
brook
::
stream
invmap
,
::
brook
::
stream
forces
,
::
brook
::
stream
inforce
,
::
brook
::
stream
outforce
);
void
kinvmap_gather2
(
const
float
strwidth
,
::
brook
::
stream
invmap1
,
::
brook
::
stream
invmap2
,
::
brook
::
stream
forces
,
::
brook
::
stream
inforce
,
::
brook
::
stream
outforce
);
void
kinvmap_gather3
(
const
float
strwidth
,
::
brook
::
stream
invmap1
,
::
brook
::
stream
invmap2
,
::
brook
::
stream
invmap3
,
::
brook
::
stream
forces
,
::
brook
::
stream
inforce
,
::
brook
::
stream
outforce
);
void
kinvmap_gather4
(
const
float
strwidth
,
::
brook
::
stream
invmap1
,
::
brook
::
stream
invmap2
,
::
brook
::
stream
invmap3
,
::
brook
::
stream
invmap4
,
::
brook
::
stream
forces
,
::
brook
::
stream
inforce
,
::
brook
::
stream
outforce
);
void
kinvmap_gather5
(
const
float
strwidth
,
::
brook
::
stream
invmap1
,
::
brook
::
stream
invmap2
,
::
brook
::
stream
invmap3
,
::
brook
::
stream
invmap4
,
::
brook
::
stream
invmap5
,
::
brook
::
stream
forces
,
::
brook
::
stream
inforce
,
::
brook
::
stream
outforce
);
void
kinvmap_gather6
(
const
float
strwidth
,
::
brook
::
stream
invmap1
,
::
brook
::
stream
invmap2
,
::
brook
::
stream
invmap3
,
::
brook
::
stream
invmap4
,
::
brook
::
stream
invmap5
,
::
brook
::
stream
invmap6
,
::
brook
::
stream
forces
,
::
brook
::
stream
inforce
,
::
brook
::
stream
outforce
);
void
kinvmap_gather3_4
(
const
float
strwidth
,
::
brook
::
stream
invmap3_1
,
::
brook
::
stream
invmap3_2
,
::
brook
::
stream
invmap3_3
,
::
brook
::
stream
forces3
,
::
brook
::
stream
invmap4_1
,
::
brook
::
stream
invmap4_2
,
::
brook
::
stream
invmap4_3
,
::
brook
::
stream
invmap4_4
,
::
brook
::
stream
forces4
,
::
brook
::
stream
inforce
,
::
brook
::
stream
outforce
);
void
kinvmap_gather3_5
(
const
float
strwidth
,
::
brook
::
stream
invmap3_1
,
::
brook
::
stream
invmap3_2
,
::
brook
::
stream
invmap3_3
,
::
brook
::
stream
forces3
,
::
brook
::
stream
invmap5_1
,
::
brook
::
stream
invmap5_2
,
::
brook
::
stream
invmap5_3
,
::
brook
::
stream
invmap5_4
,
::
brook
::
stream
invmap5_5
,
::
brook
::
stream
forces5
,
::
brook
::
stream
inforce
,
::
brook
::
stream
outforce
);
void
kinvmap_gather5_2
(
const
float
strwidth
,
::
brook
::
stream
invmap5_1
,
::
brook
::
stream
invmap5_2
,
::
brook
::
stream
invmap5_3
,
::
brook
::
stream
invmap5_4
,
::
brook
::
stream
invmap5_5
,
::
brook
::
stream
forces5
,
::
brook
::
stream
invmap2_1
,
::
brook
::
stream
invmap2_2
,
::
brook
::
stream
forces2
,
::
brook
::
stream
inforce
,
::
brook
::
stream
outforce
);
void
kinvmap_gather4_2
(
const
float
strwidth
,
::
brook
::
stream
invmap4_1
,
::
brook
::
stream
invmap4_2
,
::
brook
::
stream
invmap4_3
,
::
brook
::
stream
invmap4_4
,
::
brook
::
stream
forces4
,
::
brook
::
stream
invmap2_1
,
::
brook
::
stream
invmap2_2
,
::
brook
::
stream
forces2
,
::
brook
::
stream
inforce
,
::
brook
::
stream
outforce
);
void
kinvmap_gather_merged
(
const
float
natoms
,
const
float
strwidth
,
const
float
istrwidth
,
const
float
fstrwidth
,
::
brook
::
stream
nimap
,
::
brook
::
stream
invmap
,
::
brook
::
stream
fi
,
::
brook
::
stream
fj
,
::
brook
::
stream
fk
,
::
brook
::
stream
fl
,
::
brook
::
stream
inforce
,
::
brook
::
stream
outforce
);
void
kinvmap_gather_merged9
(
const
float
natoms
,
const
float
strwidth
,
::
brook
::
stream
invmap0
,
::
brook
::
stream
invmap1
,
::
brook
::
stream
invmap2
,
::
brook
::
stream
invmap3
,
::
brook
::
stream
invmap4
,
::
brook
::
stream
invmap5
,
::
brook
::
stream
invmap6
,
::
brook
::
stream
invmap7
,
::
brook
::
stream
invmap8
,
::
brook
::
stream
fi
,
::
brook
::
stream
fj
,
::
brook
::
stream
fk
,
::
brook
::
stream
fl
,
::
brook
::
stream
inforce
,
::
brook
::
stream
outforce
);
void
kinvmap_gather_merged5
(
const
float
natoms
,
const
float
strwidth
,
::
brook
::
stream
invmap0
,
::
brook
::
stream
invmap1
,
::
brook
::
stream
invmap2
,
::
brook
::
stream
invmap3
,
::
brook
::
stream
invmap4
,
::
brook
::
stream
fi
,
::
brook
::
stream
fj
,
::
brook
::
stream
fk
,
::
brook
::
stream
fl
,
::
brook
::
stream
inforce
,
::
brook
::
stream
outforce
);
platforms/brook/src/gpu/kmerge_partial_forces.br
0 → 100644
View file @
b53cd947
/* After forces above, we have the forces for even numbered particles
* in one stream, odd numbered particles in another.
* In each stream, the forces are in several parts depending on how
* many times we replicated the input stream.
*
* This kernel should not be a bottle neck, but if it turns out to
* be so, we can try some work arounds.
*
* To avoid an extra kernel to zero forces, this sets the forces
* rather than adding to it.
* */
kernel static void kmerge_partial_forces(
float repfac,
float atomStrWidth,
float pforceStrWidth,
float natoms,
float3 pforce1[][],
float3 pforce2[][],
out float3 force<> )
{
float linind;
float2 pindex;
float odd;
float i;
float2 adjustcount = (indexof force);
//convert to linear atom index
linind = adjustcount.x + adjustcount.y * atomStrWidth;
//If odd or even, we pick from diferent streams.
odd = linind - floor( linind / 2.0f ) * 2.0f;
//Now linear index is the index into partial_forces
linind = floor( linind / 2.0f );
force = float3( 0.0f, 0.0f, 0.0f );
//If we have predicated conditionals, we should
//keep the conditional inside the loop
for ( i = 0; i < repfac; i+=1.0f ) {
pindex.y = floor( linind / pforceStrWidth );
pindex.x = linind - pindex.y * pforceStrWidth;
//pindex.x = fmod( linind, pforceStrWidth );
if ( odd > 0.5f ) { //is odd
force += pforce2[ pindex ];
}
else {
force += pforce1[ pindex ];
}
linind += natoms/2.0f;
}
}
kernel void kMergeFloat3_4(
float repfac,
float atomStreamWidth,
float pStreamWidth,
float natoms,
float roundNatoms,
float iUnroll,
float3 pstream1[][],
float3 pstream2[][],
float3 pstream3[][],
float3 pstream4[][],
out float3 outstream<> )
{
float atomIndex, forceIndex, qIndex, qOff;
float2 pindex;
float i;
// given atom index find force indices and streams
pindex = indexof( outstream );
atomIndex = pindex.x + pindex.y*atomStreamWidth;
forceIndex = atomIndex;
outstream = float3( 0.0f, 0.0f, 0.0f );
for( i = 0.0f; i < repfac; i += 1.0f ){
qIndex = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
qOff = forceIndex - iUnroll*qIndex;
pindex.y = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
pindex.x = qIndex - pindex.y*pStreamWidth;
if ( qOff < 0.5f ){
outstream += pstream1[ pindex ];
} else if( qOff < 1.5f ){
outstream += pstream2[ pindex ];
} else if( qOff < 2.5f ){
outstream += pstream3[ pindex ];
} else {
outstream += pstream4[ pindex ];
}
forceIndex += roundNatoms;
}
}
platforms/brook/src/gpu/knlforce.br
0 → 100644
View file @
b53cd947
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
/*
* 3rd attempt at writing a force kernel that uses neighborlists.
* Keeping things very simple this time - no unrolling at all.
*
* Starting with simple Coul-LJ
* */
//Constant dielectric, LJ (normal forces)
kernel float4 nl_scalar_force_CDLJ( float4 qq, float epsfac, float4 sig, float4 eps, float4 r2 ) {
float4 invr, invrsig2, invrsig6;
float4 f;
invr = rsqrt( r2 ); // 1 or 2 flops?
invrsig2 = invr * sig; //1 flop
invrsig2 = invrsig2 * invrsig2; //1 flop
invrsig6 = invrsig2 * invrsig2 * invrsig2; //2flops
f = eps * ( 12.0f*invrsig6 - 6.0f ) * invrsig6; //4flops
f += epsfac*qq*invr; //2 flops
f *= invr*invr; //2 flops
return f; //Total ? flops
}
kernel float4 nl_get_r2( float3 d1, float3 d2, float3 d3, float3 d4 ) {
float4 r2;
r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) ); //5*4 flops
return r2;
}
kernel void knbforce_nl(
float AtomStrWidth, //Width of atom position stream
float epsfac,
iter float2 wpos<>, //pixel position of output (i-atom)
float4 posq[][], //coordinates and charges
float3 inforce<>,
float4 nlist0<>, //nbor indices
float4 sig0<>, //LJ sigmas
float4 eps0<>, //LJ epsilons
//We'll add more nlists here to reduce number of kernel calls
out float3 force<> //output forces
)
{
float2 jind;
float4 ind_tmp1, ind_tmp2;
float4 qj;
float3 jpos1, jpos2, jpos3, jpos4;
float4 r2, fs, qq;
float3 d1, d2, d3, d4;
float3 ipos;
float qi;
//Since neighbors will not be adjacent in memory
//We may want to stagger compute and etch, but
//with fxc, we don't have control over scheduling anyway.
//Will try hand scheduling the ps3 code later
//etch i parameters
ipos = posq[ wpos ].xyz;
qi = posq[ wpos ].w;
//etch j parameters
ind_tmp1 = floor( nlist0 / AtomStrWidth );
ind_tmp2 = nlist0 - ind_tmp1 * AtomStrWidth;
force = inforce;
//1st set of 4 neighbors
jind.y = ind_tmp1.x;
jind.x = ind_tmp2.x;
jpos1 = posq[ jind ].xyz;
qj.x = posq[ jind ].w;
jind.y = ind_tmp1.y;
jind.x = ind_tmp2.y;
jpos2 = posq[ jind ].xyz;
qj.y = posq[ jind ].w;
jind.y = ind_tmp1.z;
jind.x = ind_tmp2.z;
jpos3 = posq[ jind ].xyz;
qj.z = posq[ jind ].w;
jind.y = ind_tmp1.w;
jind.x = ind_tmp2.w;
jpos4 = posq[ jind ].xyz;
qj.w = posq[ jind ].w;
d1 = ipos - jpos1;
d2 = ipos - jpos2;
d3 = ipos - jpos3;
d4 = ipos - jpos4;
r2 = nl_get_r2( d1, d2, d3, d4 );
qq = qi * qj;
fs = nl_scalar_force_CDLJ( qq, epsfac, sig0, eps0, r2 );
force += fs.x * d1;
force += fs.y * d2;
force += fs.z * d3;
force += fs.w * d4;
}
platforms/brook/src/gpu/knlist.br
0 → 100644
View file @
b53cd947
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
/* Order N^2 neighbor searching.
*
* This only works for force fields that don't have charge groups.
* If you insist on charge groups, you'll have to pass in appropriate masks here.
*
* This is a simplified kernel, for testing the O(N) speeds.
*
* This does a complete N^2 search without considering groups of
* atoms. Most likely this will prove to be inefficient for
* the O(N) kernel. Lets find out.
*
*
* Each component of the curpass textures is an atom index. The w component
* of curpass3 is a count indicating how many j particles we have
* scanned for this particular i atom.
*
* */
kernel void knborsearch(
float first, //Positive means constructing the first 16.
iter float2 wpos<>, //pixel position of output
float AtomStrHeight,
float AtomStrWidth,
float cutoff2, //square of the cutoff
float natoms, //number of atoms
float excl[][], //exclusions in 1x1 format, 0 means not excluded, 1 means excluded.
float4 posq[][], //atom positions/charges
float4 prevpass3<>, //Last output texture of previous pass
out float4 curpass0<>, //First output of current pass
out float4 curpass1<>,
out float4 curpass2<>,
out float4 curpass3<> //Last output of current pass, used in next pass
){
/*For this kernel, wpos == iatom*/
float2 iind;
float2 jind;
float3 ipos, jpos, dr;
float r2;
float listptr; //Where in the 16-chunk are we now.
float jlinind;
float breakflag; //positive means keep looping, negative means stop
float4 exclconst;
float2 exclind;
float exclusions;
exclconst = float4( 2.0f, 3.0f, 5.0f, 7.0f );
iind = wpos;
exclind.x = iind.x + iind.y * AtomStrWidth;
//etch i atom
ipos = posq[ iind ].xyz;
//Loop over j depending on prevpass
jlinind = prevpass3.w + 1;
jind.y = floor( jlinind / AtomStrWidth );
jind.x = fmod( jlinind, AtomStrWidth );
exclind.y = jlinind;
//All outputs should be initialized to
listptr = 0.0f;
breakflag = 1.0f;
//if we already finished, do nothing
if ( first < 0.0f && prevpass3.w < 0.0f )
breakflag = -1.0f;
//set to -1 to indicate no neighbor
//just to save a separate set of init calls
curpass0 = float4( -1.0f, -1.0f, -1.0f, -1.0f );
curpass1 = curpass0;
curpass2 = curpass0;
curpass3 = curpass0;
while ( jind.y < AtomStrHeight && breakflag > 0.0f ) {
while ( jind.x < AtomStrWidth && breakflag > 0.0f ) {
//First see if this pair is excluded
exclusions = excl[ exclind ];
if ( exclusions < 0.5f ) {
jpos = posq[ jind ].xyz;
dr = jpos - ipos;
r2 = dot( dr, dr );
//If it is inside the cutoff
if ( r2 < cutoff2 ) {
//Figure out where to put it
//We are allowed 4 nested conditionals
//We can play with the structuring of these
if ( listptr < 0.5f )
curpass0.x = jlinind;
else if ( listptr < 1.5f )
curpass0.y = jlinind;
else if ( listptr < 2.5f )
curpass0.z = jlinind;
else if ( listptr < 3.5f )
curpass0.w = jlinind;
else if ( listptr < 4.5f )
curpass1.x = jlinind;
else if ( listptr < 5.5f )
curpass1.y = jlinind;
else if ( listptr < 6.5f )
curpass1.z = jlinind;
else if ( listptr < 7.5f )
curpass1.w = jlinind;
else if ( listptr < 8.5f )
curpass2.x = jlinind;
else if ( listptr < 9.5f )
curpass2.y = jlinind;
else if ( listptr < 10.5f )
curpass2.z = jlinind;
else if ( listptr < 11.5f )
curpass2.w = jlinind;
else if ( listptr < 12.5f )
curpass3.x = jlinind;
else if ( listptr < 13.5f )
curpass3.y = jlinind;
else if ( listptr < 14.5f ) {
curpass3.z = jlinind;
}
else if ( listptr < 15.5f ) {
//We're done for this pass
curpass3.w = jlinind;
breakflag = -1.0f;
}
listptr += 1.0f;
}
}
jlinind += 1.0f;
exclind.y += 1.0f;
jind.x += 1.0f;
}
jind.x = 0.0f;
jind.y += 1.0f;
}
}
//Precomputes lennard jones sig and eps
//to save an indirect etch (and a ew flops) in the
//force kernel. The charge product is not done this way
//because charges have to be etched anyway with the
//positions
kernel void knl_precompute_sigeps(
float AtomStrWidth,
iter float2 wpos<>,
float2 sigeps[][], //x=sigma, y=epsilon
float4 nlist0<>,
float4 nlist1<>,
out float4 sig0<>,
out float4 eps0<>,
out float4 sig1<>,
out float4 eps1<>
)
{
float2 jind;
float4 ind_tmp1, ind_tmp2;
float2 isigeps, jsigeps1, jsigeps2, jsigeps3, jsigeps4;
isigeps = sigeps[ wpos ];
ind_tmp1 = floor( nlist0 / AtomStrWidth );
ind_tmp2 = nlist0 - ind_tmp1 * AtomStrWidth;
jind.y = ind_tmp1.x;
jind.x = ind_tmp2.x;
jsigeps1 = sigeps[ jind ];
jind.y = ind_tmp1.y;
jind.x = ind_tmp2.y;
jsigeps2 = sigeps[ jind ];
jind.y = ind_tmp1.z;
jind.x = ind_tmp2.z;
jsigeps3 = sigeps[ jind ];
jind.y = ind_tmp1.w;
jind.x = ind_tmp2.w;
jsigeps4 = sigeps[ jind ];
sig0.x = isigeps.x + jsigeps1.x;
sig0.y = isigeps.x + jsigeps2.x;
sig0.z = isigeps.x + jsigeps3.x;
sig0.w = isigeps.x + jsigeps4.x;
eps0.x = isigeps.y * jsigeps1.y;
eps0.y = isigeps.y * jsigeps2.y;
eps0.z = isigeps.y * jsigeps3.y;
eps0.w = isigeps.y * jsigeps4.y;
//2nd nlist set
ind_tmp1 = floor( nlist1 / AtomStrWidth );
ind_tmp2 = nlist1 - ind_tmp1 * AtomStrWidth;
jind.y = ind_tmp1.x;
jind.x = ind_tmp2.x;
jsigeps1 = sigeps[ jind ];
jind.y = ind_tmp1.y;
jind.x = ind_tmp2.y;
jsigeps2 = sigeps[ jind ];
jind.y = ind_tmp1.z;
jind.x = ind_tmp2.z;
jsigeps3 = sigeps[ jind ];
jind.y = ind_tmp1.w;
jind.x = ind_tmp2.w;
jsigeps4 = sigeps[ jind ];
sig1.x = isigeps.x + jsigeps1.x;
sig1.y = isigeps.x + jsigeps2.x;
sig1.z = isigeps.x + jsigeps3.x;
sig1.w = isigeps.x + jsigeps4.x;
eps1.x = isigeps.y * jsigeps1.y;
eps1.y = isigeps.y * jsigeps2.y;
eps1.z = isigeps.y * jsigeps3.y;
eps1.w = isigeps.y * jsigeps4.y;
}
platforms/brook/src/gpu/kpdihs.br
0 → 100644
View file @
b53cd947
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
//Ryckaert Bellman dihedrals, needed for Amber/OPLS ff's
//
//Input is a stream of quartets i, j, k, l and the output is
//four float3 streams fi, fj, fk, fl.
//If by any chance this kernel becomes the bottleneck, we will
//optimize, but for now, this is kept pretty simple.
//To keep things streaming, we have a stream of 6 parameters(a float4 and float2)
//for each dihedral.
kernel void kpdih(
float xstrwidth, //stream width for x
float4 xq[][], //particle coordinates and charges
float4 atoms<>, //ijkl quartets
float4 parms<>, //parms = ( cp, phi0, mult, 0.0 )
out float3 fi<>, //output forces for i, j, k, l
out float3 fj<>,
out float3 fk<>,
out float3 fl<>
) {
float3 r_ij, r_kj, r_kl;
float2 ai, aj, ak, al;
float3 m, n;
float sgnphi;
float cosfac;
float phi, ddphi, mdphi;
float3 u, v, s;
float nrkj, nrkj2, msq, nsq, cos_phi, sin_phi;
//Convert from linear indices to 2D indices into x
//If this kernel is compute bound, we can do this
//conversion before-hand and feed in the 2D coordinates
ai.y = floor( atoms.x / xstrwidth );
ai.x = atoms.x - ai.y * xstrwidth;
aj.y = floor( atoms.y / xstrwidth );
aj.x = atoms.y - aj.y * xstrwidth;
ak.y = floor( atoms.z / xstrwidth );
ak.x = atoms.z - ak.y * xstrwidth;
al.y = floor( atoms.w / xstrwidth );
al.x = atoms.w - al.y * xstrwidth;
r_ij = xq[ai].xyz - xq[aj].xyz; //3
r_kj = xq[ak].xyz - xq[aj].xyz; //3
r_kl = xq[ak].xyz - xq[al].xyz; //3
m = cross( r_ij, r_kj ); //9
n = cross( r_kj, r_kl ); //9
msq = dot(m, m); //5
nsq = dot(n, n); //5
cos_phi = clamp( dot(m, n)/sqrt(msq*nsq), -1.0, 1.0 ); //8
sgnphi = sign( dot( r_ij, n ) ); //5
phi = sgnphi * acos( cos_phi ); //2
mdphi = parms.z * phi - parms.y; //2
ddphi = - parms.x * parms.z * sin( mdphi ); //3
nrkj2 = dot( r_kj, r_kj ); //5
nrkj = sqrt( nrkj2 ); //1
fi = -ddphi * nrkj / msq * m; //5
fl = ddphi * nrkj / nsq * n; //5
u = dot( r_ij, r_kj ) / nrkj2 * fi; //9
v = dot( r_kl, r_kj ) / nrkj2 * fl; //9
s = u - v; //3
fj = s - fi; //3
fk = -(s + fl); //3
//Total : 100 flops
}
platforms/brook/src/gpu/krbdihs.br
0 → 100644
View file @
b53cd947
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
//Ryckaert Bellman dihedrals, needed for Amber/OPLS ff's
//
//Input is a stream of quartets i, j, k, l and the output is
//four float3 streams fi, fj, fk, fl.
//If by any chance this kernel becomes the bottleneck, we will
//optimize, but for now, this is kept pretty simple.
//To keep things streaming, we have a stream of 6 parameters(a float4 and float2)
//for each dihedral.
kernel void krbdih(
float xstrwidth, //stream width for x
float4 xq[][], //particle coordinates and charges
float4 atoms<>, //ijkl quartets
float4 parm03<>, //params 0-3
float2 parm45<>, //params 4 and 5
out float3 fi<>, //output forces for i, j, k, l
out float3 fj<>,
out float3 fk<>,
out float3 fl<>
) {
float3 r_ij, r_kj, r_kl;
float2 ai, aj, ak, al;
float3 m, n;
float sgnphi;
float cosfac;
float ddphi;
float3 u, v, s;
float nrkj, nrkj2, msq, nsq, cos_phi, sin_phi;
//Convert from linear indices to 2D indices into x
//If this kernel is compute bound, we can do this
//conversion before-hand and feed in the 2D coordinates
ai.y = floor( atoms.x / xstrwidth );
ai.x = atoms.x - ai.y * xstrwidth;
aj.y = floor( atoms.y / xstrwidth );
aj.x = atoms.y - aj.y * xstrwidth;
ak.y = floor( atoms.z / xstrwidth );
ak.x = atoms.z - ak.y * xstrwidth;
al.y = floor( atoms.w / xstrwidth );
al.x = atoms.w - al.y * xstrwidth;
r_ij = xq[ai].xyz - xq[aj].xyz; //3
r_kj = xq[ak].xyz - xq[aj].xyz; //3
r_kl = xq[ak].xyz - xq[al].xyz; //3
m = cross( r_ij, r_kj ); //9
n = cross( r_kj, r_kl ); //9
msq = dot(m, m); //5
nsq = dot(n, n); //5
cos_phi = dot(m, n)/sqrt(msq*nsq); //8 (sqrt=1)
//Switching to "polymer convention"
//See gromacs code
cos_phi = -cos_phi;
sgnphi = sign( dot(r_ij, n) ); //5
sin_phi = -sgnphi*sqrt( clamp( 1.0 - cos_phi * cos_phi, 0.0, 1.0) ); //3
//ddphi is basically sum_{i=1}^5 i parm_i cosphi^{i-1}
//This might not be the best way to use the
//4-way mads, but for now we'll let fxc figure it
//out.
//If we precompute some ratios of the parameters
//we can use the 4-way mads better
ddphi = 5.0 * parm45.y;
ddphi = 4.0 * parm45.x + ddphi * cos_phi;
ddphi = 3.0 * parm03.w + ddphi * cos_phi;
ddphi = 2.0 * parm03.z + ddphi * cos_phi;
ddphi = parm03.y + ddphi * cos_phi;
ddphi = -ddphi * sin_phi; //13 flops total for ddphi
nrkj2 = dot( r_kj, r_kj ); //5
nrkj = sqrt( nrkj2 ); //1
fi = -ddphi * nrkj / msq * m; //5
fl = ddphi * nrkj / nsq * n; //5
u = dot( r_ij, r_kj ) / nrkj2 * fi; //9
v = dot( r_kl, r_kj ) / nrkj2 * fl; //9
s = u - v; //3
fj = s - fi; //3
fk = -(s + fl); //3
//Total flops: 109 per rb torsion.
}
platforms/brook/src/gpu/kshakeh.br
0 → 100644
View file @
b53cd947
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
/*
* This kernel constrains bonds involving hydrogen using an unrolled
* shake implementation. We assume that a heavy atom can have at most
* 3 hydrogens attached. The hydrogens are expected to be identical
*
* For atoms that have ewer than 3 hydrogens
*
* Also we don't check the constraints!
*
* We do a fixed number of iterations without checking for convergence
* to avoid stalling
* *************************************************************/
/*
kernel void
kshakeh(
float nit, //number of iterations
float strwidth, //stream width of posq
float invmH, //inverse mass of hydrogen
float omega, //parameter from gromacs, possibly unused
float4 atoms<>, //heavy0, h1, h2, h3
float4 posq[][], //positions before update
float4 posqp[][], //positions after update
float4 params<>, // (1/m0, mu/2, blensq, 0.0f )
out float4 cposq0<>, //constrained position for heavy atom
out float4 cposq1<>, //ditto for h1
out float4 cposq2<>, //ditto for h2
out float4 cposq3<> //ditto for h3
) {
float2 ai, aj1, aj2, aj3; //2d indices, can be precalc.
float i; //iteration count
float3 xi, xj1, xj2, xj3; //coordinates
float3 xpi, xpj1, xpj2, xpj3; //coordinates
float3 rij1, rij2, rij3, rpij, dr;
float rpsqij, rrpr, acor;
float mask2, mask3;
float diff;
ai.y = floor( atoms.x / strwidth );
ai.x = atoms.x - ai.y * strwidth;
aj1.y = floor( atoms.y / strwidth );
aj1.x = atoms.y - aj1.y * strwidth;
//If further hydrogens are absent,
//just set to the coordinates of the first
//so we don't have any junk memory accesses
//or nans/infs in the calcs.
if ( atoms.z > -0.5f ) {
aj2.y = floor( atoms.z / strwidth );
aj2.x = atoms.z - aj2.y * strwidth;
mask2 = 1.0f;
}
else {
aj2 = aj1;
mask2 = 0.0f;
}
if ( atoms.w > -0.5f ) {
aj3.y = floor( atoms.w / strwidth );
aj3.x = atoms.w - aj3.y * strwidth;
mask3 = 1.0f;
}
else {
aj3 = aj1;
mask3 = 0.0f;
}
cposq0 = posq[ai];
cposq1 = posq[aj1];
cposq2 = posq[aj2];
cposq3 = posq[aj3];
xi = cposq0.xyz;
xj1 = cposq1.xyz;
xj2 = cposq2.xyz;
xj3 = cposq3.xyz;
rij1 = xi - xj1;
rij2 = xi - xj2;
rij3 = xi - xj3;
xpi = posqp[ai].xyz;
xpj1 = posqp[aj1].xyz;
xpj2 = posqp[aj2].xyz;
xpj3 = posqp[aj3].xyz;
i = 0.0f;
while ( i < 15.0f ) {
//First hydrogen
rpij = xpi - xpj1;
rpsqij = dot( rpij, rpij );
rrpr = dot( rij1, rpij );
//for debugging only
diff = abs( params.z - rpsqij ) / (params.z * 2 * 1e-4 );
if ( diff < 1.0f )
acor = 0.0f;
else
//params.y = mu/2, params.z = blen*blen
acor = omega * ( params.z - rpsqij ) * params.y / rrpr ;
dr = rij1 * acor;
xpi += dr * params.x;
xpj1 -= dr * invmH;
//Second hydrogen
rpij = xpi - xpj2;
rpsqij = dot( rpij, rpij );
rrpr = dot( rij2, rpij );
//for debugging only
diff = abs( params.z - rpsqij ) / (params.z * 2 * 1e-4 );
if ( diff < 1.0f )
acor = 0.0f;
else
//params.y = mu/2, params.z = blen*blen
acor = mask2 * omega * ( params.z - rpsqij ) * params.y / rrpr ;
dr = rij2 * acor;
xpi += dr * params.x;
xpj2 -= dr * invmH;
//Third hydrogen
rpij = xpi - xpj3;
rpsqij = dot( rpij, rpij );
rrpr = dot( rij3, rpij );
//for debugging only
diff = abs( params.z - rpsqij ) / (params.z * 2 * 1e-4 );
if ( diff < 1.0f )
acor = 0.0f;
else
//params.y = mu/2, params.z = blen*blen
acor = mask3 * omega * ( params.z - rpsqij ) * params.y / rrpr ;
dr = rij3 * acor;
xpi += dr * params.x;
xpj3 -= dr * invmH;
i += 1.0f;
}
cposq0.xyz = xpi;
cposq1.xyz = xpj1;
cposq2.xyz = xpj2;
cposq3.xyz = xpj3;
} */
/*Applies the updated positions
*Each atom occurs at one and only one position
* */
/*
kernel void kshakeh_update(
float strwidth, //width of cposq streams
float2 invmap<>, //shakeh inverse map
float4 posq<>, //untouched positions
float4 cposq0[][], //constrained position for heavy atom
float4 cposq1[][], //ditto for h1
float4 cposq2[][], //ditto for h2
float4 cposq3[][], //ditto for h3
out float4 oposq<> //updated positions
){
float2 atom;
atom.y = floor( invmap.x / strwidth );
atom.x = invmap.x - atom.y * strwidth;
if ( invmap.y < 0 )
oposq = posq;
else if ( invmap.y < 0.5f )
oposq = cposq0[ atom ];
else if ( invmap.y < 1.5f )
oposq = cposq1[ atom ];
else if ( invmap.y < 2.5f )
oposq = cposq2[ atom ];
else if ( invmap.y < 3.5f )
oposq = cposq3[ atom ];
} */
/* Fix for precision of terms first order in dt
* To be used with corresponding update kernels
* The input posqp is now changes to posq rather than
* posq+changes
* */
kernel void
kshakeh_fix1(
float nit, //number of iterations
float strwidth, //stream width of posq
float invmH, //inverse mass of hydrogen
float omega, //parameter from gromacs, possibly unused
float4 atoms<>, //heavy0, h1, h2, h3
float4 posq[][], //positions before update
float4 posqp[][], //changes to positions
float4 params<>, // (1/m0, mu/2, blensq, 0.0f )
out float4 cposq0<>, //constrained position for heavy atom
out float4 cposq1<>, //ditto for h1
out float4 cposq2<>, //ditto for h2
out float4 cposq3<> //ditto for h3
) {
float2 ai, aj1, aj2, aj3; //2d indices, can be precalc.
float i; //iteration count
float3 xi, xj1, xj2, xj3; //coordinates
float3 xpi, xpj1, xpj2, xpj3; //coordinates
float3 rij1, rij2, rij3, rpij, dr;
float rpsqij, rrpr, acor;
float mask2, mask3;
float diff;
float rij1sq, rij2sq, rij3sq;
float ld1, ld2, ld3;
// ai.y = floor( atoms.x / strwidth );
ai.y = round( (atoms.x - fmod( atoms.x, strwidth ))/strwidth );
ai.x = atoms.x - ai.y * strwidth;
// aj1.y = floor( atoms.y / strwidth );
aj1.y = round( (atoms.y - fmod( atoms.y, strwidth ))/strwidth );
aj1.x = atoms.y - aj1.y * strwidth;
//If further hydrogens are absent,
//just set to the coordinates of the first
//so we don't have any junk memory accesses
//or nans/infs in the calcs.
if ( atoms.z > -0.5f ) {
// aj2.y = floor( atoms.z / strwidth );
aj2.y = round( (atoms.z - fmod( atoms.z, strwidth ))/strwidth );
aj2.x = atoms.z - aj2.y * strwidth;
mask2 = 1.0f;
}
else {
aj2 = aj1;
mask2 = 0.0f;
}
if ( atoms.w > -0.5f ) {
// aj3.y = floor( atoms.w / strwidth );
aj3.y = round( (atoms.w - fmod( atoms.w, strwidth ))/strwidth );
aj3.x = atoms.w - aj3.y * strwidth;
mask3 = 1.0f;
}
else {
aj3 = aj1;
mask3 = 0.0f;
}
cposq0 = posq[ai];
cposq1 = posq[aj1];
cposq2 = posq[aj2];
cposq3 = posq[aj3];
xi = cposq0.xyz;
xj1 = cposq1.xyz;
xj2 = cposq2.xyz;
xj3 = cposq3.xyz;
rij1 = xi - xj1;
rij2 = xi - xj2;
rij3 = xi - xj3;
rij1sq = dot( rij1, rij1 );
rij2sq = dot( rij2, rij2 );
rij3sq = dot( rij3, rij3 );
ld1 = params.z - rij1sq;
ld2 = params.z - rij2sq;
ld3 = params.z - rij3sq;
/*
xpi = posqp[ai].xyz;
xpj1 = posqp[aj1].xyz;
xpj2 = posqp[aj2].xyz;
xpj3 = posqp[aj3].xyz;
*/
xpi = posqp[ai].xyz - xi;
xpj1 = posqp[aj1].xyz - xj1;
xpj2 = posqp[aj2].xyz - xj2;
xpj3 = posqp[aj3].xyz - xj3;
// XXX
// cposq0 = posqp[ai];
// cposq1 = posqp[aj1];
// cposq2 = posqp[aj2];
// cposq3 = posqp[aj3];
// OK
// XXX
i = 0.0f;
while ( i < 15.0f ) {
//First hydrogen
rpij = xpi - xpj1; //This is really rpij - rij
rpsqij = dot( rpij, rpij ); //This is really deltar ^ 2
rrpr = dot( rij1, rpij ); //This is r.deltar
//for debugging only
//params.y = mu/2, params.z = blen*blen
diff = abs( ld1 - 2 * rrpr - rpsqij ) / (params.z * 2 * 1e-4f );
if ( diff < 1.0f )
acor = 0.0f;
else
acor = omega * ( ld1 - 2 * rrpr - rpsqij ) * params.y / ( rrpr + rij1sq ) ;
dr = rij1 * acor;
xpi += dr * params.x;
xpj1 -= dr * invmH;
//Second hydrogen
rpij = xpi - xpj2;
rpsqij = dot( rpij, rpij );
rrpr = dot( rij2, rpij );
//for debugging only
diff = abs( ld2 - 2 * rrpr - rpsqij ) / (params.z * 2 * 1e-4f );
if ( diff < 1.0f )
acor = 0.0f;
else
acor = mask2 * omega * ( ld2 - 2 * rrpr - rpsqij ) * params.y / ( rrpr + rij2sq ) ;
dr = rij2 * acor;
xpi += dr * params.x;
xpj2 -= dr * invmH;
//Third hydrogen
rpij = xpi - xpj3;
rpsqij = dot( rpij, rpij );
rrpr = dot( rij3, rpij );
diff = abs( ld3 - 2 * rrpr - rpsqij ) / (params.z * 2 * 1e-4f );
// for debugging only
if ( diff < 1.0f )
acor = 0.0f;
else
acor = mask3 * omega * ( ld3 - 2 * rrpr - rpsqij ) * params.y / ( rrpr + rij3sq ) ;
dr = rij3 * acor;
xpi += dr * params.x;
xpj3 -= dr * invmH;
i += 1.0f;
}
//Output modified delta's
cposq0.xyz = xpi;
cposq1.xyz = xpj1;
cposq2.xyz = xpj2;
cposq3.xyz = xpj3;
}
kernel void
kshakeh_fix2(
float nit, //number of iterations
float strwidth, //stream width of posq
float invmH, //inverse mass of hydrogen
float omega, //parameter from gromacs, possibly unused
float4 atoms<>, //heavy0, h1, h2, h3
float4 posq[][], //positions before update
float4 posqp[][], //changes to positions
float4 params<>, // (1/m0, mu/2, blensq, 0.0f )
out float4 cposq0<>, //constrained position for heavy atom
out float4 cposq1<>, //ditto for h1
out float4 cposq2<>, //ditto for h2
out float4 cposq3<> //ditto for h3
) {
float2 ai, aj1, aj2, aj3; //2d indices, can be precalc.
float i; //iteration count
float3 xi, xj1, xj2, xj3; //coordinates
float3 xpi, xpj1, xpj2, xpj3; //coordinates
float3 rij1, rij2, rij3, rpij, dr;
float rpsqij, rrpr, acor;
float mask2, mask3;
float diff;
float rij1sq, rij2sq, rij3sq;
float ld1, ld2, ld3;
// ai.y = floor( atoms.x / strwidth );
ai.y = round( (atoms.x - fmod( atoms.x, strwidth ))/strwidth );
ai.x = atoms.x - ai.y * strwidth;
// aj1.y = floor( atoms.y / strwidth );
aj1.y = round( (atoms.y - fmod( atoms.y, strwidth ))/strwidth );
aj1.x = atoms.y - aj1.y * strwidth;
//If further hydrogens are absent,
//just set to the coordinates of the first
//so we don't have any junk memory accesses
//or nans/infs in the calcs.
if ( atoms.z > -0.5f ) {
// aj2.y = floor( atoms.z / strwidth );
aj2.y = round( (atoms.z - fmod( atoms.z, strwidth ))/strwidth );
aj2.x = atoms.z - aj2.y * strwidth;
mask2 = 1.0f;
}
else {
aj2 = aj1;
mask2 = 0.0f;
}
if ( atoms.w > -0.5f ) {
// aj3.y = floor( atoms.w / strwidth );
aj3.y = round( (atoms.w - fmod( atoms.w, strwidth ))/strwidth );
aj3.x = atoms.w - aj3.y * strwidth;
mask3 = 1.0f;
}
else {
aj3 = aj1;
mask3 = 0.0f;
}
cposq0 = posq[ai];
cposq1 = posq[aj1];
cposq2 = posq[aj2];
cposq3 = posq[aj3];
xi = cposq0.xyz;
xj1 = cposq1.xyz;
xj2 = cposq2.xyz;
xj3 = cposq3.xyz;
rij1 = xi - xj1;
rij2 = xi - xj2;
rij3 = xi - xj3;
rij1sq = dot( rij1, rij1 );
rij2sq = dot( rij2, rij2 );
rij3sq = dot( rij3, rij3 );
ld1 = params.z - rij1sq;
ld2 = params.z - rij2sq;
ld3 = params.z - rij3sq;
xpi = posqp[ai].xyz;
xpj1 = posqp[aj1].xyz;
xpj2 = posqp[aj2].xyz;
xpj3 = posqp[aj3].xyz;
/*
xpi = posqp[ai].xyz - xi;
xpj1 = posqp[aj1].xyz - xj1;
xpj2 = posqp[aj2].xyz - xj2;
xpj3 = posqp[aj3].xyz - xj3;
*/
// XXX
// cposq0 = posqp[ai];
// cposq1 = posqp[aj1];
// cposq2 = posqp[aj2];
// cposq3 = posqp[aj3];
// OK
// XXX
i = 0.0f;
while ( i < 15.0f ) {
//First hydrogen
rpij = xpi - xpj1; //This is really rpij - rij
rpsqij = dot( rpij, rpij ); //This is really deltar ^ 2
rrpr = dot( rij1, rpij ); //This is r.deltar
//for debugging only
//params.y = mu/2, params.z = blen*blen
diff = abs( ld1 - 2 * rrpr - rpsqij ) / (params.z * 2 * 1e-4f );
if ( diff < 1.0f )
acor = 0.0f;
else
acor = omega * ( ld1 - 2 * rrpr - rpsqij ) * params.y / ( rrpr + rij1sq ) ;
dr = rij1 * acor;
xpi += dr * params.x;
xpj1 -= dr * invmH;
//Second hydrogen
rpij = xpi - xpj2;
rpsqij = dot( rpij, rpij );
rrpr = dot( rij2, rpij );
//for debugging only
diff = abs( ld2 - 2 * rrpr - rpsqij ) / (params.z * 2 * 1e-4f );
if ( diff < 1.0f )
acor = 0.0f;
else
acor = mask2 * omega * ( ld2 - 2 * rrpr - rpsqij ) * params.y / ( rrpr + rij2sq ) ;
dr = rij2 * acor;
xpi += dr * params.x;
xpj2 -= dr * invmH;
//Third hydrogen
rpij = xpi - xpj3;
rpsqij = dot( rpij, rpij );
rrpr = dot( rij3, rpij );
diff = abs( ld3 - 2 * rrpr - rpsqij ) / (params.z * 2 * 1e-4f );
// for debugging only
if ( diff < 1.0f )
acor = 0.0f;
else
acor = mask3 * omega * ( ld3 - 2 * rrpr - rpsqij ) * params.y / ( rrpr + rij3sq ) ;
dr = rij3 * acor;
xpi += dr * params.x;
xpj3 -= dr * invmH;
i += 1.0f;
}
//Output modified delta's
cposq0.xyz = xpi;
cposq1.xyz = xpj1;
cposq2.xyz = xpj2;
cposq3.xyz = xpj3;
}
/*Applies the updated delta's
*Each atom occurs at one and only one position
* */
kernel void kshakeh_update1_fix1Old(
float strwidth, //width of cposq streams
float2 invmap<>, //shakeh inverse map
float4 posq<>, //constrained deltas
float4 cposq0[][], //constrained delta for heavy atom
float4 cposq1[][], //ditto for h1
float4 cposq2[][], //ditto for h2
float4 cposq3[][], //ditto for h3
out float4 oposq<> //updated deltas
){
float2 atom;
//atom.y = floor( invmap.x / strwidth );
atom.y = round( (invmap.x - fmod( invmap.x, strwidth ))/strwidth );
atom.x = invmap.x - atom.y * strwidth;
if ( invmap.y < 0 )
oposq = posq;
else if ( invmap.y < 0.5f )
oposq = cposq0[ atom ];
else if ( invmap.y < 1.5f )
oposq = cposq1[ atom ];
else if ( invmap.y < 2.5f )
oposq = cposq2[ atom ];
else if ( invmap.y < 3.5f )
oposq = cposq3[ atom ];
}
kernel void kshakeh_update2_fix1(
float strwidth, //width of cposq streams
float2 invmap<>, //shakeh inverse map
float4 posq<>, //old positions
float4 posqp<>, //deltas from sd2
float4 cposq0[][], //constrained delta for heavy atom
float4 cposq1[][], //ditto for h1
float4 cposq2[][], //ditto for h2
float4 cposq3[][], //ditto for h3
out float4 oposq<> //updated deltas
){
float2 atom;
// atom.y = floor( invmap.x / strwidth );
atom.y = round( (invmap.x - fmod( invmap.x, strwidth ))/strwidth );
atom.x = invmap.x - atom.y * strwidth;
if ( invmap.y < 0 ){
oposq.w = posq.w;
oposq.xyz = posqp.xyz - posq.xyz;
} else if ( invmap.y < 0.5f )
oposq = cposq0[ atom ];
else if ( invmap.y < 1.5f)
oposq = cposq1[ atom ];
else if ( invmap.y < 2.5f )
oposq = cposq2[ atom ];
else if ( invmap.y < 3.5f )
oposq = cposq3[ atom ];
oposq.xyz += posq.xyz;
}
kernel void kshakeh_update1_fix1(
float strwidth, //width of cposq streams
float sdFactor,
float2 invmap<>, //shakeh inverse map
float4 posq<>, //old positions
float4 posqp<>, //deltas from sd2
float3 vp<>, //deltas from sd2
float4 cposq0[][], //constrained delta for heavy atom
float4 cposq1[][], //ditto for h1
float4 cposq2[][], //ditto for h2
float4 cposq3[][], //ditto for h3
out float4 oposq<> //updated deltas
){
float2 atom;
// atom.y = floor( invmap.x / strwidth );
atom.y = round( (invmap.x - fmod( invmap.x, strwidth ))/strwidth );
atom.x = invmap.x - atom.y * strwidth;
oposq = posq;
if ( invmap.y < 0 ){
oposq = posqp;
} else if ( invmap.y < 0.5f )
oposq.xyz += cposq0[ atom ].xyz;
else if ( invmap.y < 1.5f )
oposq.xyz += cposq1[ atom ].xyz;
else if ( invmap.y < 2.5f )
oposq.xyz += cposq2[ atom ].xyz;
else if ( invmap.y < 3.5f )
oposq.xyz += cposq3[ atom ].xyz;
}
platforms/brook/src/gpu/kupdatemd.br
0 → 100644
View file @
b53cd947
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
kernel void kupdate_md1_berendsen(
float dt,
float3 lg, //Berendsen coupling, assuming only one group
float3 uold, //Mean velocity from previous step
float4 posq<>,
float3 v<>,
float3 f<>,
float invmass<>,
out float3 vnew<>,
out float4 posqp<> )
{
float3 vb;
float3 one;
posqp = posq;
one = float3( 1.0f, 1.0f, 1.0f );
vb = ( one - lg ) * uold;
vnew = lg* ( v + f * invmass * dt ) + vb;
posqp.xyz += vnew * dt;
}
//Nose-Hoover / Parinello Rahman
kernel void kupdate_md1_extended (
float dt,
float3 lg,
float xi,
float3 M0,
float3 M1,
float3 M2,
float3 uold,
float4 posq<>,
float3 v<>,
float3 f<>,
float invmass<>,
out float3 vnew<>,
out float4 posqp<> )
{
float3 vrel, vnrel;
float3 vtrans;
vrel = v - uold;
vtrans = float3( dot(M0, vrel), dot(M1, vrel), dot(M2, vrel) );
vrel += dt * ( invmass * f - xi * vrel - vtrans );
vnew = uold + lg * vrel;
posqp = posq;
posqp.xyz += vnew * dt;
}
kernel void kupdate_md2(
float dtinv, //1/dt
float4 posqp<>, //positions after constraints
float4 posq<>, //positions before update
out float3 vnew<>, //Corrected velocities
out float4 posqnew<> //equal to posqp, avoids an extra call to copy
)
{
posqnew = posqp;
vnew = ( posqp - posq ) * dtinv;
}
/* Version that handles terms of order dt more carefully
* Update1 computes deltax's rather than x + deltax
* Corresponding shake implementation modifies the delta's
* Update2 adds the constrained deltas to x.
* */
kernel void kupdate_md1_berendsen_fix1(
float dt,
float3 lg, //Berendsen coupling, assuming only one group
float3 uold, //Mean velocity from previous step
float4 posq<>,
float3 v<>,
float3 f<>,
float invmass<>,
out float3 vnew<>,
out float4 posqp<> )
{
float3 vb;
float3 one;
one = float3( 1.0f, 1.0f, 1.0f );
vb = ( one - lg ) * uold;
vnew = lg* ( v + f * invmass * dt ) + vb;
posqp.xyz = vnew * dt;
posqp.w = posq.w;
}
//Nose-Hoover / Parinello Rahman
kernel void kupdate_md1_extended_fix1 (
float dt,
float3 lg,
float xi,
float3 M0,
float3 M1,
float3 M2,
float3 uold,
float4 posq<>,
float3 v<>,
float3 f<>,
float invmass<>,
out float3 vnew<>,
out float4 posqp<> )
{
float3 vrel, vnrel;
float3 vtrans;
vrel = v - uold;
vtrans = float3( dot(M0, vrel), dot(M1, vrel), dot(M2, vrel) );
vrel += dt * ( invmass * f - xi * vrel - vtrans );
vnew = uold + lg * vrel;
posqp.w = posq.w;
posqp.xyz = vnew * dt;
}
kernel void kupdate_md2_fix1(
float dtinv, //1/dt
float4 posqp<>, //positions after constraints
float4 posq<>, //positions before update
out float3 vnew<>, //Corrected velocities
out float4 posqnew<> //equal to posqp, avoids an extra call to copy
)
{
posqnew.xyz = posq.xyz + posqp.xyz;
posqnew.w = posq.w;
vnew = posqp.xyz * dtinv;
}
platforms/brook/src/gpu/kupdatesd.br
0 → 100644
View file @
b53cd947
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
/* Stochastic Integrator
*
* Since we don't have random numbers on the GPU, we
* precalculate random numbers from a normal distribution
* and store them in several large textures. Periodically
* these will have to be refreshed on the CPU.
*
* Doing gathers instead of streams for the random vectors,
* but the gathers should be fully coherent anyway.
* This is just to avoid some
* painful manipulations with multiple streams and domains.
* */
/* First part before first constraint call
*
*
* Many things can be precalculated here
* If we become compute bound (wonder how), we can precalc stuff
*
* Assuming constant temperature. If you want to change the
* temperature, you have to change all the precalculated
* constants that use it.
* */
/*
kernel void kupdate_sd1(
float xstrwidth, //atom stream width
float gstrwidth, //stream width of random numbers
float goffset, //starting offset for random numbers
//sd constants, prefixed with c
//to avoid 1 letter names, cxx = sdc[0].xx
float cem,
//sd precalculated constants
float pc1, //tau_t[0] * ( 1 - sdc[0].em )
float pc2, //tau_t[gt] * (sdc[gt].eph - sdc.emh)
float pc3, //sdc[0].d/(tau_t * sdc[0].c);
//sdpc = sd precalculated per atom values
//sdpc.x = 1/sqrt(m)*sig[0].Yv
//sdpc.y = 1/sqrt(m)*sig[0].V
float2 sdpc<>,
//Normally distributed random vectors
float3 fgauss[][],
float3 sd2X<>, //This is calculated in sd2 of the previous step
float4 posq<>,
float3 f<>,
float3 v<>,
float invmass<>,
out float3 sd1V<>, //This is V in gromacs, reused in sd2
out float3 vnew<>,
out float4 posqp<>
){
float3 Vmh;
float3 fg1, fg2;
float linind_gauss; //linear index into the random numbers
float2 igauss; //2d index
igauss = indexof( posq );
linind_gauss = igauss.y * xstrwidth + igauss.x;
//2 random vectors used in each fragment
linind_gauss = 2 * linind_gauss + goffset;
igauss.y = floor( linind_gauss / gstrwidth );
igauss.x = linind_gauss - igauss.y * gstrwidth;
fg1 = fgauss[ igauss ];
linind_gauss += 1.0f;
igauss.y = floor( linind_gauss / gstrwidth );
igauss.x = linind_gauss - igauss.y * gstrwidth;
fg2 = fgauss[ igauss ];
Vmh = sd2X * pc3 + sdpc.x * fg1;
sd1V = sdpc.y * fg2;
vnew = v * cem + invmass * f * pc1 + sd1V - Vmh * cem;
posqp.w = posq.w;
posqp.xyz = posq.xyz + vnew * pc2;
} */
/*Second part of sd update, after first constraints call*/
/*
kernel void kupdate_sd2(
float xstrwidth, //stream width of atoms
float gstrwidth, //stream width of random numbers
float goffset, //starting offset for random numbers
float pc1, //this is 1/pc2 for sd1
//= 1/(tau_t*(sdc[0].eph-sdc[0].emh))
float pc2, //= tau_t * sdc[0].d/(sdc[0].em - 1)
//per atom precalcs
//sdpc.x = 1/sqrt(m)*sig[0].Yx
//sdpc.y = 1/sqrt(m)*sig[0].X
float2 sdpc<>,
//Normally distributed random vectors
float3 fgauss[][],
float3 sd1V<>, //calculated in sd1
float4 posq<>, //positions pre-update
float4 posqp<>, //positions post-constraint
float3 vnew<>, //velocities after sd1
out float3 sd2X<>, //Used in sd1
out float3 v<>, //velocities corrected for constraints
out float4 posqp2<> //Will need to be constrained again
){
float linind_gauss; //linear index into the random numbers
float2 igauss; //2d index
float3 fg1, fg2;
float3 Xmh;
igauss = indexof( posq );
linind_gauss = igauss.y * xstrwidth + igauss.x;
linind_gauss = 2 * linind_gauss + goffset;
igauss.y = floor( linind_gauss / gstrwidth );
igauss.x = linind_gauss - igauss.y * gstrwidth;
fg1 = fgauss[ igauss ];
linind_gauss += 1.0f;
igauss.y = floor( linind_gauss / gstrwidth );
igauss.x = linind_gauss - igauss.y * gstrwidth;
fg2 = fgauss[ igauss ];
v = pc1 * ( posqp.xyz - posq.xyz );
Xmh = sd1V * pc2 + sdpc.x * fg1;
sd2X = sdpc.y * fg2;
posqp2 = posqp;
posqp2.xyz += sd2X - Xmh;
} */
//Applies a permutation to the given random vector stream
//Totally bandwidth bound and streams are large, should be done infrequently.
//Probably don't want gvin and gvout to be the same stream!
kernel void kpermute_vectors( float gstrwidth, float perm<>,
float3 gvin[][], out float3 gvout<> )
{
float2 ind;
//ind.y = floor( perm / gstrwidth );
ind.y = round( (perm - fmod( perm, gstrwidth ))/gstrwidth );
ind.x = perm - ind.y * gstrwidth;
gvout = gvin[ ind ];
}
/*
* Alternative formulation to handle precision better
* In updatesd1, we output only deltaV's. The shake
* kernel uses the deltaV's and the X's to output
* a bunch of constrained deltax's
* */
kernel void kupdate_sd1_fix1(
float xstrwidth, //atom stream width
float gstrwidth, //stream width of random numbers
float goffset, //starting offset for random numbers
//sd constants, prefixed with c
//to avoid 1 letter names, cxx = sdc[0].xx
float cem,
//sd precalculated constants
float pc1, //tau_t[0] * ( 1 - sdc[0].em )
float pc2, //tau_t[gt] * (sdc[gt].eph - sdc.emh)
float pc3, //sdc[0].d/(tau_t * sdc[0].c);
//sdpc = sd precalculated per atom values
//sdpc.x = 1/sqrt(m)*sig[0].Yv
//sdpc.y = 1/sqrt(m)*sig[0].V
float2 sdpc<>,
//Normally distributed random vectors
float3 fgauss[][],
float3 sd2X<>, //This is calculated in sd2 of the previous step
float4 posq<>,
float3 f<>,
float3 v<>,
float invmass<>,
out float3 sd1V<>, //This is V in gromacs, reused in sd2
out float3 vnew<>,
out float4 posqp<>
){
float3 Vmh;
float3 fg1, fg2;
float linind_gauss; //linear index into the random numbers
float2 igauss; //2d index
igauss = indexof( posq );
linind_gauss = igauss.y * xstrwidth + igauss.x;
//2 random vectors used in each fragment
linind_gauss = 2 * linind_gauss + goffset;
//igauss.y = floor( linind_gauss / gstrwidth );
igauss.y = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth);
igauss.x = linind_gauss - igauss.y * gstrwidth;
fg1 = fgauss[ igauss ];
linind_gauss += 1.0f;
//igauss.y = floor( linind_gauss / gstrwidth );
igauss.y = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth);
igauss.x = linind_gauss - igauss.y * gstrwidth;
fg2 = fgauss[ igauss ];
/*
Gromacs code
n = 1 -> ngtc
sdc[n].gdt = dt/tau_t[n];
sdc[n].eph = exp(sdc[n].gdt/2);
sdc[n].emh = exp(-sdc[n].gdt/2);
sdc[n].em = exp(-sdc[n].gdt);
sdc[n].b = sdc[n].gdt*(sqr(sdc[n].eph)-1) - 4*sqr(sdc[n].eph-1);
// 2.15 in paper (C)
sdc[n].c = sdc[n].gdt - 3 + 4*sdc[n].emh - sdc[n].em;
sdc[n].d = 2 - sdc[n].eph - sdc[n].emh;
Vmh = X[n-start][d]*sdc[gt].d/(tau_t[gt]*sdc[gt].c) + ism*sig[gt].Yv*fgauss(&jran);
V[n-start][d] = ism*sig[gt].V*fgauss(&jran);
v[n][d] = vn*sdc[gt].em + (invmass[n]*f[n][d] + accel[ga][d])*tau_t[gt]*(1 - sdc[gt].em) +
V[n-start][d] - sdc[gt].em*Vmh;
xprime[n][d] = x[n][d] + v[n][d]*tau_t[gt]*(sdc[gt].eph - sdc[gt].emh);
*/
// ---------------------------------------------------------------------------------------------
// float pc3 = sdc[0].d/(tau_t * sdc[0].c);
//sdpc.x = 1/sqrt(m)*sig[0].Yv
Vmh = sd2X * pc3 + sdpc.x * fg1;
// ---------------------------------------------------------------------------------------------
// ---------------------------------------------------------------------------------------------
//sdpc.y = 1/sqrt(m)*sig[0].V
sd1V = sdpc.y * fg2;
// ---------------------------------------------------------------------------------------------
// ---------------------------------------------------------------------------------------------
//
// float pc1 = tau_t[0] * ( 1 - sdc[0].em )
vnew = v * cem + invmass * f * pc1 + sd1V - Vmh * cem;
// float pc2 = tau_t[gt] * (sdc[gt].eph - sdc.emh)
posqp = posq;
posqp.xyz += vnew * pc2;
}
kernel void kupdate_sd2_fix1(
float xstrwidth, //stream width of atoms
float gstrwidth, //stream width of random numbers
float goffset, //starting offset for random numbers
float pc1, //this is 1/pc2 for sd1
//= 1/(tau_t*(sdc[0].eph-sdc[0].emh))
float pc2, //= tau_t * sdc[0].d/(sdc[0].em - 1)
//per atom precalcs
//sdpc.x = 1/sqrt(m)*sig[0].Yx
//sdpc.y = 1/sqrt(m)*sig[0].X
float2 sdpc<>,
//Normally distributed random vectors
float3 fgauss[][],
float3 sd1V<>, //calculated in sd1
float4 posq<>, //positions pre-update
float4 posqp<>, //deltas post constraint
float3 vnew<>, //velocities after sd1
out float3 sd2X<>, //Used in sd1
out float3 v<>, //velocities corrected for constraints
out float4 posqp2<> //new deltas, need to be constrained again
){
float linind_gauss; //linear index into the random numbers
float2 igauss; //2d index
float3 fg1, fg2;
float3 Xmh;
igauss = indexof( posq );
linind_gauss = igauss.y * xstrwidth + igauss.x;
linind_gauss = 2 * linind_gauss + goffset;
//igauss.y = floor( linind_gauss / gstrwidth );
igauss.y = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth);
igauss.x = linind_gauss - igauss.y * gstrwidth;
fg1 = fgauss[ igauss ];
linind_gauss += 1.0f;
//igauss.y = floor( linind_gauss / gstrwidth );
igauss.y = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth);
igauss.x = linind_gauss - igauss.y * gstrwidth;
fg2 = fgauss[ igauss ];
//v = pc1 * posqp.xyz (!!!!!!!! ?????? !!!!!!!)
v = pc1 * (posqp.xyz - posq.xyz);
Xmh = sd1V * pc2 + sdpc.x * fg1;
sd2X = sdpc.y * fg2;
posqp2 = posqp;
posqp2.xyz += sd2X - Xmh;
}
kernel void kupdate_sd1_fix1_FixedRV(
float xstrwidth, //atom stream width
float gstrwidth, //stream width of random numbers
float goffset, //starting offset for random numbers
//sd constants, prefixed with c
//to avoid 1 letter names, cxx = sdc[0].xx
float cem,
//sd precalculated constants
float pc1, //tau_t[0] * ( 1 - sdc[0].em )
float pc2, //tau_t[gt] * (sdc[gt].eph - sdc.emh)
float pc3, //sdc[0].d/(tau_t * sdc[0].c);
//sdpc = sd precalculated per atom values
//sdpc.x = 1/sqrt(m)*sig[0].Yv
//sdpc.y = 1/sqrt(m)*sig[0].V
float2 sdpc<>,
//Normally distributed random vectors
float3 fgauss[][],
float3 sd2X<>, //This is calculated in sd2 of the previous step
float4 posq<>,
float3 f<>,
float3 v<>,
float invmass<>,
out float3 sd1V<>, //This is V in gromacs, reused in sd2
out float3 vnew<>,
out float4 posqp<>
){
float3 Vmh;
float3 fg1, fg2;
float linind_gauss; //linear index into the random numbers
float2 igauss; //2d index
float RandomValueVsp = 0.1f;
igauss = indexof( posq );
linind_gauss = igauss.y * xstrwidth + igauss.x;
//2 random vectors used in each fragment
linind_gauss = 2 * linind_gauss + goffset;
//igauss.y = floor( linind_gauss / gstrwidth );
igauss.y = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth);
igauss.x = linind_gauss - igauss.y * gstrwidth;
//fg1 = fgauss[ igauss ];
fg1 = float3( RandomValueVsp, RandomValueVsp, RandomValueVsp );
linind_gauss += 1.0f;
//igauss.y = floor( linind_gauss / gstrwidth );
igauss.y = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth);
igauss.x = linind_gauss - igauss.y * gstrwidth;
//fg2 = fgauss[ igauss ];
fg2 = float3( RandomValueVsp, RandomValueVsp, RandomValueVsp );
/*
Gromacs code
n = 1 -> ngtc
sdc[n].gdt = dt/tau_t[n];
sdc[n].eph = exp(sdc[n].gdt/2);
sdc[n].emh = exp(-sdc[n].gdt/2);
sdc[n].em = exp(-sdc[n].gdt);
sdc[n].b = sdc[n].gdt*(sqr(sdc[n].eph)-1) - 4*sqr(sdc[n].eph-1);
// 2.15 in paper (C)
sdc[n].c = sdc[n].gdt - 3 + 4*sdc[n].emh - sdc[n].em;
sdc[n].d = 2 - sdc[n].eph - sdc[n].emh;
Vmh = X[n-start][d]*sdc[gt].d/(tau_t[gt]*sdc[gt].c) + ism*sig[gt].Yv*fgauss(&jran);
V[n-start][d] = ism*sig[gt].V*fgauss(&jran);
v[n][d] = vn*sdc[gt].em + (invmass[n]*f[n][d] + accel[ga][d])*tau_t[gt]*(1 - sdc[gt].em) +
V[n-start][d] - sdc[gt].em*Vmh;
xprime[n][d] = x[n][d] + v[n][d]*tau_t[gt]*(sdc[gt].eph - sdc[gt].emh);
*/
// ---------------------------------------------------------------------------------------------
// float pc3 = sdc[0].d/(tau_t * sdc[0].c);
//sdpc.x = 1/sqrt(m)*sig[0].Yv
Vmh = sd2X * pc3 + sdpc.x * fg1;
// ---------------------------------------------------------------------------------------------
// ---------------------------------------------------------------------------------------------
//sdpc.y = 1/sqrt(m)*sig[0].V
sd1V = sdpc.y * fg2;
// ---------------------------------------------------------------------------------------------
// ---------------------------------------------------------------------------------------------
//
// float pc1 = tau_t[0] * ( 1 - sdc[0].em )
vnew = v * cem + invmass * f * pc1 + sd1V - Vmh * cem;
// float pc2 = tau_t[gt] * (sdc[gt].eph - sdc.emh)
posqp = posq;
posqp.xyz += vnew * pc2;
}
kernel void kupdate_sd2_fix1_FixedRV(
float xstrwidth, //stream width of atoms
float gstrwidth, //stream width of random numbers
float goffset, //starting offset for random numbers
float pc1, //this is 1/pc2 for sd1
//= 1/(tau_t*(sdc[0].eph-sdc[0].emh))
float pc2, //= tau_t * sdc[0].d/(sdc[0].em - 1)
//per atom precalcs
//sdpc.x = 1/sqrt(m)*sig[0].Yx
//sdpc.y = 1/sqrt(m)*sig[0].X
float2 sdpc<>,
//Normally distributed random vectors
float3 fgauss[][],
float3 sd1V<>, //calculated in sd1
float4 posq<>, //positions pre-update
float4 posqp<>, //deltas post constraint
float3 vnew<>, //velocities after sd1
out float3 sd2X<>, //Used in sd1
out float3 v<>, //velocities corrected for constraints
out float4 posqp2<> //new deltas, need to be constrained again
){
float linind_gauss; //linear index into the random numbers
float2 igauss; //2d index
float3 fg1, fg2;
float3 Xmh;
float RandomValueVsp = 0.1f;
igauss = indexof( posq );
linind_gauss = igauss.y * xstrwidth + igauss.x;
linind_gauss = 2 * linind_gauss + goffset;
//igauss.y = floor( linind_gauss / gstrwidth );
igauss.y = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth);
igauss.x = linind_gauss - igauss.y * gstrwidth;
//fg1 = fgauss[ igauss ];
fg1 = float3( RandomValueVsp, RandomValueVsp, RandomValueVsp );
linind_gauss += 1.0f;
//igauss.y = floor( linind_gauss / gstrwidth );
igauss.y = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth);
igauss.x = linind_gauss - igauss.y * gstrwidth;
// fg2 = fgauss[ igauss ];
fg2 = float3( RandomValueVsp, RandomValueVsp, RandomValueVsp );
//v = pc1 * posqp.xyz (!!!!!!!! ?????? !!!!!!!)
v = pc1 * (posqp.xyz - posq.xyz);
Xmh = sd1V * pc2 + sdpc.x * fg1;
sd2X = sdpc.y * fg2;
posqp2 = posqp;
posqp2.xyz += sd2X - Xmh;
}
platforms/brook/src/gpu/pre_test.br
0 → 100644
View file @
b53cd947
/* pre-tests, known to be right */
kernel void pre_test0( float input<>, out float output<> )
{
output = input;
}
kernel void pre_test1( float input<>, out float output<> )
{
output = ( input * input ) + input;
}
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment