Commit cb130f92 authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Mods

parent cc8b4de0
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: Mark Friedrichs
*
* This kernel was developed in collaboration with
*
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
kernel void loop1Internal( float3 d1, float3 d2, float3 d3, float3 d4, float4 jBornR,
float4 jQ, float iBornR, float iQ, out float4 dGpol_dr<>,
out float4 dGpol_dalpha2_ij<> ){
// ---------------------------------------------------------------------------------------
float4 r2, alpha2_ij, D_ij, expTerm, denominator2, denominator, Gpol;
// ---------------------------------------------------------------------------------------
r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) );
alpha2_ij = jBornR*iBornR;
D_ij = r2/(4.0f*alpha2_ij);
expTerm = exp( -D_ij );
denominator2 = r2 + alpha2_ij*expTerm;
denominator = sqrt( denominator2 );
Gpol = jQ/denominator;
Gpol *= iQ;
dGpol_dr = -Gpol*( 1.0f - 0.25f*expTerm )/denominator2;
dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*( 1.0f + D_ij )*jBornR/denominator2;
}
/* ---------------------------------------------------------------------------------------
Calculate nonpolar ACE term (Simbios)
bornRadius: Born radius
vdwRadius: Vdw radius
duplicationFactor: duplication factor
aceForce: ACE term
--------------------------------------------------------------------------------------- */
kernel void kAceNonPolarLoop1( float iBornRadius, float iVdwRadius, float duplicationFactor,
out float aceForce<> ){
// ---------------------------------------------------------------------------------------
// nonpolar term
float iSurface;
float iAceTerm;
// ---------------------------------------------------------------------------------------
// constants
// solvent radius
const float probeRadius = 0.14f;
// PI*4*6*0.0054*1000 (0.0054=asolv from Tinker)
//const float PI_24_aI = -0.3694512961;
const float PI_24_aI = -407.1504079f;
// ---------------------------------------------------------------------------------------
// etch i position and partial charge
// e = ai * term * (ri+probe)**2 * (ri/rb)**6
// (drbi) = drb(i) - 6.0fd0*e/rb
// (rI+probe)**2
iSurface = (iVdwRadius+probeRadius);
iSurface = iSurface*iSurface;
// (rI/rB)**6
iAceTerm = iVdwRadius/iBornRadius;
iAceTerm = iAceTerm*iAceTerm*iAceTerm;
iAceTerm = iAceTerm*iAceTerm;
aceForce = iSurface*iAceTerm*PI_24_aI/(duplicationFactor*iBornRadius);
}
/* ---------------------------------------------------------------------------------------
Calculate first loop force terms (Simbios)
numberOfAtoms: no. of atoms
roundedUpAtoms: rounded up number of atoms -- accounts for unrolling
duplicationFactor: number of threads for inner loop
streamWidth: atom stream width
fstreamWidth: force stream width (output -- i-unroll)
soluteDielectric: solute dielectric
solventDielectric: solvent dielectric
includeAce: include ACE term
posq: atom positions and charge
bornRadii: Born radii
nonpolarForce: nonpolar force (0 if nonpolar not included, else
ACE value)
bornForce1: i-unroll first force component, including dBornR/dr in .w
bornForce2: i-unroll second force component, including dBornR/dr in .w
bornForce3: i-unroll first force component, including dBornR/dr in .w
bornForce4: i-unroll second force component, including dBornR/dr in .w
--------------------------------------------------------------------------------------- */
kernel void kObcLoop1( float numberOfAtoms, float roundedUpAtoms, float duplicationFactor,
float streamWidth, float fstreamWidth, float soluteDielectric,
float solventDielectric, float includeAce,
float3 posq[][], float bornRadii[][], float2 atomicRadii[][],
out float4 bornForce1<>, out float4 bornForce2<>,
out float4 bornForce3<>, out float4 bornForce4<> ){
// ---------------------------------------------------------------------------------------
// Born radii
float i1BornR, i2BornR, i3BornR, i4BornR;
float j1BornR, j2BornR, j3BornR, j4BornR;
float4 jBornR;
// atomic radii
float i1AtomicR, i2AtomicR, i3AtomicR, i4AtomicR;
// i,j coordinates
float3 i1Pos, i2Pos, i3Pos, i4Pos;
float3 j1Pos, j2Pos, j3Pos, j4Pos;
float4 j1PosQ, j2PosQ, j3PosQ, j4PosQ;
// i, j partial charges
float i1Q, i2Q, i3Q, i4Q;
float j1Q, j2Q, j3Q, j4Q;
float4 jQ;
float aceForce;
// delta coordinates
float3 d1, d2, d3, d4;
// intermediate terms
float4 dGpol_dr, dGpol_dalpha2_ij;
// indices
float2 iAtom;
float forceIndex;
// This is forceIndex mod numberOfAtoms, the true i index
float iAtomLinearIndex, jLinind;
float2 jAtom;
float jEnd, jStart, jBlock;
float whichRep;
float tmp;
// ---------------------------------------------------------------------------------------
// electricConstant = -166.0f2691;
// preFactor = 2.0f*electricConstant*(1.0f - (1.0f/waterDielectric))
float preFactor = -332.05382f;
const float I_Unroll = 4.0f;
const float3 zero3 = float3( 0.0f, 0.0f, 0.0f );
// ---------------------------------------------------------------------------------------
preFactor *= ( (1.0f/soluteDielectric) - (1.0f/solventDielectric) );
iAtom = indexof( bornForce1 );
forceIndex = I_Unroll*( iAtom.x + iAtom.y*fstreamWidth );
iAtomLinearIndex = fmod( forceIndex, roundedUpAtoms );
// ---------------------------------------------------------------------------------------
// set gather index
iAtom.x = fmod( iAtomLinearIndex, streamWidth );
iAtom.y = round( (iAtomLinearIndex - fmod(iAtomLinearIndex, streamWidth ))/streamWidth );
// ---------------------------------------------------------------------------------------
// etch i1 position and partial charge
jQ = posq[ iAtom ];
i1Pos = jQ.xyz;
i1Q = atomicRadii[ iAtom ].y;
i1Q *= preFactor;
i1BornR = bornRadii[ iAtom ];
i1AtomicR = atomicRadii[ iAtom ].x;
kAceNonPolarLoop1( i1BornR, i1AtomicR, duplicationFactor, aceForce );
bornForce1.xyz = zero3;
bornForce1.w = includeAce > 0.5f ? aceForce : 0.0f;
// ---------------------------------------------------------------------------------------
// etch i2 position and partial charge
iAtom.x += 1;
jQ = posq[ iAtom ];
i2Pos = jQ.xyz;
i2Q = atomicRadii[ iAtom ].y;
i2Q *= preFactor;
i2BornR = bornRadii[ iAtom ];
i2AtomicR = atomicRadii[ iAtom ].x;
kAceNonPolarLoop1( i2BornR, i2AtomicR, duplicationFactor, aceForce );
bornForce2.xyz = zero3;
bornForce2.w = includeAce > 0.5f ? aceForce : 0.0f;
// ---------------------------------------------------------------------------------------
// etch i3 position and partial charge
iAtom.x += 1;
jQ = posq[ iAtom ];
i3Pos = jQ.xyz;
i3Q = atomicRadii[ iAtom ].y;
i3Q *= preFactor;
i3BornR = bornRadii[ iAtom ];
i3AtomicR = atomicRadii[ iAtom ].x;
kAceNonPolarLoop1( i3BornR, i3AtomicR, duplicationFactor, aceForce );
bornForce3.xyz = zero3;
bornForce3.w = includeAce > 0.5f ? aceForce : 0.0f;
// ---------------------------------------------------------------------------------------
// etch i4 position and partial charge
iAtom.x += 1;
jQ = posq[ iAtom ];
i4Pos = jQ.xyz;
i4Q = atomicRadii[ iAtom ].y;
i4Q *= preFactor;
i4BornR = bornRadii[ iAtom ];
i4AtomicR = atomicRadii[ iAtom ].x;
kAceNonPolarLoop1( i4BornR, i4AtomicR, duplicationFactor, aceForce );
bornForce4.xyz = zero3;
bornForce4.w = includeAce > 0.5f ? aceForce : 0.0f;
// ---------------------------------------------------------------------------------------
// inner loop setup
// if dupFac == 4, I_UnRoll =2, then breaking inner loop into two segments
// to increase number of threads in flight
// forceStreamSz = N*RepFac/I_UnRoll
// forceIndex = I_UnRoll*( a.x + a.y*forceStreamSz )
// whichRep = 0 or 1
// jBlock = 1 + floor[ N/(duplicationFactor*streamWidth) ]
//changed the following instruction for rounding issues on some ASICs
//whichRep = floor( forceIndex / roundedUpAtoms );
tmp = fmod(forceIndex, roundedUpAtoms);
whichRep = round((forceIndex - tmp)/roundedUpAtoms);
jBlock = 1 + floor( numberOfAtoms/(duplicationFactor*streamWidth ) );
jStart = whichRep*jBlock;
jEnd = ( whichRep > duplicationFactor - 1.5f ) ? 999999.0f : (jStart + jBlock);
jAtom.y = jStart;
jLinind = jAtom.y*streamWidth;
// ---------------------------------------------------------------------------------------
while ( jAtom.y < jEnd && ( numberOfAtoms - jLinind ) > 0.9f ){
jAtom.x = 0.0f;
while ( jAtom.x < streamWidth && ( numberOfAtoms - jLinind ) > 0.9f ) {
// ---------------------------------------------------------------------------------------
// gather required values
j1Pos = posq[ jAtom ];
j1Q = atomicRadii[ jAtom ].y;
j1BornR = bornRadii[ jAtom ];
jAtom.x += 1.0f;
j2Pos = posq[ jAtom ];
j2Q = atomicRadii[ jAtom ].y;
j2BornR = bornRadii[ jAtom ];
jAtom.x += 1.0f;
j3Pos = posq[ jAtom ];
j3Q = atomicRadii[ jAtom ].y;
j3BornR = bornRadii[ jAtom ];
jAtom.x += 1.0f;
j4Pos = posq[ jAtom ];
j4Q = atomicRadii[ jAtom ].y;
j4BornR = bornRadii[ jAtom ];
jAtom.x += 1.0f;
jBornR = float4( j1BornR, j2BornR, j3BornR, j4BornR );
jQ = float4( j1Q, j2Q, j3Q, j4Q );
// ---------------------------------------------------------------------------------------
// i == 1
d1 = i1Pos - j1Pos;
d2 = i1Pos - j2Pos;
d3 = i1Pos - j3Pos;
d4 = i1Pos - j4Pos;
loop1Internal( d1, d2, d3, d4, jBornR, jQ, i1BornR, i1Q, dGpol_dr, dGpol_dalpha2_ij );
bornForce1.xyz += dGpol_dr.x*d1;
bornForce1.xyz += dGpol_dr.y*d2;
bornForce1.xyz += dGpol_dr.z*d3;
bornForce1.xyz += dGpol_dr.w*d4;
bornForce1.w += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w;
// ---------------------------------------------------------------------------------------
// i == 2
d1 = i2Pos - j1Pos;
d2 = i2Pos - j2Pos;
d3 = i2Pos - j3Pos;
d4 = i2Pos - j4Pos;
loop1Internal( d1, d2, d3, d4, jBornR, jQ, i2BornR, i2Q, dGpol_dr, dGpol_dalpha2_ij );
bornForce2.xyz += dGpol_dr.x*d1;
bornForce2.xyz += dGpol_dr.y*d2;
bornForce2.xyz += dGpol_dr.z*d3;
bornForce2.xyz += dGpol_dr.w*d4;
bornForce2.w += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w;
// ---------------------------------------------------------------------------------------
// i == 3
d1 = i3Pos - j1Pos;
d2 = i3Pos - j2Pos;
d3 = i3Pos - j3Pos;
d4 = i3Pos - j4Pos;
loop1Internal( d1, d2, d3, d4, jBornR, jQ, i3BornR, i3Q, dGpol_dr, dGpol_dalpha2_ij );
bornForce3.xyz += dGpol_dr.x*d1;
bornForce3.xyz += dGpol_dr.y*d2;
bornForce3.xyz += dGpol_dr.z*d3;
bornForce3.xyz += dGpol_dr.w*d4;
bornForce3.w += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w;
// ---------------------------------------------------------------------------------------
// i == 4
d1 = i4Pos - j1Pos;
d2 = i4Pos - j2Pos;
d3 = i4Pos - j3Pos;
d4 = i4Pos - j4Pos;
loop1Internal( d1, d2, d3, d4, jBornR, jQ, i4BornR, i4Q, dGpol_dr, dGpol_dalpha2_ij );
bornForce4.xyz += dGpol_dr.x*d1;
bornForce4.xyz += dGpol_dr.y*d2;
bornForce4.xyz += dGpol_dr.z*d3;
bornForce4.xyz += dGpol_dr.w*d4;
bornForce4.w += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w;
// ---------------------------------------------------------------------------------------
jLinind += 4.0f;
}
jAtom.y += 1.0f;
}
}
This diff is collapsed.
......@@ -99,3 +99,53 @@ kernel void kMergeFloat3_4(
}
}
kernel void kMergeFloat3_4_nobranch(
float repfac,
float atomStreamWidth,
float pStreamWidth,
float natoms,
float roundNatoms,
float iUnroll,
float3 pstream1[][],
float3 pstream2[][],
float3 pstream3[][],
float3 pstream4[][],
out float3 outstream<> )
{
float atomIndex, forceIndex, qIndex, qOff;
float2 pindex;
float i;
float3 o1,o2,o3,o4;
float3 tmp;
// given atom index find force indices and streams
pindex = indexof( outstream );
atomIndex = pindex.x + pindex.y*atomStreamWidth;
forceIndex = atomIndex;
outstream = float3( 0.0f, 0.0f, 0.0f );
for( i = 0.0f; i < repfac; i += 1.0f ){
qIndex = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
qOff = forceIndex - iUnroll*qIndex;
pindex.y = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
pindex.x = qIndex - pindex.y*pStreamWidth;
o1 = pstream1[ pindex ];
o2 = pstream2[ pindex ];
o3 = pstream3[ pindex ];
o4 = pstream4[ pindex ];
tmp = qOff < 0.5f ? o1 : o2;
tmp = qOff < 1.5f ? tmp : o3;
tmp = qOff < 2.5f ? tmp : o4;
outstream += tmp;
forceIndex += roundNatoms;
}
}
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
/* Order N^2 neighbor searching.
*
* This only works for force fields that don't have charge groups.
* If you insist on charge groups, you'll have to pass in appropriate masks here.
*
* This is a simplified kernel, for testing the O(N) speeds.
*
* This does a complete N^2 search without considering groups of
* atoms. Most likely this will prove to be inefficient for
* the O(N) kernel. Lets find out.
*
*
* Each component of the curpass textures is an atom index. The w component
* of curpass3 is a count indicating how many j particles we have
* scanned for this particular i atom.
*
* */
kernel void knborsearch(
float first, //Positive means constructing the first 16.
iter float2 wpos<>, //pixel position of output
float AtomStrHeight,
float AtomStrWidth,
float cutoff2, //square of the cutoff
float natoms, //number of atoms
float excl[][], //exclusions in 1x1 format, 0 means not excluded, 1 means excluded.
float4 posq[][], //atom positions/charges
float4 prevpass3<>, //Last output texture of previous pass
out float4 curpass0<>, //First output of current pass
out float4 curpass1<>,
out float4 curpass2<>,
out float4 curpass3<> //Last output of current pass, used in next pass
){
/*For this kernel, wpos == iatom*/
float2 iind;
float2 jind;
float3 ipos, jpos, dr;
float r2;
float listptr; //Where in the 16-chunk are we now.
float jlinind;
float breakflag; //positive means keep looping, negative means stop
float4 exclconst;
float2 exclind;
float exclusions;
exclconst = float4( 2.0f, 3.0f, 5.0f, 7.0f );
iind = wpos;
exclind.x = iind.x + iind.y * AtomStrWidth;
//etch i atom
ipos = posq[ iind ].xyz;
//Loop over j depending on prevpass
jlinind = prevpass3.w + 1;
jind.y = floor( jlinind / AtomStrWidth );
jind.x = fmod( jlinind, AtomStrWidth );
exclind.y = jlinind;
//All outputs should be initialized to
listptr = 0.0f;
breakflag = 1.0f;
//if we already finished, do nothing
if ( first < 0.0f && prevpass3.w < 0.0f )
breakflag = -1.0f;
//set to -1 to indicate no neighbor
//just to save a separate set of init calls
curpass0 = float4( -1.0f, -1.0f, -1.0f, -1.0f );
curpass1 = curpass0;
curpass2 = curpass0;
curpass3 = curpass0;
while ( jind.y < AtomStrHeight && breakflag > 0.0f ) {
while ( jind.x < AtomStrWidth && breakflag > 0.0f ) {
//First see if this pair is excluded
exclusions = excl[ exclind ];
if ( exclusions < 0.5f ) {
jpos = posq[ jind ].xyz;
dr = jpos - ipos;
r2 = dot( dr, dr );
//If it is inside the cutoff
if ( r2 < cutoff2 ) {
//Figure out where to put it
//We are allowed 4 nested conditionals
//We can play with the structuring of these
if ( listptr < 0.5f )
curpass0.x = jlinind;
else if ( listptr < 1.5f )
curpass0.y = jlinind;
else if ( listptr < 2.5f )
curpass0.z = jlinind;
else if ( listptr < 3.5f )
curpass0.w = jlinind;
else if ( listptr < 4.5f )
curpass1.x = jlinind;
else if ( listptr < 5.5f )
curpass1.y = jlinind;
else if ( listptr < 6.5f )
curpass1.z = jlinind;
else if ( listptr < 7.5f )
curpass1.w = jlinind;
else if ( listptr < 8.5f )
curpass2.x = jlinind;
else if ( listptr < 9.5f )
curpass2.y = jlinind;
else if ( listptr < 10.5f )
curpass2.z = jlinind;
else if ( listptr < 11.5f )
curpass2.w = jlinind;
else if ( listptr < 12.5f )
curpass3.x = jlinind;
else if ( listptr < 13.5f )
curpass3.y = jlinind;
else if ( listptr < 14.5f ) {
curpass3.z = jlinind;
}
else if ( listptr < 15.5f ) {
//We're done for this pass
curpass3.w = jlinind;
breakflag = -1.0f;
}
listptr += 1.0f;
}
}
jlinind += 1.0f;
exclind.y += 1.0f;
jind.x += 1.0f;
}
jind.x = 0.0f;
jind.y += 1.0f;
}
}
//Precomputes lennard jones sig and eps
//to save an indirect etch (and a ew flops) in the
//force kernel. The charge product is not done this way
//because charges have to be etched anyway with the
//positions
kernel void knl_precompute_sigeps(
float AtomStrWidth,
iter float2 wpos<>,
float2 sigeps[][], //x=sigma, y=epsilon
float4 nlist0<>,
float4 nlist1<>,
out float4 sig0<>,
out float4 eps0<>,
out float4 sig1<>,
out float4 eps1<>
)
{
float2 jind;
float4 ind_tmp1, ind_tmp2;
float2 isigeps, jsigeps1, jsigeps2, jsigeps3, jsigeps4;
isigeps = sigeps[ wpos ];
ind_tmp1 = floor( nlist0 / AtomStrWidth );
ind_tmp2 = nlist0 - ind_tmp1 * AtomStrWidth;
jind.y = ind_tmp1.x;
jind.x = ind_tmp2.x;
jsigeps1 = sigeps[ jind ];
jind.y = ind_tmp1.y;
jind.x = ind_tmp2.y;
jsigeps2 = sigeps[ jind ];
jind.y = ind_tmp1.z;
jind.x = ind_tmp2.z;
jsigeps3 = sigeps[ jind ];
jind.y = ind_tmp1.w;
jind.x = ind_tmp2.w;
jsigeps4 = sigeps[ jind ];
sig0.x = isigeps.x + jsigeps1.x;
sig0.y = isigeps.x + jsigeps2.x;
sig0.z = isigeps.x + jsigeps3.x;
sig0.w = isigeps.x + jsigeps4.x;
eps0.x = isigeps.y * jsigeps1.y;
eps0.y = isigeps.y * jsigeps2.y;
eps0.z = isigeps.y * jsigeps3.y;
eps0.w = isigeps.y * jsigeps4.y;
//2nd nlist set
ind_tmp1 = floor( nlist1 / AtomStrWidth );
ind_tmp2 = nlist1 - ind_tmp1 * AtomStrWidth;
jind.y = ind_tmp1.x;
jind.x = ind_tmp2.x;
jsigeps1 = sigeps[ jind ];
jind.y = ind_tmp1.y;
jind.x = ind_tmp2.y;
jsigeps2 = sigeps[ jind ];
jind.y = ind_tmp1.z;
jind.x = ind_tmp2.z;
jsigeps3 = sigeps[ jind ];
jind.y = ind_tmp1.w;
jind.x = ind_tmp2.w;
jsigeps4 = sigeps[ jind ];
sig1.x = isigeps.x + jsigeps1.x;
sig1.y = isigeps.x + jsigeps2.x;
sig1.z = isigeps.x + jsigeps3.x;
sig1.w = isigeps.x + jsigeps4.x;
eps1.x = isigeps.y * jsigeps1.y;
eps1.y = isigeps.y * jsigeps2.y;
eps1.z = isigeps.y * jsigeps3.y;
eps1.w = isigeps.y * jsigeps4.y;
}
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
//Ryckaert Bellman dihedrals, needed for Amber/OPLS ff's
//
//Input is a stream of quartets i, j, k, l and the output is
//four float3 streams fi, fj, fk, fl.
//If by any chance this kernel becomes the bottleneck, we will
//optimize, but for now, this is kept pretty simple.
//To keep things streaming, we have a stream of 6 parameters(a float4 and float2)
//for each dihedral.
kernel void kpdih(
float xstrwidth, //stream width for x
float4 xq[][], //particle coordinates and charges
float4 atoms<>, //ijkl quartets
float4 parms<>, //parms = ( cp, phi0, mult, 0.0 )
out float3 fi<>, //output forces for i, j, k, l
out float3 fj<>,
out float3 fk<>,
out float3 fl<>
) {
float3 r_ij, r_kj, r_kl;
float2 ai, aj, ak, al;
float3 m, n;
float sgnphi;
float cosfac;
float phi, ddphi, mdphi;
float3 u, v, s;
float nrkj, nrkj2, msq, nsq, cos_phi, sin_phi;
//Convert from linear indices to 2D indices into x
//If this kernel is compute bound, we can do this
//conversion before-hand and feed in the 2D coordinates
ai.y = floor( atoms.x / xstrwidth );
ai.x = atoms.x - ai.y * xstrwidth;
aj.y = floor( atoms.y / xstrwidth );
aj.x = atoms.y - aj.y * xstrwidth;
ak.y = floor( atoms.z / xstrwidth );
ak.x = atoms.z - ak.y * xstrwidth;
al.y = floor( atoms.w / xstrwidth );
al.x = atoms.w - al.y * xstrwidth;
r_ij = xq[ai].xyz - xq[aj].xyz; //3
r_kj = xq[ak].xyz - xq[aj].xyz; //3
r_kl = xq[ak].xyz - xq[al].xyz; //3
m = cross( r_ij, r_kj ); //9
n = cross( r_kj, r_kl ); //9
msq = dot(m, m); //5
nsq = dot(n, n); //5
cos_phi = clamp( dot(m, n)/sqrt(msq*nsq), -1.0, 1.0 ); //8
sgnphi = sign( dot( r_ij, n ) ); //5
phi = sgnphi * acos( cos_phi ); //2
mdphi = parms.z * phi - parms.y; //2
ddphi = - parms.x * parms.z * sin( mdphi ); //3
nrkj2 = dot( r_kj, r_kj ); //5
nrkj = sqrt( nrkj2 ); //1
fi = -ddphi * nrkj / msq * m; //5
fl = ddphi * nrkj / nsq * n; //5
u = dot( r_ij, r_kj ) / nrkj2 * fi; //9
v = dot( r_kl, r_kj ) / nrkj2 * fl; //9
s = u - v; //3
fj = s - fi; //3
fk = -(s + fl); //3
//Total : 100 flops
}
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
//Ryckaert Bellman dihedrals, needed for Amber/OPLS ff's
//
//Input is a stream of quartets i, j, k, l and the output is
//four float3 streams fi, fj, fk, fl.
//If by any chance this kernel becomes the bottleneck, we will
//optimize, but for now, this is kept pretty simple.
//To keep things streaming, we have a stream of 6 parameters(a float4 and float2)
//for each dihedral.
kernel void krbdih(
float xstrwidth, //stream width for x
float4 xq[][], //particle coordinates and charges
float4 atoms<>, //ijkl quartets
float4 parm03<>, //params 0-3
float2 parm45<>, //params 4 and 5
out float3 fi<>, //output forces for i, j, k, l
out float3 fj<>,
out float3 fk<>,
out float3 fl<>
) {
float3 r_ij, r_kj, r_kl;
float2 ai, aj, ak, al;
float3 m, n;
float sgnphi;
float cosfac;
float ddphi;
float3 u, v, s;
float nrkj, nrkj2, msq, nsq, cos_phi, sin_phi;
//Convert from linear indices to 2D indices into x
//If this kernel is compute bound, we can do this
//conversion before-hand and feed in the 2D coordinates
ai.y = floor( atoms.x / xstrwidth );
ai.x = atoms.x - ai.y * xstrwidth;
aj.y = floor( atoms.y / xstrwidth );
aj.x = atoms.y - aj.y * xstrwidth;
ak.y = floor( atoms.z / xstrwidth );
ak.x = atoms.z - ak.y * xstrwidth;
al.y = floor( atoms.w / xstrwidth );
al.x = atoms.w - al.y * xstrwidth;
r_ij = xq[ai].xyz - xq[aj].xyz; //3
r_kj = xq[ak].xyz - xq[aj].xyz; //3
r_kl = xq[ak].xyz - xq[al].xyz; //3
m = cross( r_ij, r_kj ); //9
n = cross( r_kj, r_kl ); //9
msq = dot(m, m); //5
nsq = dot(n, n); //5
cos_phi = dot(m, n)/sqrt(msq*nsq); //8 (sqrt=1)
//Switching to "polymer convention"
//See gromacs code
cos_phi = -cos_phi;
sgnphi = sign( dot(r_ij, n) ); //5
sin_phi = -sgnphi*sqrt( clamp( 1.0 - cos_phi * cos_phi, 0.0, 1.0) ); //3
//ddphi is basically sum_{i=1}^5 i parm_i cosphi^{i-1}
//This might not be the best way to use the
//4-way mads, but for now we'll let fxc figure it
//out.
//If we precompute some ratios of the parameters
//we can use the 4-way mads better
ddphi = 5.0 * parm45.y;
ddphi = 4.0 * parm45.x + ddphi * cos_phi;
ddphi = 3.0 * parm03.w + ddphi * cos_phi;
ddphi = 2.0 * parm03.z + ddphi * cos_phi;
ddphi = parm03.y + ddphi * cos_phi;
ddphi = -ddphi * sin_phi; //13 flops total for ddphi
nrkj2 = dot( r_kj, r_kj ); //5
nrkj = sqrt( nrkj2 ); //1
fi = -ddphi * nrkj / msq * m; //5
fl = ddphi * nrkj / nsq * n; //5
u = dot( r_ij, r_kj ) / nrkj2 * fi; //9
v = dot( r_kl, r_kj ) / nrkj2 * fl; //9
s = u - v; //3
fj = s - fi; //3
fk = -(s + fl); //3
//Total flops: 109 per rb torsion.
}
void kshakeh_fix1 (const float nit,
const float strwidth,
const float invmH,
const float omega,
::brook::stream atoms,
::brook::stream posq,
::brook::stream posqp,
::brook::stream params,
::brook::stream cposq0,
::brook::stream cposq1,
::brook::stream cposq2,
::brook::stream cposq3);
void kshakeh_fix2 (const float nit,
const float strwidth,
const float invmH,
const float omega,
::brook::stream atoms,
::brook::stream posq,
::brook::stream posqp,
::brook::stream params,
::brook::stream cposq0,
::brook::stream cposq1,
::brook::stream cposq2,
::brook::stream cposq3);
void kshakeh_update (const float strwidth,
::brook::stream invmap,
::brook::stream posq,
::brook::stream cposq0,
::brook::stream cposq1,
::brook::stream cposq2,
::brook::stream cposq3,
::brook::stream oposq) ;
void kshakeh (const float nit,
const float strwidth,
const float invmH,
const float omega,
::brook::stream atoms,
::brook::stream posq,
::brook::stream posqp,
::brook::stream params,
::brook::stream cposq0,
::brook::stream cposq1,
::brook::stream cposq2,
::brook::stream cposq3);
void kshakeh_update1_fix1 (
const float strwidth,
const float sdpc1,
::brook::stream invmap,
::brook::stream posq,
::brook::stream posqp,
::brook::stream vPrime,
::brook::stream cposq0,
::brook::stream cposq1,
::brook::stream cposq2,
::brook::stream cposq3,
::brook::stream oposq);
void kshakeh_update1_fix1Old (const float strwidth,
::brook::stream invmap,
::brook::stream posq,
::brook::stream cposq0,
::brook::stream cposq1,
::brook::stream cposq2,
::brook::stream cposq3,
::brook::stream oposq);
void kshakeh_update2_fix1 (const float strwidth,
::brook::stream invmap,
::brook::stream posq,
::brook::stream posqp,
::brook::stream cposq0,
::brook::stream cposq1,
::brook::stream cposq2,
::brook::stream cposq3,
::brook::stream oposq);
void kupdate_sd1 (
const float xstrwidth,
const float gstrwidth,
const float goffset,
const float cem,
const float pc1,
const float pc2,
const float pc3,
::brook::stream sdpc,
::brook::stream fgauss,
::brook::stream sd2X,
::brook::stream posq,
::brook::stream f,
::brook::stream v,
::brook::stream invmass,
::brook::stream sd1V,
::brook::stream vnew,
::brook::stream posqp);
void kupdate_sd2 (
const float xstrwidth,
const float gstrwidth,
const float goffset,
const float pc1,
const float pc2,
::brook::stream sdpc,
::brook::stream fgauss,
::brook::stream sd1V,
::brook::stream posq,
::brook::stream posqp,
::brook::stream vnew,
::brook::stream sd2X,
::brook::stream v,
::brook::stream posqp2);
void kpermute_vectors (const float gstrwidth,
::brook::stream perm,
::brook::stream gvin,
::brook::stream gvout);
void kupdate_sd2_fix1 (const float xstrwidth,
const float gstrwidth,
const float goffset,
const float pc1,
const float pc2,
::brook::stream sdpc,
::brook::stream fgauss,
::brook::stream sd1V,
::brook::stream posq,
::brook::stream posqp,
::brook::stream vnew,
::brook::stream sd2X,
::brook::stream v,
::brook::stream posqp2);
void kupdate_sd1_fix1 (const float xstrwidth,
const float gstrwidth,
const float goffset,
const float cem,
const float pc1,
const float pc2,
const float pc3,
::brook::stream sdpc,
::brook::stream fgauss,
::brook::stream sd2X,
::brook::stream posq,
::brook::stream f,
::brook::stream v,
::brook::stream invmass,
::brook::stream sd1V,
::brook::stream vnew,
::brook::stream posqp);
void kupdate_sd2_fix1_FixedRV(const float xstrwidth,
const float gstrwidth,
const float goffset,
const float pc1,
const float pc2,
::brook::stream sdpc,
::brook::stream fgauss,
::brook::stream sd1V,
::brook::stream posq,
::brook::stream posqp,
::brook::stream vnew,
::brook::stream sd2X,
::brook::stream v,
::brook::stream posqp2);
void kupdate_sd1_fix1_FixedRV(const float xstrwidth,
const float gstrwidth,
const float goffset,
const float cem,
const float pc1,
const float pc2,
const float pc3,
::brook::stream sdpc,
::brook::stream fgauss,
::brook::stream sd2X,
::brook::stream posq,
::brook::stream f,
::brook::stream v,
::brook::stream invmass,
::brook::stream sd1V,
::brook::stream vnew,
::brook::stream posqp);
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment