Commit 8c3878b2 authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Mods

parent 116a99ba
...@@ -486,7 +486,6 @@ void kObcLoop2( ...@@ -486,7 +486,6 @@ void kObcLoop2(
float streamWidth, float streamWidth,
float fstreamWidth, float fstreamWidth,
::brook::stream posq, ::brook::stream posq,
::brook::stream atomicRadii,
::brook::stream scaledAtomicRadii, ::brook::stream scaledAtomicRadii,
::brook::stream bornForceFactor, ::brook::stream bornForceFactor,
::brook::stream bornForce1, ::brook::stream bornForce1,
......
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: Mark Friedrichs
*
* This kernel was developed in collaboration with
*
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
//96 : 228 Flops
kernel void loop1Internal( float3 d1, float3 d2, float3 d3, float3 d4, float4 jBornR,
float4 jQ, float iBornR, float iQ, out float4 dGpol_dr<>,
out float4 dGpol_dalpha2_ij<> ){
// ---------------------------------------------------------------------------------------
float4 r2, alpha2_ij, D_ij, expTerm, denominator2, denominator, Gpol;
// ---------------------------------------------------------------------------------------
r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) ); //20
alpha2_ij = jBornR*iBornR; //4
D_ij = r2/(4.0f*alpha2_ij); //8
expTerm = exp( -D_ij ); //4 : 80
denominator2 = r2 + alpha2_ij*expTerm; //8
denominator = sqrt( denominator2 ); //4 : 60
Gpol = jQ/denominator; //4
Gpol *= iQ;//4
dGpol_dr = -Gpol*( 1.0f - 0.25f*expTerm )/denominator2; //16
dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*( 1.0f + D_ij )*jBornR/denominator2; //24
}
/* ---------------------------------------------------------------------------------------
Calculate nonpolar ACE term (Simbios)
bornRadius: Born radius
vdwRadius: Vdw radius
duplicationFactor: duplication factor
aceForce: ACE term
--------------------------------------------------------------------------------------- */
kernel void kAceNonPolarLoop1( float iBornRadius, float iVdwRadius, float duplicationFactor,
out float aceForce<> ){
// ---------------------------------------------------------------------------------------
// nonpolar term
float iSurface;
float iAceTerm;
// ---------------------------------------------------------------------------------------
// constants
// solvent radius
const float probeRadius = 0.14f;
// PI*24*0.0f049 (0.0f054=asolv from Tinker)
//const float PI_24_aI = -0.3694512961;
const float PI_24_aI = -400.71504079f;
// ---------------------------------------------------------------------------------------
// etch i position and partial charge
// e = ai * term * (ri+probe)**2 * (ri/rb)**6
// (drbi) = drb(i) - 6.0fd0*e/rb
// (rI+probe)**2
iSurface = (iVdwRadius+probeRadius);
iSurface = iSurface*iSurface;
// (rI/rB)**6
iAceTerm = iVdwRadius/iBornRadius;
iAceTerm = iAceTerm*iAceTerm*iAceTerm;
iAceTerm = iAceTerm*iAceTerm;
aceForce = iSurface*iAceTerm*PI_24_aI/(duplicationFactor*iBornRadius);
}
/* ---------------------------------------------------------------------------------------
Calculate first loop force terms (Simbios)
numberOfAtoms: no. of atoms
roundedUpAtoms: rounded up number of atoms -- accounts for unrolling
duplicationFactor: number of threads for inner loop
streamWidth: atom stream width
fstreamWidth: force stream width (output -- i-unroll)
soluteDielectric: solute dielectric
solventDielectric: solvent dielectric
includeAce: include ACE term
posq: atom positions and charge
bornRadii: Born radii
nonpolarForce: nonpolar force (0 if nonpolar not included, else
ACE value)
bornForce1: i-unroll first force component, including dBornR/dr in .w
bornForce2: i-unroll second force component, including dBornR/dr in .w
bornForce3: i-unroll first force component, including dBornR/dr in .w
bornForce4: i-unroll second force component, including dBornR/dr in .w
--------------------------------------------------------------------------------------- */
//FLOPS = 544
kernel void kObcLoop1( float numberOfAtoms, float roundedUpAtoms, float duplicationFactor,
float streamWidth, float fstreamWidth, float soluteDielectric,
float solventDielectric, float includeAce,
float4 posq[][], float bornRadii[][], float atomicRadii[][],
out float4 bornForce1<>, out float4 bornForce2<>,
out float4 bornForce3<>, out float4 bornForce4<> ){
// ---------------------------------------------------------------------------------------
// Born radii
float i1BornR, i2BornR, i3BornR, i4BornR;
float j1BornR, j2BornR, j3BornR, j4BornR;
float4 jBornR;
// atomic radii
float i1AtomicR, i2AtomicR, i3AtomicR, i4AtomicR;
// i,j coordinates
float3 i1Pos, i2Pos, i3Pos, i4Pos;
float3 j1Pos, j2Pos, j3Pos, j4Pos;
float4 j1PosQ, j2PosQ, j3PosQ, j4PosQ;
// i, j partial charges
float i1Q, i2Q, i3Q, i4Q;
float j1Q, j2Q, j3Q, j4Q;
float4 jQ;
float aceForce;
// delta coordinates
float3 d1, d2, d3, d4;
// intermediate terms
float4 dGpol_dr, dGpol_dalpha2_ij;
// indices
float2 iAtom;
float forceIndex;
// This is forceIndex mod numberOfAtoms, the true i index
float iAtomLinearIndex, jLinind;
float2 jAtom;
float jEnd, jStart, jBlock;
float whichRep;
float tmp;
// ---------------------------------------------------------------------------------------
// electricConstant = -166.0f2691;
// preFactor = 2.0f*electricConstant*(1.0f - (1.0f/waterDielectric))
float preFactor = -332.05382f;
const float I_Unroll = 4.0f;
const float3 zero3 = float3( 0.0f, 0.0f, 0.0f );
// ---------------------------------------------------------------------------------------
preFactor *= ( (1.0f/soluteDielectric) - (1.0f/solventDielectric) );
iAtom = indexof( bornForce1 );
forceIndex = I_Unroll*( iAtom.x + iAtom.y*fstreamWidth );
iAtomLinearIndex = fmod( forceIndex, roundedUpAtoms );
// ---------------------------------------------------------------------------------------
// set gather index
iAtom.x = fmod( iAtomLinearIndex, streamWidth );
iAtom.y = round( (iAtomLinearIndex - fmod(iAtomLinearIndex, streamWidth ))/streamWidth );
// ---------------------------------------------------------------------------------------
// etch i1 position and partial charge
jQ = posq[ iAtom ];
i1BornR = bornRadii[ iAtom ];
i1AtomicR = atomicRadii[ iAtom ];
i1Pos = jQ.xyz;
i1Q = jQ.w;
i1Q *= preFactor;
kAceNonPolarLoop1( i1BornR, i1AtomicR, duplicationFactor, aceForce );
bornForce1.xyz = zero3;
bornForce1.w = includeAce > 0.5f ? aceForce : 0.0f;
// ---------------------------------------------------------------------------------------
// etch i2 position and partial charge
iAtom.x += 1;
jQ = posq[ iAtom ];
i2BornR = bornRadii[ iAtom ];
i2AtomicR = atomicRadii[ iAtom ];
i2Pos = jQ.xyz;
i2Q = jQ.w;
i2Q *= preFactor;
kAceNonPolarLoop1( i2BornR, i2AtomicR, duplicationFactor, aceForce );
bornForce2.xyz = zero3;
bornForce2.w = includeAce > 0.5f ? aceForce : 0.0f;
// ---------------------------------------------------------------------------------------
// etch i3 position and partial charge
iAtom.x += 1;
jQ = posq[ iAtom ];
i3BornR = bornRadii[ iAtom ];
i3AtomicR = atomicRadii[ iAtom ];
i3Pos = jQ.xyz;
i3Q = jQ.w;
i3Q *= preFactor;
kAceNonPolarLoop1( i3BornR, i3AtomicR, duplicationFactor, aceForce );
bornForce3.xyz = zero3;
bornForce3.w = includeAce > 0.5f ? aceForce : 0.0f;
// ---------------------------------------------------------------------------------------
// etch i4 position and partial charge
iAtom.x += 1;
jQ = posq[ iAtom ];
i4BornR = bornRadii[ iAtom ];
i4AtomicR = atomicRadii[ iAtom ];
i4Pos = jQ.xyz;
i4Q = jQ.w;
i4Q *= preFactor;
kAceNonPolarLoop1( i4BornR, i4AtomicR, duplicationFactor, aceForce );
bornForce4.xyz = zero3;
bornForce4.w = includeAce > 0.5f ? aceForce : 0.0f;
// ---------------------------------------------------------------------------------------
// inner loop setup
// if dupFac == 4, I_UnRoll =2, then breaking inner loop into two segments
// to increase number of threads in flight
// forceStreamSz = N*RepFac/I_UnRoll
// forceIndex = I_UnRoll*( a.x + a.y*forceStreamSz )
// whichRep = 0 or 1
// jBlock = 1 + floor[ N/(duplicationFactor*streamWidth) ]
//changed the following instruction for rounding issues on some ASICs
//whichRep = floor( forceIndex / roundedUpAtoms );
tmp = fmod(forceIndex, roundedUpAtoms);
whichRep = round((forceIndex - tmp)/roundedUpAtoms);
//jBlock = 1 + floor( numberOfAtoms/(duplicationFactor*streamWidth ) );
tmp = fmod( numberOfAtoms, (duplicationFactor*streamWidth ) );
jBlock = 1 + round( (numberOfAtoms-tmp)/(duplicationFactor*streamWidth ));
jStart = whichRep*jBlock;
jEnd = ( whichRep > duplicationFactor - 1.5f ) ? 999999.0f : (jStart + jBlock);
jAtom.y = jStart;
jLinind = jAtom.y*streamWidth;
// ---------------------------------------------------------------------------------------
while ( jAtom.y < jEnd && ( numberOfAtoms - jLinind ) > 0.9f ){
jAtom.x = 0.0f;
while ( jAtom.x < streamWidth && ( numberOfAtoms - jLinind ) > 0.9f ) {
// ---------------------------------------------------------------------------------------
// gather required values
j1PosQ = posq[ jAtom ];
j1Pos = j1PosQ.xyz;
j1Q = j1PosQ.w;
j1BornR = bornRadii[ jAtom ];
jAtom.x += 1.0f;
j2PosQ = posq[ jAtom ];
j2Pos = j2PosQ.xyz;
j2Q = j2PosQ.w;
j2BornR = bornRadii[ jAtom ];
jAtom.x += 1.0f;
j3PosQ = posq[ jAtom ];
j3Pos = j3PosQ.xyz;
j3Q = j3PosQ.w;
j3BornR = bornRadii[ jAtom ];
jAtom.x += 1.0f;
j4PosQ = posq[ jAtom ];
j4Pos = j4PosQ.xyz;
j4Q = j4PosQ.w;
j4BornR = bornRadii[ jAtom ];
jAtom.x += 1.0f;
jBornR = float4( j1BornR, j2BornR, j3BornR, j4BornR );
jQ = float4( j1Q, j2Q, j3Q, j4Q );
// ---------------------------------------------------------------------------------------
// i == 1
d1 = i1Pos - j1Pos; //3
d2 = i1Pos - j2Pos; //3
d3 = i1Pos - j3Pos; //3
d4 = i1Pos - j4Pos; //3
loop1Internal( d1, d2, d3, d4, jBornR, jQ, i1BornR, i1Q, dGpol_dr, dGpol_dalpha2_ij ); //96 : 228
bornForce1.xyz += dGpol_dr.x*d1; //6
bornForce1.xyz += dGpol_dr.y*d2; //6
bornForce1.xyz += dGpol_dr.z*d3; //6
bornForce1.xyz += dGpol_dr.w*d4; //6
bornForce1.w += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w; //4
// ---------------------------------------------------------------------------------------
// i == 2
d1 = i2Pos - j1Pos; //3
d2 = i2Pos - j2Pos; //3
d3 = i2Pos - j3Pos; //3
d4 = i2Pos - j4Pos; //3
loop1Internal( d1, d2, d3, d4, jBornR, jQ, i2BornR, i2Q, dGpol_dr, dGpol_dalpha2_ij ); //96 : 228
bornForce2.xyz += dGpol_dr.x*d1; //6
bornForce2.xyz += dGpol_dr.y*d2; //6
bornForce2.xyz += dGpol_dr.z*d3; //6
bornForce2.xyz += dGpol_dr.w*d4; //6
bornForce2.w += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w; //4
// ---------------------------------------------------------------------------------------
// i == 3
d1 = i3Pos - j1Pos; //3
d2 = i3Pos - j2Pos; //3
d3 = i3Pos - j3Pos; //3
d4 = i3Pos - j4Pos; //3
loop1Internal( d1, d2, d3, d4, jBornR, jQ, i3BornR, i3Q, dGpol_dr, dGpol_dalpha2_ij ); //96 : 228
bornForce3.xyz += dGpol_dr.x*d1; //6
bornForce3.xyz += dGpol_dr.y*d2; //6
bornForce3.xyz += dGpol_dr.z*d3; //6
bornForce3.xyz += dGpol_dr.w*d4; //6
bornForce3.w += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w; //4
// ---------------------------------------------------------------------------------------
// i == 4
d1 = i4Pos - j1Pos; //3
d2 = i4Pos - j2Pos; //3
d3 = i4Pos - j3Pos; //3
d4 = i4Pos - j4Pos; //3
loop1Internal( d1, d2, d3, d4, jBornR, jQ, i4BornR, i4Q, dGpol_dr, dGpol_dalpha2_ij ); //96 : 228
bornForce4.xyz += dGpol_dr.x*d1; //6
bornForce4.xyz += dGpol_dr.y*d2; //6
bornForce4.xyz += dGpol_dr.z*d3; //6
bornForce4.xyz += dGpol_dr.w*d4; //6
bornForce4.w += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w; //4
// ---------------------------------------------------------------------------------------
jLinind += 4.0f;
}
jAtom.y += 1.0f;
}
}
...@@ -225,154 +225,6 @@ kernel void kMergeFloat4_4( ...@@ -225,154 +225,6 @@ kernel void kMergeFloat4_4(
} }
} }
kernel void kPostObcLoop1(
float repfac,
float atomStreamWidth,
float pStreamWidth,
float natoms,
float roundNatoms,
float iUnroll,
float4 pstream1[][],
float4 pstream2[][],
float4 pstream3[][],
float4 pstream4[][],
float obcChain<>,
float bornRadii<>,
out float4 outstream<>,
out float bornRadii2Force<> )
{
// ---------------------------------------------------------------------------------------
float atomIndex, forceIndex, qIndex, qOff;
float2 pindex;
float i;
float4 o1,o2,o3,o4;
float4 tmp;
float4 zero4;
// given atom index find force indices and streams
pindex = indexof( outstream );
atomIndex = pindex.x + pindex.y*atomStreamWidth;
forceIndex = atomIndex;
zero4 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
outstream = zero4;
for( i = 0.0f; i < repfac; i += 1.0f ){
// qIndex = floor( forceIndex/iUnroll );
qIndex = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
qOff = forceIndex - iUnroll*qIndex;
// pindex.y = floor( qIndex/ pStreamWidth );
pindex.y = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
// pindex.x = qIndex - pindex.y*pStreamWidth + qOff;
pindex.x = qIndex - pindex.y*pStreamWidth;
// outstream += float4( forceIndex, qIndex, pindex.x, pindex.y );
// This is going to cause really divergent code and we are
// going to end up doing all the fetches anyway...
if ( qOff < 0.5f ){
outstream += pstream1[ pindex ];
} else if( qOff < 1.5f ){
outstream += pstream2[ pindex ];
} else if( qOff < 2.5f ){
outstream += pstream3[ pindex ];
} else {
outstream += pstream4[ pindex ];
}
// o1 = pstream1[ pindex ];
// o2 = pstream2[ pindex ];
// o3 = pstream3[ pindex ];
// o4 = pstream4[ pindex ];
// tmp = qOff < 0.5f ? o1 : o2;
// tmp = qOff < 1.5f ? tmp : o3;
// tmp = qOff < 2.5f ? tmp : o4;
// outstream += tmp;
forceIndex += roundNatoms;
}
bornRadii2Force = obcChain*bornRadii*bornRadii*outstream.w;
}
// The inner loop by definition creates divergent paths. Chances are
// fair that we will take all sides of the branch anyway, so this
// verion uses manual predication
kernel void kPostObcLoop1_nobranch(
float repfac,
float atomStreamWidth,
float pStreamWidth,
float natoms,
float roundNatoms,
float iUnroll,
float4 pstream1[][],
float4 pstream2[][],
float4 pstream3[][],
float4 pstream4[][],
float obcChain<>,
float bornRadii<>,
out float4 outstream<>,
out float bornRadii2Force<> )
{
// ---------------------------------------------------------------------------------------
float atomIndex, forceIndex, qIndex, qOff;
float2 pindex;
float i;
float4 o1,o2,o3,o4;
float4 tmp;
// given atom index find force indices and streams
pindex = indexof( outstream );
atomIndex = pindex.x + pindex.y*atomStreamWidth;
forceIndex = atomIndex;
outstream = float4( 0.0f, 0.0f, 0.0f, 0.0f );
for( i = 0.0f; i < repfac; i += 1.0f ){
// qIndex = floor( forceIndex/iUnroll );
qIndex = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
qOff = forceIndex - iUnroll*qIndex;
// pindex.y = floor( qIndex/ pStreamWidth );
pindex.y = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
// pindex.x = qIndex - pindex.y*pStreamWidth + qOff;
pindex.x = qIndex - pindex.y*pStreamWidth;
o1 = pstream1[ pindex ];
o2 = pstream2[ pindex ];
o3 = pstream3[ pindex ];
o4 = pstream4[ pindex ];
tmp = qOff < 0.5f ? o1 : o2;
tmp = qOff < 1.5f ? tmp : o3;
tmp = qOff < 2.5f ? tmp : o4;
outstream += tmp;
forceIndex += roundNatoms;
}
bornRadii2Force = obcChain*bornRadii*bornRadii*outstream.w;
}
kernel void kSetValue4( float value, out float4 outstream<> ){ kernel void kSetValue4( float value, out float4 outstream<> ){
outstream = float4( value, value, value, value ); outstream = float4( value, value, value, value );
} }
...@@ -522,354 +374,6 @@ kernel void kAddAndMergeFloat4_4( ...@@ -522,354 +374,6 @@ kernel void kAddAndMergeFloat4_4(
} }
} }
kernel void kPostObcLoop2(
float repfac,
float atomStreamWidth,
float pStreamWidth,
float natoms,
float roundNatoms,
float iUnroll,
float conversion,
float mergeNonObcForces,
float4 inObcForces<>,
float3 nonObcForces<>,
float4 pstream1[][],
float4 pstream2[][],
float4 pstream3[][],
float4 pstream4[][],
float atomicRadii<>,
out float bornRadii<>,
out float obcChain<>,
out float3 outForces<> ){
// ---------------------------------------------------------------------------------------
float atomIndex, forceIndex, qIndex, qOff;
float2 pindex;
float i;
float sum2, sum3, bornSum, tanhSum, atomicRadiiOffset, obcIntermediate;
float2 iAtom;
float4 forces;
float expPlus, expMinus;
// ---------------------------------------------------------------------------------------
// constants -- OBC Type II
const float alphaObc = 1.0f;
const float betaObc = 0.8f;
const float gammaObc = 4.85f;
const float dielectricOffset = 0.009f;
// ---------------------------------------------------------------------------------------
// given atom index find force indices and streams
pindex = indexof( outForces );
atomIndex = pindex.x + pindex.y*atomStreamWidth;
forceIndex = atomIndex;
// add current forces in inStream to forces stored in pstreams
// the .w entry is Born sum values; it will be used to calculate the
// Born radii and obcChain term
forces = inObcForces;
forces.w = 0.0f;
//forces = float4( 0.0f, 0.0f, 0.0f, 0.0f );
// sum over j-loop 'duplications' by gathering from pstreams
for( i = 0.0f; i < repfac; i += 1.0f ){
// qIndex = floor( forceIndex/iUnroll );
qIndex = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
qOff = forceIndex - iUnroll*qIndex;
// pindex.y = floor( qIndex/ pStreamWidth );
pindex.y = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
// pindex.x = qIndex - pindex.y*pStreamWidth + qOff;
pindex.x = qIndex - pindex.y*pStreamWidth;
if( qOff < 0.5f ){
forces += pstream1[ pindex ];
} else if( qOff < 1.5f ){
forces += pstream2[ pindex ];
} else if( qOff < 2.5f ){
forces += pstream3[ pindex ];
} else {
forces += pstream4[ pindex ];
}
forceIndex += roundNatoms;
}
// compute Born radii and ObcChain
atomicRadiiOffset = atomicRadii - dielectricOffset;
bornSum = forces.w;
bornSum *= 0.5f*atomicRadiiOffset;
sum2 = bornSum*bornSum;
sum3 = bornSum*sum2;
// Tanh does not exist?
// calculate [ exp(x) - exp(-x) ]/[ exp(x) + exp(-x) ]
// tanhSum = tanh( bornSum - betaObc*sum2 + gammaObc*sum3 );
tanhSum = bornSum - betaObc*sum2 + gammaObc*sum3;
expPlus = exp( tanhSum );
expMinus = 1.0f/expPlus;
tanhSum = ( expPlus - expMinus )/( expPlus + expMinus );
bornRadii = 1.0f/( (1.0f/(atomicRadiiOffset)) - tanhSum/atomicRadii );
obcIntermediate = atomicRadiiOffset*( alphaObc - 2.0f*betaObc*bornSum + 3.0f*gammaObc*sum2 );
obcChain = (1.0f - tanhSum*tanhSum)*obcIntermediate/atomicRadii;
if( atomIndex >= natoms ){
bornRadii = 0.0f;
obcChain = 0.0f;
}
// add converted new forces to non-Obc forces
outForces = conversion*forces.xyz;
if( mergeNonObcForces > 0.1f ){
outForces += nonObcForces;
}
}
kernel void kPostObcLoop2_nobranch(
float repfac,
float atomStreamWidth,
float pStreamWidth,
float natoms,
float roundNatoms,
float iUnroll,
float conversion,
float mergeNonObcForces,
float4 inObcForces<>,
float3 nonObcForces<>,
float4 pstream1[][],
float4 pstream2[][],
float4 pstream3[][],
float4 pstream4[][],
float atomicRadii<>,
out float bornRadii<>,
out float obcChain<>,
out float3 outForces<> ){
// ---------------------------------------------------------------------------------------
float atomIndex, forceIndex, qIndex, qOff;
float2 pindex;
float i;
float sum2, sum3, bornSum, tanhSum, atomicRadiiOffset, obcIntermediate;
float4 o1,o2,o3,o4;
float4 tmp;
float2 iAtom;
float4 forces;
float expPlus, expMinus;
// ---------------------------------------------------------------------------------------
// constants -- OBC Type II
const float alphaObc = 1.0f;
const float betaObc = 0.8f;
const float gammaObc = 4.85f;
const float dielectricOffset = 0.009f;
// ---------------------------------------------------------------------------------------
// given atom index find force indices and streams
pindex = indexof( outForces );
atomIndex = pindex.x + pindex.y*atomStreamWidth;
forceIndex = atomIndex;
// add current forces in inStream to forces stored in pstreams
// the .w entry is Born sum values; it will be used to calculate the
// Born radii and obcChain term
forces = inObcForces;
forces.w = 0.0f;
//forces = float4( 0.0f, 0.0f, 0.0f, 0.0f );
// sum over j-loop 'duplications' by gathering from pstreams
for( i = 0.0f; i < repfac; i += 1.0f ){
// qIndex = floor( forceIndex/iUnroll );
qIndex = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
qOff = forceIndex - iUnroll*qIndex;
// pindex.y = floor( qIndex/ pStreamWidth );
pindex.y = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
// pindex.x = qIndex - pindex.y*pStreamWidth + qOff;
pindex.x = qIndex - pindex.y*pStreamWidth;
o1 = pstream1[ pindex ];
o2 = pstream2[ pindex ];
o3 = pstream3[ pindex ];
o4 = pstream4[ pindex ];
tmp = qOff < 0.5f ? o1 : o2;
tmp = qOff < 1.5f ? tmp : o3;
tmp = qOff < 2.5f ? tmp : o4;
forces += tmp;
forceIndex += roundNatoms;
}
// compute Born radii and ObcChain
atomicRadiiOffset = atomicRadii - dielectricOffset;
bornSum = forces.w;
bornSum *= 0.5f*atomicRadiiOffset;
sum2 = bornSum*bornSum;
sum3 = bornSum*sum2;
// Tanh does not exist?
// calculate [ exp(x) - exp(-x) ]/[ exp(x) + exp(-x) ]
// tanhSum = tanh( bornSum - betaObc*sum2 + gammaObc*sum3 );
tanhSum = bornSum - betaObc*sum2 + gammaObc*sum3;
expPlus = exp( tanhSum );
expMinus = 1.0f/expPlus;
tanhSum = ( expPlus - expMinus )/( expPlus + expMinus );
bornRadii = 1.0f/( (1.0f/(atomicRadiiOffset)) - tanhSum/atomicRadii );
obcIntermediate = atomicRadiiOffset*( alphaObc - 2.0f*betaObc*bornSum + 3.0f*gammaObc*sum2 );
obcChain = (1.0f - tanhSum*tanhSum)*obcIntermediate/atomicRadii;
if( atomIndex >= natoms ){
bornRadii = 0.0f;
obcChain = 0.0f;
}
// add converted new forces to non-Obc forces
outForces = conversion*forces.xyz;
if( mergeNonObcForces > 0.1f ){
outForces += nonObcForces;
}
}
kernel void kPostCalculateBornRadii_nobranch(
float repfac,
float atomStreamWidth,
float pStreamWidth,
float natoms,
float roundNatoms,
float iUnroll,
float conversion,
float mergeNonObcForces,
float4 pstream1[][],
float atomicRadii<>,
out float bornRadii<>,
out float obcChain<> ){
// ---------------------------------------------------------------------------------------
float atomIndex, forceIndex, qIndex, qOff;
float2 pindex;
float i;
float sum2, sum3, bornSum, tanhSum, atomicRadiiOffset, obcIntermediate;
float4 o1;
float tmp;
float2 iAtom;
float4 forces;
float expPlus, expMinus;
// ---------------------------------------------------------------------------------------
// constants -- OBC Type II
const float alphaObc = 1.0f;
const float betaObc = 0.8f;
const float gammaObc = 4.85f;
const float dielectricOffset = 0.009f;
// ---------------------------------------------------------------------------------------
// given atom index find force indices and streams
pindex = indexof( obcChain );
atomIndex = pindex.x + pindex.y*atomStreamWidth;
forceIndex = atomIndex;
// add current forces in inStream to forces stored in pstreams
// the .w entry is Born sum values; it will be used to calculate the
// Born radii and obcChain term
bornSum = 0.0f;
// sum over j-loop 'duplications' by gathering from pstreams
for( i = 0.0f; i < repfac; i += 1.0f ){
qIndex = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
qOff = forceIndex - iUnroll*qIndex;
pindex.y = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
pindex.x = qIndex - pindex.y*pStreamWidth;
o1 = pstream1[ pindex ];
tmp = qOff < 0.5f ? o1.x : o1.y;
tmp = qOff < 1.5f ? tmp : o1.z;
tmp = qOff < 2.5f ? tmp : o1.w;
bornSum += tmp;
forceIndex += roundNatoms;
}
// compute Born radii and ObcChain
atomicRadiiOffset = atomicRadii - dielectricOffset;
bornSum *= 0.5f*atomicRadiiOffset;
sum2 = bornSum*bornSum;
sum3 = bornSum*sum2;
// Tanh does not exist?
// calculate [ exp(x) - exp(-x) ]/[ exp(x) + exp(-x) ]
// tanhSum = tanh( bornSum - betaObc*sum2 + gammaObc*sum3 );
tanhSum = bornSum - betaObc*sum2 + gammaObc*sum3;
expPlus = exp( tanhSum );
expMinus = 1.0f/expPlus;
tanhSum = ( expPlus - expMinus )/( expPlus + expMinus );
bornRadii = 1.0f/( (1.0f/(atomicRadiiOffset)) - tanhSum/atomicRadii );
obcIntermediate = atomicRadiiOffset*( alphaObc - 2.0f*betaObc*bornSum + 3.0f*gammaObc*sum2 );
obcChain = (1.0f - tanhSum*tanhSum)*obcIntermediate/atomicRadii;
if( atomIndex >= natoms ){
bornRadii = 0.0f;
obcChain = 0.0f;
}
}
/* Add forces from two streams */ /* Add forces from two streams */
kernel void kAddForces3_4( float conversion, float3 force1<>, float4 force2<>, out float3 outForce<> ){ kernel void kAddForces3_4( float conversion, float3 force1<>, float4 force2<>, out float3 outForce<> ){
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment