Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
8c3878b2
Commit
8c3878b2
authored
Jan 02, 2009
by
Mark Friedrichs
Browse files
Mods
parent
116a99ba
Changes
4
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
974 additions
and
1250 deletions
+974
-1250
platforms/brook/src/gpu/kgbsa.br
platforms/brook/src/gpu/kgbsa.br
+974
-358
platforms/brook/src/gpu/kgbsa.h
platforms/brook/src/gpu/kgbsa.h
+0
-1
platforms/brook/src/gpu/kgbsa1.br
platforms/brook/src/gpu/kgbsa1.br
+0
-395
platforms/brook/src/gpu/kmerge.br
platforms/brook/src/gpu/kmerge.br
+0
-496
No files found.
platforms/brook/src/gpu/k
ObcBaseD2
.br
→
platforms/brook/src/gpu/k
gbsa
.br
View file @
8c3878b2
This diff is collapsed.
Click to expand it.
platforms/brook/src/gpu/kgbsa.h
View file @
8c3878b2
...
@@ -486,7 +486,6 @@ void kObcLoop2(
...
@@ -486,7 +486,6 @@ void kObcLoop2(
float
streamWidth
,
float
streamWidth
,
float
fstreamWidth
,
float
fstreamWidth
,
::
brook
::
stream
posq
,
::
brook
::
stream
posq
,
::
brook
::
stream
atomicRadii
,
::
brook
::
stream
scaledAtomicRadii
,
::
brook
::
stream
scaledAtomicRadii
,
::
brook
::
stream
bornForceFactor
,
::
brook
::
stream
bornForceFactor
,
::
brook
::
stream
bornForce1
,
::
brook
::
stream
bornForce1
,
...
...
platforms/brook/src/gpu/kgbsa1.br
deleted
100644 → 0
View file @
116a99ba
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: Mark Friedrichs
*
* This kernel was developed in collaboration with
*
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
//96 : 228 Flops
kernel void loop1Internal( float3 d1, float3 d2, float3 d3, float3 d4, float4 jBornR,
float4 jQ, float iBornR, float iQ, out float4 dGpol_dr<>,
out float4 dGpol_dalpha2_ij<> ){
// ---------------------------------------------------------------------------------------
float4 r2, alpha2_ij, D_ij, expTerm, denominator2, denominator, Gpol;
// ---------------------------------------------------------------------------------------
r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) ); //20
alpha2_ij = jBornR*iBornR; //4
D_ij = r2/(4.0f*alpha2_ij); //8
expTerm = exp( -D_ij ); //4 : 80
denominator2 = r2 + alpha2_ij*expTerm; //8
denominator = sqrt( denominator2 ); //4 : 60
Gpol = jQ/denominator; //4
Gpol *= iQ;//4
dGpol_dr = -Gpol*( 1.0f - 0.25f*expTerm )/denominator2; //16
dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*( 1.0f + D_ij )*jBornR/denominator2; //24
}
/* ---------------------------------------------------------------------------------------
Calculate nonpolar ACE term (Simbios)
bornRadius: Born radius
vdwRadius: Vdw radius
duplicationFactor: duplication factor
aceForce: ACE term
--------------------------------------------------------------------------------------- */
kernel void kAceNonPolarLoop1( float iBornRadius, float iVdwRadius, float duplicationFactor,
out float aceForce<> ){
// ---------------------------------------------------------------------------------------
// nonpolar term
float iSurface;
float iAceTerm;
// ---------------------------------------------------------------------------------------
// constants
// solvent radius
const float probeRadius = 0.14f;
// PI*24*0.0f049 (0.0f054=asolv from Tinker)
//const float PI_24_aI = -0.3694512961;
const float PI_24_aI = -400.71504079f;
// ---------------------------------------------------------------------------------------
// etch i position and partial charge
// e = ai * term * (ri+probe)**2 * (ri/rb)**6
// (drbi) = drb(i) - 6.0fd0*e/rb
// (rI+probe)**2
iSurface = (iVdwRadius+probeRadius);
iSurface = iSurface*iSurface;
// (rI/rB)**6
iAceTerm = iVdwRadius/iBornRadius;
iAceTerm = iAceTerm*iAceTerm*iAceTerm;
iAceTerm = iAceTerm*iAceTerm;
aceForce = iSurface*iAceTerm*PI_24_aI/(duplicationFactor*iBornRadius);
}
/* ---------------------------------------------------------------------------------------
Calculate first loop force terms (Simbios)
numberOfAtoms: no. of atoms
roundedUpAtoms: rounded up number of atoms -- accounts for unrolling
duplicationFactor: number of threads for inner loop
streamWidth: atom stream width
fstreamWidth: force stream width (output -- i-unroll)
soluteDielectric: solute dielectric
solventDielectric: solvent dielectric
includeAce: include ACE term
posq: atom positions and charge
bornRadii: Born radii
nonpolarForce: nonpolar force (0 if nonpolar not included, else
ACE value)
bornForce1: i-unroll first force component, including dBornR/dr in .w
bornForce2: i-unroll second force component, including dBornR/dr in .w
bornForce3: i-unroll first force component, including dBornR/dr in .w
bornForce4: i-unroll second force component, including dBornR/dr in .w
--------------------------------------------------------------------------------------- */
//FLOPS = 544
kernel void kObcLoop1( float numberOfAtoms, float roundedUpAtoms, float duplicationFactor,
float streamWidth, float fstreamWidth, float soluteDielectric,
float solventDielectric, float includeAce,
float4 posq[][], float bornRadii[][], float atomicRadii[][],
out float4 bornForce1<>, out float4 bornForce2<>,
out float4 bornForce3<>, out float4 bornForce4<> ){
// ---------------------------------------------------------------------------------------
// Born radii
float i1BornR, i2BornR, i3BornR, i4BornR;
float j1BornR, j2BornR, j3BornR, j4BornR;
float4 jBornR;
// atomic radii
float i1AtomicR, i2AtomicR, i3AtomicR, i4AtomicR;
// i,j coordinates
float3 i1Pos, i2Pos, i3Pos, i4Pos;
float3 j1Pos, j2Pos, j3Pos, j4Pos;
float4 j1PosQ, j2PosQ, j3PosQ, j4PosQ;
// i, j partial charges
float i1Q, i2Q, i3Q, i4Q;
float j1Q, j2Q, j3Q, j4Q;
float4 jQ;
float aceForce;
// delta coordinates
float3 d1, d2, d3, d4;
// intermediate terms
float4 dGpol_dr, dGpol_dalpha2_ij;
// indices
float2 iAtom;
float forceIndex;
// This is forceIndex mod numberOfAtoms, the true i index
float iAtomLinearIndex, jLinind;
float2 jAtom;
float jEnd, jStart, jBlock;
float whichRep;
float tmp;
// ---------------------------------------------------------------------------------------
// electricConstant = -166.0f2691;
// preFactor = 2.0f*electricConstant*(1.0f - (1.0f/waterDielectric))
float preFactor = -332.05382f;
const float I_Unroll = 4.0f;
const float3 zero3 = float3( 0.0f, 0.0f, 0.0f );
// ---------------------------------------------------------------------------------------
preFactor *= ( (1.0f/soluteDielectric) - (1.0f/solventDielectric) );
iAtom = indexof( bornForce1 );
forceIndex = I_Unroll*( iAtom.x + iAtom.y*fstreamWidth );
iAtomLinearIndex = fmod( forceIndex, roundedUpAtoms );
// ---------------------------------------------------------------------------------------
// set gather index
iAtom.x = fmod( iAtomLinearIndex, streamWidth );
iAtom.y = round( (iAtomLinearIndex - fmod(iAtomLinearIndex, streamWidth ))/streamWidth );
// ---------------------------------------------------------------------------------------
// etch i1 position and partial charge
jQ = posq[ iAtom ];
i1BornR = bornRadii[ iAtom ];
i1AtomicR = atomicRadii[ iAtom ];
i1Pos = jQ.xyz;
i1Q = jQ.w;
i1Q *= preFactor;
kAceNonPolarLoop1( i1BornR, i1AtomicR, duplicationFactor, aceForce );
bornForce1.xyz = zero3;
bornForce1.w = includeAce > 0.5f ? aceForce : 0.0f;
// ---------------------------------------------------------------------------------------
// etch i2 position and partial charge
iAtom.x += 1;
jQ = posq[ iAtom ];
i2BornR = bornRadii[ iAtom ];
i2AtomicR = atomicRadii[ iAtom ];
i2Pos = jQ.xyz;
i2Q = jQ.w;
i2Q *= preFactor;
kAceNonPolarLoop1( i2BornR, i2AtomicR, duplicationFactor, aceForce );
bornForce2.xyz = zero3;
bornForce2.w = includeAce > 0.5f ? aceForce : 0.0f;
// ---------------------------------------------------------------------------------------
// etch i3 position and partial charge
iAtom.x += 1;
jQ = posq[ iAtom ];
i3BornR = bornRadii[ iAtom ];
i3AtomicR = atomicRadii[ iAtom ];
i3Pos = jQ.xyz;
i3Q = jQ.w;
i3Q *= preFactor;
kAceNonPolarLoop1( i3BornR, i3AtomicR, duplicationFactor, aceForce );
bornForce3.xyz = zero3;
bornForce3.w = includeAce > 0.5f ? aceForce : 0.0f;
// ---------------------------------------------------------------------------------------
// etch i4 position and partial charge
iAtom.x += 1;
jQ = posq[ iAtom ];
i4BornR = bornRadii[ iAtom ];
i4AtomicR = atomicRadii[ iAtom ];
i4Pos = jQ.xyz;
i4Q = jQ.w;
i4Q *= preFactor;
kAceNonPolarLoop1( i4BornR, i4AtomicR, duplicationFactor, aceForce );
bornForce4.xyz = zero3;
bornForce4.w = includeAce > 0.5f ? aceForce : 0.0f;
// ---------------------------------------------------------------------------------------
// inner loop setup
// if dupFac == 4, I_UnRoll =2, then breaking inner loop into two segments
// to increase number of threads in flight
// forceStreamSz = N*RepFac/I_UnRoll
// forceIndex = I_UnRoll*( a.x + a.y*forceStreamSz )
// whichRep = 0 or 1
// jBlock = 1 + floor[ N/(duplicationFactor*streamWidth) ]
//changed the following instruction for rounding issues on some ASICs
//whichRep = floor( forceIndex / roundedUpAtoms );
tmp = fmod(forceIndex, roundedUpAtoms);
whichRep = round((forceIndex - tmp)/roundedUpAtoms);
//jBlock = 1 + floor( numberOfAtoms/(duplicationFactor*streamWidth ) );
tmp = fmod( numberOfAtoms, (duplicationFactor*streamWidth ) );
jBlock = 1 + round( (numberOfAtoms-tmp)/(duplicationFactor*streamWidth ));
jStart = whichRep*jBlock;
jEnd = ( whichRep > duplicationFactor - 1.5f ) ? 999999.0f : (jStart + jBlock);
jAtom.y = jStart;
jLinind = jAtom.y*streamWidth;
// ---------------------------------------------------------------------------------------
while ( jAtom.y < jEnd && ( numberOfAtoms - jLinind ) > 0.9f ){
jAtom.x = 0.0f;
while ( jAtom.x < streamWidth && ( numberOfAtoms - jLinind ) > 0.9f ) {
// ---------------------------------------------------------------------------------------
// gather required values
j1PosQ = posq[ jAtom ];
j1Pos = j1PosQ.xyz;
j1Q = j1PosQ.w;
j1BornR = bornRadii[ jAtom ];
jAtom.x += 1.0f;
j2PosQ = posq[ jAtom ];
j2Pos = j2PosQ.xyz;
j2Q = j2PosQ.w;
j2BornR = bornRadii[ jAtom ];
jAtom.x += 1.0f;
j3PosQ = posq[ jAtom ];
j3Pos = j3PosQ.xyz;
j3Q = j3PosQ.w;
j3BornR = bornRadii[ jAtom ];
jAtom.x += 1.0f;
j4PosQ = posq[ jAtom ];
j4Pos = j4PosQ.xyz;
j4Q = j4PosQ.w;
j4BornR = bornRadii[ jAtom ];
jAtom.x += 1.0f;
jBornR = float4( j1BornR, j2BornR, j3BornR, j4BornR );
jQ = float4( j1Q, j2Q, j3Q, j4Q );
// ---------------------------------------------------------------------------------------
// i == 1
d1 = i1Pos - j1Pos; //3
d2 = i1Pos - j2Pos; //3
d3 = i1Pos - j3Pos; //3
d4 = i1Pos - j4Pos; //3
loop1Internal( d1, d2, d3, d4, jBornR, jQ, i1BornR, i1Q, dGpol_dr, dGpol_dalpha2_ij ); //96 : 228
bornForce1.xyz += dGpol_dr.x*d1; //6
bornForce1.xyz += dGpol_dr.y*d2; //6
bornForce1.xyz += dGpol_dr.z*d3; //6
bornForce1.xyz += dGpol_dr.w*d4; //6
bornForce1.w += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w; //4
// ---------------------------------------------------------------------------------------
// i == 2
d1 = i2Pos - j1Pos; //3
d2 = i2Pos - j2Pos; //3
d3 = i2Pos - j3Pos; //3
d4 = i2Pos - j4Pos; //3
loop1Internal( d1, d2, d3, d4, jBornR, jQ, i2BornR, i2Q, dGpol_dr, dGpol_dalpha2_ij ); //96 : 228
bornForce2.xyz += dGpol_dr.x*d1; //6
bornForce2.xyz += dGpol_dr.y*d2; //6
bornForce2.xyz += dGpol_dr.z*d3; //6
bornForce2.xyz += dGpol_dr.w*d4; //6
bornForce2.w += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w; //4
// ---------------------------------------------------------------------------------------
// i == 3
d1 = i3Pos - j1Pos; //3
d2 = i3Pos - j2Pos; //3
d3 = i3Pos - j3Pos; //3
d4 = i3Pos - j4Pos; //3
loop1Internal( d1, d2, d3, d4, jBornR, jQ, i3BornR, i3Q, dGpol_dr, dGpol_dalpha2_ij ); //96 : 228
bornForce3.xyz += dGpol_dr.x*d1; //6
bornForce3.xyz += dGpol_dr.y*d2; //6
bornForce3.xyz += dGpol_dr.z*d3; //6
bornForce3.xyz += dGpol_dr.w*d4; //6
bornForce3.w += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w; //4
// ---------------------------------------------------------------------------------------
// i == 4
d1 = i4Pos - j1Pos; //3
d2 = i4Pos - j2Pos; //3
d3 = i4Pos - j3Pos; //3
d4 = i4Pos - j4Pos; //3
loop1Internal( d1, d2, d3, d4, jBornR, jQ, i4BornR, i4Q, dGpol_dr, dGpol_dalpha2_ij ); //96 : 228
bornForce4.xyz += dGpol_dr.x*d1; //6
bornForce4.xyz += dGpol_dr.y*d2; //6
bornForce4.xyz += dGpol_dr.z*d3; //6
bornForce4.xyz += dGpol_dr.w*d4; //6
bornForce4.w += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w; //4
// ---------------------------------------------------------------------------------------
jLinind += 4.0f;
}
jAtom.y += 1.0f;
}
}
platforms/brook/src/gpu/kmerge.br
View file @
8c3878b2
...
@@ -225,154 +225,6 @@ kernel void kMergeFloat4_4(
...
@@ -225,154 +225,6 @@ kernel void kMergeFloat4_4(
}
}
}
}
kernel void kPostObcLoop1(
float repfac,
float atomStreamWidth,
float pStreamWidth,
float natoms,
float roundNatoms,
float iUnroll,
float4 pstream1[][],
float4 pstream2[][],
float4 pstream3[][],
float4 pstream4[][],
float obcChain<>,
float bornRadii<>,
out float4 outstream<>,
out float bornRadii2Force<> )
{
// ---------------------------------------------------------------------------------------
float atomIndex, forceIndex, qIndex, qOff;
float2 pindex;
float i;
float4 o1,o2,o3,o4;
float4 tmp;
float4 zero4;
// given atom index find force indices and streams
pindex = indexof( outstream );
atomIndex = pindex.x + pindex.y*atomStreamWidth;
forceIndex = atomIndex;
zero4 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
outstream = zero4;
for( i = 0.0f; i < repfac; i += 1.0f ){
// qIndex = floor( forceIndex/iUnroll );
qIndex = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
qOff = forceIndex - iUnroll*qIndex;
// pindex.y = floor( qIndex/ pStreamWidth );
pindex.y = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
// pindex.x = qIndex - pindex.y*pStreamWidth + qOff;
pindex.x = qIndex - pindex.y*pStreamWidth;
// outstream += float4( forceIndex, qIndex, pindex.x, pindex.y );
// This is going to cause really divergent code and we are
// going to end up doing all the fetches anyway...
if ( qOff < 0.5f ){
outstream += pstream1[ pindex ];
} else if( qOff < 1.5f ){
outstream += pstream2[ pindex ];
} else if( qOff < 2.5f ){
outstream += pstream3[ pindex ];
} else {
outstream += pstream4[ pindex ];
}
// o1 = pstream1[ pindex ];
// o2 = pstream2[ pindex ];
// o3 = pstream3[ pindex ];
// o4 = pstream4[ pindex ];
// tmp = qOff < 0.5f ? o1 : o2;
// tmp = qOff < 1.5f ? tmp : o3;
// tmp = qOff < 2.5f ? tmp : o4;
// outstream += tmp;
forceIndex += roundNatoms;
}
bornRadii2Force = obcChain*bornRadii*bornRadii*outstream.w;
}
// The inner loop by definition creates divergent paths. Chances are
// fair that we will take all sides of the branch anyway, so this
// verion uses manual predication
kernel void kPostObcLoop1_nobranch(
float repfac,
float atomStreamWidth,
float pStreamWidth,
float natoms,
float roundNatoms,
float iUnroll,
float4 pstream1[][],
float4 pstream2[][],
float4 pstream3[][],
float4 pstream4[][],
float obcChain<>,
float bornRadii<>,
out float4 outstream<>,
out float bornRadii2Force<> )
{
// ---------------------------------------------------------------------------------------
float atomIndex, forceIndex, qIndex, qOff;
float2 pindex;
float i;
float4 o1,o2,o3,o4;
float4 tmp;
// given atom index find force indices and streams
pindex = indexof( outstream );
atomIndex = pindex.x + pindex.y*atomStreamWidth;
forceIndex = atomIndex;
outstream = float4( 0.0f, 0.0f, 0.0f, 0.0f );
for( i = 0.0f; i < repfac; i += 1.0f ){
// qIndex = floor( forceIndex/iUnroll );
qIndex = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
qOff = forceIndex - iUnroll*qIndex;
// pindex.y = floor( qIndex/ pStreamWidth );
pindex.y = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
// pindex.x = qIndex - pindex.y*pStreamWidth + qOff;
pindex.x = qIndex - pindex.y*pStreamWidth;
o1 = pstream1[ pindex ];
o2 = pstream2[ pindex ];
o3 = pstream3[ pindex ];
o4 = pstream4[ pindex ];
tmp = qOff < 0.5f ? o1 : o2;
tmp = qOff < 1.5f ? tmp : o3;
tmp = qOff < 2.5f ? tmp : o4;
outstream += tmp;
forceIndex += roundNatoms;
}
bornRadii2Force = obcChain*bornRadii*bornRadii*outstream.w;
}
kernel void kSetValue4( float value, out float4 outstream<> ){
kernel void kSetValue4( float value, out float4 outstream<> ){
outstream = float4( value, value, value, value );
outstream = float4( value, value, value, value );
}
}
...
@@ -522,354 +374,6 @@ kernel void kAddAndMergeFloat4_4(
...
@@ -522,354 +374,6 @@ kernel void kAddAndMergeFloat4_4(
}
}
}
}
kernel void kPostObcLoop2(
float repfac,
float atomStreamWidth,
float pStreamWidth,
float natoms,
float roundNatoms,
float iUnroll,
float conversion,
float mergeNonObcForces,
float4 inObcForces<>,
float3 nonObcForces<>,
float4 pstream1[][],
float4 pstream2[][],
float4 pstream3[][],
float4 pstream4[][],
float atomicRadii<>,
out float bornRadii<>,
out float obcChain<>,
out float3 outForces<> ){
// ---------------------------------------------------------------------------------------
float atomIndex, forceIndex, qIndex, qOff;
float2 pindex;
float i;
float sum2, sum3, bornSum, tanhSum, atomicRadiiOffset, obcIntermediate;
float2 iAtom;
float4 forces;
float expPlus, expMinus;
// ---------------------------------------------------------------------------------------
// constants -- OBC Type II
const float alphaObc = 1.0f;
const float betaObc = 0.8f;
const float gammaObc = 4.85f;
const float dielectricOffset = 0.009f;
// ---------------------------------------------------------------------------------------
// given atom index find force indices and streams
pindex = indexof( outForces );
atomIndex = pindex.x + pindex.y*atomStreamWidth;
forceIndex = atomIndex;
// add current forces in inStream to forces stored in pstreams
// the .w entry is Born sum values; it will be used to calculate the
// Born radii and obcChain term
forces = inObcForces;
forces.w = 0.0f;
//forces = float4( 0.0f, 0.0f, 0.0f, 0.0f );
// sum over j-loop 'duplications' by gathering from pstreams
for( i = 0.0f; i < repfac; i += 1.0f ){
// qIndex = floor( forceIndex/iUnroll );
qIndex = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
qOff = forceIndex - iUnroll*qIndex;
// pindex.y = floor( qIndex/ pStreamWidth );
pindex.y = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
// pindex.x = qIndex - pindex.y*pStreamWidth + qOff;
pindex.x = qIndex - pindex.y*pStreamWidth;
if( qOff < 0.5f ){
forces += pstream1[ pindex ];
} else if( qOff < 1.5f ){
forces += pstream2[ pindex ];
} else if( qOff < 2.5f ){
forces += pstream3[ pindex ];
} else {
forces += pstream4[ pindex ];
}
forceIndex += roundNatoms;
}
// compute Born radii and ObcChain
atomicRadiiOffset = atomicRadii - dielectricOffset;
bornSum = forces.w;
bornSum *= 0.5f*atomicRadiiOffset;
sum2 = bornSum*bornSum;
sum3 = bornSum*sum2;
// Tanh does not exist?
// calculate [ exp(x) - exp(-x) ]/[ exp(x) + exp(-x) ]
// tanhSum = tanh( bornSum - betaObc*sum2 + gammaObc*sum3 );
tanhSum = bornSum - betaObc*sum2 + gammaObc*sum3;
expPlus = exp( tanhSum );
expMinus = 1.0f/expPlus;
tanhSum = ( expPlus - expMinus )/( expPlus + expMinus );
bornRadii = 1.0f/( (1.0f/(atomicRadiiOffset)) - tanhSum/atomicRadii );
obcIntermediate = atomicRadiiOffset*( alphaObc - 2.0f*betaObc*bornSum + 3.0f*gammaObc*sum2 );
obcChain = (1.0f - tanhSum*tanhSum)*obcIntermediate/atomicRadii;
if( atomIndex >= natoms ){
bornRadii = 0.0f;
obcChain = 0.0f;
}
// add converted new forces to non-Obc forces
outForces = conversion*forces.xyz;
if( mergeNonObcForces > 0.1f ){
outForces += nonObcForces;
}
}
kernel void kPostObcLoop2_nobranch(
float repfac,
float atomStreamWidth,
float pStreamWidth,
float natoms,
float roundNatoms,
float iUnroll,
float conversion,
float mergeNonObcForces,
float4 inObcForces<>,
float3 nonObcForces<>,
float4 pstream1[][],
float4 pstream2[][],
float4 pstream3[][],
float4 pstream4[][],
float atomicRadii<>,
out float bornRadii<>,
out float obcChain<>,
out float3 outForces<> ){
// ---------------------------------------------------------------------------------------
float atomIndex, forceIndex, qIndex, qOff;
float2 pindex;
float i;
float sum2, sum3, bornSum, tanhSum, atomicRadiiOffset, obcIntermediate;
float4 o1,o2,o3,o4;
float4 tmp;
float2 iAtom;
float4 forces;
float expPlus, expMinus;
// ---------------------------------------------------------------------------------------
// constants -- OBC Type II
const float alphaObc = 1.0f;
const float betaObc = 0.8f;
const float gammaObc = 4.85f;
const float dielectricOffset = 0.009f;
// ---------------------------------------------------------------------------------------
// given atom index find force indices and streams
pindex = indexof( outForces );
atomIndex = pindex.x + pindex.y*atomStreamWidth;
forceIndex = atomIndex;
// add current forces in inStream to forces stored in pstreams
// the .w entry is Born sum values; it will be used to calculate the
// Born radii and obcChain term
forces = inObcForces;
forces.w = 0.0f;
//forces = float4( 0.0f, 0.0f, 0.0f, 0.0f );
// sum over j-loop 'duplications' by gathering from pstreams
for( i = 0.0f; i < repfac; i += 1.0f ){
// qIndex = floor( forceIndex/iUnroll );
qIndex = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
qOff = forceIndex - iUnroll*qIndex;
// pindex.y = floor( qIndex/ pStreamWidth );
pindex.y = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
// pindex.x = qIndex - pindex.y*pStreamWidth + qOff;
pindex.x = qIndex - pindex.y*pStreamWidth;
o1 = pstream1[ pindex ];
o2 = pstream2[ pindex ];
o3 = pstream3[ pindex ];
o4 = pstream4[ pindex ];
tmp = qOff < 0.5f ? o1 : o2;
tmp = qOff < 1.5f ? tmp : o3;
tmp = qOff < 2.5f ? tmp : o4;
forces += tmp;
forceIndex += roundNatoms;
}
// compute Born radii and ObcChain
atomicRadiiOffset = atomicRadii - dielectricOffset;
bornSum = forces.w;
bornSum *= 0.5f*atomicRadiiOffset;
sum2 = bornSum*bornSum;
sum3 = bornSum*sum2;
// Tanh does not exist?
// calculate [ exp(x) - exp(-x) ]/[ exp(x) + exp(-x) ]
// tanhSum = tanh( bornSum - betaObc*sum2 + gammaObc*sum3 );
tanhSum = bornSum - betaObc*sum2 + gammaObc*sum3;
expPlus = exp( tanhSum );
expMinus = 1.0f/expPlus;
tanhSum = ( expPlus - expMinus )/( expPlus + expMinus );
bornRadii = 1.0f/( (1.0f/(atomicRadiiOffset)) - tanhSum/atomicRadii );
obcIntermediate = atomicRadiiOffset*( alphaObc - 2.0f*betaObc*bornSum + 3.0f*gammaObc*sum2 );
obcChain = (1.0f - tanhSum*tanhSum)*obcIntermediate/atomicRadii;
if( atomIndex >= natoms ){
bornRadii = 0.0f;
obcChain = 0.0f;
}
// add converted new forces to non-Obc forces
outForces = conversion*forces.xyz;
if( mergeNonObcForces > 0.1f ){
outForces += nonObcForces;
}
}
kernel void kPostCalculateBornRadii_nobranch(
float repfac,
float atomStreamWidth,
float pStreamWidth,
float natoms,
float roundNatoms,
float iUnroll,
float conversion,
float mergeNonObcForces,
float4 pstream1[][],
float atomicRadii<>,
out float bornRadii<>,
out float obcChain<> ){
// ---------------------------------------------------------------------------------------
float atomIndex, forceIndex, qIndex, qOff;
float2 pindex;
float i;
float sum2, sum3, bornSum, tanhSum, atomicRadiiOffset, obcIntermediate;
float4 o1;
float tmp;
float2 iAtom;
float4 forces;
float expPlus, expMinus;
// ---------------------------------------------------------------------------------------
// constants -- OBC Type II
const float alphaObc = 1.0f;
const float betaObc = 0.8f;
const float gammaObc = 4.85f;
const float dielectricOffset = 0.009f;
// ---------------------------------------------------------------------------------------
// given atom index find force indices and streams
pindex = indexof( obcChain );
atomIndex = pindex.x + pindex.y*atomStreamWidth;
forceIndex = atomIndex;
// add current forces in inStream to forces stored in pstreams
// the .w entry is Born sum values; it will be used to calculate the
// Born radii and obcChain term
bornSum = 0.0f;
// sum over j-loop 'duplications' by gathering from pstreams
for( i = 0.0f; i < repfac; i += 1.0f ){
qIndex = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
qOff = forceIndex - iUnroll*qIndex;
pindex.y = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
pindex.x = qIndex - pindex.y*pStreamWidth;
o1 = pstream1[ pindex ];
tmp = qOff < 0.5f ? o1.x : o1.y;
tmp = qOff < 1.5f ? tmp : o1.z;
tmp = qOff < 2.5f ? tmp : o1.w;
bornSum += tmp;
forceIndex += roundNatoms;
}
// compute Born radii and ObcChain
atomicRadiiOffset = atomicRadii - dielectricOffset;
bornSum *= 0.5f*atomicRadiiOffset;
sum2 = bornSum*bornSum;
sum3 = bornSum*sum2;
// Tanh does not exist?
// calculate [ exp(x) - exp(-x) ]/[ exp(x) + exp(-x) ]
// tanhSum = tanh( bornSum - betaObc*sum2 + gammaObc*sum3 );
tanhSum = bornSum - betaObc*sum2 + gammaObc*sum3;
expPlus = exp( tanhSum );
expMinus = 1.0f/expPlus;
tanhSum = ( expPlus - expMinus )/( expPlus + expMinus );
bornRadii = 1.0f/( (1.0f/(atomicRadiiOffset)) - tanhSum/atomicRadii );
obcIntermediate = atomicRadiiOffset*( alphaObc - 2.0f*betaObc*bornSum + 3.0f*gammaObc*sum2 );
obcChain = (1.0f - tanhSum*tanhSum)*obcIntermediate/atomicRadii;
if( atomIndex >= natoms ){
bornRadii = 0.0f;
obcChain = 0.0f;
}
}
/* Add forces from two streams */
/* Add forces from two streams */
kernel void kAddForces3_4( float conversion, float3 force1<>, float4 force2<>, out float3 outForce<> ){
kernel void kAddForces3_4( float conversion, float3 force1<>, float4 force2<>, out float3 outForce<> ){
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment