/* -------------------------------------------------------------------------- * * OpenMM * * -------------------------------------------------------------------------- * * This is part of the OpenMM molecular simulation toolkit originating from * * Simbios, the NIH National Center for Physics-Based Simulation of * * Biological Structures at Stanford, funded under the NIH Roadmap for * * Medical Research, grant U54 GM072970. See https://simtk.org. * * * * Portions copyright (c) 2008-2009 Stanford University and the Authors. * * Authors: Peter Eastman, Mark Friedrichs, Chris Bruns, Mike Houston * * Contributors: * * * * Permission is hereby granted, free of charge, to any person obtaining a * * copy of this software and associated documentation files (the "Software"), * * to deal in the Software without restriction, including without limitation * * the rights to use, copy, modify, merge, publish, distribute, sublicense, * * and/or sell copies of the Software, and to permit persons to whom the * * Software is furnished to do so, subject to the following conditions: * * * * The above copyright notice and this permission notice shall be included in * * all copies or substantial portions of the Software. * * * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, * * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * * USE OR OTHER DEALINGS IN THE SOFTWARE. * * -------------------------------------------------------------------------- */ //124 FLOPS [??? total] kernel void bornSumInternal( float3 d1, float3 d2, float3 d3, float3 d4, float4 jAtomicRadiiScaled, float iAtomicRadii, out float4 bornSum<> ){ // --------------------------------------------------------------------------------------- float4 r, rInverse, r2, r2Inverse; float4 l_ij, l_ij2; float4 u_ij, u_ij2; float4 t3; float4 logRatio, scaledRadiusJ2, rPlusScaledJ, rMinusScaledJ, rMinusScaledJAbs; float4 iAtomicRadii4; float4 diff_u2_l2; const float bigValue = 1000000.0f; const float4 bigValue4 = float4( bigValue, bigValue, bigValue, bigValue ); const float smallValue = 0.000001f; const float4 smallValue4 = float4( smallValue, smallValue, smallValue, smallValue ); const float4 zero4 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); const float4 one4 = float4( 1.0f, 1.0f, 1.0f, 1.0f ); float4 mask; // --------------------------------------------------------------------------------------- iAtomicRadii4 = float4( iAtomicRadii, iAtomicRadii, iAtomicRadii, iAtomicRadii ); r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) ); //20 // --------------------------------------------------------------------------------------- // not needed for force calculation since deltaX = deltaY = deltaZ = 0, if r = 0 // r2 not masked -- assume for i = j, that 0 * infinity i 0; appears to work mask = r2 < smallValue4 ? zero4 : one4; // --------------------------------------------------------------------------------------- r = sqrt( r2 ); //4 rInverse = rsqrt( r2 ); //8 rPlusScaledJ = r + jAtomicRadiiScaled; //4 rMinusScaledJ = r - jAtomicRadiiScaled; //4 rMinusScaledJAbs = abs( rMinusScaledJ ); //40 // --------------------------------------------------------------------------------------- mask = iAtomicRadii4 >= rPlusScaledJ ? zero4 : mask; l_ij = iAtomicRadii4 > rMinusScaledJAbs ? iAtomicRadii4 : rMinusScaledJAbs; // --------------------------------------------------------------------------------------- l_ij = 1.0f/l_ij; //4 l_ij2 = l_ij*l_ij; //4 u_ij = 1.0f/rPlusScaledJ; //4 u_ij2 = u_ij*u_ij; //4 diff_u2_l2 = u_ij2 - l_ij2; //4 logRatio = log(u_ij/l_ij); //4 //24 // --------------------------------------------------------------------------------------- // terms associated w/ dL/dr & dU/dr are zero and not included // Born radius term bornSum = l_ij - u_ij + 0.5f*rInverse*logRatio + 0.25f*( r - jAtomicRadiiScaled*jAtomicRadiiScaled*rInverse)*diff_u2_l2; //10*4 t3 = ( iAtomicRadii4 < (jAtomicRadiiScaled - r) ) ? 2.0f*( (1.0f/iAtomicRadii) - l_ij) : zero4; //12 for true, but don't know how to average so saying 0 bornSum += t3; //4 bornSum *= mask; //4 //60 } //???? FLOPS kernel void kCalculateBornRadii( float numberOfAtoms, float roundedUpAtoms, float duplicationFactor, float streamWidth, float fstreamWidth, float3 posq[][], float2 scaledAtomicRadii[][], out float4 bornForce<> ){ // --------------------------------------------------------------------------------------- // atomic radii: radius - dielectric offset float i1AtomicRadii, i2AtomicRadii, i3AtomicRadii, i4AtomicRadii; // scaled atomic radii: scaleFactor*( radius - dielectric offset ) float j1ScaledAtomicRadii, j2ScaledAtomicRadii, j3ScaledAtomicRadii, j4ScaledAtomicRadii; float4 jScaledAtomicRadii; // (Born radii**2)*BornForce*ObcChainForce // i,j coordinates float3 i1pos, i2pos, i3pos, i4pos; float3 j1pos, j2pos, j3pos, j4pos; // delta coordinates float3 d1, d2, d3, d4; // indices float2 iAtom; float forceIndex; //This is forceIndex mod numberOfAtoms, the true i index float4 de, bornSum; float iAtomLinearIndex, jLinind; float2 jAtom; float jEnd, jStart, jBlock; float whichRep; float tmp; float tmp2; // --------------------------------------------------------------------------------------- // constants const float I_Unroll = 4.0f; // --------------------------------------------------------------------------------------- // set linear index iAtom = indexof( bornForce ); forceIndex = I_Unroll*( iAtom.x + iAtom.y*fstreamWidth ); iAtomLinearIndex = fmod( forceIndex, roundedUpAtoms ); // --------------------------------------------------------------------------------------- // set gather coordinates iAtom.x = fmod( iAtomLinearIndex, streamWidth ); iAtom.y = round( (iAtomLinearIndex - iAtom.x)/streamWidth ); // --------------------------------------------------------------------------------------- // fetch position, (atomic radius - dielectric offset ) i1pos = posq[ iAtom ]; i1AtomicRadii = scaledAtomicRadii[ iAtom ].y; iAtom.x += 1; i2pos = posq[ iAtom ]; i2AtomicRadii = scaledAtomicRadii[ iAtom ].y; iAtom.x += 1; i3pos = posq[ iAtom ]; i3AtomicRadii = scaledAtomicRadii[ iAtom ].y; iAtom.x += 1; i4pos = posq[ iAtom ]; i4AtomicRadii = scaledAtomicRadii[ iAtom ].y; bornForce = float4( 0.0f, 0.0f, 0.0f, 0.0f ); // inner loop indices setup //changed the following instruction for rounding issues on some ASICs //whichRep = floor( forceIndex / roundedUpAtoms ); tmp = fmod(forceIndex, roundedUpAtoms); whichRep = round((forceIndex - tmp)/roundedUpAtoms); jBlock = 1 + round( (numberOfAtoms-fmod( numberOfAtoms, (duplicationFactor*streamWidth) )) / (duplicationFactor*streamWidth) ); jStart = whichRep*jBlock; jEnd = ( whichRep > duplicationFactor - 1.5f ) ? 999999.0f : (jStart + jBlock); jAtom.y = jStart; jLinind = jAtom.y*streamWidth; // --------------------------------------------------------------------------------------- while ( jAtom.y < jEnd && ( numberOfAtoms - jLinind ) > 0.9f ){ jAtom.x = 0.0f; while ( jAtom.x < streamWidth && ( numberOfAtoms - jLinind ) > 0.9f ){ // --------------------------------------------------------------------------------------- // gather required values j1ScaledAtomicRadii = scaledAtomicRadii[ jAtom ].x; j1pos = posq[ jAtom ]; jAtom.x += 1.0f; j2ScaledAtomicRadii = scaledAtomicRadii[ jAtom ].x; j2pos = posq[ jAtom ]; jAtom.x += 1.0f; j3ScaledAtomicRadii = scaledAtomicRadii[ jAtom ].x; j3pos = posq[ jAtom ]; jAtom.x += 1.0f; j4ScaledAtomicRadii = scaledAtomicRadii[ jAtom ].x; j4pos = posq[ jAtom ]; jAtom.x += 1.0f; jScaledAtomicRadii = float4( j1ScaledAtomicRadii, j2ScaledAtomicRadii, j3ScaledAtomicRadii, j4ScaledAtomicRadii ); // --------------------------------------------------------------------------------------- // First set of 4 -- i == 1 d1 = j1pos - i1pos; //3 d2 = j2pos - i1pos; //3 d3 = j3pos - i1pos; //3 d4 = j4pos - i1pos; //3 // i = 1 bornSumInternal( d1, d2, d3, d4, jScaledAtomicRadii, i1AtomicRadii, bornSum ); //112 : 368(?) bornForce.x += bornSum.x + bornSum.y + bornSum.z + bornSum.w; //4 // --------------------------------------------------------------------------------------- // i = 2 d1 = j1pos - i2pos; //3 d2 = j2pos - i2pos; //3 d3 = j3pos - i2pos; //3 d4 = j4pos - i2pos; //3 bornSumInternal( d1, d2, d3, d4, jScaledAtomicRadii, i2AtomicRadii, bornSum );//112 : 368(?) bornForce.y += bornSum.x + bornSum.y + bornSum.z + bornSum.w; //4 // --------------------------------------------------------------------------------------- // i = 3 d1 = j1pos - i3pos; //3 d2 = j2pos - i3pos; //3 d3 = j3pos - i3pos; //3 d4 = j4pos - i3pos; //3 bornSumInternal( d1, d2, d3, d4, jScaledAtomicRadii, i3AtomicRadii, bornSum );//112 : 368(?) bornForce.z += bornSum.x + bornSum.y + bornSum.z + bornSum.w; //4 // --------------------------------------------------------------------------------------- // i = 4 d1 = j1pos - i4pos; //3 d2 = j2pos - i4pos; //3 d3 = j3pos - i4pos; //3 d4 = j4pos - i4pos; //3 bornSumInternal( d1, d2, d3, d4, jScaledAtomicRadii, i4AtomicRadii, bornSum );//112 : 368(?) bornForce.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w; //4 // --------------------------------------------------------------------------------------- // increment linear index by 4 jLinind += 4.0f; } jAtom.y += 1.0f; } } kernel void kPostCalculateBornRadii_nobranch( float repfac, float atomStreamWidth, float pStreamWidth, float natoms, float roundNatoms, float iUnroll, float conversion, float mergeNonObcForces, float4 pstream1[][], float atomicRadii<>, out float bornRadii<>, out float obcChain<> ){ // --------------------------------------------------------------------------------------- float atomIndex, forceIndex, qIndex, qOff; float2 pindex; float i; float sum2, sum3, bornSum, tanhSum, atomicRadiiOffset, obcIntermediate; float4 o1; float tmp; float2 iAtom; float4 forces; float expPlus, expMinus; // --------------------------------------------------------------------------------------- // constants -- OBC Type II const float alphaObc = 1.0f; const float betaObc = 0.8f; const float gammaObc = 4.85f; const float dielectricOffset = 0.009f; // --------------------------------------------------------------------------------------- // given atom index find force indices and streams pindex = indexof( obcChain ); atomIndex = pindex.x + pindex.y*atomStreamWidth; forceIndex = atomIndex; // add current forces in inStream to forces stored in pstreams // the .w entry is Born sum values; it will be used to calculate the // Born radii and obcChain term bornSum = 0.0f; // sum over j-loop 'duplications' by gathering from pstreams for( i = 0.0f; i < repfac; i += 1.0f ){ qIndex = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll ); qOff = forceIndex - iUnroll*qIndex; pindex.y = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth ); pindex.x = qIndex - pindex.y*pStreamWidth; o1 = pstream1[ pindex ]; tmp = qOff < 0.5f ? o1.x : o1.y; tmp = qOff < 1.5f ? tmp : o1.z; tmp = qOff < 2.5f ? tmp : o1.w; bornSum += tmp; forceIndex += roundNatoms; } // compute Born radii and ObcChain atomicRadiiOffset = atomicRadii - dielectricOffset; bornSum *= 0.5f*atomicRadiiOffset; sum2 = bornSum*bornSum; sum3 = bornSum*sum2; // Tanh does not exist? // calculate [ exp(x) - exp(-x) ]/[ exp(x) + exp(-x) ] // tanhSum = tanh( bornSum - betaObc*sum2 + gammaObc*sum3 ); tanhSum = bornSum - betaObc*sum2 + gammaObc*sum3; expPlus = exp( tanhSum ); expMinus = 1.0f/expPlus; tanhSum = ( expPlus - expMinus )/( expPlus + expMinus ); bornRadii = 1.0f/( (1.0f/(atomicRadiiOffset)) - tanhSum/atomicRadii ); obcIntermediate = atomicRadiiOffset*( alphaObc - 2.0f*betaObc*bornSum + 3.0f*gammaObc*sum2 ); obcChain = (1.0f - tanhSum*tanhSum)*obcIntermediate/atomicRadii; if( atomIndex >= natoms ){ bornRadii = 0.0f; obcChain = 0.0f; } } kernel void loop1Internal( float3 d1, float3 d2, float3 d3, float3 d4, float4 jBornR, float4 jQ, float iBornR, float iQ, out float4 dGpol_dr<>, out float4 dGpol_dalpha2_ij<> ){ // --------------------------------------------------------------------------------------- float4 r2, alpha2_ij, D_ij, expTerm, denominator2, denominator, Gpol; // --------------------------------------------------------------------------------------- r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) ); alpha2_ij = jBornR*iBornR; D_ij = r2/(4.0f*alpha2_ij); expTerm = exp( -D_ij ); denominator2 = r2 + alpha2_ij*expTerm; denominator = sqrt( denominator2 ); Gpol = jQ/denominator; Gpol *= iQ; dGpol_dr = -Gpol*( 1.0f - 0.25f*expTerm )/denominator2; dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*( 1.0f + D_ij )*jBornR/denominator2; } /* --------------------------------------------------------------------------------------- Calculate nonpolar ACE term (Simbios) bornRadius: Born radius vdwRadius: Vdw radius duplicationFactor: duplication factor aceForce: ACE term --------------------------------------------------------------------------------------- */ kernel void kAceNonPolarLoop1( float iBornRadius, float iVdwRadius, float duplicationFactor, out float aceForce<> ){ // --------------------------------------------------------------------------------------- // nonpolar term float iSurface; float iAceTerm; // --------------------------------------------------------------------------------------- // constants // solvent radius const float probeRadius = 0.14f; // PI*4*6*0.0054*1000 (0.0054=asolv from Tinker) //const float PI_24_aI = -0.3694512961; const float PI_24_aI = -407.1504079f; // --------------------------------------------------------------------------------------- // etch i position and partial charge // e = ai * term * (ri+probe)**2 * (ri/rb)**6 // (drbi) = drb(i) - 6.0fd0*e/rb // (rI+probe)**2 iSurface = (iVdwRadius+probeRadius); iSurface = iSurface*iSurface; // (rI/rB)**6 iAceTerm = iVdwRadius/iBornRadius; iAceTerm = iAceTerm*iAceTerm*iAceTerm; iAceTerm = iAceTerm*iAceTerm; aceForce = iSurface*iAceTerm*PI_24_aI/(duplicationFactor*iBornRadius); } /* --------------------------------------------------------------------------------------- Calculate first loop force terms (Simbios) numberOfAtoms: no. of atoms roundedUpAtoms: rounded up number of atoms -- accounts for unrolling duplicationFactor: number of threads for inner loop streamWidth: atom stream width fstreamWidth: force stream width (output -- i-unroll) soluteDielectric: solute dielectric solventDielectric: solvent dielectric includeAce: include ACE term posq: atom positions and charge bornRadii: Born radii nonpolarForce: nonpolar force (0 if nonpolar not included, else ACE value) bornForce1: i-unroll first force component, including dBornR/dr in .w bornForce2: i-unroll second force component, including dBornR/dr in .w bornForce3: i-unroll first force component, including dBornR/dr in .w bornForce4: i-unroll second force component, including dBornR/dr in .w --------------------------------------------------------------------------------------- */ kernel void kObcLoop1( float numberOfAtoms, float roundedUpAtoms, float duplicationFactor, float streamWidth, float fstreamWidth, float soluteDielectric, float solventDielectric, float includeAce, float3 posq[][], float bornRadii[][], float2 atomicRadii[][], out float4 bornForce1<>, out float4 bornForce2<>, out float4 bornForce3<>, out float4 bornForce4<> ){ // --------------------------------------------------------------------------------------- // Born radii float i1BornR, i2BornR, i3BornR, i4BornR; float j1BornR, j2BornR, j3BornR, j4BornR; float4 jBornR; // atomic radii float i1AtomicR, i2AtomicR, i3AtomicR, i4AtomicR; // i,j coordinates float3 i1Pos, i2Pos, i3Pos, i4Pos; float3 j1Pos, j2Pos, j3Pos, j4Pos; float4 j1PosQ, j2PosQ, j3PosQ, j4PosQ; // i, j partial charges float i1Q, i2Q, i3Q, i4Q; float j1Q, j2Q, j3Q, j4Q; float4 jQ; float aceForce; // delta coordinates float3 d1, d2, d3, d4; // intermediate terms float4 dGpol_dr, dGpol_dalpha2_ij; // indices float2 iAtom; float forceIndex; // This is forceIndex mod numberOfAtoms, the true i index float iAtomLinearIndex, jLinind; float2 jAtom; float jEnd, jStart, jBlock; float whichRep; float tmp; // --------------------------------------------------------------------------------------- // electricConstant = -166.0f2691; // preFactor = 2.0f*electricConstant*(1.0f - (1.0f/waterDielectric)) float preFactor = -332.05382f; const float I_Unroll = 4.0f; const float3 zero3 = float3( 0.0f, 0.0f, 0.0f ); // --------------------------------------------------------------------------------------- preFactor *= ( (1.0f/soluteDielectric) - (1.0f/solventDielectric) ); iAtom = indexof( bornForce1 ); forceIndex = I_Unroll*( iAtom.x + iAtom.y*fstreamWidth ); iAtomLinearIndex = fmod( forceIndex, roundedUpAtoms ); // --------------------------------------------------------------------------------------- // set gather index iAtom.x = fmod( iAtomLinearIndex, streamWidth ); iAtom.y = round( (iAtomLinearIndex - fmod(iAtomLinearIndex, streamWidth ))/streamWidth ); // --------------------------------------------------------------------------------------- // etch i1 position and partial charge jQ = posq[ iAtom ]; i1Pos = jQ.xyz; i1Q = atomicRadii[ iAtom ].y; i1Q *= preFactor; i1BornR = bornRadii[ iAtom ]; i1AtomicR = atomicRadii[ iAtom ].x; kAceNonPolarLoop1( i1BornR, i1AtomicR, duplicationFactor, aceForce ); bornForce1.xyz = zero3; bornForce1.w = includeAce > 0.5f ? aceForce : 0.0f; // --------------------------------------------------------------------------------------- // etch i2 position and partial charge iAtom.x += 1; jQ = posq[ iAtom ]; i2Pos = jQ.xyz; i2Q = atomicRadii[ iAtom ].y; i2Q *= preFactor; i2BornR = bornRadii[ iAtom ]; i2AtomicR = atomicRadii[ iAtom ].x; kAceNonPolarLoop1( i2BornR, i2AtomicR, duplicationFactor, aceForce ); bornForce2.xyz = zero3; bornForce2.w = includeAce > 0.5f ? aceForce : 0.0f; // --------------------------------------------------------------------------------------- // etch i3 position and partial charge iAtom.x += 1; jQ = posq[ iAtom ]; i3Pos = jQ.xyz; i3Q = atomicRadii[ iAtom ].y; i3Q *= preFactor; i3BornR = bornRadii[ iAtom ]; i3AtomicR = atomicRadii[ iAtom ].x; kAceNonPolarLoop1( i3BornR, i3AtomicR, duplicationFactor, aceForce ); bornForce3.xyz = zero3; bornForce3.w = includeAce > 0.5f ? aceForce : 0.0f; // --------------------------------------------------------------------------------------- // etch i4 position and partial charge iAtom.x += 1; jQ = posq[ iAtom ]; i4Pos = jQ.xyz; i4Q = atomicRadii[ iAtom ].y; i4Q *= preFactor; i4BornR = bornRadii[ iAtom ]; i4AtomicR = atomicRadii[ iAtom ].x; kAceNonPolarLoop1( i4BornR, i4AtomicR, duplicationFactor, aceForce ); bornForce4.xyz = zero3; bornForce4.w = includeAce > 0.5f ? aceForce : 0.0f; // --------------------------------------------------------------------------------------- // inner loop setup // if dupFac == 4, I_UnRoll =2, then breaking inner loop into two segments // to increase number of threads in flight // forceStreamSz = N*RepFac/I_UnRoll // forceIndex = I_UnRoll*( a.x + a.y*forceStreamSz ) // whichRep = 0 or 1 // jBlock = 1 + floor[ N/(duplicationFactor*streamWidth) ] //changed the following instruction for rounding issues on some ASICs //whichRep = floor( forceIndex / roundedUpAtoms ); tmp = fmod(forceIndex, roundedUpAtoms); whichRep = round((forceIndex - tmp)/roundedUpAtoms); jBlock = 1 + floor( numberOfAtoms/(duplicationFactor*streamWidth ) ); jStart = whichRep*jBlock; jEnd = ( whichRep > duplicationFactor - 1.5f ) ? 999999.0f : (jStart + jBlock); jAtom.y = jStart; jLinind = jAtom.y*streamWidth; // --------------------------------------------------------------------------------------- while ( jAtom.y < jEnd && ( numberOfAtoms - jLinind ) > 0.9f ){ jAtom.x = 0.0f; while ( jAtom.x < streamWidth && ( numberOfAtoms - jLinind ) > 0.9f ) { // --------------------------------------------------------------------------------------- // gather required values j1Pos = posq[ jAtom ]; j1Q = atomicRadii[ jAtom ].y; j1BornR = bornRadii[ jAtom ]; jAtom.x += 1.0f; j2Pos = posq[ jAtom ]; j2Q = atomicRadii[ jAtom ].y; j2BornR = bornRadii[ jAtom ]; jAtom.x += 1.0f; j3Pos = posq[ jAtom ]; j3Q = atomicRadii[ jAtom ].y; j3BornR = bornRadii[ jAtom ]; jAtom.x += 1.0f; j4Pos = posq[ jAtom ]; j4Q = atomicRadii[ jAtom ].y; j4BornR = bornRadii[ jAtom ]; jAtom.x += 1.0f; jBornR = float4( j1BornR, j2BornR, j3BornR, j4BornR ); jQ = float4( j1Q, j2Q, j3Q, j4Q ); // --------------------------------------------------------------------------------------- // i == 1 d1 = i1Pos - j1Pos; d2 = i1Pos - j2Pos; d3 = i1Pos - j3Pos; d4 = i1Pos - j4Pos; loop1Internal( d1, d2, d3, d4, jBornR, jQ, i1BornR, i1Q, dGpol_dr, dGpol_dalpha2_ij ); bornForce1.xyz += dGpol_dr.x*d1; bornForce1.xyz += dGpol_dr.y*d2; bornForce1.xyz += dGpol_dr.z*d3; bornForce1.xyz += dGpol_dr.w*d4; bornForce1.w += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w; // --------------------------------------------------------------------------------------- // i == 2 d1 = i2Pos - j1Pos; d2 = i2Pos - j2Pos; d3 = i2Pos - j3Pos; d4 = i2Pos - j4Pos; loop1Internal( d1, d2, d3, d4, jBornR, jQ, i2BornR, i2Q, dGpol_dr, dGpol_dalpha2_ij ); bornForce2.xyz += dGpol_dr.x*d1; bornForce2.xyz += dGpol_dr.y*d2; bornForce2.xyz += dGpol_dr.z*d3; bornForce2.xyz += dGpol_dr.w*d4; bornForce2.w += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w; // --------------------------------------------------------------------------------------- // i == 3 d1 = i3Pos - j1Pos; d2 = i3Pos - j2Pos; d3 = i3Pos - j3Pos; d4 = i3Pos - j4Pos; loop1Internal( d1, d2, d3, d4, jBornR, jQ, i3BornR, i3Q, dGpol_dr, dGpol_dalpha2_ij ); bornForce3.xyz += dGpol_dr.x*d1; bornForce3.xyz += dGpol_dr.y*d2; bornForce3.xyz += dGpol_dr.z*d3; bornForce3.xyz += dGpol_dr.w*d4; bornForce3.w += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w; // --------------------------------------------------------------------------------------- // i == 4 d1 = i4Pos - j1Pos; d2 = i4Pos - j2Pos; d3 = i4Pos - j3Pos; d4 = i4Pos - j4Pos; loop1Internal( d1, d2, d3, d4, jBornR, jQ, i4BornR, i4Q, dGpol_dr, dGpol_dalpha2_ij ); bornForce4.xyz += dGpol_dr.x*d1; bornForce4.xyz += dGpol_dr.y*d2; bornForce4.xyz += dGpol_dr.z*d3; bornForce4.xyz += dGpol_dr.w*d4; bornForce4.w += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w; // --------------------------------------------------------------------------------------- jLinind += 4.0f; } jAtom.y += 1.0f; } } // The inner loop by definition creates divergent paths. Chances are // fair that we will take all sides of the branch anyway, so this // verion uses manual predication kernel void kPostObcLoop1_nobranch( float repfac, float atomStreamWidth, float pStreamWidth, float natoms, float roundNatoms, float iUnroll, float4 pstream1[][], float4 pstream2[][], float4 pstream3[][], float4 pstream4[][], float obcChain<>, float bornRadii<>, out float4 outstream<>, out float bornRadii2Force<> ) { // --------------------------------------------------------------------------------------- float atomIndex, forceIndex, qIndex, qOff; float2 pindex; float i; float4 o1,o2,o3,o4; float4 tmp; // given atom index find force indices and streams pindex = indexof( outstream ); atomIndex = pindex.x + pindex.y*atomStreamWidth; forceIndex = atomIndex; outstream = float4( 0.0f, 0.0f, 0.0f, 0.0f ); for( i = 0.0f; i < repfac; i += 1.0f ){ qIndex = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll ); qOff = forceIndex - iUnroll*qIndex; pindex.y = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth ); pindex.x = qIndex - pindex.y*pStreamWidth; o1 = pstream1[ pindex ]; o2 = pstream2[ pindex ]; o3 = pstream3[ pindex ]; o4 = pstream4[ pindex ]; tmp = qOff < 0.5f ? o1 : o2; tmp = qOff < 1.5f ? tmp : o3; tmp = qOff < 2.5f ? tmp : o4; outstream += tmp; forceIndex += roundNatoms; } bornRadii2Force = obcChain*bornRadii*bornRadii*outstream.w; } //156 FLOPS [172 total] kernel void loop2Internal( float3 d1, float3 d2, float3 d3, float3 d4, float4 jAtomicRadiiScaled, float bornForce, float iAtomicRadii, out float4 de<> ){ // --------------------------------------------------------------------------------------- float4 r, rInverse, r2, r2Inverse; float4 l_ij, l_ij2; float4 u_ij, u_ij2; float4 t3; float4 logRatio, scaledRadiusJ2, rPlusScaledJ, rMinusScaledJ, rMinusScaledJAbs; float4 iAtomicRadii4; float4 diff_u2_l2; const float bigValue = 1000000.0f; const float4 bigValue4 = float4( bigValue, bigValue, bigValue, bigValue ); const float smallValue = 0.000001f; const float4 smallValue4 = float4( smallValue, smallValue, smallValue, smallValue ); const float4 zero4 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); const float4 one4 = float4( 1.0f, 1.0f, 1.0f, 1.0f ); float4 mask; // --------------------------------------------------------------------------------------- iAtomicRadii4 = float4( iAtomicRadii, iAtomicRadii, iAtomicRadii, iAtomicRadii ); r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) ); //20 // --------------------------------------------------------------------------------------- // not needed for force calculation since deltaX = deltaY = deltaZ = 0, if r = 0 // r2 not masked -- assume for i = j, that 0 * infinity i 0; appears to work mask = r2 < smallValue4 ? zero4 : one4; // r2 = r2 < smallValue4 ? bigValue4 : r2; // --------------------------------------------------------------------------------------- r = sqrt( r2 ); //4 rInverse = rsqrt( r2 ); //8 r2Inverse = rInverse*rInverse; //4 rPlusScaledJ = r + jAtomicRadiiScaled; //4 rMinusScaledJ = r - jAtomicRadiiScaled; //4 rMinusScaledJAbs = abs( rMinusScaledJ ); // --------------------------------------------------------------------------------------- mask = iAtomicRadii4 >= rPlusScaledJ ? zero4 : mask; l_ij = iAtomicRadii4 > rMinusScaledJAbs ? iAtomicRadii4 : rMinusScaledJAbs; // --------------------------------------------------------------------------------------- l_ij = 1.0f/l_ij; //4 l_ij2 = l_ij*l_ij; //4 u_ij = 1.0f/rPlusScaledJ; //4 u_ij2 = u_ij*u_ij; //4 diff_u2_l2 = u_ij2 - l_ij2; //4 scaledRadiusJ2 = jAtomicRadiiScaled*jAtomicRadiiScaled; //4 logRatio = log(u_ij/l_ij); //4 // terms associated w/ dL/dr & dU/dr are zero and not included t3 = -0.125f*(1.0f + scaledRadiusJ2*r2Inverse)*diff_u2_l2 + 0.25f*logRatio*r2Inverse; //7*4 de = mask*bornForce*t3*rInverse; //12 // de = float4( 1.0f, 1.0f, 1.0f, 1.0f ); // de = scaledRadiusJ2; // de = mask*t3; // de = mask*rPlusScaledJ; // de = mask*r2; // de = mask; // de = mask*rInverse; // de = mask*bornForce; // Born radius term // bornSum = l_ij - u_ij + 0.25*r*diff_u2_l2 + 0.5f*rInverse*logRatio - (0.25*jAtomicRadiiScaled*jAtomicRadiiScaled*rInverse)*diff_u2_l2; /* bornSum = l_ij - u_ij + 0.5f*rInverse*logRatio + 0.25f*( r - jAtomicRadiiScaled*jAtomicRadiiScaled*rInverse)*diff_u2_l2; //10*4 t3 = ( iAtomicRadii4 < (jAtomicRadiiScaled - r) ) ? 2.0f*( (1.0f/iAtomicRadii) - l_ij) : zero4; //12 for true, but don't know how to average so saying 0 bornSum += t3; //4 bornSum *= mask; //4 */ } kernel void loop2InternalR2( float4 r2, float4 jAtomicRadiiScaled, float bornForce, float iAtomicRadii, out float4 de<>, out float4 bornSum<> ){ // --------------------------------------------------------------------------------------- float4 r, rInverse, r2Inverse; float4 l_ij, l_ij2; float4 u_ij, u_ij2; float4 t3; float4 logRatio, scaledRadiusJ2, rPlusScaledJ, rMinusScaledJ, rMinusScaledJAbs; float4 iAtomicRadii4; float4 diff_u2_l2; const float bigValue = 1000000.0f; const float4 bigValue4 = float4( bigValue, bigValue, bigValue, bigValue ); const float smallValue = 0.000001f; const float4 smallValue4 = float4( smallValue, smallValue, smallValue, smallValue ); const float4 zero4 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); const float4 one4 = float4( 1.0f, 1.0f, 1.0f, 1.0f ); float4 mask; // --------------------------------------------------------------------------------------- iAtomicRadii4 = float4( iAtomicRadii, iAtomicRadii, iAtomicRadii, iAtomicRadii ); // --------------------------------------------------------------------------------------- // not needed for force calculation since deltaX = deltaY = deltaZ = 0, if r = 0 // r2 not masked -- assume for i = j, that 0 * infinity i 0; appears to work mask = r2 < smallValue4 ? zero4 : one4; // r2 = r2 < smallValue4 ? bigValue4 : r2; // --------------------------------------------------------------------------------------- r = sqrt( r2 ); rInverse = rsqrt( r2 ); r2Inverse = rInverse*rInverse; rPlusScaledJ = r + jAtomicRadiiScaled; rMinusScaledJ = r - jAtomicRadiiScaled; rMinusScaledJAbs = abs( rMinusScaledJ ); // --------------------------------------------------------------------------------------- mask = iAtomicRadii4 >= rPlusScaledJ ? zero4 : mask; l_ij = iAtomicRadii4 > rMinusScaledJAbs ? iAtomicRadii4 : rMinusScaledJAbs; // --------------------------------------------------------------------------------------- l_ij = 1.0f/l_ij; l_ij2 = l_ij*l_ij; u_ij = 1.0f/rPlusScaledJ; u_ij2 = u_ij*u_ij; diff_u2_l2 = u_ij2 - l_ij2; scaledRadiusJ2 = jAtomicRadiiScaled*jAtomicRadiiScaled; logRatio = log(u_ij/l_ij); // terms associated w/ dL/dr & dU/dr are zero and not included t3 = -0.125f*(1.0f + scaledRadiusJ2*r2Inverse)*diff_u2_l2 + 0.25f*logRatio*r2Inverse; de = mask*bornForce*t3*rInverse; // de = float4( 1.0f, 1.0f, 1.0f, 1.0f ); // de = scaledRadiusJ2; // de = mask*t3; // de = mask*rPlusScaledJ; // de = mask*r2; // de = mask; // de = mask*rInverse; // de = mask*bornForce; // Born radius term // bornSum = l_ij - u_ij + 0.25*r*diff_u2_l2 + 0.5f*rInverse*logRatio - (0.25*jAtomicRadiiScaled*jAtomicRadiiScaled*rInverse)*diff_u2_l2; bornSum = l_ij - u_ij + 0.5f*rInverse*logRatio + 0.25f*( r - jAtomicRadiiScaled*jAtomicRadiiScaled*rInverse)*diff_u2_l2; t3 = ( iAtomicRadii4 < (jAtomicRadiiScaled - r) ) ? 2.0f*( (1.0f/iAtomicRadii) - l_ij) : zero4; bornSum += t3; bornSum *= mask; } /* This is a copy of loop2Internal(), but w/ the calculation of bornSum removed */ //112 FLOPS kernel void loop2InternalNoSum( float3 d1, float3 d2, float3 d3, float3 d4, float4 jAtomicRadiiScaled, float bornForce, float iAtomicRadii, out float4 de<> ){ // --------------------------------------------------------------------------------------- float4 r, rInverse, r2, r2Inverse; float4 l_ij, l_ij2; float4 u_ij, u_ij2; float4 t3; float4 logRatio, scaledRadiusJ2, rPlusScaledJ, rMinusScaledJ, rMinusScaledJAbs; float4 iAtomicRadii4; const float bigValue = 1000000.0f; const float4 bigValue4 = float4( bigValue, bigValue, bigValue, bigValue ); const float smallValue = 0.000001f; const float4 smallValue4 = float4( smallValue, smallValue, smallValue, smallValue ); const float4 zero4 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); const float4 one4 = float4( 1.0f, 1.0f, 1.0f, 1.0f ); float4 mask; // --------------------------------------------------------------------------------------- iAtomicRadii4 = float4( iAtomicRadii, iAtomicRadii, iAtomicRadii, iAtomicRadii ); r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) ); //20 // --------------------------------------------------------------------------------------- // not needed for force calculation since deltaX = deltaY = deltaZ = 0, if r = 0 // mask = r2 < smallValue4 ? zero4 : one4; // r2 = r2 < smallValue4 ? bigValue4 : r2; // --------------------------------------------------------------------------------------- r = sqrt( r2 ); //4 rInverse = rsqrt( r2 ); //8 r2Inverse = rInverse*rInverse; //4 rPlusScaledJ = r + jAtomicRadiiScaled; //4 rMinusScaledJ = r - jAtomicRadiiScaled; //4 rMinusScaledJAbs = abs( rMinusScaledJ ); // --------------------------------------------------------------------------------------- // mask = iAtomicRadii4 >= rPlusScaledJ ? zero4 : mask; mask = iAtomicRadii4 >= rPlusScaledJ ? zero4 : one4; l_ij = iAtomicRadii4 > rMinusScaledJAbs ? iAtomicRadii4 : rMinusScaledJAbs; // --------------------------------------------------------------------------------------- l_ij = 1.0f/l_ij; //4 l_ij2 = l_ij*l_ij; //4 u_ij = 1.0f/rPlusScaledJ; //4 u_ij2 = u_ij*u_ij; //4 scaledRadiusJ2 = jAtomicRadiiScaled*jAtomicRadiiScaled; //4 logRatio = log(u_ij/l_ij); //4 // terms associated w/ dL/dr & dU/dr are zero and not included t3 = 0.125f*(1.0f + scaledRadiusJ2*r2Inverse)*(l_ij2 - u_ij2) + 0.25f*logRatio*r2Inverse; //8*4 de = mask*bornForce*t3*rInverse; //12 } /* --------------------------------------------------------------------------------------- Calculate Loop 2 OBC forces (Simbios) numberOfAtoms: no. of atoms roundedUpAtoms: rounded up number of atoms based on unroll duplicationFactor: ? strheight: atom stream height streamWidth: atom stream width fstreamWidth: force stream width (output -- i-unroll) posq: atom positions and charge atomicRadii: atomic radii scaledAtomicRadii: scaled atomic radii bornForceFactor: (Born radii)**2*bornForce*Obcforce bornForce1: i-unroll first force component, including dBorn_r/dr in .w bornForce2: i-unroll second force component, including dBorn_r/dr in .w bornForce3: i-unroll third force component, including dBorn_r/dr in .w bornForce4: i-unroll fourth force component, including dBorn_r/dr in .w #endif --------------------------------------------------------------------------------------- */ //1376 FLOPS kernel void kObcLoop2( float numberOfAtoms, float roundedUpAtoms, float duplicationFactor, float streamWidth, float fstreamWidth, float3 posq[][], float2 scaledAtomicRadii[][], float bornForceFactor[][], out float4 bornForce1<>, out float4 bornForce2<>, out float4 bornForce3<>, out float4 bornForce4<> ){ // --------------------------------------------------------------------------------------- // atomic radii: radius - dielectric offset float i1AtomicRadii, i2AtomicRadii, i3AtomicRadii, i4AtomicRadii; float j1AtomicRadii, j2AtomicRadii, j3AtomicRadii, j4AtomicRadii; float4 iAtomicRadii, jAtomicRadii; // scaled atomic radii: scaleFactor*( radius - dielectric offset ) float i1ScaledAtomicRadii, i2ScaledAtomicRadii, i3ScaledAtomicRadii, i4ScaledAtomicRadii; float j1ScaledAtomicRadii, j2ScaledAtomicRadii, j3ScaledAtomicRadii, j4ScaledAtomicRadii; float4 iScaledAtomicRadii, jScaledAtomicRadii; // (Born radii**2)*BornForce*ObcChainForce float i1BornForceFactor, i2BornForceFactor, i3BornForceFactor, i4BornForceFactor; float j1BornForceFactor, j2BornForceFactor, j3BornForceFactor, j4BornForceFactor; //float4 iBornForceFactor; //float4 jBornForceFactor; // i,j coordinates float3 i1pos, i2pos, i3pos, i4pos; float3 j1pos, j2pos, j3pos, j4pos; // delta coordinates float3 d1, d2, d3, d4; // indices float2 iAtom; float forceIndex; //This is forceIndex mod numberOfAtoms, the true i index float4 de, bornSum; float iAtomLinearIndex, jLinind; float2 jAtom; float jEnd, jStart, jBlock; float whichRep; float tmp; float tmp2; // --------------------------------------------------------------------------------------- // constants const float I_Unroll = 4.0f; // --------------------------------------------------------------------------------------- // set linear index iAtom = indexof( bornForce1 ); forceIndex = I_Unroll*( iAtom.x + iAtom.y*fstreamWidth ); iAtomLinearIndex = fmod( forceIndex, roundedUpAtoms ); // --------------------------------------------------------------------------------------- // set gather coordinates iAtom.x = fmod( iAtomLinearIndex, streamWidth ); iAtom.y = round( (iAtomLinearIndex - iAtom.x)/streamWidth ); // --------------------------------------------------------------------------------------- // etch i1 position, Born prefactor, atomic radius i1pos = posq[ iAtom ]; i1BornForceFactor = bornForceFactor[ iAtom ]; i1AtomicRadii = scaledAtomicRadii[ iAtom ].y; i1ScaledAtomicRadii = scaledAtomicRadii[ iAtom ].x; bornForce1 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); // etch i2 position, Born prefactor, atomic radius iAtom.x += 1; i2pos = posq[ iAtom ]; i2BornForceFactor = bornForceFactor[ iAtom ]; i2AtomicRadii = scaledAtomicRadii[ iAtom ].y; i2ScaledAtomicRadii = scaledAtomicRadii[ iAtom ].x; bornForce2 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); // etch i3 position, Born prefactor, atomic radius iAtom.x += 1; i3pos = posq[ iAtom ]; i3BornForceFactor = bornForceFactor[ iAtom ]; i3AtomicRadii = scaledAtomicRadii[ iAtom ].y; i3ScaledAtomicRadii = scaledAtomicRadii[ iAtom ].x; bornForce3 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); // etch i4 position, Born prefactor, atomic radius iAtom.x += 1; i4pos = posq[ iAtom ]; i4BornForceFactor = bornForceFactor[ iAtom ]; i4AtomicRadii = scaledAtomicRadii[ iAtom ].y; i4ScaledAtomicRadii = scaledAtomicRadii[ iAtom ].x; bornForce4 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); // create float4 for main vars iAtomicRadii = float4( i1AtomicRadii, i2AtomicRadii, i3AtomicRadii, i4AtomicRadii ); iScaledAtomicRadii = float4( i1ScaledAtomicRadii, i2ScaledAtomicRadii, i3ScaledAtomicRadii, i4ScaledAtomicRadii ); // inner loop indices setup //changed the following instruction for rounding issues on some ASICs //whichRep = floor( forceIndex / roundedUpAtoms ); tmp = fmod(forceIndex, roundedUpAtoms); whichRep = round((forceIndex - tmp)/roundedUpAtoms); jBlock = 1 + round( (numberOfAtoms-fmod( numberOfAtoms, (duplicationFactor*streamWidth) )) / (duplicationFactor*streamWidth) ); jStart = whichRep*jBlock; jEnd = ( whichRep > duplicationFactor - 1.5f ) ? 999999.0f : (jStart + jBlock); jAtom.y = jStart; jLinind = jAtom.y*streamWidth; // --------------------------------------------------------------------------------------- while ( jAtom.y < jEnd && ( numberOfAtoms - jLinind ) > 0.9f ){ jAtom.x = 0.0f; while ( jAtom.x < streamWidth && ( numberOfAtoms - jLinind ) > 0.9f ){ // --------------------------------------------------------------------------------------- // gather required values j1BornForceFactor = bornForceFactor[ jAtom ]; j1AtomicRadii = scaledAtomicRadii[ jAtom ].y; j1ScaledAtomicRadii = scaledAtomicRadii[ jAtom ].x; j1pos = posq[ jAtom ]; jAtom.x += 1.0f; j2BornForceFactor = bornForceFactor[ jAtom ]; j2AtomicRadii = scaledAtomicRadii[ jAtom ].y; j2ScaledAtomicRadii = scaledAtomicRadii[ jAtom ].x; j2pos = posq[ jAtom ]; jAtom.x += 1.0f; j3BornForceFactor = bornForceFactor[ jAtom ]; j3AtomicRadii = scaledAtomicRadii[ jAtom ].y; j3ScaledAtomicRadii = scaledAtomicRadii[ jAtom ].x; j3pos = posq[ jAtom ]; jAtom.x += 1.0f; j4BornForceFactor = bornForceFactor[ jAtom ]; j4AtomicRadii = scaledAtomicRadii[ jAtom ].y; j4ScaledAtomicRadii = scaledAtomicRadii[ jAtom ].x; j4pos = posq[ jAtom ]; jAtom.x += 1.0f; jAtomicRadii = float4( j1AtomicRadii, j2AtomicRadii, j3AtomicRadii, j4AtomicRadii ); jScaledAtomicRadii = float4( j1ScaledAtomicRadii, j2ScaledAtomicRadii, j3ScaledAtomicRadii, j4ScaledAtomicRadii ); // --------------------------------------------------------------------------------------- // First set of 4 -- i == 1 d1 = j1pos - i1pos; //3 d2 = j2pos - i1pos; //3 d3 = j3pos - i1pos; //3 d4 = j4pos - i1pos; //3 // i = 1 loop2Internal( d1, d2, d3, d4, jScaledAtomicRadii, i1BornForceFactor, i1AtomicRadii, de ); //156 : 368 bornForce1.xyz += de.x*d1; //3*2=6 bornForce1.xyz += de.y*d2; //3*2=6 bornForce1.xyz += de.z*d3; //3*2=6 bornForce1.xyz += de.w*d4; //3*2=6 // --------------------------------------------------------------------------------------- // i = 2 d1 = j1pos - i2pos; //3 d2 = j2pos - i2pos; //3 d3 = j3pos - i2pos; //3 d4 = j4pos - i2pos; //3 loop2Internal( d1, d2, d3, d4, jScaledAtomicRadii, i2BornForceFactor, i2AtomicRadii, de );//156 : 368 bornForce2.xyz += de.x*d1; //3*2=6 bornForce2.xyz += de.y*d2; //3*2=6 bornForce2.xyz += de.z*d3; //3*2=6 bornForce2.xyz += de.w*d4; //3*2=6 // --------------------------------------------------------------------------------------- // i = 3 d1 = j1pos - i3pos; //3 d2 = j2pos - i3pos; //3 d3 = j3pos - i3pos; //3 d4 = j4pos - i3pos; //3 loop2Internal( d1, d2, d3, d4, jScaledAtomicRadii, i3BornForceFactor, i3AtomicRadii, de );//156 : 368 bornForce3.xyz += de.x*d1; //3*2=6 bornForce3.xyz += de.y*d2; //3*2=6 bornForce3.xyz += de.z*d3; //3*2=6 bornForce3.xyz += de.w*d4; //3*2=6 // --------------------------------------------------------------------------------------- // i = 4 d1 = j1pos - i4pos; //3 d2 = j2pos - i4pos; //3 d3 = j3pos - i4pos; //3 d4 = j4pos - i4pos; //3 loop2Internal( d1, d2, d3, d4, jScaledAtomicRadii, i4BornForceFactor, i4AtomicRadii, de );//156 : 366 bornForce4.xyz += de.x*d1; //3*2=6 bornForce4.xyz += de.y*d2; //3*2=6 bornForce4.xyz += de.z*d3; //3*2=6 bornForce4.xyz += de.w*d4; //3*2=6 // --------------------------------------------------------------------------------------- // j = 1 d1 = j1pos - i1pos; //3 d2 = j1pos - i2pos; //3 d3 = j1pos - i3pos; //3 d4 = j1pos - i4pos; //3 loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j1BornForceFactor, j1AtomicRadii, de ); //112 : 304 bornForce1.xyz += de.x*d1; //3*2=6 bornForce2.xyz += de.y*d2; //3*2=6 bornForce3.xyz += de.z*d3; //3*2=6 bornForce4.xyz += de.w*d4; //3*2=6 // --------------------------------------------------------------------------------------- // j = 2 d1 = j2pos - i1pos; //3 d2 = j2pos - i2pos; //3 d3 = j2pos - i3pos; //3 d4 = j2pos - i4pos; //3 loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j2BornForceFactor, j2AtomicRadii, de ); //112 : 304 bornForce1.xyz += de.x*d1; //3*2=6 bornForce2.xyz += de.y*d2; //3*2=6 bornForce3.xyz += de.z*d3; //3*2=6 bornForce4.xyz += de.w*d4; //3*2=6 // --------------------------------------------------------------------------------------- // j = 3 d1 = j3pos - i1pos; //3 d2 = j3pos - i2pos; //3 d3 = j3pos - i3pos; //3 d4 = j3pos - i4pos; //3 loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j3BornForceFactor, j3AtomicRadii, de ); //112 : 304 bornForce1.xyz += de.x*d1; //3*2=6 bornForce2.xyz += de.y*d2; //3*2=6 bornForce3.xyz += de.z*d3; //3*2=6 bornForce4.xyz += de.w*d4; //3*2=6 // --------------------------------------------------------------------------------------- // j = 4 d1 = j4pos - i1pos; //3 d2 = j4pos - i2pos; //3 d3 = j4pos - i3pos; //3 d4 = j4pos - i4pos; //3 loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j4BornForceFactor, j4AtomicRadii, de ); //112 : 304 bornForce1.xyz += de.x*d1; //3*2=6 bornForce2.xyz += de.y*d2; //3*2=6 bornForce3.xyz += de.z*d3; //3*2=6 bornForce4.xyz += de.w*d4; //3*2=6 // --------------------------------------------------------------------------------------- // increment linear index by 4 jLinind += 4.0f; } jAtom.y += 1.0f; } } kernel void kPostObcLoop2_nobranch( float repfac, float atomStreamWidth, float pStreamWidth, float natoms, float roundNatoms, float iUnroll, float conversion, float mergeNonObcForces, float4 inObcForces<>, float3 nonObcForces<>, float4 pstream1[][], float4 pstream2[][], float4 pstream3[][], float4 pstream4[][], float atomicRadii<>, out float bornRadii<>, out float obcChain<>, out float3 outForces<> ){ // --------------------------------------------------------------------------------------- float atomIndex, forceIndex, qIndex, qOff; float2 pindex; float i; float sum2, sum3, bornSum, tanhSum, atomicRadiiOffset, obcIntermediate; float4 o1,o2,o3,o4; float4 tmp; float2 iAtom; float4 forces; float expPlus, expMinus; // --------------------------------------------------------------------------------------- // constants -- OBC Type II const float alphaObc = 1.0f; const float betaObc = 0.8f; const float gammaObc = 4.85f; const float dielectricOffset = 0.009f; // --------------------------------------------------------------------------------------- // given atom index find force indices and streams pindex = indexof( outForces ); atomIndex = pindex.x + pindex.y*atomStreamWidth; forceIndex = atomIndex; // add current forces in inStream to forces stored in pstreams // the .w entry is Born sum values; it will be used to calculate the // Born radii and obcChain term forces = inObcForces; forces.w = 0.0f; //forces = float4( 0.0f, 0.0f, 0.0f, 0.0f ); // sum over j-loop 'duplications' by gathering from pstreams for( i = 0.0f; i < repfac; i += 1.0f ){ // qIndex = floor( forceIndex/iUnroll ); qIndex = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll ); qOff = forceIndex - iUnroll*qIndex; // pindex.y = floor( qIndex/ pStreamWidth ); pindex.y = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth ); // pindex.x = qIndex - pindex.y*pStreamWidth + qOff; pindex.x = qIndex - pindex.y*pStreamWidth; o1 = pstream1[ pindex ]; o2 = pstream2[ pindex ]; o3 = pstream3[ pindex ]; o4 = pstream4[ pindex ]; tmp = qOff < 0.5f ? o1 : o2; tmp = qOff < 1.5f ? tmp : o3; tmp = qOff < 2.5f ? tmp : o4; forces += tmp; forceIndex += roundNatoms; } // compute Born radii and ObcChain atomicRadiiOffset = atomicRadii - dielectricOffset; bornSum = forces.w; bornSum *= 0.5f*atomicRadiiOffset; sum2 = bornSum*bornSum; sum3 = bornSum*sum2; // Tanh does not exist? // calculate [ exp(x) - exp(-x) ]/[ exp(x) + exp(-x) ] // tanhSum = tanh( bornSum - betaObc*sum2 + gammaObc*sum3 ); tanhSum = bornSum - betaObc*sum2 + gammaObc*sum3; expPlus = exp( tanhSum ); expMinus = 1.0f/expPlus; tanhSum = ( expPlus - expMinus )/( expPlus + expMinus ); bornRadii = 1.0f/( (1.0f/(atomicRadiiOffset)) - tanhSum/atomicRadii ); obcIntermediate = atomicRadiiOffset*( alphaObc - 2.0f*betaObc*bornSum + 3.0f*gammaObc*sum2 ); obcChain = (1.0f - tanhSum*tanhSum)*obcIntermediate/atomicRadii; if( atomIndex >= natoms ){ bornRadii = 0.0f; obcChain = 0.0f; } // add converted new forces to non-Obc forces outForces = conversion*forces.xyz; if( mergeNonObcForces > 0.1f ){ outForces += nonObcForces; } }