/* -------------------------------------------------------------------------- * * OpenMM * * -------------------------------------------------------------------------- * * This is part of the OpenMM molecular simulation toolkit originating from * * Simbios, the NIH National Center for Physics-Based Simulation of * * Biological Structures at Stanford, funded under the NIH Roadmap for * * Medical Research, grant U54 GM072970. See https://simtk.org. * * * * Portions copyright (c) 2008 Stanford University and the Authors. * * Authors: Peter Eastman, Mark Friedrichs, Chris Bruns * * Contributors: * * * * Permission is hereby granted, free of charge, to any person obtaining a * * copy of this software and associated documentation files (the "Software"), * * to deal in the Software without restriction, including without limitation * * the rights to use, copy, modify, merge, publish, distribute, sublicense, * * and/or sell copies of the Software, and to permit persons to whom the * * Software is furnished to do so, subject to the following conditions: * * * * The above copyright notice and this permission notice shall be included in * * all copies or substantial portions of the Software. * * * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, * * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * * USE OR OTHER DEALINGS IN THE SOFTWARE. * * -------------------------------------------------------------------------- */ kernel void loop1Internal( float3 d1, float3 d2, float3 d3, float3 d4, float4 jBornR, float4 jQ, float iBornR, float iQ, out float4 dGpol_dr<>, out float4 dGpol_dalpha2_ij<> ){ // --------------------------------------------------------------------------------------- float4 r2, alpha2_ij, D_ij, expTerm, denominator2, denominator, Gpol; // --------------------------------------------------------------------------------------- r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) ); alpha2_ij = jBornR*iBornR; D_ij = r2/(4.0f*alpha2_ij); expTerm = exp( -D_ij ); denominator2 = r2 + alpha2_ij*expTerm; denominator = sqrt( denominator2 ); Gpol = jQ/denominator; Gpol *= iQ; dGpol_dr = -Gpol*( 1.0f - 0.25f*expTerm )/denominator2; dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*( 1.0f + D_ij )*jBornR/denominator2; } /* --------------------------------------------------------------------------------------- Calculate nonpolar ACE term (Simbios) bornRadius: Born radius vdwRadius: Vdw radius duplicationFactor: duplication factor aceForce: ACE term --------------------------------------------------------------------------------------- */ kernel void kAceNonPolarLoop1( float iBornRadius, float iVdwRadius, float duplicationFactor, out float aceForce<> ){ // --------------------------------------------------------------------------------------- // nonpolar term float iSurface; float iAceTerm; // --------------------------------------------------------------------------------------- // constants // solvent radius const float probeRadius = 0.14f; // PI*4*6*0.0054*1000 (0.0054=asolv from Tinker) //const float PI_24_aI = -0.3694512961; const float PI_24_aI = -407.1504079f; // --------------------------------------------------------------------------------------- // etch i position and partial charge // e = ai * term * (ri+probe)**2 * (ri/rb)**6 // (drbi) = drb(i) - 6.0fd0*e/rb // (rI+probe)**2 iSurface = (iVdwRadius+probeRadius); iSurface = iSurface*iSurface; // (rI/rB)**6 iAceTerm = iVdwRadius/iBornRadius; iAceTerm = iAceTerm*iAceTerm*iAceTerm; iAceTerm = iAceTerm*iAceTerm; aceForce = iSurface*iAceTerm*PI_24_aI/(duplicationFactor*iBornRadius); } /* --------------------------------------------------------------------------------------- Calculate first loop force terms (Simbios) numberOfAtoms: no. of atoms roundedUpAtoms: rounded up number of atoms -- accounts for unrolling duplicationFactor: number of threads for inner loop streamWidth: atom stream width fstreamWidth: force stream width (output -- i-unroll) soluteDielectric: solute dielectric solventDielectric: solvent dielectric includeAce: include ACE term posq: atom positions and charge bornRadii: Born radii nonpolarForce: nonpolar force (0 if nonpolar not included, else ACE value) bornForce1: i-unroll first force component, including dBornR/dr in .w bornForce2: i-unroll second force component, including dBornR/dr in .w bornForce3: i-unroll first force component, including dBornR/dr in .w bornForce4: i-unroll second force component, including dBornR/dr in .w --------------------------------------------------------------------------------------- */ kernel void kObcLoop1( float numberOfAtoms, float roundedUpAtoms, float duplicationFactor, float streamWidth, float fstreamWidth, float soluteDielectric, float solventDielectric, float includeAce, float3 posq[][], float bornRadii[][], float2 atomicRadii[][], out float4 bornForce1<>, out float4 bornForce2<>, out float4 bornForce3<>, out float4 bornForce4<> ){ // --------------------------------------------------------------------------------------- // Born radii float i1BornR, i2BornR, i3BornR, i4BornR; float j1BornR, j2BornR, j3BornR, j4BornR; float4 jBornR; // atomic radii float i1AtomicR, i2AtomicR, i3AtomicR, i4AtomicR; // i,j coordinates float3 i1Pos, i2Pos, i3Pos, i4Pos; float3 j1Pos, j2Pos, j3Pos, j4Pos; float4 j1PosQ, j2PosQ, j3PosQ, j4PosQ; // i, j partial charges float i1Q, i2Q, i3Q, i4Q; float j1Q, j2Q, j3Q, j4Q; float4 jQ; float aceForce; // delta coordinates float3 d1, d2, d3, d4; // intermediate terms float4 dGpol_dr, dGpol_dalpha2_ij; // indices float2 iAtom; float forceIndex; // This is forceIndex mod numberOfAtoms, the true i index float iAtomLinearIndex, jLinind; float2 jAtom; float jEnd, jStart, jBlock; float whichRep; float tmp; // --------------------------------------------------------------------------------------- // electricConstant = -166.0f2691; // preFactor = 2.0f*electricConstant*(1.0f - (1.0f/waterDielectric)) float preFactor = -332.05382f; const float I_Unroll = 4.0f; const float3 zero3 = float3( 0.0f, 0.0f, 0.0f ); // --------------------------------------------------------------------------------------- preFactor *= ( (1.0f/soluteDielectric) - (1.0f/solventDielectric) ); iAtom = indexof( bornForce1 ); forceIndex = I_Unroll*( iAtom.x + iAtom.y*fstreamWidth ); iAtomLinearIndex = fmod( forceIndex, roundedUpAtoms ); // --------------------------------------------------------------------------------------- // set gather index iAtom.x = fmod( iAtomLinearIndex, streamWidth ); iAtom.y = round( (iAtomLinearIndex - fmod(iAtomLinearIndex, streamWidth ))/streamWidth ); // --------------------------------------------------------------------------------------- // etch i1 position and partial charge jQ = posq[ iAtom ]; i1Pos = jQ.xyz; i1Q = atomicRadii[ iAtom ].y; i1Q *= preFactor; i1BornR = bornRadii[ iAtom ]; i1AtomicR = atomicRadii[ iAtom ].x; kAceNonPolarLoop1( i1BornR, i1AtomicR, duplicationFactor, aceForce ); bornForce1.xyz = zero3; bornForce1.w = includeAce > 0.5f ? aceForce : 0.0f; // --------------------------------------------------------------------------------------- // etch i2 position and partial charge iAtom.x += 1; jQ = posq[ iAtom ]; i2Pos = jQ.xyz; i2Q = atomicRadii[ iAtom ].y; i2Q *= preFactor; i2BornR = bornRadii[ iAtom ]; i2AtomicR = atomicRadii[ iAtom ].x; kAceNonPolarLoop1( i2BornR, i2AtomicR, duplicationFactor, aceForce ); bornForce2.xyz = zero3; bornForce2.w = includeAce > 0.5f ? aceForce : 0.0f; // --------------------------------------------------------------------------------------- // etch i3 position and partial charge iAtom.x += 1; jQ = posq[ iAtom ]; i3Pos = jQ.xyz; i3Q = atomicRadii[ iAtom ].y; i3Q *= preFactor; i3BornR = bornRadii[ iAtom ]; i3AtomicR = atomicRadii[ iAtom ].x; kAceNonPolarLoop1( i3BornR, i3AtomicR, duplicationFactor, aceForce ); bornForce3.xyz = zero3; bornForce3.w = includeAce > 0.5f ? aceForce : 0.0f; // --------------------------------------------------------------------------------------- // etch i4 position and partial charge iAtom.x += 1; jQ = posq[ iAtom ]; i4Pos = jQ.xyz; i4Q = atomicRadii[ iAtom ].y; i4Q *= preFactor; i4BornR = bornRadii[ iAtom ]; i4AtomicR = atomicRadii[ iAtom ].x; kAceNonPolarLoop1( i4BornR, i4AtomicR, duplicationFactor, aceForce ); bornForce4.xyz = zero3; bornForce4.w = includeAce > 0.5f ? aceForce : 0.0f; // --------------------------------------------------------------------------------------- // inner loop setup // if dupFac == 4, I_UnRoll =2, then breaking inner loop into two segments // to increase number of threads in flight // forceStreamSz = N*RepFac/I_UnRoll // forceIndex = I_UnRoll*( a.x + a.y*forceStreamSz ) // whichRep = 0 or 1 // jBlock = 1 + floor[ N/(duplicationFactor*streamWidth) ] //changed the following instruction for rounding issues on some ASICs //whichRep = floor( forceIndex / roundedUpAtoms ); tmp = fmod(forceIndex, roundedUpAtoms); whichRep = round((forceIndex - tmp)/roundedUpAtoms); jBlock = 1 + floor( numberOfAtoms/(duplicationFactor*streamWidth ) ); jStart = whichRep*jBlock; jEnd = ( whichRep > duplicationFactor - 1.5f ) ? 999999.0f : (jStart + jBlock); jAtom.y = jStart; jLinind = jAtom.y*streamWidth; // --------------------------------------------------------------------------------------- while ( jAtom.y < jEnd && ( numberOfAtoms - jLinind ) > 0.9f ){ jAtom.x = 0.0f; while ( jAtom.x < streamWidth && ( numberOfAtoms - jLinind ) > 0.9f ) { // --------------------------------------------------------------------------------------- // gather required values j1Pos = posq[ jAtom ]; j1Q = atomicRadii[ jAtom ].y; j1BornR = bornRadii[ jAtom ]; jAtom.x += 1.0f; j2Pos = posq[ jAtom ]; j2Q = atomicRadii[ jAtom ].y; j2BornR = bornRadii[ jAtom ]; jAtom.x += 1.0f; j3Pos = posq[ jAtom ]; j3Q = atomicRadii[ jAtom ].y; j3BornR = bornRadii[ jAtom ]; jAtom.x += 1.0f; j4Pos = posq[ jAtom ]; j4Q = atomicRadii[ jAtom ].y; j4BornR = bornRadii[ jAtom ]; jAtom.x += 1.0f; jBornR = float4( j1BornR, j2BornR, j3BornR, j4BornR ); jQ = float4( j1Q, j2Q, j3Q, j4Q ); // --------------------------------------------------------------------------------------- // i == 1 d1 = i1Pos - j1Pos; d2 = i1Pos - j2Pos; d3 = i1Pos - j3Pos; d4 = i1Pos - j4Pos; loop1Internal( d1, d2, d3, d4, jBornR, jQ, i1BornR, i1Q, dGpol_dr, dGpol_dalpha2_ij ); bornForce1.xyz += dGpol_dr.x*d1; bornForce1.xyz += dGpol_dr.y*d2; bornForce1.xyz += dGpol_dr.z*d3; bornForce1.xyz += dGpol_dr.w*d4; bornForce1.w += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w; // --------------------------------------------------------------------------------------- // i == 2 d1 = i2Pos - j1Pos; d2 = i2Pos - j2Pos; d3 = i2Pos - j3Pos; d4 = i2Pos - j4Pos; loop1Internal( d1, d2, d3, d4, jBornR, jQ, i2BornR, i2Q, dGpol_dr, dGpol_dalpha2_ij ); bornForce2.xyz += dGpol_dr.x*d1; bornForce2.xyz += dGpol_dr.y*d2; bornForce2.xyz += dGpol_dr.z*d3; bornForce2.xyz += dGpol_dr.w*d4; bornForce2.w += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w; // --------------------------------------------------------------------------------------- // i == 3 d1 = i3Pos - j1Pos; d2 = i3Pos - j2Pos; d3 = i3Pos - j3Pos; d4 = i3Pos - j4Pos; loop1Internal( d1, d2, d3, d4, jBornR, jQ, i3BornR, i3Q, dGpol_dr, dGpol_dalpha2_ij ); bornForce3.xyz += dGpol_dr.x*d1; bornForce3.xyz += dGpol_dr.y*d2; bornForce3.xyz += dGpol_dr.z*d3; bornForce3.xyz += dGpol_dr.w*d4; bornForce3.w += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w; // --------------------------------------------------------------------------------------- // i == 4 d1 = i4Pos - j1Pos; d2 = i4Pos - j2Pos; d3 = i4Pos - j3Pos; d4 = i4Pos - j4Pos; loop1Internal( d1, d2, d3, d4, jBornR, jQ, i4BornR, i4Q, dGpol_dr, dGpol_dalpha2_ij ); bornForce4.xyz += dGpol_dr.x*d1; bornForce4.xyz += dGpol_dr.y*d2; bornForce4.xyz += dGpol_dr.z*d3; bornForce4.xyz += dGpol_dr.w*d4; bornForce4.w += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w; // --------------------------------------------------------------------------------------- jLinind += 4.0f; } jAtom.y += 1.0f; } }