/* -------------------------------------------------------------------------- * * OpenMM * * -------------------------------------------------------------------------- * * This is part of the OpenMM molecular simulation toolkit originating from * * Simbios, the NIH National Center for Physics-Based Simulation of * * Biological Structures at Stanford, funded under the NIH Roadmap for * * Medical Research, grant U54 GM072970. See https://simtk.org. * * * * Portions copyright (c) 2008 Stanford University and the Authors. * * Authors: Peter Eastman, Mark Friedrichs, Chris Bruns * * Contributors: * * * * Permission is hereby granted, free of charge, to any person obtaining a * * copy of this software and associated documentation files (the "Software"), * * to deal in the Software without restriction, including without limitation * * the rights to use, copy, modify, merge, publish, distribute, sublicense, * * and/or sell copies of the Software, and to permit persons to whom the * * Software is furnished to do so, subject to the following conditions: * * * * The above copyright notice and this permission notice shall be included in * * all copies or substantial portions of the Software. * * * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, * * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * * USE OR OTHER DEALINGS IN THE SOFTWARE. * * -------------------------------------------------------------------------- */ kernel void loop2Internal( float3 d1, float3 d2, float3 d3, float3 d4, float4 jAtomicRadiiScaled, float bornForce, float iAtomicRadii, out float4 de<>, out float4 bornSum<> ){ // --------------------------------------------------------------------------------------- float4 r, rInverse, r2, r2Inverse; float4 l_ij, l_ij2; float4 u_ij, u_ij2; float4 t3; float4 logRatio, scaledRadiusJ2, rPlusScaledJ, rMinusScaledJ, rMinusScaledJAbs; float4 iAtomicRadii4; float4 diff_u2_l2; const float bigValue = 1000000.0f; const float4 bigValue4 = float4( bigValue, bigValue, bigValue, bigValue ); const float smallValue = 0.000001f; const float4 smallValue4 = float4( smallValue, smallValue, smallValue, smallValue ); const float4 zero4 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); const float4 one4 = float4( 1.0f, 1.0f, 1.0f, 1.0f ); float4 mask; // --------------------------------------------------------------------------------------- iAtomicRadii4 = float4( iAtomicRadii, iAtomicRadii, iAtomicRadii, iAtomicRadii ); r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) ); // --------------------------------------------------------------------------------------- // not needed for force calculation since deltaX = deltaY = deltaZ = 0, if r = 0 // r2 not masked -- assume for i = j, that 0 * infinity i 0; appears to work mask = r2 < smallValue4 ? zero4 : one4; // r2 = r2 < smallValue4 ? bigValue4 : r2; // --------------------------------------------------------------------------------------- r = sqrt( r2 ); rInverse = rsqrt( r2 ); r2Inverse = rInverse*rInverse; rPlusScaledJ = r + jAtomicRadiiScaled; rMinusScaledJ = r - jAtomicRadiiScaled; rMinusScaledJAbs = abs( rMinusScaledJ ); // --------------------------------------------------------------------------------------- mask = iAtomicRadii4 >= rPlusScaledJ ? zero4 : mask; l_ij = iAtomicRadii4 > rMinusScaledJAbs ? iAtomicRadii4 : rMinusScaledJAbs; // --------------------------------------------------------------------------------------- l_ij = 1.0f/l_ij; l_ij2 = l_ij*l_ij; u_ij = 1.0f/rPlusScaledJ; u_ij2 = u_ij*u_ij; diff_u2_l2 = u_ij2 - l_ij2; scaledRadiusJ2 = jAtomicRadiiScaled*jAtomicRadiiScaled; logRatio = log(u_ij/l_ij); // terms associated w/ dL/dr & dU/dr are zero and not included t3 = -0.125f*(1.0f + scaledRadiusJ2*r2Inverse)*diff_u2_l2 + 0.25f*logRatio*r2Inverse; de = mask*bornForce*t3*rInverse; // de = float4( 1.0f, 1.0f, 1.0f, 1.0f ); // de = scaledRadiusJ2; // de = mask*t3; // de = mask*rPlusScaledJ; // de = mask*r2; // de = mask; // de = mask*rInverse; // de = mask*bornForce; // Born radius term // bornSum = l_ij - u_ij + 0.25*r*diff_u2_l2 + 0.5f*rInverse*logRatio - (0.25*jAtomicRadiiScaled*jAtomicRadiiScaled*rInverse)*diff_u2_l2; bornSum = l_ij - u_ij + 0.5f*rInverse*logRatio + 0.25f*( r - jAtomicRadiiScaled*jAtomicRadiiScaled*rInverse)*diff_u2_l2; t3 = ( iAtomicRadii4 < (jAtomicRadiiScaled - r) ) ? 2.0f*( (1.0f/iAtomicRadii) - l_ij) : zero4; bornSum += t3; bornSum *= mask; } /**************************************************************** * This file is part of the gpu acceleration library for gromacs. * Author: Mark Friedrichs * * This kernel was developed in collaboration with * * Copyright (C) Pande Group, Stanford, 2006 *****************************************************************/ kernel void loop2InternalR2( float4 r2, float4 jAtomicRadiiScaled, float bornForce, float iAtomicRadii, out float4 de<>, out float4 bornSum<> ){ // --------------------------------------------------------------------------------------- float4 r, rInverse, r2Inverse; float4 l_ij, l_ij2; float4 u_ij, u_ij2; float4 t3; float4 logRatio, scaledRadiusJ2, rPlusScaledJ, rMinusScaledJ, rMinusScaledJAbs; float4 iAtomicRadii4; float4 diff_u2_l2; const float bigValue = 1000000.0f; const float4 bigValue4 = float4( bigValue, bigValue, bigValue, bigValue ); const float smallValue = 0.000001f; const float4 smallValue4 = float4( smallValue, smallValue, smallValue, smallValue ); const float4 zero4 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); const float4 one4 = float4( 1.0f, 1.0f, 1.0f, 1.0f ); float4 mask; // --------------------------------------------------------------------------------------- iAtomicRadii4 = float4( iAtomicRadii, iAtomicRadii, iAtomicRadii, iAtomicRadii ); // --------------------------------------------------------------------------------------- // not needed for force calculation since deltaX = deltaY = deltaZ = 0, if r = 0 // r2 not masked -- assume for i = j, that 0 * infinity i 0; appears to work mask = r2 < smallValue4 ? zero4 : one4; // r2 = r2 < smallValue4 ? bigValue4 : r2; // --------------------------------------------------------------------------------------- r = sqrt( r2 ); rInverse = rsqrt( r2 ); r2Inverse = rInverse*rInverse; rPlusScaledJ = r + jAtomicRadiiScaled; rMinusScaledJ = r - jAtomicRadiiScaled; rMinusScaledJAbs = abs( rMinusScaledJ ); // --------------------------------------------------------------------------------------- mask = iAtomicRadii4 >= rPlusScaledJ ? zero4 : mask; l_ij = iAtomicRadii4 > rMinusScaledJAbs ? iAtomicRadii4 : rMinusScaledJAbs; // --------------------------------------------------------------------------------------- l_ij = 1.0f/l_ij; l_ij2 = l_ij*l_ij; u_ij = 1.0f/rPlusScaledJ; u_ij2 = u_ij*u_ij; diff_u2_l2 = u_ij2 - l_ij2; scaledRadiusJ2 = jAtomicRadiiScaled*jAtomicRadiiScaled; logRatio = log(u_ij/l_ij); // terms associated w/ dL/dr & dU/dr are zero and not included t3 = -0.125f*(1.0f + scaledRadiusJ2*r2Inverse)*diff_u2_l2 + 0.25f*logRatio*r2Inverse; de = mask*bornForce*t3*rInverse; // de = float4( 1.0f, 1.0f, 1.0f, 1.0f ); // de = scaledRadiusJ2; // de = mask*t3; // de = mask*rPlusScaledJ; // de = mask*r2; // de = mask; // de = mask*rInverse; // de = mask*bornForce; // Born radius term // bornSum = l_ij - u_ij + 0.25*r*diff_u2_l2 + 0.5f*rInverse*logRatio - (0.25*jAtomicRadiiScaled*jAtomicRadiiScaled*rInverse)*diff_u2_l2; bornSum = l_ij - u_ij + 0.5f*rInverse*logRatio + 0.25f*( r - jAtomicRadiiScaled*jAtomicRadiiScaled*rInverse)*diff_u2_l2; t3 = ( iAtomicRadii4 < (jAtomicRadiiScaled - r) ) ? 2.0f*( (1.0f/iAtomicRadii) - l_ij) : zero4; bornSum += t3; bornSum *= mask; } /* This is a copy of loop2Internal(), but w/ the calculation of bornSum removed */ kernel void loop2InternalNoSum( float3 d1, float3 d2, float3 d3, float3 d4, float4 jAtomicRadiiScaled, float bornForce, float iAtomicRadii, out float4 de<> ){ // --------------------------------------------------------------------------------------- float4 r, rInverse, r2, r2Inverse; float4 l_ij, l_ij2; float4 u_ij, u_ij2; float4 t3; float4 logRatio, scaledRadiusJ2, rPlusScaledJ, rMinusScaledJ, rMinusScaledJAbs; float4 iAtomicRadii4; const float bigValue = 1000000.0f; const float4 bigValue4 = float4( bigValue, bigValue, bigValue, bigValue ); const float smallValue = 0.000001f; const float4 smallValue4 = float4( smallValue, smallValue, smallValue, smallValue ); const float4 zero4 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); const float4 one4 = float4( 1.0f, 1.0f, 1.0f, 1.0f ); float4 mask; // --------------------------------------------------------------------------------------- iAtomicRadii4 = float4( iAtomicRadii, iAtomicRadii, iAtomicRadii, iAtomicRadii ); r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) ); // --------------------------------------------------------------------------------------- // not needed for force calculation since deltaX = deltaY = deltaZ = 0, if r = 0 // mask = r2 < smallValue4 ? zero4 : one4; // r2 = r2 < smallValue4 ? bigValue4 : r2; // --------------------------------------------------------------------------------------- r = sqrt( r2 ); rInverse = rsqrt( r2 ); r2Inverse = rInverse*rInverse; rPlusScaledJ = r + jAtomicRadiiScaled; rMinusScaledJ = r - jAtomicRadiiScaled; rMinusScaledJAbs = abs( rMinusScaledJ ); // --------------------------------------------------------------------------------------- // mask = iAtomicRadii4 >= rPlusScaledJ ? zero4 : mask; mask = iAtomicRadii4 >= rPlusScaledJ ? zero4 : one4; l_ij = iAtomicRadii4 > rMinusScaledJAbs ? iAtomicRadii4 : rMinusScaledJAbs; // --------------------------------------------------------------------------------------- l_ij = 1.0f/l_ij; l_ij2 = l_ij*l_ij; u_ij = 1.0f/rPlusScaledJ; u_ij2 = u_ij*u_ij; scaledRadiusJ2 = jAtomicRadiiScaled*jAtomicRadiiScaled; logRatio = log(u_ij/l_ij); // terms associated w/ dL/dr & dU/dr are zero and not included t3 = 0.125f*(1.0f + scaledRadiusJ2*r2Inverse)*(l_ij2 - u_ij2) + 0.25f*logRatio*r2Inverse; de = mask*bornForce*t3*rInverse; // de = rInverse; // de = float4( 1.0f, 1.0f, 1.0f, 1.0f ); // de = bornForce; } // This kernel includes terms dL/dr & dU/dr from paper // these values are zero, but included here to confirm that is the case kernel void loop2InternalX( float3 d1, float3 d2, float3 d3, float3 d4, float4 jAtomicRadiiScaled, float bornForce, float iAtomicRadii, out float4 de<>, out float4 bornSum<> ){ // --------------------------------------------------------------------------------------- float4 r, rInverse, r2, r2Inverse; float4 l_ij, l_ij2, l_ij3; float4 u_ij, u_ij2, u_ij3; float4 t2,t3; float4 logRatio, scaledRadiusJ2, rPlusScaledJ, rMinusScaledJ, rMinusScaledJAbs; float4 iAtomicRadii4; float4 diff_u2_l2; const float bigValue = 1000000.0f; const float4 bigValue4 = float4( bigValue, bigValue, bigValue, bigValue ); const float smallValue = 0.000001f; const float4 smallValue4 = float4( smallValue, smallValue, smallValue, smallValue ); const float4 zero4 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); const float4 one4 = float4( 1.0f, 1.0f, 1.0f, 1.0f ); float4 mask, t2Mask; // --------------------------------------------------------------------------------------- iAtomicRadii4 = float4( iAtomicRadii, iAtomicRadii, iAtomicRadii, iAtomicRadii ); r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) ); // --------------------------------------------------------------------------------------- // not needed for force calculation since deltaX = deltaY = deltaZ = 0, if r = 0 // r2 not masked -- assume for i = j, that 0 * infinity i 0; appears to work mask = r2 < smallValue4 ? zero4 : one4; // r2 = r2 < smallValue4 ? bigValue4 : r2; // --------------------------------------------------------------------------------------- r = sqrt( r2 ); rInverse = rsqrt( r2 ); r2Inverse = rInverse*rInverse; rPlusScaledJ = r + jAtomicRadiiScaled; rMinusScaledJ = r - jAtomicRadiiScaled; rMinusScaledJAbs = abs( rMinusScaledJ ); // --------------------------------------------------------------------------------------- mask = iAtomicRadii4 >= rPlusScaledJ ? zero4 : mask; l_ij = iAtomicRadii4 > rMinusScaledJAbs ? iAtomicRadii4 : rMinusScaledJAbs; // --------------------------------------------------------------------------------------- l_ij = 1.0f/l_ij; l_ij2 = l_ij*l_ij; u_ij = 1.0f/rPlusScaledJ; u_ij2 = u_ij*u_ij; diff_u2_l2 = u_ij2 - l_ij2; scaledRadiusJ2 = jAtomicRadiiScaled*jAtomicRadiiScaled; logRatio = log(u_ij/l_ij); // terms associated (t2) w/ dL/dr & dU/dr are zero; included here to test that true // loop2Internal() should be used t3 = -0.125f*(1.0f + scaledRadiusJ2*r2Inverse)*diff_u2_l2 + 0.25f*logRatio*r2Inverse; l_ij3 = l_ij2*l_ij; // t1 in Tinker code t2 = 0.5f*l_ij2 + 0.25f*scaledRadiusJ2*l_ij3/r - 0.25f*(l_ij/r + l_ij3*r); t2Mask = iAtomicRadii4 < rMinusScaledJ ? t2 : zero4; t3 += t2Mask; // t2 in Tinker code u_ij3 = u_ij2*u_ij; t2 = -0.5f*u_ij2 - 0.25f*scaledRadiusJ2*u_ij3/r + 0.25f*(u_ij/r + u_ij3*r); t3 += t2; de = mask*bornForce*t3*rInverse; // Born radius term // bornSum = l_ij - u_ij + 0.25*r*diff_u2_l2 + 0.5f*rInverse*logRatio - (0.25*jAtomicRadiiScaled*jAtomicRadiiScaled*rInverse)*diff_u2_l2; bornSum = l_ij - u_ij + 0.5f*rInverse*logRatio + 0.25f*( r - jAtomicRadiiScaled*jAtomicRadiiScaled*rInverse)*diff_u2_l2; t3 = ( iAtomicRadii4 < (jAtomicRadiiScaled - r) ) ? 2.0f*( (1.0f/iAtomicRadii) - l_ij) : zero4; bornSum += t3; bornSum *= mask; } /* This is a copy of loop2Internal(), but w/ the calculation of bornSum removed */ kernel void loop2InternalNoSumX( float3 d1, float3 d2, float3 d3, float3 d4, float4 jAtomicRadiiScaled, float bornForce, float iAtomicRadii, out float4 de<> ){ // --------------------------------------------------------------------------------------- float4 r, rInverse, r2, r2Inverse; float4 l_ij, l_ij2, l_ij3; float4 u_ij, u_ij2, u_ij3; float4 t2, t3; float4 logRatio, scaledRadiusJ2, rPlusScaledJ, rMinusScaledJ, rMinusScaledJAbs; float4 iAtomicRadii4; const float bigValue = 1000000.0f; const float4 bigValue4 = float4( bigValue, bigValue, bigValue, bigValue ); const float smallValue = 0.000001f; const float4 smallValue4 = float4( smallValue, smallValue, smallValue, smallValue ); const float4 zero4 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); const float4 one4 = float4( 1.0f, 1.0f, 1.0f, 1.0f ); float4 mask, t2Mask; // --------------------------------------------------------------------------------------- iAtomicRadii4 = float4( iAtomicRadii, iAtomicRadii, iAtomicRadii, iAtomicRadii ); r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) ); // --------------------------------------------------------------------------------------- // not needed for force calculation since deltaX = deltaY = deltaZ = 0, if r = 0 // mask = r2 < smallValue4 ? zero4 : one4; // r2 = r2 < smallValue4 ? bigValue4 : r2; // --------------------------------------------------------------------------------------- r = sqrt( r2 ); rInverse = rsqrt( r2 ); r2Inverse = rInverse*rInverse; rPlusScaledJ = r + jAtomicRadiiScaled; rMinusScaledJ = r - jAtomicRadiiScaled; rMinusScaledJAbs = abs( rMinusScaledJ ); // --------------------------------------------------------------------------------------- // mask = iAtomicRadii4 >= rPlusScaledJ ? zero4 : mask; mask = iAtomicRadii4 >= rPlusScaledJ ? zero4 : one4; l_ij = iAtomicRadii4 > rMinusScaledJAbs ? iAtomicRadii4 : rMinusScaledJAbs; // --------------------------------------------------------------------------------------- l_ij = 1.0f/l_ij; l_ij2 = l_ij*l_ij; u_ij = 1.0f/rPlusScaledJ; u_ij2 = u_ij*u_ij; scaledRadiusJ2 = jAtomicRadiiScaled*jAtomicRadiiScaled; logRatio = log(u_ij/l_ij); // terms associated w/ dL/dr & dU/dr are zero and not included t3 = 0.125f*(1.0f + scaledRadiusJ2*r2Inverse)*(l_ij2 - u_ij2) + 0.25f*logRatio*r2Inverse; l_ij3 = l_ij2*l_ij; // t1 in Tinker code t2 = 0.5f*l_ij2 + 0.25f*scaledRadiusJ2*l_ij3/r - 0.25f*(l_ij/r + l_ij3*r); t2Mask = iAtomicRadii4 < rMinusScaledJ ? t2 : zero4; t3 += t2Mask; // t2 in Tinker code u_ij3 = u_ij2*u_ij; t2 = -0.5f*u_ij2 - 0.25f*scaledRadiusJ2*u_ij3/r + 0.25f*(u_ij/r + u_ij3*r); t3 += t2; de = mask*bornForce*t3*rInverse; } /* --------------------------------------------------------------------------------------- Calculate Loop 2 OBC forces (Simbios) numberOfAtoms: no. of atoms roundedUpAtoms: rounded up number of atoms based on unroll duplicationFactor: ? strheight: atom stream height streamWidth: atom stream width fstreamWidth: force stream width (output -- i-unroll) posq: atom positions and charge atomicRadii: atomic radii scaledAtomicRadii: scaled atomic radii bornForceFactor: (Born radii)**2*bornForce*Obcforce bornForce1: i-unroll first force component, including dBorn_r/dr in .w bornForce2: i-unroll second force component, including dBorn_r/dr in .w bornForce3: i-unroll third force component, including dBorn_r/dr in .w bornForce4: i-unroll fourth force component, including dBorn_r/dr in .w #endif --------------------------------------------------------------------------------------- */ kernel void kObcLoop2( float numberOfAtoms, float roundedUpAtoms, float duplicationFactor, float streamWidth, float fstreamWidth, float3 posq[][], float2 scaledAtomicRadii[][], float bornForceFactor[][], out float4 bornForce1<>, out float4 bornForce2<>, out float4 bornForce3<>, out float4 bornForce4<> ){ // --------------------------------------------------------------------------------------- // atomic radii: radius - dielectric offset float i1AtomicRadii, i2AtomicRadii, i3AtomicRadii, i4AtomicRadii; float j1AtomicRadii, j2AtomicRadii, j3AtomicRadii, j4AtomicRadii; float4 iAtomicRadii, jAtomicRadii; // scaled atomic radii: scaleFactor*( radius - dielectric offset ) float i1ScaledAtomicRadii, i2ScaledAtomicRadii, i3ScaledAtomicRadii, i4ScaledAtomicRadii; float j1ScaledAtomicRadii, j2ScaledAtomicRadii, j3ScaledAtomicRadii, j4ScaledAtomicRadii; float4 iScaledAtomicRadii, jScaledAtomicRadii; // (Born radii**2)*BornForce*ObcChainForce float i1BornForceFactor, i2BornForceFactor, i3BornForceFactor, i4BornForceFactor; float j1BornForceFactor, j2BornForceFactor, j3BornForceFactor, j4BornForceFactor; //float4 iBornForceFactor; //float4 jBornForceFactor; // i,j coordinates float3 i1pos, i2pos, i3pos, i4pos; float3 j1pos, j2pos, j3pos, j4pos; // delta coordinates float3 d1, d2, d3, d4; // indices float2 iAtom; float forceIndex; //This is forceIndex mod numberOfAtoms, the true i index float4 de, bornSum; float iAtomLinearIndex, jLinind; float2 jAtom; float jEnd, jStart, jBlock; float whichRep; float tmp; // --------------------------------------------------------------------------------------- // constants const float I_Unroll = 4.0f; // --------------------------------------------------------------------------------------- // set linear index iAtom = indexof( bornForce1 ); forceIndex = I_Unroll*( iAtom.x + iAtom.y*fstreamWidth ); iAtomLinearIndex = fmod( forceIndex, roundedUpAtoms ); // --------------------------------------------------------------------------------------- // set gather coordinates iAtom.x = fmod( iAtomLinearIndex, streamWidth ); iAtom.y = round( (iAtomLinearIndex - iAtom.x)/streamWidth ); // --------------------------------------------------------------------------------------- // etch i1 position, Born prefactor, atomic radius i1pos = posq[ iAtom ]; i1BornForceFactor = bornForceFactor[ iAtom ]; i1AtomicRadii = scaledAtomicRadii[ iAtom ].y; i1ScaledAtomicRadii = scaledAtomicRadii[ iAtom ].x; bornForce1 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); //bornForce1.xyz = float3( i1BornForceFactor, i1ScaledAtomicRadii, i1AtomicRadii ); // etch i2 position, Born prefactor, atomic radius iAtom.x += 1; i2pos = posq[ iAtom ]; i2BornForceFactor = bornForceFactor[ iAtom ]; i2AtomicRadii = scaledAtomicRadii[ iAtom ].y; i2ScaledAtomicRadii = scaledAtomicRadii[ iAtom ].x; bornForce2 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); //bornForce2.xyz = float3( i2BornForceFactor, i2ScaledAtomicRadii, i2AtomicRadii ); // etch i3 position, Born prefactor, atomic radius iAtom.x += 1; i3pos = posq[ iAtom ]; i3BornForceFactor = bornForceFactor[ iAtom ]; i3AtomicRadii = scaledAtomicRadii[ iAtom ].y; i3ScaledAtomicRadii = scaledAtomicRadii[ iAtom ].x; bornForce3 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); //bornForce3.xyz = float3( i3BornForceFactor, i3ScaledAtomicRadii, i3AtomicRadii ); // etch i4 position, Born prefactor, atomic radius iAtom.x += 1; i4pos = posq[ iAtom ]; i4BornForceFactor = bornForceFactor[ iAtom ]; i4AtomicRadii = scaledAtomicRadii[ iAtom ].y; i4ScaledAtomicRadii = scaledAtomicRadii[ iAtom ].x; bornForce4 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); //bornForce4.xyz = float3( i4BornForceFactor, i4ScaledAtomicRadii, i4AtomicRadii ); // create float4 for main vars //iBornForceFactor = float4( i1BornForceFactor, i2BornForceFactor, i3BornForceFactor, i4BornForceFactor ); iAtomicRadii = float4( i1AtomicRadii, i2AtomicRadii, i3AtomicRadii, i4AtomicRadii ); iScaledAtomicRadii = float4( i1ScaledAtomicRadii, i2ScaledAtomicRadii, i3ScaledAtomicRadii, i4ScaledAtomicRadii ); // inner loop indices setup //changed the following instruction for rounding issues on some ASICs //whichRep = floor( forceIndex / roundedUpAtoms ); tmp = fmod(forceIndex, roundedUpAtoms); whichRep = round((forceIndex - tmp)/roundedUpAtoms); jBlock = 1 + floor( numberOfAtoms/(duplicationFactor*streamWidth ) ); jStart = whichRep*jBlock; jEnd = ( whichRep > duplicationFactor - 1.5f ) ? 999999.0f : (jStart + jBlock); jAtom.y = jStart; jLinind = jAtom.y*streamWidth; // --------------------------------------------------------------------------------------- while ( jAtom.y < jEnd && ( numberOfAtoms - jLinind ) > 0.9f ){ jAtom.x = 0.0f; while ( jAtom.x < streamWidth && ( numberOfAtoms - jLinind ) > 0.9f ){ // --------------------------------------------------------------------------------------- // gather required values j1BornForceFactor = bornForceFactor[ jAtom ]; j1AtomicRadii = scaledAtomicRadii[ jAtom ].y; j1ScaledAtomicRadii = scaledAtomicRadii[ jAtom ].x; j1pos = posq[ jAtom ]; jAtom.x += 1.0f; j2BornForceFactor = bornForceFactor[ jAtom ]; j2AtomicRadii = scaledAtomicRadii[ jAtom ].y; j2ScaledAtomicRadii = scaledAtomicRadii[ jAtom ].x; j2pos = posq[ jAtom ]; jAtom.x += 1.0f; j3BornForceFactor = bornForceFactor[ jAtom ]; j3AtomicRadii = scaledAtomicRadii[ jAtom ].y; j3ScaledAtomicRadii = scaledAtomicRadii[ jAtom ].x; j3pos = posq[ jAtom ]; jAtom.x += 1.0f; j4BornForceFactor = bornForceFactor[ jAtom ]; j4AtomicRadii = scaledAtomicRadii[ jAtom ].y; j4ScaledAtomicRadii = scaledAtomicRadii[ jAtom ].x; j4pos = posq[ jAtom ]; jAtom.x += 1.0f; //jBornForceFactor = float4( j1BornForceFactor, j2BornForceFactor, j3BornForceFactor, j4BornForceFactor ); jAtomicRadii = float4( j1AtomicRadii, j2AtomicRadii, j3AtomicRadii, j4AtomicRadii ); jScaledAtomicRadii = float4( j1ScaledAtomicRadii, j2ScaledAtomicRadii, j3ScaledAtomicRadii, j4ScaledAtomicRadii ); // --------------------------------------------------------------------------------------- // First set of 4 -- i == 1 d1 = j1pos - i1pos; d2 = j2pos - i1pos; d3 = j3pos - i1pos; d4 = j4pos - i1pos; // i = 1 loop2Internal( d1, d2, d3, d4, jScaledAtomicRadii, i1BornForceFactor, i1AtomicRadii, de, bornSum ); bornForce1.xyz += de.x*d1; bornForce1.xyz += de.y*d2; bornForce1.xyz += de.z*d3; bornForce1.xyz += de.w*d4; bornForce1.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w; // --------------------------------------------------------------------------------------- // i = 2 d1 = j1pos - i2pos; d2 = j2pos - i2pos; d3 = j3pos - i2pos; d4 = j4pos - i2pos; loop2Internal( d1, d2, d3, d4, jScaledAtomicRadii, i2BornForceFactor, i2AtomicRadii, de, bornSum ); bornForce2.xyz += de.x*d1; bornForce2.xyz += de.y*d2; bornForce2.xyz += de.z*d3; bornForce2.xyz += de.w*d4; bornForce2.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w; // --------------------------------------------------------------------------------------- // i = 3 d1 = j1pos - i3pos; d2 = j2pos - i3pos; d3 = j3pos - i3pos; d4 = j4pos - i3pos; loop2Internal( d1, d2, d3, d4, jScaledAtomicRadii, i3BornForceFactor, i3AtomicRadii, de, bornSum ); bornForce3.xyz += de.x*d1; bornForce3.xyz += de.y*d2; bornForce3.xyz += de.z*d3; bornForce3.xyz += de.w*d4; bornForce3.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w; // --------------------------------------------------------------------------------------- // i = 4 d1 = j1pos - i4pos; d2 = j2pos - i4pos; d3 = j3pos - i4pos; d4 = j4pos - i4pos; loop2Internal( d1, d2, d3, d4, jScaledAtomicRadii, i4BornForceFactor, i4AtomicRadii, de, bornSum ); bornForce4.xyz += de.x*d1; bornForce4.xyz += de.y*d2; bornForce4.xyz += de.z*d3; bornForce4.xyz += de.w*d4; bornForce4.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w; // --------------------------------------------------------------------------------------- // j = 1 d1 = j1pos - i1pos; d2 = j1pos - i2pos; d3 = j1pos - i3pos; d4 = j1pos - i4pos; // loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j1BornForceFactor, j1AtomicRadii, de, bornSum ); loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j1BornForceFactor, j1AtomicRadii, de ); bornForce1.xyz += de.x*d1; bornForce2.xyz += de.y*d2; bornForce3.xyz += de.z*d3; bornForce4.xyz += de.w*d4; // --------------------------------------------------------------------------------------- // j = 2 d1 = j2pos - i1pos; d2 = j2pos - i2pos; d3 = j2pos - i3pos; d4 = j2pos - i4pos; // loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j2BornForceFactor, j2AtomicRadii, de, bornSum ); loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j2BornForceFactor, j2AtomicRadii, de ); bornForce1.xyz += de.x*d1; bornForce2.xyz += de.y*d2; bornForce3.xyz += de.z*d3; bornForce4.xyz += de.w*d4; // --------------------------------------------------------------------------------------- // j = 3 d1 = j3pos - i1pos; d2 = j3pos - i2pos; d3 = j3pos - i3pos; d4 = j3pos - i4pos; // loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j3BornForceFactor, j3AtomicRadii, de, bornSum ); loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j3BornForceFactor, j3AtomicRadii, de ); bornForce1.xyz += de.x*d1; bornForce2.xyz += de.y*d2; bornForce3.xyz += de.z*d3; bornForce4.xyz += de.w*d4; // --------------------------------------------------------------------------------------- // j = 4 d1 = j4pos - i1pos; d2 = j4pos - i2pos; d3 = j4pos - i3pos; d4 = j4pos - i4pos; // loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j4BornForceFactor, j4AtomicRadii, de, bornSum ); loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j4BornForceFactor, j4AtomicRadii, de ); bornForce1.xyz += de.x*d1; bornForce2.xyz += de.y*d2; bornForce3.xyz += de.z*d3; bornForce4.xyz += de.w*d4; // --------------------------------------------------------------------------------------- // increment linear index by 4 jLinind += 4.0f; } jAtom.y += 1.0f; } } /* --------------------------------------------------------------------------------------- Calculate Born radii and Obc chain derivative term numberOfAtoms: no. of atoms streamWidth: atom stream width forces input forces atomicRadii atomic radii bornRadii output Born radii obcChain output Obc chain derivative --------------------------------------------------------------------------------------- */ kernel void kObcBornRadii( float numberOfAtoms, float streamWidth, float4 forces<>, float atomicRadii<>, out float bornRadii<>, out float obcChain<> ){ // --------------------------------------------------------------------------------------- float sum2, sum3, bornSum, tanhSum, atomicRadiiOffset, obcIntermediate; float2 iAtom; float expPlus, expMinus; // --------------------------------------------------------------------------------------- // constants -- OBC Type II const float alphaObc = 1.0f; const float betaObc = 0.8f; const float gammaObc = 4.85f; const float dielectricOffset = 0.009f; // --------------------------------------------------------------------------------------- atomicRadiiOffset = atomicRadii - dielectricOffset; bornSum = forces.w; bornSum *= 0.5f*atomicRadiiOffset; sum2 = bornSum*bornSum; sum3 = bornSum*sum2; // Tanh does not exist? // calculate [ exp(x) - exp(-x) ]/[ exp(x) + exp(-x) ] // tanhSum = tanh( bornSum - betaObc*sum2 + gammaObc*sum3 ); tanhSum = bornSum - betaObc*sum2 + gammaObc*sum3; expPlus = exp( tanhSum ); expMinus = 1.0f/expPlus; tanhSum = ( expPlus - expMinus )/( expPlus + expMinus ); bornRadii = 1.0f/( (1.0f/(atomicRadiiOffset)) - tanhSum/atomicRadii ); obcIntermediate = atomicRadiiOffset*( alphaObc - 2.0f*betaObc*bornSum + 3.0f*gammaObc*sum2 ); obcChain = (1.0f - tanhSum*tanhSum)*obcIntermediate/atomicRadii; } kernel float4 scalar_force_CDLJMerge( float4 qq, float epsfac, float4 sig, float4 eps, float4 r2 ) { float4 invr, invrsig2, invrsig6; float4 f; invr = rsqrt( r2 ); invrsig2 = invr * sig; invrsig2 = invrsig2 * invrsig2; invrsig6 = invrsig2 * invrsig2 * invrsig2; f = eps * ( 12.0f*invrsig6 - 6.0f ) * invrsig6; f += epsfac*qq*invr; f *= invr*invr*2.39f; return f; } /* --------------------------------------------------------------------------------------- Calculate Loop 2 OBC forces (Simbios) numberOfAtoms: no. of atoms roundedUpAtoms: rounded up number of atoms based on unroll duplicationFactor: ? strheight: atom stream height streamWidth: atom stream width fstreamWidth: force stream width (output -- i-unroll) posq: atom positions and charge atomicRadii: atomic radii scaledAtomicRadii: scaled atomic radii bornForceFactor: (Born radii)**2*bornForce*Obcforce bornForce1: i-unroll first force component, including dBorn_r/dr in .w bornForce2: i-unroll second force component, including dBorn_r/dr in .w bornForce3: i-unroll third force component, including dBorn_r/dr in .w bornForce4: i-unroll fourth force component, including dBorn_r/dr in .w #endif --------------------------------------------------------------------------------------- */ kernel void kObcLoop2Cdlj4( float numberOfAtoms, float roundedUpAtoms, float duplicationFactor, float streamWidth, float fstreamWidth, float jstreamWidth, float epsfac, float4 posq[][], float atomicRadii[][], float scaledAtomicRadii[][], float bornForceFactor[][], float4 isigeps[][], float4 sigma[][], float4 epsilon[][], float excl[][], out float4 bornForce1<>, out float4 bornForce2<>, out float4 bornForce3<>, out float4 bornForce4<> ){ // --------------------------------------------------------------------------------------- // atomic radii: radius - dielectric offset float i1AtomicRadii, i2AtomicRadii, i3AtomicRadii, i4AtomicRadii; float j1AtomicRadii, j2AtomicRadii, j3AtomicRadii, j4AtomicRadii; float4 iAtomicRadii, jAtomicRadii; // scaled atomic radii: scaleFactor*( radius - dielectric offset ) float i1ScaledAtomicRadii, i2ScaledAtomicRadii, i3ScaledAtomicRadii, i4ScaledAtomicRadii; float j1ScaledAtomicRadii, j2ScaledAtomicRadii, j3ScaledAtomicRadii, j4ScaledAtomicRadii; float4 iScaledAtomicRadii, jScaledAtomicRadii; // (Born radii**2)*BornForce*ObcChainForce float i1BornForceFactor, i2BornForceFactor, i3BornForceFactor, i4BornForceFactor; float j1BornForceFactor, j2BornForceFactor, j3BornForceFactor, j4BornForceFactor; //float4 iBornForceFactor; // i,j coordinates float3 i1pos, i2pos, i3pos, i4pos; float3 j1pos, j2pos, j3pos, j4pos; // delta coordinates float3 d1, d2, d3, d4; // indices float2 iAtom; float forceIndex; float i1q, i2q, i3q, i4q; float i1sig, i2sig, i3sig, i4sig; float i1eps, i2eps, i3eps, i4eps; float j1q, j2q, j3q, j4q; float2 exclind; float4 sig, eps; float exclusions; float4 r2, r2Masked; float3 pad; float4 exclmask; float2 iatom; float2 jstrindex; float linind; float4 jsig, jeps; float4 jQ,qq; //This is forceIndex mod numberOfAtoms, the true i index float4 de, fs, bornSum; float iAtomLinearIndex, jLinind; float2 jAtom; float jEnd, jStart, jBlock; float whichRep; // --------------------------------------------------------------------------------------- // constants const float I_Unroll = 4.0f; const float4 exclconst = float4( 2.0f, 3.0f, 5.0f, 7.0f ); // --------------------------------------------------------------------------------------- // set linear index iAtom = indexof( bornForce1 ); forceIndex = I_Unroll*( iAtom.x + iAtom.y*fstreamWidth ); iAtomLinearIndex = fmod( forceIndex, roundedUpAtoms ); // --------------------------------------------------------------------------------------- // set gather coordinates iAtom.x = fmod( iAtomLinearIndex, streamWidth ); iAtom.y = round( (iAtomLinearIndex - iAtom.x)/streamWidth ); // --------------------------------------------------------------------------------------- // fetch i1 position, Born prefactor, atomic radius i1pos = posq[ iAtom ].xyz; i1q = posq[ iAtom ].w; i1BornForceFactor = bornForceFactor[ iAtom ]; i1AtomicRadii = atomicRadii[ iAtom ]; i1ScaledAtomicRadii = scaledAtomicRadii[ iAtom ]; jstrindex = isigeps[ iAtom ]; i1sig = jstrindex.x; i1eps = jstrindex.y; bornForce1 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); // fetch i2 position, Born prefactor, atomic radius iAtom.x += 1; i2pos = posq[ iAtom ].xyz; i2q = posq[ iAtom ].w; i2BornForceFactor = bornForceFactor[ iAtom ]; i2AtomicRadii = atomicRadii[ iAtom ]; i2ScaledAtomicRadii = scaledAtomicRadii[ iAtom ]; jstrindex = isigeps[ iAtom ]; i2sig = jstrindex.x; i2eps = jstrindex.y; bornForce2 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); // fetch i3 position, Born prefactor, atomic radius iAtom.x += 1; i3pos = posq[ iAtom ].xyz; i3q = posq[ iAtom ].w; i3BornForceFactor = bornForceFactor[ iAtom ]; i3AtomicRadii = atomicRadii[ iAtom ]; i3ScaledAtomicRadii = scaledAtomicRadii[ iAtom ]; jstrindex = isigeps[ iAtom ]; i3sig = jstrindex.x; i3eps = jstrindex.y; bornForce3 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); // fetch i4 position, Born prefactor, atomic radius iAtom.x += 1; i4pos = posq[ iAtom ].xyz; i4q = posq[ iAtom ].w; i4BornForceFactor = bornForceFactor[ iAtom ]; i4AtomicRadii = atomicRadii[ iAtom ]; i4ScaledAtomicRadii = scaledAtomicRadii[ iAtom ]; jstrindex = isigeps[ iAtom ]; i4sig = jstrindex.x; i4eps = jstrindex.y; bornForce4 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); // create float4 for main vars //iBornForceFactor = float4( i1BornForceFactor, i2BornForceFactor, i3BornForceFactor, i4BornForceFactor ); iAtomicRadii = float4( i1AtomicRadii, i2AtomicRadii, i3AtomicRadii, i4AtomicRadii ); iScaledAtomicRadii = float4( i1ScaledAtomicRadii, i2ScaledAtomicRadii, i3ScaledAtomicRadii, i4ScaledAtomicRadii ); // inner loop indices setup whichRep = floor( forceIndex / roundedUpAtoms ); jBlock = 1 + floor( numberOfAtoms/(duplicationFactor*streamWidth ) ); jStart = whichRep*jBlock; jEnd = ( whichRep > duplicationFactor - 1.5f ) ? 999999.0f : (jStart + jBlock); jAtom.y = jStart; jLinind = jAtom.y*streamWidth; exclind.x = iAtomLinearIndex; exclind.y = jStart*streamWidth/4.0f; exclind.y = floor( exclind.y + 0.25f ); // --------------------------------------------------------------------------------------- while ( jAtom.y < jEnd && ( numberOfAtoms - jLinind ) > 0.9f ){ jAtom.x = 0.0f; while ( jAtom.x < streamWidth && ( numberOfAtoms - jLinind ) > 0.9f ){ // --------------------------------------------------------------------------------------- // gather required values exclusions = excl[ exclind ]; exclmask = fmod( exclusions, exclconst ); linind = (jAtom.x + jAtom.y*streamWidth)/4.0f; linind = floor( linind + 0.25f ); jstrindex.y = round( (linind - fmod(linind, jstreamWidth))/jstreamWidth); jstrindex.x = linind - jstrindex.y*jstreamWidth; jstrindex.x = floor( jstrindex.x + 0.25f ); jsig = sigma[ jstrindex ]; jeps = epsilon[ jstrindex ]; j1BornForceFactor = bornForceFactor[ jAtom ]; j1AtomicRadii = atomicRadii[ jAtom ]; j1ScaledAtomicRadii = scaledAtomicRadii[ jAtom ]; j1pos = posq[ jAtom ].xyz; j1q = posq[ jAtom ].w; jAtom.x += 1.0f; j2BornForceFactor = bornForceFactor[ jAtom ]; j2AtomicRadii = atomicRadii[ jAtom ]; j2ScaledAtomicRadii = scaledAtomicRadii[ jAtom ]; j2pos = posq[ jAtom ].xyz; j2q = posq[ jAtom ].w; jAtom.x += 1.0f; j3BornForceFactor = bornForceFactor[ jAtom ]; j3AtomicRadii = atomicRadii[ jAtom ]; j3ScaledAtomicRadii = scaledAtomicRadii[ jAtom ]; j3pos = posq[ jAtom ].xyz; j3q = posq[ jAtom ].w; jAtom.x += 1.0f; j4BornForceFactor = bornForceFactor[ jAtom ]; j4AtomicRadii = atomicRadii[ jAtom ]; j4ScaledAtomicRadii = scaledAtomicRadii[ jAtom ]; j4pos = posq[ jAtom ].xyz; j4q = posq[ jAtom ].w; jAtom.x += 1.0f; //jBornForceFactor = float4( j1BornForceFactor, j2BornForceFactor, j3BornForceFactor, j4BornForceFactor ); jAtomicRadii = float4( j1AtomicRadii, j2AtomicRadii, j3AtomicRadii, j4AtomicRadii ); jScaledAtomicRadii = float4( j1ScaledAtomicRadii, j2ScaledAtomicRadii, j3ScaledAtomicRadii, j4ScaledAtomicRadii ); jQ = float4( j1q, j2q, j3q, j4q ); // --------------------------------------------------------------------------------------- // First set of 4 -- i == 1 sig = i1sig + jsig; eps = i1eps * jeps; qq = i1q*jQ; d1 = j1pos - i1pos; d2 = j2pos - i1pos; d3 = j3pos - i1pos; d4 = j4pos - i1pos; // i = 1 r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) ); r2Masked = r2 + exclmask*10000.0f; fs = scalar_force_CDLJMerge( qq, epsfac, sig, eps, r2Masked ); loop2InternalR2( r2, jScaledAtomicRadii, i1BornForceFactor, i1AtomicRadii, de, bornSum ); de += fs; //de = fs; bornForce1.xyz += de.x*d1; bornForce1.xyz += de.y*d2; bornForce1.xyz += de.z*d3; bornForce1.xyz += de.w*d4; bornForce1.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w; // --------------------------------------------------------------------------------------- // i = 2 exclind.x = floor( exclind.x + 1.2f ); exclusions = excl[ exclind ]; exclmask = fmod( exclusions, exclconst ); sig = i2sig + jsig; eps = i2eps * jeps; qq = i2q*jQ; d1 = j1pos - i2pos; d2 = j2pos - i2pos; d3 = j3pos - i2pos; d4 = j4pos - i2pos; r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) ); r2Masked = r2 + exclmask*10000.0f; fs = scalar_force_CDLJMerge( qq, epsfac, sig, eps, r2Masked ); loop2InternalR2( r2, jScaledAtomicRadii, i2BornForceFactor, i2AtomicRadii, de, bornSum ); de += fs; //de = fs; bornForce2.xyz += de.x*d1; bornForce2.xyz += de.y*d2; bornForce2.xyz += de.z*d3; bornForce2.xyz += de.w*d4; bornForce2.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w; // --------------------------------------------------------------------------------------- // i = 3 exclind.x = floor( exclind.x + 1.2f ); exclusions = excl[ exclind ]; exclmask = fmod( exclusions, exclconst ); sig = i3sig + jsig; eps = i3eps * jeps; qq = i3q*jQ; d1 = j1pos - i3pos; d2 = j2pos - i3pos; d3 = j3pos - i3pos; d4 = j4pos - i3pos; r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) ); r2Masked = r2 + exclmask*10000.0f; fs = scalar_force_CDLJMerge( qq, epsfac, sig, eps, r2Masked ); loop2InternalR2( r2, jScaledAtomicRadii, i3BornForceFactor, i3AtomicRadii, de, bornSum ); de += fs; //de = fs; bornForce3.xyz += de.x*d1; bornForce3.xyz += de.y*d2; bornForce3.xyz += de.z*d3; bornForce3.xyz += de.w*d4; bornForce3.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w; // --------------------------------------------------------------------------------------- // i = 4 exclind.x = floor( exclind.x + 1.2f ); exclusions = excl[ exclind ]; exclmask = fmod( exclusions, exclconst ); sig = i4sig + jsig; eps = i4eps * jeps; qq = i4q*jQ; d1 = j1pos - i4pos; d2 = j2pos - i4pos; d3 = j3pos - i4pos; d4 = j4pos - i4pos; r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) ); r2Masked = r2 + exclmask*10000.0f; fs = scalar_force_CDLJMerge( qq, epsfac, sig, eps, r2Masked ); loop2InternalR2( r2, jScaledAtomicRadii, i4BornForceFactor, i4AtomicRadii, de, bornSum ); de += fs; //de = fs; bornForce4.xyz += de.x*d1; bornForce4.xyz += de.y*d2; bornForce4.xyz += de.z*d3; bornForce4.xyz += de.w*d4; bornForce4.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w; // --------------------------------------------------------------------------------------- // j = 1 d1 = j1pos - i1pos; d2 = j1pos - i2pos; d3 = j1pos - i3pos; d4 = j1pos - i4pos; // loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j1BornForceFactor, j1AtomicRadii, de, bornSum ); loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j1BornForceFactor, j1AtomicRadii, de ); bornForce1.xyz += de.x*d1; bornForce2.xyz += de.y*d2; bornForce3.xyz += de.z*d3; bornForce4.xyz += de.w*d4; // --------------------------------------------------------------------------------------- // j = 2 d1 = j2pos - i1pos; d2 = j2pos - i2pos; d3 = j2pos - i3pos; d4 = j2pos - i4pos; // loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j2BornForceFactor, j2AtomicRadii, de, bornSum ); loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j2BornForceFactor, j2AtomicRadii, de ); bornForce1.xyz += de.x*d1; bornForce2.xyz += de.y*d2; bornForce3.xyz += de.z*d3; bornForce4.xyz += de.w*d4; // --------------------------------------------------------------------------------------- // j = 3 d1 = j3pos - i1pos; d2 = j3pos - i2pos; d3 = j3pos - i3pos; d4 = j3pos - i4pos; // loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j3BornForceFactor, j3AtomicRadii, de, bornSum ); loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j3BornForceFactor, j3AtomicRadii, de ); bornForce1.xyz += de.x*d1; bornForce2.xyz += de.y*d2; bornForce3.xyz += de.z*d3; bornForce4.xyz += de.w*d4; // --------------------------------------------------------------------------------------- // j = 4 d1 = j4pos - i1pos; d2 = j4pos - i2pos; d3 = j4pos - i3pos; d4 = j4pos - i4pos; // loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j4BornForceFactor, j4AtomicRadii, de, bornSum ); loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j4BornForceFactor, j4AtomicRadii, de ); bornForce1.xyz += de.x*d1; bornForce2.xyz += de.y*d2; bornForce3.xyz += de.z*d3; bornForce4.xyz += de.w*d4; // --------------------------------------------------------------------------------------- //exclind.x -= 3.0f; exclind.x = floor( exclind.x - 2.9f ); //exclind.y += 1.0f; exclind.y = floor( exclind.y + 1.25f ); // increment linear index by 4 jLinind += 4.0f; } jAtom.y += 1.0f; } } /* --------------------------------------------------------------------------------------- Calculate Loop 2 OBC forces (Simbios) numberOfAtoms: no. of atoms roundedUpAtoms: rounded up number of atoms based on unroll duplicationFactor: ? strheight: atom stream height streamWidth: atom stream width fstreamWidth: force stream width (output -- i-unroll) posq: atom positions and charge atomicRadii: atomic radii scaledAtomicRadii: scaled atomic radii bornForceFactor: (Born radii)**2*bornForce*Obcforce bornForce1: i-unroll first force component, including dBorn_r/dr in .w bornForce2: i-unroll second force component, including dBorn_r/dr in .w bornForce3: i-unroll third force component, including dBorn_r/dr in .w bornForce4: i-unroll fourth force component, including dBorn_r/dr in .w #endif --------------------------------------------------------------------------------------- */ kernel void kObcLoop2Cdlj4X( float numberOfAtoms, float roundedUpAtoms, float duplicationFactor, float streamWidth, float fstreamWidth, float jstreamWidth, float epsfac, float4 posq[][], float atomicRadii[][], float scaledAtomicRadii[][], float bornForceFactor[][], float4 isigeps[][], float4 sigma[][], float4 epsilon[][], float excl[][], out float4 bornForce1<>, out float4 bornForce2<>, out float4 bornForce3<>, out float4 bornForce4<> ){ // --------------------------------------------------------------------------------------- // atomic radii: radius - dielectric offset float i1AtomicRadii, i2AtomicRadii, i3AtomicRadii, i4AtomicRadii; float j1AtomicRadii, j2AtomicRadii, j3AtomicRadii, j4AtomicRadii; float4 iAtomicRadii, jAtomicRadii; // scaled atomic radii: scaleFactor*( radius - dielectric offset ) float i1ScaledAtomicRadii, i2ScaledAtomicRadii, i3ScaledAtomicRadii, i4ScaledAtomicRadii; float j1ScaledAtomicRadii, j2ScaledAtomicRadii, j3ScaledAtomicRadii, j4ScaledAtomicRadii; float4 iScaledAtomicRadii, jScaledAtomicRadii; // (Born radii**2)*BornForce*ObcChainForce float i1BornForceFactor, i2BornForceFactor, i3BornForceFactor, i4BornForceFactor; float j1BornForceFactor, j2BornForceFactor, j3BornForceFactor, j4BornForceFactor; //float4 iBornForceFactor; //float4 jBornForceFactor; // i,j coordinates float3 i1pos, i2pos, i3pos, i4pos; float3 j1pos, j2pos, j3pos, j4pos; // delta coordinates float3 d1, d2, d3, d4; // indices float2 iAtom; float forceIndex; float i1q, i2q, i3q, i4q; float i1sig, i2sig, i3sig, i4sig; float i1eps, i2eps, i3eps, i4eps; float j1q, j2q, j3q, j4q; float2 exclind; float4 sig, eps; float exclusions; float4 r2, r2Masked; float3 pad; float4 exclmask; float2 iatom; float2 jstrindex; float linind; float4 jsig, jeps; float4 jQ,qq; //This is forceIndex mod numberOfAtoms, the true i index float4 de, fs, bornSum; float iAtomLinearIndex, jLinind; float2 jAtom; float jEnd, jStart, jBlock; float whichRep; float sum; // --------------------------------------------------------------------------------------- // constants const float4 one4 = float4( 1.0f, 1.0f, 1.0f, 1.0f ); const float4 zero4 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); const float4 frac4 = float4( 0.2f, 0.2f, 0.2f, 0.2f ); const float I_Unroll = 4.0f; const float4 exclconst = float4( 2.0f, 3.0f, 5.0f, 7.0f ); // --------------------------------------------------------------------------------------- // set linear index iAtom = indexof( bornForce1 ); forceIndex = I_Unroll*( iAtom.x + iAtom.y*fstreamWidth ); iAtomLinearIndex = fmod( forceIndex, roundedUpAtoms ); // --------------------------------------------------------------------------------------- // set gather coordinates iAtom.x = fmod( iAtomLinearIndex, streamWidth ); iAtom.y = round( (iAtomLinearIndex - iAtom.x)/streamWidth ); // --------------------------------------------------------------------------------------- // fetch i1 position, Born prefactor, atomic radius i1pos = posq[ iAtom ].xyz; i1q = posq[ iAtom ].w; i1BornForceFactor = bornForceFactor[ iAtom ]; i1AtomicRadii = atomicRadii[ iAtom ]; i1ScaledAtomicRadii = scaledAtomicRadii[ iAtom ]; jstrindex = isigeps[ iAtom ]; i1sig = jstrindex.x; i1eps = jstrindex.y; bornForce1 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); // fetch i2 position, Born prefactor, atomic radius iAtom.x += 1; i2pos = posq[ iAtom ].xyz; i2q = posq[ iAtom ].w; i2BornForceFactor = bornForceFactor[ iAtom ]; i2AtomicRadii = atomicRadii[ iAtom ]; i2ScaledAtomicRadii = scaledAtomicRadii[ iAtom ]; jstrindex = isigeps[ iAtom ]; i2sig = jstrindex.x; i2eps = jstrindex.y; bornForce2 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); // fetch i3 position, Born prefactor, atomic radius iAtom.x += 1; i3pos = posq[ iAtom ].xyz; i3q = posq[ iAtom ].w; i3BornForceFactor = bornForceFactor[ iAtom ]; i3AtomicRadii = atomicRadii[ iAtom ]; i3ScaledAtomicRadii = scaledAtomicRadii[ iAtom ]; jstrindex = isigeps[ iAtom ]; i3sig = jstrindex.x; i3eps = jstrindex.y; bornForce3 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); // fetch i4 position, Born prefactor, atomic radius iAtom.x += 1; i4pos = posq[ iAtom ].xyz; i4q = posq[ iAtom ].w; i4BornForceFactor = bornForceFactor[ iAtom ]; i4AtomicRadii = atomicRadii[ iAtom ]; i4ScaledAtomicRadii = scaledAtomicRadii[ iAtom ]; jstrindex = isigeps[ iAtom ]; i4sig = jstrindex.x; i4eps = jstrindex.y; bornForce4 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); // create float4 for main vars //iBornForceFactor = float4( i1BornForceFactor, i2BornForceFactor, i3BornForceFactor, i4BornForceFactor ); iAtomicRadii = float4( i1AtomicRadii, i2AtomicRadii, i3AtomicRadii, i4AtomicRadii ); iScaledAtomicRadii = float4( i1ScaledAtomicRadii, i2ScaledAtomicRadii, i3ScaledAtomicRadii, i4ScaledAtomicRadii ); // inner loop indices setup whichRep = floor( forceIndex / roundedUpAtoms ); jBlock = 1 + floor( numberOfAtoms/(duplicationFactor*streamWidth ) ); jStart = whichRep*jBlock; jEnd = ( whichRep > duplicationFactor - 1.5f ) ? 999999.0f : (jStart + jBlock); jAtom.y = jStart; jLinind = jAtom.y*streamWidth; exclind.x = iAtomLinearIndex; exclind.y = jStart*streamWidth/4.0f; exclind.y = floor( exclind.y + 0.25f ); // --------------------------------------------------------------------------------------- while ( jAtom.y < jEnd && ( numberOfAtoms - jLinind ) > 0.9f ){ jAtom.x = 0.0f; while ( jAtom.x < streamWidth && ( numberOfAtoms - jLinind ) > 0.9f ){ // --------------------------------------------------------------------------------------- // gather required values exclusions = excl[ exclind ]; exclmask = fmod( exclusions, exclconst ); linind = (jAtom.x + jAtom.y*streamWidth)/4.0f; linind = floor( linind + 0.25f ); jstrindex.y = round( (linind - fmod(linind, jstreamWidth))/jstreamWidth); jstrindex.x = linind - jstrindex.y*jstreamWidth; jstrindex.x = floor( jstrindex.x + 0.25f ); jsig = sigma[ jstrindex ]; jeps = epsilon[ jstrindex ]; j1BornForceFactor = bornForceFactor[ jAtom ]; j1AtomicRadii = atomicRadii[ jAtom ]; j1ScaledAtomicRadii = scaledAtomicRadii[ jAtom ]; j1pos = posq[ jAtom ].xyz; j1q = posq[ jAtom ].w; jAtom.x += 1.0f; j2BornForceFactor = bornForceFactor[ jAtom ]; j2AtomicRadii = atomicRadii[ jAtom ]; j2ScaledAtomicRadii = scaledAtomicRadii[ jAtom ]; j2pos = posq[ jAtom ].xyz; j2q = posq[ jAtom ].w; jAtom.x += 1.0f; j3BornForceFactor = bornForceFactor[ jAtom ]; j3AtomicRadii = atomicRadii[ jAtom ]; j3ScaledAtomicRadii = scaledAtomicRadii[ jAtom ]; j3pos = posq[ jAtom ].xyz; j3q = posq[ jAtom ].w; jAtom.x += 1.0f; j4BornForceFactor = bornForceFactor[ jAtom ]; j4AtomicRadii = atomicRadii[ jAtom ]; j4ScaledAtomicRadii = scaledAtomicRadii[ jAtom ]; j4pos = posq[ jAtom ].xyz; j4q = posq[ jAtom ].w; jAtom.x += 1.0f; //jBornForceFactor = float4( j1BornForceFactor, j2BornForceFactor, j3BornForceFactor, j4BornForceFactor ); jAtomicRadii = float4( j1AtomicRadii, j2AtomicRadii, j3AtomicRadii, j4AtomicRadii ); jScaledAtomicRadii = float4( j1ScaledAtomicRadii, j2ScaledAtomicRadii, j3ScaledAtomicRadii, j4ScaledAtomicRadii ); jQ = float4( j1q, j2q, j3q, j4q ); // --------------------------------------------------------------------------------------- // First set of 4 -- i == 1 /* sig = i1sig + jsig; eps = i1eps * jeps; qq = i1q*jQ; d1 = j1pos - i1pos; d2 = j2pos - i1pos; d3 = j3pos - i1pos; d4 = j4pos - i1pos; // i = 1 r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) ); r2Masked = r2 + exclmask*10000.0f; fs = scalar_force_CDLJMerge( qq, epsfac, sig, eps, r2Masked ); loop2InternalR2( r2, jScaledAtomicRadii, i1BornForceFactor, i1AtomicRadii, de, bornSum ); //de += fs; de = fs; bornForce1.xyz += de.x*d1; bornForce1.xyz += de.y*d2; bornForce1.xyz += de.z*d3; bornForce1.xyz += de.w*d4; bornForce1.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w; */ exclmask = exclmask > frac4 ? one4 : zero4; sum = floor( exclmask.x ) + floor( exclmask.y ) + floor( exclmask.z ) + floor( exclmask.w ); if( sum > 0.25f ){ //if( exclind.y < 4.9f && exclind.y > 3.1f ){ //if( exclind.y < 0.9f ){ //if( exclind.y < 1.9f && exclind.y > 0.1f ){ //if( exclind.y < 2.9f && exclind.y > 1.1f ){ //if( exclind.y < 5.9f && exclind.y > 4.1f ){ bornForce1.x += 1.0f; bornForce1.y += exclusions; bornForce1.z += sum; bornForce1.w += exclmask.w; //bornForce1 = exclmask; } // --------------------------------------------------------------------------------------- // i = 2 exclind.x = floor( exclind.x + 1.2f ); exclusions = excl[ exclind ]; exclmask = fmod( exclusions, exclconst ); exclmask = exclmask > frac4 ? one4 : zero4; sum = floor( exclmask.x + exclmask.y + exclmask.z + exclmask.w + 0.25f ); if( sum > 0.25f ){ //if( exclind.y < 5.9f && exclind.y > 4.1f ){ bornForce2.x += 1.0f; bornForce2.y += exclusions; bornForce2.z += sum; bornForce2.w += exclind.y; } /* sig = i2sig + jsig; eps = i2eps * jeps; qq = i2q*jQ; d1 = j1pos - i2pos; d2 = j2pos - i2pos; d3 = j3pos - i2pos; d4 = j4pos - i2pos; r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) ); r2Masked = r2 + exclmask*10000.0f; fs = scalar_force_CDLJMerge( qq, epsfac, sig, eps, r2Masked ); loop2InternalR2( r2, jScaledAtomicRadii, i2BornForceFactor, i2AtomicRadii, de, bornSum ); //de += fs; de = fs; bornForce2.xyz += de.x*d1; bornForce2.xyz += de.y*d2; bornForce2.xyz += de.z*d3; bornForce2.xyz += de.w*d4; bornForce2.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w; */ // --------------------------------------------------------------------------------------- // i = 3 exclind.x = floor( exclind.x + 1.2f ); exclusions = excl[ exclind ]; exclmask = fmod( exclusions, exclconst ); exclmask = exclmask > frac4 ? one4 : zero4; sum = floor( exclmask.x + exclmask.y + exclmask.z + exclmask.w + 0.25f ); if( sum > 0.25f ){ //if( exclind.y < 5.9f && exclind.y > 4.1f ){ bornForce3.x += 1.0f; bornForce3.y += exclusions; bornForce3.z += sum; bornForce3.w += exclind.y; } /* sig = i3sig + jsig; eps = i3eps * jeps; qq = i3q*jQ; d1 = j1pos - i3pos; d2 = j2pos - i3pos; d3 = j3pos - i3pos; d4 = j4pos - i3pos; r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) ); r2Masked = r2 + exclmask*10000.0f; fs = scalar_force_CDLJMerge( qq, epsfac, sig, eps, r2Masked ); loop2InternalR2( r2, jScaledAtomicRadii, i3BornForceFactor, i3AtomicRadii, de, bornSum ); //de += fs; de = fs; bornForce3.xyz += de.x*d1; bornForce3.xyz += de.y*d2; bornForce3.xyz += de.z*d3; bornForce3.xyz += de.w*d4; bornForce3.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w; */ // --------------------------------------------------------------------------------------- // i = 4 exclind.x = floor( exclind.x + 1.2f ); exclusions = excl[ exclind ]; exclmask = fmod( exclusions, exclconst ); exclmask = exclmask > frac4 ? one4 : zero4; sum = floor( exclmask.x + exclmask.y + exclmask.z + exclmask.w + 0.25f ); if( sum > 0.25f ){ //if( exclind.y < 5.9f && exclind.y > 4.1f ){ bornForce4.x += 1.0f; bornForce4.y += exclusions; bornForce4.z += sum; bornForce4.w += exclind.y; } /* sig = i4sig + jsig; eps = i4eps * jeps; qq = i4q*jQ; d1 = j1pos - i4pos; d2 = j2pos - i4pos; d3 = j3pos - i4pos; d4 = j4pos - i4pos; r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) ); r2Masked = r2 + exclmask*10000.0f; fs = scalar_force_CDLJMerge( qq, epsfac, sig, eps, r2Masked ); loop2InternalR2( r2, jScaledAtomicRadii, i4BornForceFactor, i4AtomicRadii, de, bornSum ); //de += fs; de = fs; bornForce4.xyz += de.x*d1; bornForce4.xyz += de.y*d2; bornForce4.xyz += de.z*d3; bornForce4.xyz += de.w*d4; bornForce4.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w; */ // --------------------------------------------------------------------------------------- /* // j = 1 d1 = j1pos - i1pos; d2 = j1pos - i2pos; d3 = j1pos - i3pos; d4 = j1pos - i4pos; // loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j1BornForceFactor, j1AtomicRadii, de, bornSum ); loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j1BornForceFactor, j1AtomicRadii, de ); bornForce1.xyz += de.x*d1; bornForce2.xyz += de.y*d2; bornForce3.xyz += de.z*d3; bornForce4.xyz += de.w*d4; // --------------------------------------------------------------------------------------- // j = 2 d1 = j2pos - i1pos; d2 = j2pos - i2pos; d3 = j2pos - i3pos; d4 = j2pos - i4pos; // loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j2BornForceFactor, j2AtomicRadii, de, bornSum ); loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j2BornForceFactor, j2AtomicRadii, de ); bornForce1.xyz += de.x*d1; bornForce2.xyz += de.y*d2; bornForce3.xyz += de.z*d3; bornForce4.xyz += de.w*d4; // --------------------------------------------------------------------------------------- // j = 3 d1 = j3pos - i1pos; d2 = j3pos - i2pos; d3 = j3pos - i3pos; d4 = j3pos - i4pos; // loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j3BornForceFactor, j3AtomicRadii, de, bornSum ); loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j3BornForceFactor, j3AtomicRadii, de ); bornForce1.xyz += de.x*d1; bornForce2.xyz += de.y*d2; bornForce3.xyz += de.z*d3; bornForce4.xyz += de.w*d4; // --------------------------------------------------------------------------------------- // j = 4 d1 = j4pos - i1pos; d2 = j4pos - i2pos; d3 = j4pos - i3pos; d4 = j4pos - i4pos; // loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j4BornForceFactor, j4AtomicRadii, de, bornSum ); loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j4BornForceFactor, j4AtomicRadii, de ); bornForce1.xyz += de.x*d1; bornForce2.xyz += de.y*d2; bornForce3.xyz += de.z*d3; bornForce4.xyz += de.w*d4; */ // --------------------------------------------------------------------------------------- //exclind.x -= 3.0f; exclind.x = floor( exclind.x - 2.9f ); //exclind.y += 1.0f; exclind.y = floor( exclind.y + 1.25f ); // increment linear index by 4 jLinind += 4.0f; } jAtom.y += 1.0f; } }