/**************************************************************** * This file is part of the gpu acceleration library for gromacs. * Author: Mark Friedrichs * * This kernel was developed in collaboration with * * Copyright (C) Pande Group, Stanford, 2006 *****************************************************************/ //156 FLOPS [172 total] kernel void loop2Internal( float3 d1, float3 d2, float3 d3, float3 d4, float4 jAtomicRadiiScaled, float bornForce, float iAtomicRadii, out float4 de<> ){ // --------------------------------------------------------------------------------------- float4 r, rInverse, r2, r2Inverse; float4 l_ij, l_ij2; float4 u_ij, u_ij2; float4 t3; float4 logRatio, scaledRadiusJ2, rPlusScaledJ, rMinusScaledJ, rMinusScaledJAbs; float4 iAtomicRadii4; float4 diff_u2_l2; const float bigValue = 1000000.0f; const float4 bigValue4 = float4( bigValue, bigValue, bigValue, bigValue ); const float smallValue = 0.000001f; const float4 smallValue4 = float4( smallValue, smallValue, smallValue, smallValue ); const float4 zero4 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); const float4 one4 = float4( 1.0f, 1.0f, 1.0f, 1.0f ); float4 mask; // --------------------------------------------------------------------------------------- iAtomicRadii4 = float4( iAtomicRadii, iAtomicRadii, iAtomicRadii, iAtomicRadii ); r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) ); //20 // --------------------------------------------------------------------------------------- // not needed for force calculation since deltaX = deltaY = deltaZ = 0, if r = 0 // r2 not masked -- assume for i = j, that 0 * infinity i 0; appears to work mask = r2 < smallValue4 ? zero4 : one4; // r2 = r2 < smallValue4 ? bigValue4 : r2; // --------------------------------------------------------------------------------------- r = sqrt( r2 ); //4 rInverse = rsqrt( r2 ); //8 r2Inverse = rInverse*rInverse; //4 rPlusScaledJ = r + jAtomicRadiiScaled; //4 rMinusScaledJ = r - jAtomicRadiiScaled; //4 rMinusScaledJAbs = abs( rMinusScaledJ ); // --------------------------------------------------------------------------------------- mask = iAtomicRadii4 >= rPlusScaledJ ? zero4 : mask; l_ij = iAtomicRadii4 > rMinusScaledJAbs ? iAtomicRadii4 : rMinusScaledJAbs; // --------------------------------------------------------------------------------------- l_ij = 1.0f/l_ij; //4 l_ij2 = l_ij*l_ij; //4 u_ij = 1.0f/rPlusScaledJ; //4 u_ij2 = u_ij*u_ij; //4 diff_u2_l2 = u_ij2 - l_ij2; //4 scaledRadiusJ2 = jAtomicRadiiScaled*jAtomicRadiiScaled; //4 logRatio = log(u_ij/l_ij); //4 // terms associated w/ dL/dr & dU/dr are zero and not included t3 = -0.125f*(1.0f + scaledRadiusJ2*r2Inverse)*diff_u2_l2 + 0.25f*logRatio*r2Inverse; //7*4 de = mask*bornForce*t3*rInverse; //12 // de = float4( 1.0f, 1.0f, 1.0f, 1.0f ); // de = scaledRadiusJ2; // de = mask*t3; // de = mask*rPlusScaledJ; // de = mask*r2; // de = mask; // de = mask*rInverse; // de = mask*bornForce; // Born radius term // bornSum = l_ij - u_ij + 0.25*r*diff_u2_l2 + 0.5f*rInverse*logRatio - (0.25*jAtomicRadiiScaled*jAtomicRadiiScaled*rInverse)*diff_u2_l2; /* bornSum = l_ij - u_ij + 0.5f*rInverse*logRatio + 0.25f*( r - jAtomicRadiiScaled*jAtomicRadiiScaled*rInverse)*diff_u2_l2; //10*4 t3 = ( iAtomicRadii4 < (jAtomicRadiiScaled - r) ) ? 2.0f*( (1.0f/iAtomicRadii) - l_ij) : zero4; //12 for true, but don't know how to average so saying 0 bornSum += t3; //4 bornSum *= mask; //4 */ } kernel void loop2InternalR2( float4 r2, float4 jAtomicRadiiScaled, float bornForce, float iAtomicRadii, out float4 de<>, out float4 bornSum<> ){ // --------------------------------------------------------------------------------------- float4 r, rInverse, r2Inverse; float4 l_ij, l_ij2; float4 u_ij, u_ij2; float4 t3; float4 logRatio, scaledRadiusJ2, rPlusScaledJ, rMinusScaledJ, rMinusScaledJAbs; float4 iAtomicRadii4; float4 diff_u2_l2; const float bigValue = 1000000.0f; const float4 bigValue4 = float4( bigValue, bigValue, bigValue, bigValue ); const float smallValue = 0.000001f; const float4 smallValue4 = float4( smallValue, smallValue, smallValue, smallValue ); const float4 zero4 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); const float4 one4 = float4( 1.0f, 1.0f, 1.0f, 1.0f ); float4 mask; // --------------------------------------------------------------------------------------- iAtomicRadii4 = float4( iAtomicRadii, iAtomicRadii, iAtomicRadii, iAtomicRadii ); // --------------------------------------------------------------------------------------- // not needed for force calculation since deltaX = deltaY = deltaZ = 0, if r = 0 // r2 not masked -- assume for i = j, that 0 * infinity i 0; appears to work mask = r2 < smallValue4 ? zero4 : one4; // r2 = r2 < smallValue4 ? bigValue4 : r2; // --------------------------------------------------------------------------------------- r = sqrt( r2 ); rInverse = rsqrt( r2 ); r2Inverse = rInverse*rInverse; rPlusScaledJ = r + jAtomicRadiiScaled; rMinusScaledJ = r - jAtomicRadiiScaled; rMinusScaledJAbs = abs( rMinusScaledJ ); // --------------------------------------------------------------------------------------- mask = iAtomicRadii4 >= rPlusScaledJ ? zero4 : mask; l_ij = iAtomicRadii4 > rMinusScaledJAbs ? iAtomicRadii4 : rMinusScaledJAbs; // --------------------------------------------------------------------------------------- l_ij = 1.0f/l_ij; l_ij2 = l_ij*l_ij; u_ij = 1.0f/rPlusScaledJ; u_ij2 = u_ij*u_ij; diff_u2_l2 = u_ij2 - l_ij2; scaledRadiusJ2 = jAtomicRadiiScaled*jAtomicRadiiScaled; logRatio = log(u_ij/l_ij); // terms associated w/ dL/dr & dU/dr are zero and not included t3 = -0.125f*(1.0f + scaledRadiusJ2*r2Inverse)*diff_u2_l2 + 0.25f*logRatio*r2Inverse; de = mask*bornForce*t3*rInverse; // de = float4( 1.0f, 1.0f, 1.0f, 1.0f ); // de = scaledRadiusJ2; // de = mask*t3; // de = mask*rPlusScaledJ; // de = mask*r2; // de = mask; // de = mask*rInverse; // de = mask*bornForce; // Born radius term // bornSum = l_ij - u_ij + 0.25*r*diff_u2_l2 + 0.5f*rInverse*logRatio - (0.25*jAtomicRadiiScaled*jAtomicRadiiScaled*rInverse)*diff_u2_l2; bornSum = l_ij - u_ij + 0.5f*rInverse*logRatio + 0.25f*( r - jAtomicRadiiScaled*jAtomicRadiiScaled*rInverse)*diff_u2_l2; t3 = ( iAtomicRadii4 < (jAtomicRadiiScaled - r) ) ? 2.0f*( (1.0f/iAtomicRadii) - l_ij) : zero4; bornSum += t3; bornSum *= mask; } /* This is a copy of loop2Internal(), but w/ the calculation of bornSum removed */ //112 FLOPS kernel void loop2InternalNoSum( float3 d1, float3 d2, float3 d3, float3 d4, float4 jAtomicRadiiScaled, float bornForce, float iAtomicRadii, out float4 de<> ){ // --------------------------------------------------------------------------------------- float4 r, rInverse, r2, r2Inverse; float4 l_ij, l_ij2; float4 u_ij, u_ij2; float4 t3; float4 logRatio, scaledRadiusJ2, rPlusScaledJ, rMinusScaledJ, rMinusScaledJAbs; float4 iAtomicRadii4; const float bigValue = 1000000.0f; const float4 bigValue4 = float4( bigValue, bigValue, bigValue, bigValue ); const float smallValue = 0.000001f; const float4 smallValue4 = float4( smallValue, smallValue, smallValue, smallValue ); const float4 zero4 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); const float4 one4 = float4( 1.0f, 1.0f, 1.0f, 1.0f ); float4 mask; // --------------------------------------------------------------------------------------- iAtomicRadii4 = float4( iAtomicRadii, iAtomicRadii, iAtomicRadii, iAtomicRadii ); r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) ); //20 // --------------------------------------------------------------------------------------- // not needed for force calculation since deltaX = deltaY = deltaZ = 0, if r = 0 // mask = r2 < smallValue4 ? zero4 : one4; // r2 = r2 < smallValue4 ? bigValue4 : r2; // --------------------------------------------------------------------------------------- r = sqrt( r2 ); //4 rInverse = rsqrt( r2 ); //8 r2Inverse = rInverse*rInverse; //4 rPlusScaledJ = r + jAtomicRadiiScaled; //4 rMinusScaledJ = r - jAtomicRadiiScaled; //4 rMinusScaledJAbs = abs( rMinusScaledJ ); // --------------------------------------------------------------------------------------- // mask = iAtomicRadii4 >= rPlusScaledJ ? zero4 : mask; mask = iAtomicRadii4 >= rPlusScaledJ ? zero4 : one4; l_ij = iAtomicRadii4 > rMinusScaledJAbs ? iAtomicRadii4 : rMinusScaledJAbs; // --------------------------------------------------------------------------------------- l_ij = 1.0f/l_ij; //4 l_ij2 = l_ij*l_ij; //4 u_ij = 1.0f/rPlusScaledJ; //4 u_ij2 = u_ij*u_ij; //4 scaledRadiusJ2 = jAtomicRadiiScaled*jAtomicRadiiScaled; //4 logRatio = log(u_ij/l_ij); //4 // terms associated w/ dL/dr & dU/dr are zero and not included t3 = 0.125f*(1.0f + scaledRadiusJ2*r2Inverse)*(l_ij2 - u_ij2) + 0.25f*logRatio*r2Inverse; //8*4 de = mask*bornForce*t3*rInverse; //12 } //124 FLOPS [??? total] kernel void bornSumInternal( float3 d1, float3 d2, float3 d3, float3 d4, float4 jAtomicRadiiScaled, float iAtomicRadii, out float4 bornSum<> ){ // --------------------------------------------------------------------------------------- float4 r, rInverse, r2, r2Inverse; float4 l_ij, l_ij2; float4 u_ij, u_ij2; float4 t3; float4 logRatio, scaledRadiusJ2, rPlusScaledJ, rMinusScaledJ, rMinusScaledJAbs; float4 iAtomicRadii4; float4 diff_u2_l2; const float bigValue = 1000000.0f; const float4 bigValue4 = float4( bigValue, bigValue, bigValue, bigValue ); const float smallValue = 0.000001f; const float4 smallValue4 = float4( smallValue, smallValue, smallValue, smallValue ); const float4 zero4 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); const float4 one4 = float4( 1.0f, 1.0f, 1.0f, 1.0f ); float4 mask; // --------------------------------------------------------------------------------------- iAtomicRadii4 = float4( iAtomicRadii, iAtomicRadii, iAtomicRadii, iAtomicRadii ); r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) ); //20 // --------------------------------------------------------------------------------------- // not needed for force calculation since deltaX = deltaY = deltaZ = 0, if r = 0 // r2 not masked -- assume for i = j, that 0 * infinity i 0; appears to work mask = r2 < smallValue4 ? zero4 : one4; // --------------------------------------------------------------------------------------- r = sqrt( r2 ); //4 rInverse = rsqrt( r2 ); //8 rPlusScaledJ = r + jAtomicRadiiScaled; //4 rMinusScaledJ = r - jAtomicRadiiScaled; //4 rMinusScaledJAbs = abs( rMinusScaledJ ); //40 // --------------------------------------------------------------------------------------- mask = iAtomicRadii4 >= rPlusScaledJ ? zero4 : mask; l_ij = iAtomicRadii4 > rMinusScaledJAbs ? iAtomicRadii4 : rMinusScaledJAbs; // --------------------------------------------------------------------------------------- l_ij = 1.0f/l_ij; //4 l_ij2 = l_ij*l_ij; //4 u_ij = 1.0f/rPlusScaledJ; //4 u_ij2 = u_ij*u_ij; //4 diff_u2_l2 = u_ij2 - l_ij2; //4 logRatio = log(u_ij/l_ij); //4 //24 // --------------------------------------------------------------------------------------- // terms associated w/ dL/dr & dU/dr are zero and not included // Born radius term bornSum = l_ij - u_ij + 0.5f*rInverse*logRatio + 0.25f*( r - jAtomicRadiiScaled*jAtomicRadiiScaled*rInverse)*diff_u2_l2; //10*4 t3 = ( iAtomicRadii4 < (jAtomicRadiiScaled - r) ) ? 2.0f*( (1.0f/iAtomicRadii) - l_ij) : zero4; //12 for true, but don't know how to average so saying 0 bornSum += t3; //4 bornSum *= mask; //4 //60 } /* --------------------------------------------------------------------------------------- Calculate Loop 2 OBC forces (Simbios) numberOfAtoms: no. of atoms roundedUpAtoms: rounded up number of atoms based on unroll duplicationFactor: ? strheight: atom stream height streamWidth: atom stream width fstreamWidth: force stream width (output -- i-unroll) posq: atom positions and charge atomicRadii: atomic radii scaledAtomicRadii: scaled atomic radii bornForceFactor: (Born radii)**2*bornForce*Obcforce bornForce1: i-unroll first force component, including dBorn_r/dr in .w bornForce2: i-unroll second force component, including dBorn_r/dr in .w bornForce3: i-unroll third force component, including dBorn_r/dr in .w bornForce4: i-unroll fourth force component, including dBorn_r/dr in .w #endif --------------------------------------------------------------------------------------- */ //1376 FLOPS kernel void kObcLoop2( float numberOfAtoms, float roundedUpAtoms, float duplicationFactor, float streamWidth, float fstreamWidth, float4 posq[][], float atomicRadii[][], float scaledAtomicRadii[][], float bornForceFactor[][], out float4 bornForce1<>, out float4 bornForce2<>, out float4 bornForce3<>, out float4 bornForce4<> ){ // --------------------------------------------------------------------------------------- // atomic radii: radius - dielectric offset float i1AtomicRadii, i2AtomicRadii, i3AtomicRadii, i4AtomicRadii; float j1AtomicRadii, j2AtomicRadii, j3AtomicRadii, j4AtomicRadii; float4 iAtomicRadii, jAtomicRadii; // scaled atomic radii: scaleFactor*( radius - dielectric offset ) float i1ScaledAtomicRadii, i2ScaledAtomicRadii, i3ScaledAtomicRadii, i4ScaledAtomicRadii; float j1ScaledAtomicRadii, j2ScaledAtomicRadii, j3ScaledAtomicRadii, j4ScaledAtomicRadii; float4 iScaledAtomicRadii, jScaledAtomicRadii; // (Born radii**2)*BornForce*ObcChainForce float i1BornForceFactor, i2BornForceFactor, i3BornForceFactor, i4BornForceFactor; float j1BornForceFactor, j2BornForceFactor, j3BornForceFactor, j4BornForceFactor; //float4 iBornForceFactor; //float4 jBornForceFactor; // i,j coordinates float3 i1pos, i2pos, i3pos, i4pos; float3 j1pos, j2pos, j3pos, j4pos; // delta coordinates float3 d1, d2, d3, d4; // indices float2 iAtom; float forceIndex; //This is forceIndex mod numberOfAtoms, the true i index float4 de, bornSum; float iAtomLinearIndex, jLinind; float2 jAtom; float jEnd, jStart, jBlock; float whichRep; float tmp; float tmp2; // --------------------------------------------------------------------------------------- // constants const float I_Unroll = 4.0f; // --------------------------------------------------------------------------------------- // set linear index iAtom = indexof( bornForce1 ); forceIndex = I_Unroll*( iAtom.x + iAtom.y*fstreamWidth ); iAtomLinearIndex = fmod( forceIndex, roundedUpAtoms ); // --------------------------------------------------------------------------------------- // set gather coordinates iAtom.x = fmod( iAtomLinearIndex, streamWidth ); iAtom.y = round( (iAtomLinearIndex - iAtom.x)/streamWidth ); // --------------------------------------------------------------------------------------- // etch i1 position, Born prefactor, atomic radius i1pos = posq[ iAtom ].xyz; i1BornForceFactor = bornForceFactor[ iAtom ]; i1AtomicRadii = atomicRadii[ iAtom ]; i1ScaledAtomicRadii = scaledAtomicRadii[ iAtom ]; bornForce1 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); // etch i2 position, Born prefactor, atomic radius iAtom.x += 1; i2pos = posq[ iAtom ].xyz; i2BornForceFactor = bornForceFactor[ iAtom ]; i2AtomicRadii = atomicRadii[ iAtom ]; i2ScaledAtomicRadii = scaledAtomicRadii[ iAtom ]; bornForce2 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); // etch i3 position, Born prefactor, atomic radius iAtom.x += 1; i3pos = posq[ iAtom ].xyz; i3BornForceFactor = bornForceFactor[ iAtom ]; i3AtomicRadii = atomicRadii[ iAtom ]; i3ScaledAtomicRadii = scaledAtomicRadii[ iAtom ]; bornForce3 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); // etch i4 position, Born prefactor, atomic radius iAtom.x += 1; i4pos = posq[ iAtom ].xyz; i4BornForceFactor = bornForceFactor[ iAtom ]; i4AtomicRadii = atomicRadii[ iAtom ]; i4ScaledAtomicRadii = scaledAtomicRadii[ iAtom ]; bornForce4 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); // create float4 for main vars //iBornForceFactor = float4( i1BornForceFactor, i2BornForceFactor, i3BornForceFactor, i4BornForceFactor ); iAtomicRadii = float4( i1AtomicRadii, i2AtomicRadii, i3AtomicRadii, i4AtomicRadii ); iScaledAtomicRadii = float4( i1ScaledAtomicRadii, i2ScaledAtomicRadii, i3ScaledAtomicRadii, i4ScaledAtomicRadii ); // inner loop indices setup //changed the following instruction for rounding issues on some ASICs //whichRep = floor( forceIndex / roundedUpAtoms ); tmp = fmod(forceIndex, roundedUpAtoms); whichRep = round((forceIndex - tmp)/roundedUpAtoms); //jBlock = 1 + floor( numberOfAtoms/(duplicationFactor*streamWidth ) ); jBlock = 1 + round( (numberOfAtoms-fmod( numberOfAtoms, (duplicationFactor*streamWidth) )) / (duplicationFactor*streamWidth) ); jStart = whichRep*jBlock; jEnd = ( whichRep > duplicationFactor - 1.5f ) ? 999999.0f : (jStart + jBlock); jAtom.y = jStart; jLinind = jAtom.y*streamWidth; // --------------------------------------------------------------------------------------- while ( jAtom.y < jEnd && ( numberOfAtoms - jLinind ) > 0.9f ){ jAtom.x = 0.0f; while ( jAtom.x < streamWidth && ( numberOfAtoms - jLinind ) > 0.9f ){ // --------------------------------------------------------------------------------------- // gather required values j1BornForceFactor = bornForceFactor[ jAtom ]; j1AtomicRadii = atomicRadii[ jAtom ]; j1ScaledAtomicRadii = scaledAtomicRadii[ jAtom ]; j1pos = posq[ jAtom ].xyz; jAtom.x += 1.0f; j2BornForceFactor = bornForceFactor[ jAtom ]; j2AtomicRadii = atomicRadii[ jAtom ]; j2ScaledAtomicRadii = scaledAtomicRadii[ jAtom ]; j2pos = posq[ jAtom ].xyz; jAtom.x += 1.0f; j3BornForceFactor = bornForceFactor[ jAtom ]; j3AtomicRadii = atomicRadii[ jAtom ]; j3ScaledAtomicRadii = scaledAtomicRadii[ jAtom ]; j3pos = posq[ jAtom ].xyz; jAtom.x += 1.0f; j4BornForceFactor = bornForceFactor[ jAtom ]; j4AtomicRadii = atomicRadii[ jAtom ]; j4ScaledAtomicRadii = scaledAtomicRadii[ jAtom ]; j4pos = posq[ jAtom ].xyz; jAtom.x += 1.0f; //jBornForceFactor = float4( j1BornForceFactor, j2BornForceFactor, j3BornForceFactor, j4BornForceFactor ); jAtomicRadii = float4( j1AtomicRadii, j2AtomicRadii, j3AtomicRadii, j4AtomicRadii ); jScaledAtomicRadii = float4( j1ScaledAtomicRadii, j2ScaledAtomicRadii, j3ScaledAtomicRadii, j4ScaledAtomicRadii ); // --------------------------------------------------------------------------------------- // First set of 4 -- i == 1 d1 = j1pos - i1pos; //3 d2 = j2pos - i1pos; //3 d3 = j3pos - i1pos; //3 d4 = j4pos - i1pos; //3 // i = 1 //loop2Internal( d1, d2, d3, d4, jScaledAtomicRadii, i1BornForceFactor, i1AtomicRadii, de, bornSum ); //156 : 368 loop2Internal( d1, d2, d3, d4, jScaledAtomicRadii, i1BornForceFactor, i1AtomicRadii, de ); //156 : 368 bornForce1.xyz += de.x*d1; //3*2=6 bornForce1.xyz += de.y*d2; //3*2=6 bornForce1.xyz += de.z*d3; //3*2=6 bornForce1.xyz += de.w*d4; //3*2=6 //bornForce1.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w; //4 // --------------------------------------------------------------------------------------- // i = 2 d1 = j1pos - i2pos; //3 d2 = j2pos - i2pos; //3 d3 = j3pos - i2pos; //3 d4 = j4pos - i2pos; //3 //loop2Internal( d1, d2, d3, d4, jScaledAtomicRadii, i2BornForceFactor, i2AtomicRadii, de, bornSum );//156 : 368 loop2Internal( d1, d2, d3, d4, jScaledAtomicRadii, i2BornForceFactor, i2AtomicRadii, de );//156 : 368 bornForce2.xyz += de.x*d1; //3*2=6 bornForce2.xyz += de.y*d2; //3*2=6 bornForce2.xyz += de.z*d3; //3*2=6 bornForce2.xyz += de.w*d4; //3*2=6 //bornForce2.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w; //4 // --------------------------------------------------------------------------------------- // i = 3 d1 = j1pos - i3pos; //3 d2 = j2pos - i3pos; //3 d3 = j3pos - i3pos; //3 d4 = j4pos - i3pos; //3 loop2Internal( d1, d2, d3, d4, jScaledAtomicRadii, i3BornForceFactor, i3AtomicRadii, de );//156 : 368 bornForce3.xyz += de.x*d1; //3*2=6 bornForce3.xyz += de.y*d2; //3*2=6 bornForce3.xyz += de.z*d3; //3*2=6 bornForce3.xyz += de.w*d4; //3*2=6 //bornForce3.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w; //4 // --------------------------------------------------------------------------------------- // i = 4 d1 = j1pos - i4pos; //3 d2 = j2pos - i4pos; //3 d3 = j3pos - i4pos; //3 d4 = j4pos - i4pos; //3 loop2Internal( d1, d2, d3, d4, jScaledAtomicRadii, i4BornForceFactor, i4AtomicRadii, de );//156 : 366 bornForce4.xyz += de.x*d1; //3*2=6 bornForce4.xyz += de.y*d2; //3*2=6 bornForce4.xyz += de.z*d3; //3*2=6 bornForce4.xyz += de.w*d4; //3*2=6 //bornForce4.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w; //4 // --------------------------------------------------------------------------------------- // j = 1 d1 = j1pos - i1pos; //3 d2 = j1pos - i2pos; //3 d3 = j1pos - i3pos; //3 d4 = j1pos - i4pos; //3 // loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j1BornForceFactor, j1AtomicRadii, de, bornSum ); loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j1BornForceFactor, j1AtomicRadii, de ); //112 : 304 bornForce1.xyz += de.x*d1; //3*2=6 bornForce2.xyz += de.y*d2; //3*2=6 bornForce3.xyz += de.z*d3; //3*2=6 bornForce4.xyz += de.w*d4; //3*2=6 // --------------------------------------------------------------------------------------- // j = 2 d1 = j2pos - i1pos; //3 d2 = j2pos - i2pos; //3 d3 = j2pos - i3pos; //3 d4 = j2pos - i4pos; //3 // loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j2BornForceFactor, j2AtomicRadii, de, bornSum ); loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j2BornForceFactor, j2AtomicRadii, de ); //112 : 304 bornForce1.xyz += de.x*d1; //3*2=6 bornForce2.xyz += de.y*d2; //3*2=6 bornForce3.xyz += de.z*d3; //3*2=6 bornForce4.xyz += de.w*d4; //3*2=6 // --------------------------------------------------------------------------------------- // j = 3 d1 = j3pos - i1pos; //3 d2 = j3pos - i2pos; //3 d3 = j3pos - i3pos; //3 d4 = j3pos - i4pos; //3 // loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j3BornForceFactor, j3AtomicRadii, de, bornSum ); loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j3BornForceFactor, j3AtomicRadii, de ); //112 : 304 bornForce1.xyz += de.x*d1; //3*2=6 bornForce2.xyz += de.y*d2; //3*2=6 bornForce3.xyz += de.z*d3; //3*2=6 bornForce4.xyz += de.w*d4; //3*2=6 // --------------------------------------------------------------------------------------- // j = 4 d1 = j4pos - i1pos; //3 d2 = j4pos - i2pos; //3 d3 = j4pos - i3pos; //3 d4 = j4pos - i4pos; //3 // loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j4BornForceFactor, j4AtomicRadii, de, bornSum ); loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j4BornForceFactor, j4AtomicRadii, de ); //112 : 304 bornForce1.xyz += de.x*d1; //3*2=6 bornForce2.xyz += de.y*d2; //3*2=6 bornForce3.xyz += de.z*d3; //3*2=6 bornForce4.xyz += de.w*d4; //3*2=6 // --------------------------------------------------------------------------------------- // increment linear index by 4 jLinind += 4.0f; } jAtom.y += 1.0f; } } /* --------------------------------------------------------------------------------------- Calculate Born radii and Obc chain derivative term numberOfAtoms: no. of atoms streamWidth: atom stream width forces input forces atomicRadii atomic radii bornRadii output Born radii obcChain output Obc chain derivative --------------------------------------------------------------------------------------- */ kernel void kObcBornRadii( float numberOfAtoms, float streamWidth, float4 forces<>, float atomicRadii<>, out float bornRadii<>, out float obcChain<> ){ // --------------------------------------------------------------------------------------- float sum2, sum3, bornSum, tanhSum, atomicRadiiOffset, obcIntermediate; float2 iAtom; float expPlus, expMinus; // --------------------------------------------------------------------------------------- // constants -- OBC Type II const float alphaObc = 1.0f; const float betaObc = 0.8f; const float gammaObc = 4.85f; const float dielectricOffset = 0.009f; // --------------------------------------------------------------------------------------- atomicRadiiOffset = atomicRadii - dielectricOffset; bornSum = forces.w; bornSum *= 0.5f*atomicRadiiOffset; sum2 = bornSum*bornSum; sum3 = bornSum*sum2; // Tanh does not exist? // calculate [ exp(x) - exp(-x) ]/[ exp(x) + exp(-x) ] // tanhSum = tanh( bornSum - betaObc*sum2 + gammaObc*sum3 ); tanhSum = bornSum - betaObc*sum2 + gammaObc*sum3; expPlus = exp( tanhSum ); expMinus = 1.0f/expPlus; tanhSum = ( expPlus - expMinus )/( expPlus + expMinus ); bornRadii = 1.0f/( (1.0f/(atomicRadiiOffset)) - tanhSum/atomicRadii ); obcIntermediate = atomicRadiiOffset*( alphaObc - 2.0f*betaObc*bornSum + 3.0f*gammaObc*sum2 ); obcChain = (1.0f - tanhSum*tanhSum)*obcIntermediate/atomicRadii; } kernel float4 scalar_force_CDLJMerge( float4 qq, float epsfac, float4 sig, float4 eps, float4 r2 ) { float4 invr, invrsig2, invrsig6; float4 f; invr = rsqrt( r2 ); invrsig2 = invr * sig; invrsig2 = invrsig2 * invrsig2; invrsig6 = invrsig2 * invrsig2 * invrsig2; f = eps * ( 12.0f*invrsig6 - 6.0f ) * invrsig6; f += epsfac*qq*invr; f *= invr*invr*2.39f; return f; } //???? FLOPS kernel void kCalculateBornRadii( float numberOfAtoms, float roundedUpAtoms, float duplicationFactor, float streamWidth, float fstreamWidth, float3 posq[][], float2 scaledAtomicRadii[][], out float4 bornForce<> ){ // --------------------------------------------------------------------------------------- // atomic radii: radius - dielectric offset float i1AtomicRadii, i2AtomicRadii, i3AtomicRadii, i4AtomicRadii; // scaled atomic radii: scaleFactor*( radius - dielectric offset ) float j1ScaledAtomicRadii, j2ScaledAtomicRadii, j3ScaledAtomicRadii, j4ScaledAtomicRadii; float4 jScaledAtomicRadii; // (Born radii**2)*BornForce*ObcChainForce // i,j coordinates float3 i1pos, i2pos, i3pos, i4pos; float3 j1pos, j2pos, j3pos, j4pos; // delta coordinates float3 d1, d2, d3, d4; // indices float2 iAtom; float forceIndex; //This is forceIndex mod numberOfAtoms, the true i index float4 de, bornSum; float iAtomLinearIndex, jLinind; float2 jAtom; float jEnd, jStart, jBlock; float whichRep; float tmp; float tmp2; // --------------------------------------------------------------------------------------- // constants const float I_Unroll = 4.0f; // --------------------------------------------------------------------------------------- // set linear index iAtom = indexof( bornForce ); forceIndex = I_Unroll*( iAtom.x + iAtom.y*fstreamWidth ); iAtomLinearIndex = fmod( forceIndex, roundedUpAtoms ); // --------------------------------------------------------------------------------------- // set gather coordinates iAtom.x = fmod( iAtomLinearIndex, streamWidth ); iAtom.y = round( (iAtomLinearIndex - iAtom.x)/streamWidth ); // --------------------------------------------------------------------------------------- // fetch position, (atomic radius - dielectric offset ) i1pos = posq[ iAtom ]; i1AtomicRadii = scaledAtomicRadii[ iAtom ].y; iAtom.x += 1; i2pos = posq[ iAtom ]; i2AtomicRadii = scaledAtomicRadii[ iAtom ].y; iAtom.x += 1; i3pos = posq[ iAtom ]; i3AtomicRadii = scaledAtomicRadii[ iAtom ].y; iAtom.x += 1; i4pos = posq[ iAtom ]; i4AtomicRadii = scaledAtomicRadii[ iAtom ].y; bornForce = float4( 0.0f, 0.0f, 0.0f, 0.0f ); // inner loop indices setup //changed the following instruction for rounding issues on some ASICs //whichRep = floor( forceIndex / roundedUpAtoms ); tmp = fmod(forceIndex, roundedUpAtoms); whichRep = round((forceIndex - tmp)/roundedUpAtoms); jBlock = 1 + round( (numberOfAtoms-fmod( numberOfAtoms, (duplicationFactor*streamWidth) )) / (duplicationFactor*streamWidth) ); jStart = whichRep*jBlock; jEnd = ( whichRep > duplicationFactor - 1.5f ) ? 999999.0f : (jStart + jBlock); jAtom.y = jStart; jLinind = jAtom.y*streamWidth; // --------------------------------------------------------------------------------------- while ( jAtom.y < jEnd && ( numberOfAtoms - jLinind ) > 0.9f ){ jAtom.x = 0.0f; while ( jAtom.x < streamWidth && ( numberOfAtoms - jLinind ) > 0.9f ){ // --------------------------------------------------------------------------------------- // gather required values j1ScaledAtomicRadii = scaledAtomicRadii[ jAtom ].x; j1pos = posq[ jAtom ]; jAtom.x += 1.0f; j2ScaledAtomicRadii = scaledAtomicRadii[ jAtom ].x; j2pos = posq[ jAtom ]; jAtom.x += 1.0f; j3ScaledAtomicRadii = scaledAtomicRadii[ jAtom ].x; j3pos = posq[ jAtom ]; jAtom.x += 1.0f; j4ScaledAtomicRadii = scaledAtomicRadii[ jAtom ].x; j4pos = posq[ jAtom ]; jAtom.x += 1.0f; jScaledAtomicRadii = float4( j1ScaledAtomicRadii, j2ScaledAtomicRadii, j3ScaledAtomicRadii, j4ScaledAtomicRadii ); // --------------------------------------------------------------------------------------- // First set of 4 -- i == 1 d1 = j1pos - i1pos; //3 d2 = j2pos - i1pos; //3 d3 = j3pos - i1pos; //3 d4 = j4pos - i1pos; //3 // i = 1 bornSumInternal( d1, d2, d3, d4, jScaledAtomicRadii, i1AtomicRadii, bornSum ); //112 : 368(?) bornForce.x += bornSum.x + bornSum.y + bornSum.z + bornSum.w; //4 // --------------------------------------------------------------------------------------- // i = 2 d1 = j1pos - i2pos; //3 d2 = j2pos - i2pos; //3 d3 = j3pos - i2pos; //3 d4 = j4pos - i2pos; //3 bornSumInternal( d1, d2, d3, d4, jScaledAtomicRadii, i2AtomicRadii, bornSum );//112 : 368(?) bornForce.y += bornSum.x + bornSum.y + bornSum.z + bornSum.w; //4 // --------------------------------------------------------------------------------------- // i = 3 d1 = j1pos - i3pos; //3 d2 = j2pos - i3pos; //3 d3 = j3pos - i3pos; //3 d4 = j4pos - i3pos; //3 bornSumInternal( d1, d2, d3, d4, jScaledAtomicRadii, i3AtomicRadii, bornSum );//112 : 368(?) bornForce.z += bornSum.x + bornSum.y + bornSum.z + bornSum.w; //4 // --------------------------------------------------------------------------------------- // i = 4 d1 = j1pos - i4pos; //3 d2 = j2pos - i4pos; //3 d3 = j3pos - i4pos; //3 d4 = j4pos - i4pos; //3 bornSumInternal( d1, d2, d3, d4, jScaledAtomicRadii, i4AtomicRadii, bornSum );//112 : 368(?) bornForce.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w; //4 // --------------------------------------------------------------------------------------- // increment linear index by 4 jLinind += 4.0f; } jAtom.y += 1.0f; } }