/**************************************************************** * This file is part of the gpu acceleration library for gromacs. * Author: Mark Friedrichs * * This kernel was developed in collaboration with * * Copyright (C) Pande Group, Stanford, 2006 *****************************************************************/ //96 : 228 Flops kernel void loop1Internal( float3 d1, float3 d2, float3 d3, float3 d4, float4 jBornR, float4 jQ, float iBornR, float iQ, out float4 dGpol_dr<>, out float4 dGpol_dalpha2_ij<> ){ // --------------------------------------------------------------------------------------- float4 r2, alpha2_ij, D_ij, expTerm, denominator2, denominator, Gpol; // --------------------------------------------------------------------------------------- r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) ); //20 alpha2_ij = jBornR*iBornR; //4 D_ij = r2/(4.0f*alpha2_ij); //8 expTerm = exp( -D_ij ); //4 : 80 denominator2 = r2 + alpha2_ij*expTerm; //8 denominator = sqrt( denominator2 ); //4 : 60 Gpol = jQ/denominator; //4 Gpol *= iQ;//4 dGpol_dr = -Gpol*( 1.0f - 0.25f*expTerm )/denominator2; //16 dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*( 1.0f + D_ij )*jBornR/denominator2; //24 } /* --------------------------------------------------------------------------------------- Calculate nonpolar ACE term (Simbios) bornRadius: Born radius vdwRadius: Vdw radius duplicationFactor: duplication factor aceForce: ACE term --------------------------------------------------------------------------------------- */ kernel void kAceNonPolarLoop1( float iBornRadius, float iVdwRadius, float duplicationFactor, out float aceForce<> ){ // --------------------------------------------------------------------------------------- // nonpolar term float iSurface; float iAceTerm; // --------------------------------------------------------------------------------------- // constants // solvent radius const float probeRadius = 0.14f; // PI*24*0.0f049 (0.0f054=asolv from Tinker) //const float PI_24_aI = -0.3694512961; const float PI_24_aI = -400.71504079f; // --------------------------------------------------------------------------------------- // etch i position and partial charge // e = ai * term * (ri+probe)**2 * (ri/rb)**6 // (drbi) = drb(i) - 6.0fd0*e/rb // (rI+probe)**2 iSurface = (iVdwRadius+probeRadius); iSurface = iSurface*iSurface; // (rI/rB)**6 iAceTerm = iVdwRadius/iBornRadius; iAceTerm = iAceTerm*iAceTerm*iAceTerm; iAceTerm = iAceTerm*iAceTerm; aceForce = iSurface*iAceTerm*PI_24_aI/(duplicationFactor*iBornRadius); } /* --------------------------------------------------------------------------------------- Calculate first loop force terms (Simbios) numberOfAtoms: no. of atoms roundedUpAtoms: rounded up number of atoms -- accounts for unrolling duplicationFactor: number of threads for inner loop streamWidth: atom stream width fstreamWidth: force stream width (output -- i-unroll) soluteDielectric: solute dielectric solventDielectric: solvent dielectric includeAce: include ACE term posq: atom positions and charge bornRadii: Born radii nonpolarForce: nonpolar force (0 if nonpolar not included, else ACE value) bornForce1: i-unroll first force component, including dBornR/dr in .w bornForce2: i-unroll second force component, including dBornR/dr in .w bornForce3: i-unroll first force component, including dBornR/dr in .w bornForce4: i-unroll second force component, including dBornR/dr in .w --------------------------------------------------------------------------------------- */ //FLOPS = 544 kernel void kObcLoop1( float numberOfAtoms, float roundedUpAtoms, float duplicationFactor, float streamWidth, float fstreamWidth, float soluteDielectric, float solventDielectric, float includeAce, float4 posq[][], float bornRadii[][], float atomicRadii[][], out float4 bornForce1<>, out float4 bornForce2<>, out float4 bornForce3<>, out float4 bornForce4<> ){ // --------------------------------------------------------------------------------------- // Born radii float i1BornR, i2BornR, i3BornR, i4BornR; float j1BornR, j2BornR, j3BornR, j4BornR; float4 jBornR; // atomic radii float i1AtomicR, i2AtomicR, i3AtomicR, i4AtomicR; // i,j coordinates float3 i1Pos, i2Pos, i3Pos, i4Pos; float3 j1Pos, j2Pos, j3Pos, j4Pos; float4 j1PosQ, j2PosQ, j3PosQ, j4PosQ; // i, j partial charges float i1Q, i2Q, i3Q, i4Q; float j1Q, j2Q, j3Q, j4Q; float4 jQ; float aceForce; // delta coordinates float3 d1, d2, d3, d4; // intermediate terms float4 dGpol_dr, dGpol_dalpha2_ij; // indices float2 iAtom; float forceIndex; // This is forceIndex mod numberOfAtoms, the true i index float iAtomLinearIndex, jLinind; float2 jAtom; float jEnd, jStart, jBlock; float whichRep; float tmp; // --------------------------------------------------------------------------------------- // electricConstant = -166.0f2691; // preFactor = 2.0f*electricConstant*(1.0f - (1.0f/waterDielectric)) float preFactor = -332.05382f; const float I_Unroll = 4.0f; const float3 zero3 = float3( 0.0f, 0.0f, 0.0f ); // --------------------------------------------------------------------------------------- preFactor *= ( (1.0f/soluteDielectric) - (1.0f/solventDielectric) ); iAtom = indexof( bornForce1 ); forceIndex = I_Unroll*( iAtom.x + iAtom.y*fstreamWidth ); iAtomLinearIndex = fmod( forceIndex, roundedUpAtoms ); // --------------------------------------------------------------------------------------- // set gather index iAtom.x = fmod( iAtomLinearIndex, streamWidth ); iAtom.y = round( (iAtomLinearIndex - fmod(iAtomLinearIndex, streamWidth ))/streamWidth ); // --------------------------------------------------------------------------------------- // etch i1 position and partial charge jQ = posq[ iAtom ]; i1BornR = bornRadii[ iAtom ]; i1AtomicR = atomicRadii[ iAtom ]; i1Pos = jQ.xyz; i1Q = jQ.w; i1Q *= preFactor; kAceNonPolarLoop1( i1BornR, i1AtomicR, duplicationFactor, aceForce ); bornForce1.xyz = zero3; bornForce1.w = includeAce > 0.5f ? aceForce : 0.0f; // --------------------------------------------------------------------------------------- // etch i2 position and partial charge iAtom.x += 1; jQ = posq[ iAtom ]; i2BornR = bornRadii[ iAtom ]; i2AtomicR = atomicRadii[ iAtom ]; i2Pos = jQ.xyz; i2Q = jQ.w; i2Q *= preFactor; kAceNonPolarLoop1( i2BornR, i2AtomicR, duplicationFactor, aceForce ); bornForce2.xyz = zero3; bornForce2.w = includeAce > 0.5f ? aceForce : 0.0f; // --------------------------------------------------------------------------------------- // etch i3 position and partial charge iAtom.x += 1; jQ = posq[ iAtom ]; i3BornR = bornRadii[ iAtom ]; i3AtomicR = atomicRadii[ iAtom ]; i3Pos = jQ.xyz; i3Q = jQ.w; i3Q *= preFactor; kAceNonPolarLoop1( i3BornR, i3AtomicR, duplicationFactor, aceForce ); bornForce3.xyz = zero3; bornForce3.w = includeAce > 0.5f ? aceForce : 0.0f; // --------------------------------------------------------------------------------------- // etch i4 position and partial charge iAtom.x += 1; jQ = posq[ iAtom ]; i4BornR = bornRadii[ iAtom ]; i4AtomicR = atomicRadii[ iAtom ]; i4Pos = jQ.xyz; i4Q = jQ.w; i4Q *= preFactor; kAceNonPolarLoop1( i4BornR, i4AtomicR, duplicationFactor, aceForce ); bornForce4.xyz = zero3; bornForce4.w = includeAce > 0.5f ? aceForce : 0.0f; // --------------------------------------------------------------------------------------- // inner loop setup // if dupFac == 4, I_UnRoll =2, then breaking inner loop into two segments // to increase number of threads in flight // forceStreamSz = N*RepFac/I_UnRoll // forceIndex = I_UnRoll*( a.x + a.y*forceStreamSz ) // whichRep = 0 or 1 // jBlock = 1 + floor[ N/(duplicationFactor*streamWidth) ] //changed the following instruction for rounding issues on some ASICs //whichRep = floor( forceIndex / roundedUpAtoms ); tmp = fmod(forceIndex, roundedUpAtoms); whichRep = round((forceIndex - tmp)/roundedUpAtoms); //jBlock = 1 + floor( numberOfAtoms/(duplicationFactor*streamWidth ) ); tmp = fmod( numberOfAtoms, (duplicationFactor*streamWidth ) ); jBlock = 1 + round( (numberOfAtoms-tmp)/(duplicationFactor*streamWidth )); jStart = whichRep*jBlock; jEnd = ( whichRep > duplicationFactor - 1.5f ) ? 999999.0f : (jStart + jBlock); jAtom.y = jStart; jLinind = jAtom.y*streamWidth; // --------------------------------------------------------------------------------------- while ( jAtom.y < jEnd && ( numberOfAtoms - jLinind ) > 0.9f ){ jAtom.x = 0.0f; while ( jAtom.x < streamWidth && ( numberOfAtoms - jLinind ) > 0.9f ) { // --------------------------------------------------------------------------------------- // gather required values j1PosQ = posq[ jAtom ]; j1Pos = j1PosQ.xyz; j1Q = j1PosQ.w; j1BornR = bornRadii[ jAtom ]; jAtom.x += 1.0f; j2PosQ = posq[ jAtom ]; j2Pos = j2PosQ.xyz; j2Q = j2PosQ.w; j2BornR = bornRadii[ jAtom ]; jAtom.x += 1.0f; j3PosQ = posq[ jAtom ]; j3Pos = j3PosQ.xyz; j3Q = j3PosQ.w; j3BornR = bornRadii[ jAtom ]; jAtom.x += 1.0f; j4PosQ = posq[ jAtom ]; j4Pos = j4PosQ.xyz; j4Q = j4PosQ.w; j4BornR = bornRadii[ jAtom ]; jAtom.x += 1.0f; jBornR = float4( j1BornR, j2BornR, j3BornR, j4BornR ); jQ = float4( j1Q, j2Q, j3Q, j4Q ); // --------------------------------------------------------------------------------------- // i == 1 d1 = i1Pos - j1Pos; //3 d2 = i1Pos - j2Pos; //3 d3 = i1Pos - j3Pos; //3 d4 = i1Pos - j4Pos; //3 loop1Internal( d1, d2, d3, d4, jBornR, jQ, i1BornR, i1Q, dGpol_dr, dGpol_dalpha2_ij ); //96 : 228 bornForce1.xyz += dGpol_dr.x*d1; //6 bornForce1.xyz += dGpol_dr.y*d2; //6 bornForce1.xyz += dGpol_dr.z*d3; //6 bornForce1.xyz += dGpol_dr.w*d4; //6 bornForce1.w += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w; //4 // --------------------------------------------------------------------------------------- // i == 2 d1 = i2Pos - j1Pos; //3 d2 = i2Pos - j2Pos; //3 d3 = i2Pos - j3Pos; //3 d4 = i2Pos - j4Pos; //3 loop1Internal( d1, d2, d3, d4, jBornR, jQ, i2BornR, i2Q, dGpol_dr, dGpol_dalpha2_ij ); //96 : 228 bornForce2.xyz += dGpol_dr.x*d1; //6 bornForce2.xyz += dGpol_dr.y*d2; //6 bornForce2.xyz += dGpol_dr.z*d3; //6 bornForce2.xyz += dGpol_dr.w*d4; //6 bornForce2.w += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w; //4 // --------------------------------------------------------------------------------------- // i == 3 d1 = i3Pos - j1Pos; //3 d2 = i3Pos - j2Pos; //3 d3 = i3Pos - j3Pos; //3 d4 = i3Pos - j4Pos; //3 loop1Internal( d1, d2, d3, d4, jBornR, jQ, i3BornR, i3Q, dGpol_dr, dGpol_dalpha2_ij ); //96 : 228 bornForce3.xyz += dGpol_dr.x*d1; //6 bornForce3.xyz += dGpol_dr.y*d2; //6 bornForce3.xyz += dGpol_dr.z*d3; //6 bornForce3.xyz += dGpol_dr.w*d4; //6 bornForce3.w += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w; //4 // --------------------------------------------------------------------------------------- // i == 4 d1 = i4Pos - j1Pos; //3 d2 = i4Pos - j2Pos; //3 d3 = i4Pos - j3Pos; //3 d4 = i4Pos - j4Pos; //3 loop1Internal( d1, d2, d3, d4, jBornR, jQ, i4BornR, i4Q, dGpol_dr, dGpol_dalpha2_ij ); //96 : 228 bornForce4.xyz += dGpol_dr.x*d1; //6 bornForce4.xyz += dGpol_dr.y*d2; //6 bornForce4.xyz += dGpol_dr.z*d3; //6 bornForce4.xyz += dGpol_dr.w*d4; //6 bornForce4.w += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w; //4 // --------------------------------------------------------------------------------------- jLinind += 4.0f; } jAtom.y += 1.0f; } }