Commit 408469c3 authored by Peter Eastman's avatar Peter Eastman
Browse files

Optimizations to PME

parent 45b0302d
...@@ -152,15 +152,26 @@ __device__ static void calculatePmeSelfTorqueElectrostaticPairIxn_kernel( PmeDir ...@@ -152,15 +152,26 @@ __device__ static void calculatePmeSelfTorqueElectrostaticPairIxn_kernel( PmeDir
} }
__device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ, __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ,
float* scalingFactors, float* outputForce, float outputTorque[2][3], float* energy float* scalingFactors, float* outputForce, float3 outputTorque[3], float* energy
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
,float4* debugArray ,float4* debugArray
#endif #endif
){ ){
float xr = atomJ.x - atomI.x;
float yr = atomJ.y - atomI.y;
float zr = atomJ.z - atomI.z;
// periodic box
float e,ei; xr -= floor(xr*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
float erl,erli; yr -= floor(yr*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
zr -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
float r2 = xr*xr + yr*yr + zr*zr;
if( r2 <= cSim.nonbondedCutoffSqr ){
float r = sqrt(r2);
float ck = atomJ.q;
float conversionFactor = (-cAmoebaSim.electric/cAmoebaSim.dielec); float conversionFactor = (-cAmoebaSim.electric/cAmoebaSim.dielec);
...@@ -184,22 +195,6 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros ...@@ -184,22 +195,6 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
float qi8 = atomI.labFrameQuadrupole[7]; float qi8 = atomI.labFrameQuadrupole[7];
float qi9 = atomI.labFrameQuadrupole[8]; float qi9 = atomI.labFrameQuadrupole[8];
float xr = atomJ.x - atomI.x;
float yr = atomJ.y - atomI.y;
float zr = atomJ.z - atomI.z;
// periodic box
xr -= floor(xr*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
yr -= floor(yr*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
zr -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
float r2 = xr*xr + yr*yr + zr*zr;
if( r2 <= cSim.nonbondedCutoffSqr ){
float r = sqrt(r2);
float ck = atomJ.q;
float dk1 = atomJ.labFrameDipole[0]; float dk1 = atomJ.labFrameDipole[0];
float dk2 = atomJ.labFrameDipole[1]; float dk2 = atomJ.labFrameDipole[1];
float dk3 = atomJ.labFrameDipole[2]; float dk3 = atomJ.labFrameDipole[2];
...@@ -497,18 +492,18 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros ...@@ -497,18 +492,18 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
// compute the energy contributions for this interaction // compute the energy contributions for this interaction
e = bn0*gl0 + bn1*(gl1+gl6) float e = bn0*gl0 + bn1*(gl1+gl6)
+ bn2*(gl2+gl7+gl8) + bn2*(gl2+gl7+gl8)
+ bn3*(gl3+gl5) + bn4*gl4; + bn3*(gl3+gl5) + bn4*gl4;
ei = 0.5f * (bn1*(gli1+gli6) float ei = 0.5f * (bn1*(gli1+gli6)
+ bn2*(gli2+gli7) + bn3*gli3); + bn2*(gli2+gli7) + bn3*gli3);
// get the real energy without any screening function // get the real energy without any screening function
erl = rr1*gl0 + rr3*(gl1+gl6) float erl = rr1*gl0 + rr3*(gl1+gl6)
+ rr5*(gl2+gl7+gl8) + rr5*(gl2+gl7+gl8)
+ rr7*(gl3+gl5) + rr9*gl4; + rr7*(gl3+gl5) + rr9*gl4;
erli = 0.5f*(rr3*(gli1+gli6)*psc3 float erli = 0.5f*(rr3*(gli1+gli6)*psc3
+ rr5*(gli2+gli7)*psc5 + rr5*(gli2+gli7)*psc5
+ rr7*gli3*psc7); + rr7*gli3*psc7);
e = e - (1.0f-scalingFactors[MScaleIndex])*erl; e = e - (1.0f-scalingFactors[MScaleIndex])*erl;
...@@ -552,23 +547,23 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros ...@@ -552,23 +547,23 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
// intermediate variables for induced force terms // intermediate variables for induced force terms
float gfi1 = 0.5f*bn2*(gli1+glip1+gli6+glip6) float gfi1 = 0.5f*(bn2*(gli1+glip1+gli6+glip6)
+ 0.5f*bn2*scip2 + bn2*scip2
+ 0.5f*bn3*(gli2+glip2+gli7+glip7) + bn3*(gli2+glip2+gli7+glip7)
- 0.5f*bn3*(sci3*scip4+scip3*sci4) - bn3*(sci3*scip4+scip3*sci4)
+ 0.5f*bn4*(gli3+glip3); + bn4*(gli3+glip3));
float gfi2 = -ck*bn1 + sc4*bn2 - sc6*bn3; float gfi2 = -ck*bn1 + sc4*bn2 - sc6*bn3;
float gfi3 = ci*bn1 + sc3*bn2 + sc5*bn3; float gfi3 = ci*bn1 + sc3*bn2 + sc5*bn3;
float gfi4 = 2.0f * bn2; float gfi4 = 2.0f * bn2;
float gfi5 = bn3 * (sci4+scip4); float gfi5 = bn3 * (sci4+scip4);
float gfi6 = -bn3 * (sci3+scip3); float gfi6 = -bn3 * (sci3+scip3);
float gfri1 = 0.5f*rr5*((gli1+gli6)*psc3 float gfri1 = 0.5f*(rr5*((gli1+gli6)*psc3
+ (glip1+glip6)*dsc3 + (glip1+glip6)*dsc3
+ scip2*usc3) + scip2*usc3)
+ 0.5f*rr7*((gli7+gli2)*psc5 + rr7*((gli7+gli2)*psc5
+ (glip7+glip2)*dsc5 + (glip7+glip2)*dsc5
- (sci3*scip4+scip3*sci4)*usc5) - (sci3*scip4+scip3*sci4)*usc5)
+ 0.5f*rr9*(gli3*psc7+glip3*dsc7); + rr9*(gli3*psc7+glip3*dsc7));
float gfri4 = 2.0f * rr5; float gfri4 = 2.0f * rr5;
float gfri5 = rr7 * (sci4*psc7+scip4*dsc7); float gfri5 = rr7 * (sci4*psc7+scip4*dsc7);
float gfri6 = -rr7 * (sci3*psc7+scip3*dsc7); float gfri6 = -rr7 * (sci3*psc7+scip3*dsc7);
...@@ -858,13 +853,13 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros ...@@ -858,13 +853,13 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
outputForce[2] = conversionFactor*(ftm23 + ftm2i3); outputForce[2] = conversionFactor*(ftm23 + ftm2i3);
conversionFactor *= -1.0; conversionFactor *= -1.0;
outputTorque[0][0] = conversionFactor*(ttm21 + ttm2i1); outputTorque[0].x = conversionFactor*(ttm21 + ttm2i1);
outputTorque[0][1] = conversionFactor*(ttm22 + ttm2i2); outputTorque[1].x = conversionFactor*(ttm22 + ttm2i2);
outputTorque[0][2] = conversionFactor*(ttm23 + ttm2i3); outputTorque[2].x = conversionFactor*(ttm23 + ttm2i3);
outputTorque[1][0] = conversionFactor*(ttm31 + ttm3i1); outputTorque[1].x = conversionFactor*(ttm31 + ttm3i1);
outputTorque[1][1] = conversionFactor*(ttm32 + ttm3i2); outputTorque[1].y = conversionFactor*(ttm32 + ttm3i2);
outputTorque[1][2] = conversionFactor*(ttm33 + ttm3i3); outputTorque[1].z = conversionFactor*(ttm33 + ttm3i3);
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
int debugIndex = 0; int debugIndex = 0;
...@@ -959,13 +954,13 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros ...@@ -959,13 +954,13 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
outputForce[1] = 0.0f; outputForce[1] = 0.0f;
outputForce[2] = 0.0f; outputForce[2] = 0.0f;
outputTorque[0][0] = 0.0f; outputTorque[0].x = 0.0f;
outputTorque[0][1] = 0.0f; outputTorque[0].y = 0.0f;
outputTorque[0][2] = 0.0f; outputTorque[0].z = 0.0f;
outputTorque[1][0] = 0.0f; outputTorque[1].x = 0.0f;
outputTorque[1][1] = 0.0f; outputTorque[1].y = 0.0f;
outputTorque[1][2] = 0.0f; outputTorque[1].z = 0.0f;
*energy = 0.0f; *energy = 0.0f;
......
...@@ -117,7 +117,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)( ...@@ -117,7 +117,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
{ {
float force[3]; float force[3];
float torque[2][3]; float3 torque[2];
unsigned int atomJ = y + j; unsigned int atomJ = y + j;
...@@ -151,9 +151,9 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)( ...@@ -151,9 +151,9 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
localParticle.force[1] += mask ? force[1] : 0.0f; localParticle.force[1] += mask ? force[1] : 0.0f;
localParticle.force[2] += mask ? force[2] : 0.0f; localParticle.force[2] += mask ? force[2] : 0.0f;
localParticle.torque[0] += mask ? torque[0][0] : 0.0f; localParticle.torque[0] += mask ? torque[0].x : 0.0f;
localParticle.torque[1] += mask ? torque[0][1] : 0.0f; localParticle.torque[1] += mask ? torque[0].y : 0.0f;
localParticle.torque[2] += mask ? torque[0][2] : 0.0f; localParticle.torque[2] += mask ? torque[0].z : 0.0f;
totalEnergy += mask ? 0.5*energy : 0.0f; totalEnergy += mask ? 0.5*energy : 0.0f;
...@@ -181,15 +181,15 @@ if( atomI == targetAtom ){ ...@@ -181,15 +181,15 @@ if( atomI == targetAtom ){
index += cAmoebaSim.paddedNumberOfAtoms; index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? torque[0][0] : 0.0f; debugArray[index].x = mask ? torque[0].x : 0.0f;
debugArray[index].y = mask ? torque[0][1] : 0.0f; debugArray[index].y = mask ? torque[0].y : 0.0f;
debugArray[index].z = mask ? torque[0][2] : 0.0f; debugArray[index].z = mask ? torque[0].z : 0.0f;
debugArray[index].w = mask ? energy : 0.0f; debugArray[index].w = mask ? energy : 0.0f;
index += cAmoebaSim.paddedNumberOfAtoms; index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? torque[0][0] : 0.0f; debugArray[index].x = mask ? torque[0].x : 0.0f;
debugArray[index].y = mask ? torque[0][1] : 0.0f; debugArray[index].y = mask ? torque[0].y : 0.0f;
debugArray[index].z = mask ? torque[0][2] : 0.0f; debugArray[index].z = mask ? torque[0].z : 0.0f;
debugArray[index].w = (float) (blockIdx.x * blockDim.x + threadIdx.x); debugArray[index].w = (float) (blockIdx.x * blockDim.x + threadIdx.x);
for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){ for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
...@@ -304,7 +304,7 @@ if( atomI == targetAtom ){ ...@@ -304,7 +304,7 @@ if( atomI == targetAtom ){
unsigned int atomJ = y + jIdx; unsigned int atomJ = y + jIdx;
float force[3]; float force[3];
float torque[2][3]; float3 torque[2];
// set scale factors // set scale factors
...@@ -335,9 +335,9 @@ if( atomI == targetAtom ){ ...@@ -335,9 +335,9 @@ if( atomI == targetAtom ){
localParticle.force[1] += mask ? force[1] : 0.0f; localParticle.force[1] += mask ? force[1] : 0.0f;
localParticle.force[2] += mask ? force[2] : 0.0f; localParticle.force[2] += mask ? force[2] : 0.0f;
localParticle.torque[0] += mask ? torque[0][0] : 0.0f; localParticle.torque[0] += mask ? torque[0].x : 0.0f;
localParticle.torque[1] += mask ? torque[0][1] : 0.0f; localParticle.torque[1] += mask ? torque[0].y : 0.0f;
localParticle.torque[2] += mask ? torque[0][2] : 0.0f; localParticle.torque[2] += mask ? torque[0].z : 0.0f;
totalEnergy += mask ? energy : 0.0f; totalEnergy += mask ? energy : 0.0f;
...@@ -349,9 +349,9 @@ if( atomI == targetAtom ){ ...@@ -349,9 +349,9 @@ if( atomI == targetAtom ){
psA[jIdx].force[1] -= mask ? force[1] : 0.0f; psA[jIdx].force[1] -= mask ? force[1] : 0.0f;
psA[jIdx].force[2] -= mask ? force[2] : 0.0f; psA[jIdx].force[2] -= mask ? force[2] : 0.0f;
psA[jIdx].torque[0] += mask ? torque[1][0] : 0.0f; psA[jIdx].torque[0] += mask ? torque[1].x : 0.0f;
psA[jIdx].torque[1] += mask ? torque[1][1] : 0.0f; psA[jIdx].torque[1] += mask ? torque[1].y : 0.0f;
psA[jIdx].torque[2] += mask ? torque[1][2] : 0.0f; psA[jIdx].torque[2] += mask ? torque[1].z : 0.0f;
} else { } else {
...@@ -359,9 +359,9 @@ if( atomI == targetAtom ){ ...@@ -359,9 +359,9 @@ if( atomI == targetAtom ){
sA[threadIdx.x].tempForce[1] = mask ? 0.0f : force[1]; sA[threadIdx.x].tempForce[1] = mask ? 0.0f : force[1];
sA[threadIdx.x].tempForce[2] = mask ? 0.0f : force[2]; sA[threadIdx.x].tempForce[2] = mask ? 0.0f : force[2];
sA[threadIdx.x].tempTorque[0] = mask ? 0.0f : torque[1][0]; sA[threadIdx.x].tempTorque[0] = mask ? 0.0f : torque[1].x;
sA[threadIdx.x].tempTorque[1] = mask ? 0.0f : torque[1][1]; sA[threadIdx.x].tempTorque[1] = mask ? 0.0f : torque[1].y;
sA[threadIdx.x].tempTorque[2] = mask ? 0.0f : torque[1][2]; sA[threadIdx.x].tempTorque[2] = mask ? 0.0f : torque[1].z;
if( tgx % 2 == 0 ){ if( tgx % 2 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] ); sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );
......
...@@ -187,6 +187,7 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle& ...@@ -187,6 +187,7 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
zr -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ; zr -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
float r2 = xr*xr + yr*yr + zr*zr; float r2 = xr*xr + yr*yr + zr*zr;
if( r2 <= cSim.nonbondedCutoffSqr ){
float r = sqrtf(r2); float r = sqrtf(r2);
// calculate the error function damping terms // calculate the error function damping terms
...@@ -317,8 +318,6 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle& ...@@ -317,8 +318,6 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
// increment the field at each site due to this interaction // increment the field at each site due to this interaction
if( r2 <= cSim.nonbondedCutoffSqr ){
fields[0].x = fim0 - fid0; fields[0].x = fim0 - fid0;
fields[1].x = fim1 - fid1; fields[1].x = fim1 - fid1;
fields[2].x = fim2 - fid2; fields[2].x = fim2 - fid2;
......
...@@ -75,6 +75,7 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce ...@@ -75,6 +75,7 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce
zr -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ; zr -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
float r2 = xr*xr + yr* yr + zr*zr; float r2 = xr*xr + yr* yr + zr*zr;
if( r2 <= cSim.nonbondedCutoffSqr ){
float r = sqrtf(r2); float r = sqrtf(r2);
// calculate the error function damping terms // calculate the error function damping terms
...@@ -160,8 +161,6 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce ...@@ -160,8 +161,6 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce
// increment the field at each site due to this interaction // increment the field at each site due to this interaction
if( r2 <= cSim.nonbondedCutoffSqr ){
fields[0].x = fimd0 - fid0; fields[0].x = fimd0 - fid0;
fields[0].y = fkmd0 - fkd0; fields[0].y = fkmd0 - fkd0;
fields[0].z = fimp0 - fip0; fields[0].z = fimp0 - fip0;
......
...@@ -29,7 +29,7 @@ ...@@ -29,7 +29,7 @@
__global__ __global__
#if (__CUDA_ARCH__ >= 200) #if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1) __launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130) #elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1) __launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
#else #else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1) __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment