Optimizations to PME

408469c3 · Peter Eastman · 45b0302d · 408469c3 · 408469c3 · 408469c3
Commit 408469c3 authored Oct 07, 2010 by Peter Eastman
5 changed files
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
@@ -152,15 +152,26 @@ __device__ static void calculatePmeSelfTorqueElectrostaticPairIxn_kernel( PmeDir
 }

 __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectrostaticParticle& atomI,   PmeDirectElectrostaticParticle& atomJ,
-                                                               float* scalingFactors, float*  outputForce, float  outputTorque[2][3], float* energy
+                                                               float* scalingFactors, float*  outputForce, float3  outputTorque[3], float* energy
 #ifdef AMOEBA_DEBUG
                                                               ,float4* debugArray
 #endif
 ){
+    float xr    = atomJ.x - atomI.x;
+    float yr    = atomJ.y - atomI.y;
+    float zr    = atomJ.z - atomI.z;

+    // periodic box

-    float e,ei;
-    float erl,erli;
+    xr         -= floor(xr*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+    yr         -= floor(yr*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+    zr         -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+
+    float r2    = xr*xr + yr*yr + zr*zr;
+    if( r2 <= cSim.nonbondedCutoffSqr ){
+
+        float r      = sqrt(r2);
+        float ck     = atomJ.q;

        float conversionFactor   = (-cAmoebaSim.electric/cAmoebaSim.dielec);

@@ -184,22 +195,6 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
        float qi8       = atomI.labFrameQuadrupole[7];
        float qi9       = atomI.labFrameQuadrupole[8];

-    float xr    = atomJ.x - atomI.x;
-    float yr    = atomJ.y - atomI.y;
-    float zr    = atomJ.z - atomI.z;
-
-    // periodic box 
-
-    xr         -= floor(xr*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-    yr         -= floor(yr*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-    zr         -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-
-    float r2    = xr*xr + yr*yr + zr*zr;
-    if( r2 <= cSim.nonbondedCutoffSqr ){
-
-        float r      = sqrt(r2);
-        float ck     = atomJ.q;
-      
        float dk1  = atomJ.labFrameDipole[0];
        float dk2  = atomJ.labFrameDipole[1];
        float dk3  = atomJ.labFrameDipole[2];
@@ -497,18 +492,18 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros

        // compute the energy contributions for this interaction

-        e    = bn0*gl0 + bn1*(gl1+gl6)
+        float e    = bn0*gl0 + bn1*(gl1+gl6)
                 + bn2*(gl2+gl7+gl8)
                 + bn3*(gl3+gl5) + bn4*gl4;
-        ei    = 0.5f * (bn1*(gli1+gli6)
+        float ei    = 0.5f * (bn1*(gli1+gli6)
                       + bn2*(gli2+gli7) + bn3*gli3);

        // get the real energy without any screening function

-        erl = rr1*gl0 + rr3*(gl1+gl6)
+        float erl = rr1*gl0 + rr3*(gl1+gl6)
                   + rr5*(gl2+gl7+gl8)
                   + rr7*(gl3+gl5) + rr9*gl4;
-        erli = 0.5f*(rr3*(gli1+gli6)*psc3
+        float erli = 0.5f*(rr3*(gli1+gli6)*psc3
                    + rr5*(gli2+gli7)*psc5
                    + rr7*gli3*psc7);
        e = e - (1.0f-scalingFactors[MScaleIndex])*erl;
@@ -552,23 +547,23 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros

        // intermediate variables for induced force terms

-        float gfi1 = 0.5f*bn2*(gli1+glip1+gli6+glip6)
-                      + 0.5f*bn2*scip2
-                      + 0.5f*bn3*(gli2+glip2+gli7+glip7)
-                      - 0.5f*bn3*(sci3*scip4+scip3*sci4)
-                      + 0.5f*bn4*(gli3+glip3);
+        float gfi1 = 0.5f*(bn2*(gli1+glip1+gli6+glip6)
+                      + bn2*scip2
+                      + bn3*(gli2+glip2+gli7+glip7)
+                      - bn3*(sci3*scip4+scip3*sci4)
+                      + bn4*(gli3+glip3));
        float gfi2 = -ck*bn1 + sc4*bn2 - sc6*bn3;
        float gfi3 = ci*bn1 + sc3*bn2 + sc5*bn3;
        float gfi4 = 2.0f * bn2;
        float gfi5 = bn3 * (sci4+scip4);
        float gfi6 = -bn3 * (sci3+scip3);
-        float gfri1 = 0.5f*rr5*((gli1+gli6)*psc3
+        float gfri1 = 0.5f*(rr5*((gli1+gli6)*psc3
                             + (glip1+glip6)*dsc3
                             + scip2*usc3)
-                  + 0.5f*rr7*((gli7+gli2)*psc5
+                  + rr7*((gli7+gli2)*psc5
                             + (glip7+glip2)*dsc5
                      - (sci3*scip4+scip3*sci4)*usc5)
-                  + 0.5f*rr9*(gli3*psc7+glip3*dsc7);
+                  + rr9*(gli3*psc7+glip3*dsc7));
        float gfri4 = 2.0f * rr5;
        float gfri5 = rr7 * (sci4*psc7+scip4*dsc7);
        float gfri6 = -rr7 * (sci3*psc7+scip3*dsc7);
@@ -858,13 +853,13 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
        outputForce[2]           = conversionFactor*(ftm23 + ftm2i3);

        conversionFactor        *= -1.0;
-        outputTorque[0][0]       =  conversionFactor*(ttm21 + ttm2i1);
-        outputTorque[0][1]       =  conversionFactor*(ttm22 + ttm2i2);
-        outputTorque[0][2]       =  conversionFactor*(ttm23 + ttm2i3);
+        outputTorque[0].x       =  conversionFactor*(ttm21 + ttm2i1);
+        outputTorque[1].x       =  conversionFactor*(ttm22 + ttm2i2);
+        outputTorque[2].x       =  conversionFactor*(ttm23 + ttm2i3);

-        outputTorque[1][0]       =  conversionFactor*(ttm31 + ttm3i1);
-        outputTorque[1][1]       =  conversionFactor*(ttm32 + ttm3i2);
-        outputTorque[1][2]       =  conversionFactor*(ttm33 + ttm3i3);
+        outputTorque[1].x       =  conversionFactor*(ttm31 + ttm3i1);
+        outputTorque[1].y       =  conversionFactor*(ttm32 + ttm3i2);
+        outputTorque[1].z       =  conversionFactor*(ttm33 + ttm3i3);

 #ifdef AMOEBA_DEBUG
    int debugIndex               = 0;
@@ -959,13 +954,13 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
        outputForce[1]           = 0.0f;
        outputForce[2]           = 0.0f;

-        outputTorque[0][0]       = 0.0f;
-        outputTorque[0][1]       = 0.0f;
-        outputTorque[0][2]       = 0.0f;
+        outputTorque[0].x       = 0.0f;
+        outputTorque[0].y       = 0.0f;
+        outputTorque[0].z       = 0.0f;

-        outputTorque[1][0]       = 0.0f;
-        outputTorque[1][1]       = 0.0f;
-        outputTorque[1][2]       = 0.0f;
+        outputTorque[1].x       = 0.0f;
+        outputTorque[1].y       = 0.0f;
+        outputTorque[1].z       = 0.0f;

        *energy                  = 0.0f;


--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.h
@@ -117,7 +117,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
            {

                float force[3];
-                float torque[2][3];
+                float3 torque[2];

                unsigned int atomJ = y + j;

@@ -151,9 +151,9 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
                localParticle.force[1]            += mask ? force[1]     : 0.0f;
                localParticle.force[2]            += mask ? force[2]     : 0.0f;

-                localParticle.torque[0]           += mask ? torque[0][0] : 0.0f;
-                localParticle.torque[1]           += mask ? torque[0][1] : 0.0f;
-                localParticle.torque[2]           += mask ? torque[0][2] : 0.0f;
+                localParticle.torque[0]           += mask ? torque[0].x : 0.0f;
+                localParticle.torque[1]           += mask ? torque[0].y : 0.0f;
+                localParticle.torque[2]           += mask ? torque[0].z : 0.0f;

                totalEnergy                       += mask ? 0.5*energy   : 0.0f;

@@ -181,15 +181,15 @@ if( atomI == targetAtom ){


    index                             += cAmoebaSim.paddedNumberOfAtoms;
-    debugArray[index].x                = mask ? torque[0][0] : 0.0f;
-    debugArray[index].y                = mask ? torque[0][1] : 0.0f;
-    debugArray[index].z                = mask ? torque[0][2] : 0.0f;
+    debugArray[index].x                = mask ? torque[0].x : 0.0f;
+    debugArray[index].y                = mask ? torque[0].y : 0.0f;
+    debugArray[index].z                = mask ? torque[0].z : 0.0f;
    debugArray[index].w                = mask ? energy       : 0.0f;

    index                             += cAmoebaSim.paddedNumberOfAtoms;
-    debugArray[index].x                = mask ? torque[0][0] : 0.0f;
-    debugArray[index].y                = mask ? torque[0][1] : 0.0f;
-    debugArray[index].z                = mask ? torque[0][2] : 0.0f;
+    debugArray[index].x                = mask ? torque[0].x : 0.0f;
+    debugArray[index].y                = mask ? torque[0].y : 0.0f;
+    debugArray[index].z                = mask ? torque[0].z : 0.0f;
    debugArray[index].w                = (float) (blockIdx.x * blockDim.x + threadIdx.x);

    for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
@@ -304,7 +304,7 @@ if( atomI == targetAtom ){
                        unsigned int atomJ = y + jIdx;

                        float force[3];
-                        float torque[2][3];
+                        float3 torque[2];

                        // set scale factors

@@ -335,9 +335,9 @@ if( atomI == targetAtom ){
                        localParticle.force[1]         += mask ? force[1]      : 0.0f;
                        localParticle.force[2]         += mask ? force[2]      : 0.0f;

-                        localParticle.torque[0]        += mask ? torque[0][0]  : 0.0f;
-                        localParticle.torque[1]        += mask ? torque[0][1]  : 0.0f;
-                        localParticle.torque[2]        += mask ? torque[0][2]  : 0.0f;
+                        localParticle.torque[0]        += mask ? torque[0].x  : 0.0f;
+                        localParticle.torque[1]        += mask ? torque[0].y  : 0.0f;
+                        localParticle.torque[2]        += mask ? torque[0].z  : 0.0f;

                        totalEnergy                    += mask ? energy        : 0.0f;

@@ -349,9 +349,9 @@ if( atomI == targetAtom ){
                            psA[jIdx].force[1]               -= mask ?  force[1]     : 0.0f;
                            psA[jIdx].force[2]               -= mask ?  force[2]     : 0.0f;

-                            psA[jIdx].torque[0]              += mask ?  torque[1][0] : 0.0f;
-                            psA[jIdx].torque[1]              += mask ?  torque[1][1] : 0.0f;
-                            psA[jIdx].torque[2]              += mask ?  torque[1][2] : 0.0f;
+                            psA[jIdx].torque[0]              += mask ?  torque[1].x : 0.0f;
+                            psA[jIdx].torque[1]              += mask ?  torque[1].y : 0.0f;
+                            psA[jIdx].torque[2]              += mask ?  torque[1].z : 0.0f;

                        } else {

@@ -359,9 +359,9 @@ if( atomI == targetAtom ){
                            sA[threadIdx.x].tempForce[1]     = mask ? 0.0f : force[1];
                            sA[threadIdx.x].tempForce[2]     = mask ? 0.0f : force[2];

-                            sA[threadIdx.x].tempTorque[0]    = mask ? 0.0f : torque[1][0];
-                            sA[threadIdx.x].tempTorque[1]    = mask ? 0.0f : torque[1][1];
-                            sA[threadIdx.x].tempTorque[2]    = mask ? 0.0f : torque[1][2];
+                            sA[threadIdx.x].tempTorque[0]    = mask ? 0.0f : torque[1].x;
+                            sA[threadIdx.x].tempTorque[1]    = mask ? 0.0f : torque[1].y;
+                            sA[threadIdx.x].tempTorque[2]    = mask ? 0.0f : torque[1].z;

                            if( tgx % 2 == 0 ){
                                sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
@@ -187,6 +187,7 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
    zr               -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;

    float r2          = xr*xr + yr*yr + zr*zr;
+    if( r2 <= cSim.nonbondedCutoffSqr ){
        float r           = sqrtf(r2);

        // calculate the error function damping terms
@@ -317,8 +318,6 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&

        // increment the field at each site due to this interaction

-    if( r2 <= cSim.nonbondedCutoffSqr ){
-
        fields[0].x       = fim0 - fid0;
        fields[1].x       = fim1 - fid1;
        fields[2].x       = fim2 - fid2;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
@@ -75,6 +75,7 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce
    zr               -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;

    float r2          = xr*xr + yr* yr + zr*zr;
+    if( r2 <= cSim.nonbondedCutoffSqr ){
        float r           = sqrtf(r2);

        // calculate the error function damping terms
@@ -160,8 +161,6 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce

        // increment the field at each site due to this interaction

-    if( r2 <= cSim.nonbondedCutoffSqr ){
-
        fields[0].x       = fimd0 - fid0;
        fields[0].y       = fkmd0 - fkd0;
        fields[0].z       = fimp0 - fip0;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.h
@@ -29,7 +29,7 @@
 __global__
 #if (__CUDA_ARCH__ >= 200)
 __launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 130)
+#elif (__CUDA_ARCH__ >= 120)
 __launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
 #else
 __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)