Optimizations to Ewald summation (in progress)

705ec545 · Peter Eastman · 530e4ea0 · 705ec545 · 705ec545 · 705ec545
Commit 705ec545 authored Jun 30, 2009 by Peter Eastman
5 changed files
--- a/platforms/cuda/src/kernels/gpu.cpp
+++ b/platforms/cuda/src/kernels/gpu.cpp
@@ -432,7 +432,7 @@ void gpuSetEwaldParameters(gpuContext gpu)//, float alphaEwald, int kmax )
    gpu->sim.alphaEwald         = alpha;
    gpu->sim.factorEwald        = -1 / (4*alpha*alpha);
    gpu->sim.kmax               = 20+1;
-    gpu->psEwaldEikr            = new CUDAStream<float2>(gpu->sim.atoms*gpu->sim.kmax, 1, "EwaldEikr");
+    gpu->psEwaldEikr            = new CUDAStream<float2>(gpu->sim.atoms*gpu->sim.kmax*3, 1, "EwaldEikr");
    gpu->sim.pEwaldEikr         = gpu->psEwaldEikr->_pDevStream[0];
    gpu->psEwaldCosSinSum       = new CUDAStream<float2>((gpu->sim.kmax*2-1) * (gpu->sim.kmax*2-1) * (gpu->sim.kmax*2-1), 1, "EwaldCosSinSum");
    gpu->sim.pEwaldCosSinSum    = gpu->psEwaldCosSinSum->_pDevStream[0];

--- a/platforms/cuda/src/kernels/kCalculateCDLJEwaldFastReciprocal.h
+++ b/platforms/cuda/src/kernels/kCalculateCDLJEwaldFastReciprocal.h
@@ -24,25 +24,25 @@
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

-  /* Define multiply operations for floats */
+/* Define multiply operations for floats */

-  __device__ float2 MultofFloat2(float2 a, float2 b)
-  {
+__device__ float2 MultofFloat2(float2 a, float2 b)
+{
    float2 c;
    c.x = a.x * b.x - a.y * b.y;
    c.y = a.x * b.y + a.y * b.x;
    return c;
+}

-  }
-  
-  __device__ float2 ConjMultofFloat2(float2 a, float2 b)
-  {
+__device__ float2 ConjMultofFloat2(float2 a, float2 b)
+{
    float2 c;
    c.x = a.x*b.x + a.y*b.y;
    c.y = a.y*b.x - a.x*b.y;
    return c;
+}

-  }
+#define EIR(x, y, z) cSim.pEwaldEikr[(x)+(y)*cSim.kmax+(z)*cSim.kmax*3]

 __global__ void kCalculateEwaldFastEikr_kernel()
 {
@@ -63,23 +63,23 @@ __global__ void kCalculateEwaldFastEikr_kernel()

 // k = 0, explicitly
        for(unsigned int m = 0; (m < 3); m++) {
-          cSim.pEwaldEikr[atom*kmax*3 + 0 + m].x = 1;
-          cSim.pEwaldEikr[atom*kmax*3 + 0 + m].y = 0;
+            EIR(atom, 0, m).x = 1;
+            EIR(atom, 0, m).y = 0;
        }
 // k = 1, explicitly
-          cSim.pEwaldEikr[atom*kmax*3 + 3 + 0].x = cos(apos.x*cSim.recipBoxSizeX);
-          cSim.pEwaldEikr[atom*kmax*3 + 3 + 0].y = sin(apos.x*cSim.recipBoxSizeX);
+          EIR(atom, 1, 0).x = cos(apos.x*cSim.recipBoxSizeX);
+          EIR(atom, 1, 0).y = sin(apos.x*cSim.recipBoxSizeX);

-          cSim.pEwaldEikr[atom*kmax*3 + 3 + 1].x = cos(apos.y*cSim.recipBoxSizeY);
-          cSim.pEwaldEikr[atom*kmax*3 + 3 + 1].y = sin(apos.y*cSim.recipBoxSizeY);
+          EIR(atom, 1, 1).x = cos(apos.y*cSim.recipBoxSizeY);
+          EIR(atom, 1, 1).y = sin(apos.y*cSim.recipBoxSizeY);

-          cSim.pEwaldEikr[atom*kmax*3 + 3 + 2].x = cos(apos.z*cSim.recipBoxSizeZ);
-          cSim.pEwaldEikr[atom*kmax*3 + 3 + 2].y = sin(apos.z*cSim.recipBoxSizeZ);
+          EIR(atom, 1, 2).x = cos(apos.z*cSim.recipBoxSizeZ);
+          EIR(atom, 1, 2).y = sin(apos.z*cSim.recipBoxSizeZ);

 // k > 1, by recursion
        for(unsigned int k=2; (k<kmax); k++) {
          for(unsigned int m=0; (m<3); m++) {
-            cSim.pEwaldEikr[atom*kmax*3 + k*3 + m] = MultofFloat2 (cSim.pEwaldEikr[atom*kmax*3 + (k-1)*3 + m] , cSim.pEwaldEikr[atom*kmax*3 + 3 + m]);
+            EIR(atom, k, m) = MultofFloat2(EIR(atom, k-1, m), EIR(atom, 1, m));
          }
        }

@@ -102,11 +102,11 @@ __global__ void kCalculateEwaldFastCosSinSums_kernel()
        float2 sum = make_float2(0.0f, 0.0f);
        for (int atom = 0; atom < cSim.atoms; atom++)
        {
-            float2 tab_xy = (ry >= 0 ? MultofFloat2 (cSim.pEwaldEikr[atom*cSim.kmax*3 + rx*3 + 0], cSim.pEwaldEikr[atom*cSim.kmax*3 + ry*3 + 1]) :
-                                       ConjMultofFloat2 (cSim.pEwaldEikr[atom*cSim.kmax*3 + rx*3 + 0], cSim.pEwaldEikr[atom*cSim.kmax*3 - ry*3 + 1]));
+            float2 tab_xy = (ry >= 0 ? MultofFloat2(EIR(atom, rx, 0), EIR(atom, ry, 1)) :
+                                       ConjMultofFloat2(EIR(atom, rx, 0), EIR(atom, -ry, 1)));
            float charge = cSim.pPosq[atom].w;
-            float2 structureFactor = (rz >= 0 ? MultofFloat2(tab_xy, cSim.pEwaldEikr[atom*cSim.kmax*3 + rz*3 + 2]) :
-                                                ConjMultofFloat2(tab_xy, cSim.pEwaldEikr[atom*cSim.kmax*3 - rz*3 + 2]));
+            float2 structureFactor = (rz >= 0 ? MultofFloat2(tab_xy, EIR(atom, rz, 2)) :
+                                                ConjMultofFloat2(tab_xy, EIR(atom, -rz, 2)));
            sum.x += charge*structureFactor.x;
            sum.y += charge*structureFactor.y;
        }
@@ -132,29 +132,32 @@ __global__ void kCalculateEwaldFastForces_kernel()

    while (atom < cSim.atoms)
    {
+        float4 force = cSim.pForce4[atom];
        float charge = cSim.pPosq[atom].w;
        for (int rx = 0; rx < numRx; rx++) {
            float kx = rx * cSim.recipBoxSizeX;
            for (int ry = lowry; ry < numRy; ry++) {
                float ky = ry * cSim.recipBoxSizeY;
-                float2 tab_xy = (ry >= 0 ? MultofFloat2(cSim.pEwaldEikr[atom*cSim.kmax*3 + rx*3 + 0], cSim.pEwaldEikr[atom*cSim.kmax*3 + ry*3 + 1]) :
-                                           ConjMultofFloat2(cSim.pEwaldEikr[atom*cSim.kmax*3 + rx*3 + 0], cSim.pEwaldEikr[atom*cSim.kmax*3 - ry*3 + 1]));
+                float2 tab_xy = (ry >= 0 ? MultofFloat2(EIR(atom, rx, 0), EIR(atom, ry, 1)) :
+                                           ConjMultofFloat2(EIR(atom, rx, 0), EIR(atom, -ry, 1)));
                for (int rz = lowrz; rz < numRz; rz++) {
                    float kz = rz * cSim.recipBoxSizeZ;
                    int index = rx*(numRy*2-1)*(numRz*2-1) + (ry+numRy-1)*(numRz*2-1) + (rz+numRz-1);
                    float k2 = kx*kx + ky*ky + kz*kz;
                    float ak = exp(k2*cSim.factorEwald) / k2;
-                    float2 structureFactor = (rz >= 0 ? MultofFloat2(tab_xy, cSim.pEwaldEikr[atom*cSim.kmax*3 + rz*3 + 2]) :
-                                                        ConjMultofFloat2(tab_xy, cSim.pEwaldEikr[atom*cSim.kmax*3 - rz*3 + 2]));
-                    float dEdR = ak * charge * (cSim.pEwaldCosSinSum[index].x*structureFactor.y - cSim.pEwaldCosSinSum[index].y*structureFactor.x);
-                    cSim.pForce4[atom].x += 2 * recipCoeff * dEdR * kx;
-                    cSim.pForce4[atom].y += 2 * recipCoeff * dEdR * ky;
-                    cSim.pForce4[atom].z += 2 * recipCoeff * dEdR * kz;
+                    float2 structureFactor = (rz >= 0 ? MultofFloat2(tab_xy, EIR(atom, rz, 2)) :
+                                                        ConjMultofFloat2(tab_xy, EIR(atom, -rz, 2)));
+                    float2 cosSinSum = cSim.pEwaldCosSinSum[index];
+                    float dEdR = ak * charge * (cosSinSum.x*structureFactor.y - cosSinSum.y*structureFactor.x);
+                    force.x += 2 * recipCoeff * dEdR * kx;
+                    force.y += 2 * recipCoeff * dEdR * ky;
+                    force.z += 2 * recipCoeff * dEdR * kz;
                    lowrz = 1 - numRz;
                }
                lowry = 1 - numRy;
            }
        }
+        cSim.pForce4[atom] = force;
        atom += blockDim.x * gridDim.x;
    }
 }
--- a/platforms/cuda/src/kernels/kCalculateCDLJEwaldReciprocal.h
+++ b/platforms/cuda/src/kernels/kCalculateCDLJEwaldReciprocal.h
@@ -32,74 +32,54 @@

 __global__ void kCalculateCDLJEwaldReciprocalForces_kernel()
 {
-    float eps0             = 5.72765E-4;
-
-    int numRx              = 20+1;
-    int numRy              = 20+1;
-    int numRz              = 20+1;
-
-    int lowry, lowrz;
-
-    float kx, ky, kz, k2, ek;
-    float Qi, Qj, SinI, SinJ, CosI, CosJ;
-
-    float V = cSim.cellVolume;
-
-    float4 apos1, apos2 ; // i nteracting atoms
-    float4 af; // atomic force
+    const float eps0 = 1.0f/(4.0f*3.1415926535f*cSim.epsfac);

    unsigned int atomID1    = threadIdx.x + blockIdx.x * blockDim.x;

    while (atomID1 < cSim.atoms)
    {
-        apos1             = cSim.pPosq[atomID1];
-        af                = cSim.pForce4[atomID1];
-
+        float4 apos1 = cSim.pPosq[atomID1];
+        float4 af    = cSim.pForce4[atomID1];
        unsigned int atomID2 = 0;
        while (atomID2 < cSim.atoms)
        {
-                apos2             = cSim.pPosq[atomID2];
-
-                lowry = 0;
-                lowrz = 1;
-
-                for(int rx = 0; rx < numRx; rx++) {
-
-                  kx = rx * cSim.recipBoxSizeX;
-
-                  for(int ry = lowry; ry < numRy; ry++) {
-
-                    ky = ry * cSim.recipBoxSizeY;
+            float4 apos2 = cSim.pPosq[atomID2];
+            float scale = 2.0f*apos1.w*apos2.w/(cSim.cellVolume*eps0);

-                    for (int rz = lowrz; rz < numRz; rz++) {
+            int lowry = 0;
+            int lowrz = 1;

-                      kz = rz * cSim.recipBoxSizeZ;
-
-                      k2 = kx * kx + ky * ky + kz * kz;
-                      ek = exp (  k2 * cSim.factorEwald);
-
-                      Qi = apos1.w ;
-                      Qj = apos2.w ;
-
-                      SinI = sin ( kx * apos1.x + ky * apos1.y + kz * apos1.z );
-                      SinJ = sin ( kx * apos2.x + ky * apos2.y + kz * apos2.z );
-                      CosI = cos ( kx * apos1.x + ky * apos1.y + kz * apos1.z );
-                      CosJ = cos ( kx * apos2.x + ky * apos2.y + kz * apos2.z );
-
-                      af.x -= (2.0 / (V * eps0 ))  * Qi * ( kx/k2) * ek * ( - SinI * Qj * CosJ + CosI * Qj * SinJ);
-                      af.y -= (2.0 / (V * eps0 ))  * Qi * ( ky/k2) * ek * ( - SinI * Qj * CosJ + CosI * Qj * SinJ);
-                      af.z -= (2.0 / (V * eps0 ))  * Qi * ( kz/k2) * ek * ( - SinI * Qj * CosJ + CosI * Qj * SinJ);
-
-                      lowrz = 1 - numRz;
+            for(int rx = 0; rx < cSim.kmax; rx++)
+            {
+                float kx = rx*cSim.recipBoxSizeX;
+                for(int ry = lowry; ry < cSim.kmax; ry++)
+                {
+                    float ky = ry*cSim.recipBoxSizeY;
+                    for (int rz = lowrz; rz < cSim.kmax; rz++)
+                    {
+                        float kz = rz*cSim.recipBoxSizeZ;
+                        float k2 = kx*kx + ky*ky + kz*kz;
+                        float ek = exp(k2*cSim.factorEwald);
+
+                        float arg1 = kx*apos1.x + ky*apos1.y + kz*apos1.z;
+                        float arg2 = kx*apos2.x + ky*apos2.y + kz*apos2.z;
+                        float sinI = sin(arg1);
+                        float sinJ = sin(arg2);
+                        float cosI = cos(arg1);
+                        float cosJ = cos(arg2);
+
+                        float f = scale * ek * (-sinI*cosJ + cosI*sinJ) / k2;
+                        af.x -= kx*f;
+                        af.y -= ky*f;
+                        af.z -= kz*f;
+
+                        lowrz = 1 - cSim.kmax;
                    }
-
-                    lowry = 1 - numRy;
+                    lowry = 1 - cSim.kmax;
                }
            }
-
            atomID2++;
       }
-   
       cSim.pForce4[atomID1] = af;
       atomID1 += blockDim.x * gridDim.x;


--- a/platforms/cuda/src/kernels/kLangevinUpdate.h
+++ b/platforms/cuda/src/kernels/kLangevinUpdate.h
@@ -155,7 +155,7 @@ __global__ void kLangevinUpdatePart2_kernel()
        CM.x                   += mass * velocity.x;
        CM.y                   += mass * velocity.y;
        CM.z                   += mass * velocity.z;
-#endif;
+#endif

        Xmh.x                   = vVector.x * cSim.TauDOverEMMinusOne +
                                  sqrtInvMass * random4b.x;

--- a/platforms/reference/src/SimTKReference/ReferenceLJCoulombIxn.cpp
+++ b/platforms/reference/src/SimTKReference/ReferenceLJCoulombIxn.cpp
@@ -36,6 +36,8 @@
 // make sure that erf() and erfc() are defined.
 #include "MSVC_erfc.h"

+using std::vector;
+
 /**---------------------------------------------------------------------------------------

   ReferenceLJCoulombIxn constructor
@@ -279,9 +281,9 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at
 // setup K-vectors

  #define EIR(x, y, z) eir[(x)*numberOfAtoms*3+(y)*3+z]
-  d_complex* eir = new d_complex[kmax*numberOfAtoms*3];
-  d_complex* tab_xy = new d_complex[numberOfAtoms];
-  d_complex* tab_qxyz = new d_complex[numberOfAtoms];
+  vector<d_complex> eir(kmax*numberOfAtoms*3);
+  vector<d_complex> tab_xy(numberOfAtoms);
+  vector<d_complex> tab_qxyz(numberOfAtoms);

  if (kmax < 1) {
      std::stringstream message;
@@ -389,7 +391,7 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at

       // allocate and initialize exclusion array

-       int* exclusionIndices = new int[numberOfAtoms];
+       vector<int> exclusionIndices(numberOfAtoms);
       for( int ii = 0; ii < numberOfAtoms; ii++ ){
          exclusionIndices[ii] = -1;
       }
@@ -428,11 +430,6 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at
          }
       }

-       delete[] exclusionIndices;
-       delete[] eir;
-       delete[] tab_xy;
-       delete[] tab_qxyz;
-
 // ***********************************************************************

        if( totalEnergy ) {