Preliminary implementation of faster (N^3/2) Ewald kernels in Cuda

353529b1 · Rossen Apostolov · 52c04559 · 353529b1 · 353529b1 · 353529b1
Commit 353529b1 authored May 21, 2009 by Rossen Apostolov
6 changed files
--- a/platforms/cuda/src/kernels/cudatypes.h
+++ b/platforms/cuda/src/kernels/cudatypes.h
@@ -335,6 +335,9 @@ struct cudaGmxSimulation {
    float           collisionProbability;           // Collision probability for Andersen thermostat
    float2*         pObcData;                       // Pointer to fixed Born data
    float2*         pAttr;                          // Pointer to additional atom attributes (sig, eps)
+    float2*         pEikr;                          // Pointer to exponents of reciprocal vectors and atom coordinates (ewald)
+    float2*         pStructureFactor;               // Pointer to the structure factors (ewald)
+    float2*         pCosSinSum;                     // Pointer to the cos/sin sums (ewald)
    unsigned int    bonds;                          // Number of bonds
    int4*           pBondID;                        // Bond atom and output buffer IDs
    float2*         pBondParameter;                 // Bond parameters

--- a/platforms/cuda/src/kernels/gpu.cpp
+++ b/platforms/cuda/src/kernels/gpu.cpp
@@ -1087,6 +1087,12 @@ int gpuAllocateInitialBuffers(gpuContext gpu)
    gpu->sim.pObcChain                  = gpu->psObcChain->_pDevStream[0];
    gpu->psSigEps2                      = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1, "SigEps2");
    gpu->sim.pAttr                      = gpu->psSigEps2->_pDevStream[0];
+    gpu->psEwaldEikr                    = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1, "EwaldEikr");
+    gpu->sim.pEikr                      = gpu->psEwaldEikr->_pDevStream[0];
+    gpu->psEwaldStructureFactor         = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1, "EwaldStructureFactor");
+    gpu->sim.pStructureFactor           = gpu->psEwaldStructureFactor->_pDevStream[0];
+    gpu->psEwaldCosSinSum               = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1, "EwaldCosSinSum");
+    gpu->sim.pCosSinSum                 = gpu->psEwaldCosSinSum->_pDevStream[0];
    gpu->psObcData                      = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1, "ObcData");
    gpu->sim.pObcData                   = gpu->psObcData->_pDevStream[0];
    gpu->pAtomSymbol                    = new unsigned char[gpu->natoms];
@@ -1486,6 +1492,9 @@ void gpuShutDown(gpuContext gpu)
    delete gpu->psxVector4;
    delete gpu->psvVector4;
    delete gpu->psSigEps2; 
+    delete gpu->psEwaldEikr; 
+    delete gpu->psEwaldStructureFactor; 
+    delete gpu->psEwaldCosSinSum; 
    delete gpu->psObcData; 
    delete gpu->psObcChain;
    delete gpu->psBornForce;

--- a/platforms/cuda/src/kernels/gputypes.h
+++ b/platforms/cuda/src/kernels/gputypes.h
@@ -86,6 +86,9 @@ struct _gpuContext {
    CUDAStream<float4>* psxVector4;
    CUDAStream<float4>* psvVector4;
    CUDAStream<float2>* psSigEps2; 
+    CUDAStream<float2>* psEwaldEikr; 
+    CUDAStream<float2>* psEwaldStructureFactor; 
+    CUDAStream<float2>* psEwaldCosSinSum; 
    CUDAStream<float2>* psObcData; 
    CUDAStream<float>* psObcChain;
    CUDAStream<float>* psBornForce;

--- a/platforms/cuda/src/kernels/kCalculateCDLJEwaldFastReciprocal.h
+++ b/platforms/cuda/src/kernels/kCalculateCDLJEwaldFastReciprocal.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Rossen P. Apostolov, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+  /* Define multiply operations for floats */
+  __device__ float2 MultofFloat2(float2 a, float2 b)
+  {
+       float2 c;
+       c.x = a.x * b.x - a.y * b.y;
+       c.y = a.x * b.y + a.y * b.x;
+       return c;
+  }
+  __device__ float2 ConjMultofFloat2(float2 a, float2 b)
+  {
+       float2 c;
+       c.x = a.x*b.x + a.y*b.y;
+       c.y = a.y*b.x - a.x*b.y;
+       return c;
+  }
+  __device__ float2 FloatMultFloat2(float r, float2 a)
+  {
+       float2 b;
+       b.x = r*a.x;
+       b.y = r*a.y;
+       return b;
+  }
+  __device__ float2 FloatMultConjFloat2(float r, float2 a)
+  {
+       float2 b;
+       b.x = r*a.y;
+       b.y = r*a.x;
+       return b;
+  }
+__global__ void kCalculateEwaldFastEikr_kernel()
+{
+    int kmax               = cSim.kmax;
+    float4 apos;
+    unsigned int atom    = threadIdx.x + blockIdx.x * blockDim.x;
+    while (atom < cSim.atoms)
+    {
+//generic form of the array
+// pEikr[ atomID*kmax*3 + k*3 + m]
+        for(unsigned int m = 0; (m < 3); m++) {
+// k = 0, explicitly
+          cSim.pEikr[atom*kmax*3 + 0 + m].x = 1;
+          cSim.pEikr[atom*kmax*3 + 0 + m].y = 0;
+// k = 1, explicitly
+          cSim.pEikr[atom*kmax*3 + 3 + m].x = cos(apos.x*cSim.recipBoxSizeX);
+          cSim.pEikr[atom*kmax*3 + 3 + m].y = sin(apos.x*cSim.recipBoxSizeX);
+          cSim.pEikr[atom*kmax*3 + 4 + m].x = cos(apos.y*cSim.recipBoxSizeY);
+          cSim.pEikr[atom*kmax*3 + 4 + m].y = sin(apos.y*cSim.recipBoxSizeY);
+          cSim.pEikr[atom*kmax*3 + 5 + m].x = cos(apos.z*cSim.recipBoxSizeZ);
+          cSim.pEikr[atom*kmax*3 + 5 + m].y = sin(apos.z*cSim.recipBoxSizeZ);
+        }
+// k > 1, by recursion
+        for(unsigned int k=2; (k<kmax); k++) {
+          for(unsigned int m=0; (m<3); m++) {
+            cSim.pEikr[atom*kmax*3 + k*3 + m] = MultofFloat2 (cSim.pEikr[atom*kmax*3 + (k-1)*3 + m] , cSim.pEikr[atom*kmax*3 + 3 + m]);
+          }
+        }
+        atom                            += blockDim.x * gridDim.x;
+    }
+}
+__global__ void kCalculateEwaldFastCosSinSums_kernel()
+{
+// hard-coded maximum k-vectors, no interface yet
+    int kmax               = cSim.kmax;
+//    float2 eikr;
+    float4 apos;
+    int lowry = 0;
+    int lowrz = 1;
+    int numRx = 20+1;
+    int numRy = 20+1;
+    int numRz = 20+1;
+    unsigned int totalK  = (numRx*2 - 1) * (numRy*2 - 1) * (numRz*2 - 1);
+    float2 tab_xy;
+    int index;
+    unsigned int atom    = threadIdx.x + blockIdx.x * blockDim.x;
+    while (atom < cSim.atoms)
+    {
+    apos                         = cSim.pPosq[atom];
+// **********************************************************************
+// cSim.pEikr[atom*kmax*3 + k*3 + m] 
+    for(int rx = 0; rx < numRx; rx++) {
+      for(int ry = lowry; ry < numRy; ry++) {
+        if(ry >= 0) {
+      //      tab_xy[n] = EIR(rx, n, 0) * EIR(ry, n, 1);
+            tab_xy = MultofFloat2 (cSim.pEikr[atom*kmax*3 + rx*3 + 0] , cSim.pEikr[atom*kmax*3 + ry*3 + 1]);
+        }
+        else {
+      //      tab_xy[n] = EIR(rx, n, 0) * conj (EIR(-ry, n, 1));
+            tab_xy = ConjMultofFloat2 (cSim.pEikr[atom*kmax*3 + rx*3 + 0] , cSim.pEikr[atom*kmax*3 - ry*3 + 1]);
+        }
+        for (int rz = lowrz; rz < numRz; rz++) {
+       // next one is scary!
+          index = rx * (numRy*2 - 1 ) * (numRz*2 - 1) + (ry + numRy -1 ) * (numRz * 2 - 1) + (rz + numRz -1 );
+          if( rz >= 0) {
+             //tab_qxyz[n] = atomParameters[n][QIndex] * (tab_xy[n] * EIR(rz, n, 2));
+             cSim.pStructureFactor[atom*totalK + index] = FloatMultFloat2 ( apos.w , MultofFloat2 (tab_xy ,cSim.pEikr[atom*kmax*3 + rz*3 + 2] ));
+          }
+          else {
+            //  tab_qxyz[n] = atomParameters[n][QIndex] * (tab_xy[n] * conj(EIR(-rz, n, 2)));
+             cSim.pStructureFactor[atom*totalK + index] = FloatMultFloat2 ( apos.w , ConjMultofFloat2 (tab_xy ,cSim.pEikr[atom*kmax*3 - rz*3 + 2] ));
+          }
+            cSim.pCosSinSum[index].x += cSim.pStructureFactor[atom*totalK + index].x;
+            cSim.pCosSinSum[index].y += cSim.pStructureFactor[atom*totalK + index].y;
+          lowrz = 1 - numRz;
+        }
+        lowry = 1 - numRy;
+      }
+    }
+// **********************************************************************
+        atom                            += blockDim.x * gridDim.x;
+    }
+}
+__global__ void kCalculateEwaldFastForces_kernel()
+{
+    float PI               = 3.14159265358979323846f;
+// hard-coded maximum k-vectors, no interface yet
+//    int kmax               = cSim.kmax;
+    const float epsilon     =  1.0;
+    float recipCoeff = (4*PI/cSim.V/epsilon);
+//    float2 eikr;
+//    float4 apos;
+    int lowry = 0;
+    int lowrz = 1;
+    int numRx = 20+1;
+    int numRy = 20+1;
+    int numRz = 20+1;
+    unsigned int totalK  = (numRx*2 - 1) * (numRy*2 - 1) * (numRz*2 - 1);
+    int index;
+    unsigned int atom    = threadIdx.x + blockIdx.x * blockDim.x;
+    while (atom < cSim.atoms)
+    {
+ //   apos                         = cSim.pPosq[atom];
+// **********************************************************************
+// cSim.pEikr[atom*kmax*3 + k*3 + m] 
+    for(int rx = 0; rx < numRx; rx++) {
+      float kx = rx * cSim.recipBoxSizeX;
+      for(int ry = lowry; ry < numRy; ry++) {
+      float ky = ry * cSim.recipBoxSizeY;
+        for (int rz = lowrz; rz < numRz; rz++) {
+        float kz = rz * cSim.recipBoxSizeZ;
+       // next one is scary!
+          index = rx * (numRy*2 - 1 ) * (numRz*2 - 1) + (ry + numRy -1 ) * (numRz * 2 - 1) + (rz + numRz -1 );
+          float k2 = kx * kx + ky * ky + kz * kz;
+          float ak = exp(k2*cSim.factorEwald) / k2;
+          float dEdR = ak * (cSim.pCosSinSum[index].x  * cSim.pStructureFactor[atom*totalK + index].y - cSim.pCosSinSum[index].y * cSim.pStructureFactor[atom*totalK + index].x);
+          cSim.pForce4[atom].x += 2 * recipCoeff * dEdR * kx ;
+          cSim.pForce4[atom].y += 2 * recipCoeff * dEdR * ky ;
+          cSim.pForce4[atom].z += 2 * recipCoeff * dEdR * kz ;
+          lowrz = 1 - numRz;
+        }
+        lowry = 1 - numRy;
+      }
+    }
+// **********************************************************************
+        atom                            += blockDim.x * gridDim.x;
+    }
+}
--- a/platforms/cuda/src/kernels/kCalculateCDLJEwaldReciprocal.h
+++ b/platforms/cuda/src/kernels/kCalculateCDLJEwaldReciprocal.h
@@ -32,7 +32,6 @@
 __global__ void kCalculateCDLJEwaldReciprocalForces_kernel()
 {
-    float alphaEwald       = cSim.alphaEwald;
    float eps0             = 5.72765E-4;
    int numRx              = 20+1;
@@ -46,13 +45,15 @@ __global__ void kCalculateCDLJEwaldReciprocalForces_kernel()
    float V = cSim.cellVolume;
-    float4 apos1, apos2 ;
+    float4 apos1, apos2 ; // i nteracting atoms
+    float4 af; // atomic force
    unsigned int atomID1    = threadIdx.x + blockIdx.x * blockDim.x;
-    while (atomID1 < cSim.stride * cSim.outputBuffers)
+    while (atomID1 < cSim.atoms)
    {
        apos1             = cSim.pPosq[atomID1];
+        af                = cSim.pForce4[atomID1];
        unsigned int atomID2    = 0;
        while (atomID2 < cSim.atoms)
@@ -85,9 +86,9 @@ __global__ void kCalculateCDLJEwaldReciprocalForces_kernel()
                      CosI = cos ( kx * apos1.x + ky * apos1.y + kz * apos1.z );
                      CosJ = cos ( kx * apos2.x + ky * apos2.y + kz * apos2.z );
-                      cSim.pForce4[atomID1].x -= (2.0 / (V * eps0 ))  * Qi * ( kx/k2) * ek * ( - SinI * Qj * CosJ + CosI * Qj * SinJ);
+                      af.x -= (2.0 / (V * eps0 ))  * Qi * ( kx/k2) * ek * ( - SinI * Qj * CosJ + CosI * Qj * SinJ);
-                      cSim.pForce4[atomID1].y -= (2.0 / (V * eps0 ))  * Qi * ( ky/k2) * ek * ( - SinI * Qj * CosJ + CosI * Qj * SinJ);
+                      af.y -= (2.0 / (V * eps0 ))  * Qi * ( ky/k2) * ek * ( - SinI * Qj * CosJ + CosI * Qj * SinJ);
-                      cSim.pForce4[atomID1].z -= (2.0 / (V * eps0 ))  * Qi * ( kz/k2) * ek * ( - SinI * Qj * CosJ + CosI * Qj * SinJ);
+                      af.z -= (2.0 / (V * eps0 ))  * Qi * ( kz/k2) * ek * ( - SinI * Qj * CosJ + CosI * Qj * SinJ);
                      lowrz = 1 - numRz;
                    }
@@ -98,7 +99,8 @@ __global__ void kCalculateCDLJEwaldReciprocalForces_kernel()
                atomID2++;
       }
+       cSim.pForce4[atomID1]               = af;
       atomID1                            += blockDim.x * gridDim.x;
    }

--- a/platforms/cuda/src/kernels/kCalculateCDLJForces.cu
+++ b/platforms/cuda/src/kernels/kCalculateCDLJForces.cu
@@ -119,6 +119,7 @@ void GetCalculateCDLJForcesSim(gpuContext gpu)
 // Reciprocal Space Ewald summation is in a separate kernel
 #include "kCalculateCDLJEwaldReciprocal.h"
+#include "kCalculateCDLJEwaldFastReciprocal.h"
 __global__ extern void kCalculateCDLJCutoffForces_12_kernel();
@@ -198,6 +199,7 @@ void kCalculateCDLJForces(gpuContext gpu)
                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
            if (gpu->bOutputBufferPerWarp)
            {
+// Vanilla (N2) Ewald
                kCalculateCDLJEwaldDirectByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
                kCalculateCDLJEwaldReciprocalForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
@@ -205,11 +207,20 @@ void kCalculateCDLJForces(gpuContext gpu)
            }
            else
            {
+// Vanilla (N2) Ewald
                kCalculateCDLJEwaldDirectForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
                LAUNCHERROR("kCalculateCDLJEwaldDirectForces");
                kCalculateCDLJEwaldReciprocalForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
                LAUNCHERROR("kCalculateCDLJEwaldReciprocalForces");
+// If using Fast Ewald, uncomment the lines below
+ //               kCalculateEwaldFastEikr_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
+ //               LAUNCHERROR("kCalculateEwaldFastEikr");
+ //               kCalculateEwaldFastCosSinSums_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
+ //               LAUNCHERROR("kCalculateEwaldFastCosSinSums");
+ //               kCalculateEwaldFastForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
+ //               LAUNCHERROR("kCalculateEwaldFastForces");
            }
    }