More work on fast Ewald (N^3/2) in Cuda

9fdfaca9 · Rossen Apostolov · 6fd7f3bb · 9fdfaca9 · 9fdfaca9
Commit 9fdfaca9 authored May 22, 2009 by Rossen Apostolov
2 changed files
--- a/platforms/cuda/src/kernels/kCalculateCDLJEwaldFastReciprocal.h
+++ b/platforms/cuda/src/kernels/kCalculateCDLJEwaldFastReciprocal.h
@@ -73,25 +73,27 @@ __global__ void kCalculateEwaldFastEikr_kernel()
    while (atom < cSim.atoms)
    {
+          apos                         = cSim.pPosq[atom];
 //generic form of the array
 // pEikr[ atomID*kmax*3 + k*3 + m]
-        for(unsigned int m = 0; (m < 3); m++) {
 // k = 0, explicitly
+        for(unsigned int m = 0; (m < 3); m++) {
          cSim.pEikr[atom*kmax*3 + 0 + m].x = 1;
          cSim.pEikr[atom*kmax*3 + 0 + m].y = 0;
+        }
 // k = 1, explicitly
-          cSim.pEikr[atom*kmax*3 + 3 + m].x = cos(apos.x*cSim.recipBoxSizeX);
+          cSim.pEikr[atom*kmax*3 + 3 + 0].x = cos(apos.x*cSim.recipBoxSizeX);
-          cSim.pEikr[atom*kmax*3 + 3 + m].y = sin(apos.x*cSim.recipBoxSizeX);
+          cSim.pEikr[atom*kmax*3 + 3 + 0].y = sin(apos.x*cSim.recipBoxSizeX);
-          cSim.pEikr[atom*kmax*3 + 4 + m].x = cos(apos.y*cSim.recipBoxSizeY);
+          cSim.pEikr[atom*kmax*3 + 3 + 1].x = cos(apos.y*cSim.recipBoxSizeY);
-          cSim.pEikr[atom*kmax*3 + 4 + m].y = sin(apos.y*cSim.recipBoxSizeY);
+          cSim.pEikr[atom*kmax*3 + 3 + 1].y = sin(apos.y*cSim.recipBoxSizeY);
-          cSim.pEikr[atom*kmax*3 + 5 + m].x = cos(apos.z*cSim.recipBoxSizeZ);
+          cSim.pEikr[atom*kmax*3 + 3 + 2].x = cos(apos.z*cSim.recipBoxSizeZ);
-          cSim.pEikr[atom*kmax*3 + 5 + m].y = sin(apos.z*cSim.recipBoxSizeZ);
+          cSim.pEikr[atom*kmax*3 + 3 + 2].y = sin(apos.z*cSim.recipBoxSizeZ);
-        }
 // k > 1, by recursion
        for(unsigned int k=2; (k<kmax); k++) {
          for(unsigned int m=0; (m<3); m++) {
@@ -103,13 +105,12 @@ __global__ void kCalculateEwaldFastEikr_kernel()
    }
 }
-__global__ void kCalculateEwaldFastCosSinSums_kernel()
+__global__ void kCalculateEwaldFastStructureFactors_kernel()
 {
 // hard-coded maximum k-vectors, no interface yet
    int kmax               = cSim.kmax;
-//    float2 eikr;
    float4 apos;
    int lowry = 0;
    int lowrz = 1;
@@ -125,64 +126,110 @@ __global__ void kCalculateEwaldFastCosSinSums_kernel()
    while (atom < cSim.atoms)
    {
-    apos                         = cSim.pPosq[atom];
+           apos                         = cSim.pPosq[atom];
+           // cSim.pEikr[atom*kmax*3 + k*3 + m] 
+           for(int rx = 0; rx < numRx; rx++) 
+           {
+             for(int ry = lowry; ry < numRy; ry++) 
+             {
+               if(ry >= 0) 
+               {
+                   tab_xy = MultofFloat2 (cSim.pEikr[atom*kmax*3 + rx*3 + 0] , cSim.pEikr[atom*kmax*3 + ry*3 + 1]);
+               }
+               else 
+               {
+                   tab_xy = ConjMultofFloat2 (cSim.pEikr[atom*kmax*3 + rx*3 + 0] , cSim.pEikr[atom*kmax*3 - ry*3 + 1]);
+               }
+               for (int rz = lowrz; rz < numRz; rz++) 
+               {
+                   index = rx * (numRy*2 - 1 ) * (numRz*2 - 1) + (ry + numRy -1 ) * (numRz * 2 - 1) + (rz + numRz -1 );
+                   if( rz >= 0) 
+                   {
+                      cSim.pStructureFactor[atom*totalK + index] = FloatMultFloat2 ( (apos.w ), MultofFloat2 (tab_xy ,cSim.pEikr[atom*kmax*3 + rz*3 + 2] ));
+                   }
+                   else 
+                   {
+                      cSim.pStructureFactor[atom*totalK + index] = FloatMultFloat2 ( ( apos.w ), ConjMultofFloat2 (tab_xy ,cSim.pEikr[atom*kmax*3 - rz*3 + 2] ));
+                   }
+                     cSim.pCosSinSum[index].x = 0.0;
+                     cSim.pCosSinSum[index].y = 0.0;
+                   lowrz = 1 - numRz;
+               }
+               lowry = 1 - numRy;
+             }
+           }
+        atom                            += blockDim.x * gridDim.x;
+    }
+}
+__global__ void kCalculateEwaldFastCosSinSums_kernel()
+{
+//    float2 eikr;
+    int lowry = 0;
+    int lowrz = 1;
+    int numRx = 20+1;
+    int numRy = 20+1;
+    int numRz = 20+1;
+    unsigned int totalK  = (numRx*2 - 1) * (numRy*2 - 1) * (numRz*2 - 1);
+    int index;
+    unsigned int rx    = threadIdx.x + blockIdx.x * blockDim.x;
+    while (rx < numRx)
+    {
 // **********************************************************************
 // cSim.pEikr[atom*kmax*3 + k*3 + m] 
-    for(int rx = 0; rx < numRx; rx++) {
+//    for(int rx = 0; rx < numRx; rx++) {
      for(int ry = lowry; ry < numRy; ry++) {
-        if(ry >= 0) {
-      //      tab_xy[n] = EIR(rx, n, 0) * EIR(ry, n, 1);
-            tab_xy = MultofFloat2 (cSim.pEikr[atom*kmax*3 + rx*3 + 0] , cSim.pEikr[atom*kmax*3 + ry*3 + 1]);
-        }
-        else {
-      //      tab_xy[n] = EIR(rx, n, 0) * conj (EIR(-ry, n, 1));
-            tab_xy = ConjMultofFloat2 (cSim.pEikr[atom*kmax*3 + rx*3 + 0] , cSim.pEikr[atom*kmax*3 - ry*3 + 1]);
-        }
        for (int rz = lowrz; rz < numRz; rz++) {
-       // next one is scary!
          index = rx * (numRy*2 - 1 ) * (numRz*2 - 1) + (ry + numRy -1 ) * (numRz * 2 - 1) + (rz + numRz -1 );
-          if( rz >= 0) {
+             for (int atom = 0; atom < cSim.atoms; atom++)
-             //tab_qxyz[n] = atomParameters[n][QIndex] * (tab_xy[n] * EIR(rz, n, 2));
+             {
-             cSim.pStructureFactor[atom*totalK + index] = FloatMultFloat2 ( apos.w , MultofFloat2 (tab_xy ,cSim.pEikr[atom*kmax*3 + rz*3 + 2] ));
+                 cSim.pCosSinSum[index].x += cSim.pStructureFactor[atom*totalK + index].x;
-          }
+                 cSim.pCosSinSum[index].y += cSim.pStructureFactor[atom*totalK + index].y;
+             }
-          else {
-            //  tab_qxyz[n] = atomParameters[n][QIndex] * (tab_xy[n] * conj(EIR(-rz, n, 2)));
-             cSim.pStructureFactor[atom*totalK + index] = FloatMultFloat2 ( apos.w , ConjMultofFloat2 (tab_xy ,cSim.pEikr[atom*kmax*3 - rz*3 + 2] ));
-          }
-            cSim.pCosSinSum[index].x += cSim.pStructureFactor[atom*totalK + index].x;
-            cSim.pCosSinSum[index].y += cSim.pStructureFactor[atom*totalK + index].y;
          lowrz = 1 - numRz;
        }
        lowry = 1 - numRy;
      }
+        rx                            += blockDim.x * gridDim.x;
    }
-// **********************************************************************
-        atom                            += blockDim.x * gridDim.x;
-    }
 }
 __global__ void kCalculateEwaldFastForces_kernel()
 {
    float PI               = 3.14159265358979323846f;
-// hard-coded maximum k-vectors, no interface yet
-//    int kmax               = cSim.kmax;
    const float epsilon     =  1.0;
    float recipCoeff = (4*PI/cSim.V/epsilon);
-//    float2 eikr;
-//    float4 apos;
    int lowry = 0;
    int lowrz = 1;
    int numRx = 20+1;
@@ -195,10 +242,6 @@ __global__ void kCalculateEwaldFastForces_kernel()
    while (atom < cSim.atoms)
    {
- //   apos                         = cSim.pPosq[atom];
-// **********************************************************************
-// cSim.pEikr[atom*kmax*3 + k*3 + m] 
    for(int rx = 0; rx < numRx; rx++) {
@@ -219,6 +262,7 @@ __global__ void kCalculateEwaldFastForces_kernel()
          float ak = exp(k2*cSim.factorEwald) / k2;
          float dEdR = ak * (cSim.pCosSinSum[index].x  * cSim.pStructureFactor[atom*totalK + index].y - cSim.pCosSinSum[index].y * cSim.pStructureFactor[atom*totalK + index].x);
          cSim.pForce4[atom].x += 2 * recipCoeff * dEdR * kx ;
          cSim.pForce4[atom].y += 2 * recipCoeff * dEdR * ky ;
          cSim.pForce4[atom].z += 2 * recipCoeff * dEdR * kz ;
@@ -228,7 +272,9 @@ __global__ void kCalculateEwaldFastForces_kernel()
        lowry = 1 - numRy;
      }
    }
-// **********************************************************************
        atom                            += blockDim.x * gridDim.x;
    }
 }
--- a/platforms/cuda/src/kernels/kCalculateCDLJForces.cu
+++ b/platforms/cuda/src/kernels/kCalculateCDLJForces.cu
@@ -215,12 +215,14 @@ void kCalculateCDLJForces(gpuContext gpu)
                LAUNCHERROR("kCalculateCDLJEwaldReciprocalForces");
 // If using Fast Ewald, uncomment the lines below
- //               kCalculateEwaldFastEikr_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
+//                kCalculateEwaldFastEikr_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
- //               LAUNCHERROR("kCalculateEwaldFastEikr");
+//                LAUNCHERROR("kCalculateEwaldFastEikr");
- //               kCalculateEwaldFastCosSinSums_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
+//                kCalculateEwaldFastStructureFactors_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
- //               LAUNCHERROR("kCalculateEwaldFastCosSinSums");
+//                LAUNCHERROR("kCalculateEwaldFastStructureFactors_kernel");
- //               kCalculateEwaldFastForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
+//                kCalculateEwaldFastCosSinSums_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
- //               LAUNCHERROR("kCalculateEwaldFastForces");
+//                LAUNCHERROR("kCalculateEwaldFastCosSinSums");
+//                kCalculateEwaldFastForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
+//                LAUNCHERROR("kCalculateEwaldFastForces");
            }
    }