Real space Ewald (direct summation) is now in kCalculateCDLJForces.h,...

Real space Ewald (direct summation) is now in kCalculateCDLJForces.h, reciprocal summation is done in a separate kernel

Real space Ewald (direct summation) is now in kCalculateCDLJForces.h,...
Real space Ewald (direct summation) is now in kCalculateCDLJForces.h, reciprocal summation is done in a separate kernel
1b9ab82b · Rossen Apostolov · d3b512ad · 1b9ab82b · 1b9ab82b · 1b9ab82b
Commit 1b9ab82b authored Apr 07, 2009 by Rossen Apostolov
3 changed files
--- a/platforms/cuda/src/kernels/kCalculateCDLJForces.cu
+++ b/platforms/cuda/src/kernels/kCalculateCDLJForces.cu
@@ -108,7 +108,23 @@ void GetCalculateCDLJForcesSim(gpuContext gpu)
 #include "kCalculateCDLJForces.h"

 // Include version of the kernel with Ewald method
-#include "kCalculateEwald.h"
+
+    // Real Space Ewald uses almost the same kernels as Periodic
+    
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_PERIODIC
+#define USE_EWALD
+#define METHOD_NAME(a, b) a##EwaldDirect##b
+#include "kCalculateCDLJForces.h"
+#include "kFindInteractingBlocks.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##EwaldDirectByWarp##b
+#include "kCalculateCDLJForces.h"
+
+     // Reciprocal Space Ewald summation is in a separate kernel
+//#include "kCalculateEwaldReciprocal.h"

 __global__ extern void kCalculateCDLJCutoffForces_12_kernel();

@@ -177,10 +193,10 @@ void kCalculateCDLJForces(gpuContext gpu)
            LAUNCHERROR("kCalculateCDLJPeriodicForces");
            break;
        case EWALD:
-            kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
-            LAUNCHERROR("kFindBlockBoundsPeriodic");
-            kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
-            LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
+            kFindBlockBoundsEwaldDirect_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
+            LAUNCHERROR("kFindBlockBoundsEwaldDirect");
+            kFindBlocksWithInteractionsEwaldDirect_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
+            LAUNCHERROR("kFindBlocksWithInteractionsEwaldDirect");
            result = cudppCompact(gpu->cudpp, gpu->sim.pInteractingWorkUnit, gpu->sim.pInteractionCount,
                    gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits);
            if (result != CUDPP_SUCCESS)
@@ -190,15 +206,15 @@ void kCalculateCDLJForces(gpuContext gpu)
            }
            gpu->psInteractionCount->Download();
            numWithInteractions = gpu->psInteractionCount->_pSysData[0];
-            kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+            kFindInteractionsWithinBlocksEwaldDirect_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
            if (gpu->bOutputBufferPerWarp)
-                kCalculateCDLJPeriodicByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                kCalculateCDLJEwaldDirectByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
            else
-                kCalculateCDLJEwaldForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                kCalculateCDLJEwaldDirectForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
-            LAUNCHERROR("kCalculateCDLJPeriodicForces");
+            LAUNCHERROR("kCalculateCDLJEwaldDirectForces");

    }
 }
--- a/platforms/cuda/src/kernels/kCalculateCDLJForces.h
+++ b/platforms/cuda/src/kernels/kCalculateCDLJForces.h
@@ -46,6 +46,12 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
    float3* tempBuffer = (float3*) &sA[cSim.nonbond_threads_per_block];
 #endif

+#ifdef USE_EWALD
+    float alphaEwald       =  3.123413;
+    float PI = 3.14159265358979323846f;
+    float SQRT_PI  = sqrt(PI);
+#endif
+
    unsigned int lasty = 0xFFFFFFFF;
    while (pos < end)
    {
@@ -108,7 +114,13 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
                    eps             = a.y * psA[j].eps;
                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6;
 #ifdef USE_CUTOFF
+    #ifdef USE_EWALD
+                    float r               = sqrt(r2);
+                    float alphaR    = alphaEwald * r;
+                    dEdR += apos.w * psA[tj].q * invR * invR * invR * (erfc(alphaR) + 2.0 * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
+    #else
                    dEdR           += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
+    #endif
 #else
                    dEdR           += apos.w * psA[j].q * invR;
 #endif
@@ -151,7 +163,13 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
                    eps             = a.y * psA[j].eps;
                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6;
 #ifdef USE_CUTOFF
+    #ifdef USE_EWALD
+                    float r               = sqrt(r2);
+                    float alphaR    = alphaEwald * r;
+                    dEdR += apos.w * psA[tj].q * invR * invR * invR * (erfc(alphaR) + 2.0 * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
+    #else
                    dEdR           += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
+    #endif
 #else
                    dEdR           += apos.w * psA[j].q * invR;
 #endif
@@ -243,7 +261,13 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
                        eps             = a.y * psA[tj].eps;
                        dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6;
 #ifdef USE_CUTOFF
+    #ifdef USE_EWALD
+                    float r               = sqrt(r2);
+                    float alphaR    = alphaEwald * r;
+                    dEdR += apos.w * psA[tj].q * invR * invR * invR * (erfc(alphaR) + 2.0 * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
+    #else
                        dEdR           += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
+    #endif
 #else
                        dEdR           += apos.w * psA[tj].q * invR;
 #endif
@@ -292,7 +316,13 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
                            eps             = a.y * psA[j].eps;
                            dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6;
 #ifdef USE_CUTOFF
+    #ifdef USE_EWALD
+                    float r               = sqrt(r2);
+                    float alphaR    = alphaEwald * r;
+                    dEdR += apos.w * psA[tj].q * invR * invR * invR * (erfc(alphaR) + 2.0 * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
+    #else
                            dEdR           += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
+    #endif
 #else
                            dEdR           += apos.w * psA[j].q * invR;
 #endif
@@ -377,7 +407,13 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
                    eps             = a.y * psA[tj].eps;
                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6;
 #ifdef USE_CUTOFF
+    #ifdef USE_EWALD
+                    float r               = sqrt(r2);
+                    float alphaR    = alphaEwald * r;
+                    dEdR += apos.w * psA[tj].q * invR * invR * invR * (erfc(alphaR) + 2.0 * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
+    #else
                    dEdR           += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
+    #endif
 #else
                    dEdR           += apos.w * psA[tj].q * invR;
 #endif

--- a/platforms/cuda/src/kernels/kCalculateEwald.h
+++ b/platforms/cuda/src/kernels/kCalculateEwald.h
@@ -71,17 +71,6 @@ __device__ cuComplex FloatComplexMul(float r, cuComplex a)

 __global__ void  kCalculateCDLJEwaldForces_kernel(unsigned int* workUnit, int numWorkUnits)
 {
-/*
-
-    extern __shared__ Atom sA[];
-    unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
-    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
-    int pos = warp*numWorkUnits/totalWarps;
-    int end = (warp+1)*numWorkUnits/totalWarps;
-
-//###########################################################################
-// EWALD RECIP SPACE
-//###########################################################################

 // *******************************************************************
    float alphaEwald       =  3.123413f;
@@ -110,32 +99,40 @@ __global__ void  kCalculateCDLJEwaldForces_kernel(unsigned int* workUnit, int nu
    unsigned int numRz = 60+1;
    const int kmax = 61;

-    cuComplex eir[kmax][numberOfAtoms][3];
-    cuComplex tab_xy[numberOfAtoms];
-    cuComplex tab_qxyz[numberOfAtoms];
+    unsigned int pos    = threadIdx.x + blockIdx.x * blockDim.x;
+
+    cuComplex eir[kmax][cSim.atoms][3];
+    cuComplex tab_xy[cSim.atoms];
+    cuComplex tab_qxyz[cSim.atoms];


-  for(unsigned int i = 0; i < numberOfAtoms; i++) {
+  while (pos < cSim.atoms)
+  {
+
+      float4 apos             = cSim.pPosq[pos];
+
      for(unsigned int m = 0; (m < 3); m++) {
-      eir[0][i][m].x = 1;
-      eir[0][i][m].y = 0;
+        eir[0][pos][m].x = 1;
+        eir[0][pos][m].y = 0;
      }


-      eir[1][i][0].x = cos(apos.x*recipBoxSizeX);
-      eir[1][i][0].y = sin(apos.x*recipBoxSizeX);
+      eir[1][pos][0].x = cos(apos.x*recipBoxSizeX);
+      eir[1][pos][0].y = sin(apos.x*recipBoxSizeX);

-      eir[1][i][1].x = cos(apos.y*recipBoxSizeY);
-      eir[1][i][1].y = sin(apos.y*recipBoxSizeY);
+      eir[1][pos][1].x = cos(apos.y*recipBoxSizeY);
+      eir[1][pos][1].y = sin(apos.y*recipBoxSizeY);

-      eir[1][i][2].x = cos(apos.z*recipBoxSizeZ);
-      eir[1][i][2].y = sin(apos.z*recipBoxSizeZ);
+      eir[1][pos][2].x = cos(apos.z*recipBoxSizeZ);
+      eir[1][pos][2].y = sin(apos.z*recipBoxSizeZ);

      for(unsigned int j=2; (j<kmax); j++) {
        for(unsigned int m=0; (m<3); m++) {
-        eir[j][i][m] = ComplexMul (eir[j-1][i][m] , eir[1][i][m]);
+          eir[j][pos][m] = ComplexMul (eir[j-1][pos][m] , eir[1][pos][m]);
        }
      }
+
+      pos                    += blockDim.x * gridDim.x;
  }

 // *******************************************************************
@@ -153,14 +150,18 @@ __global__ void  kCalculateCDLJEwaldForces_kernel(unsigned int* workUnit, int nu
        float ky = ry * recipBoxSizeY;

        if(ry >= 0) {
-          for(int n = 0; n < numberOfAtoms; n++) {
-            tab_xy[n] = ComplexMul (eir[rx][n][0] , eir[ry][n][1]);
+          while (pos < cSim.atoms)
+          {
+            tab_xy[pos] = ComplexMul (eir[rx][pos][0] , eir[ry][pos][1]);
+            pos                    += blockDim.x * gridDim.x;
          }
        }

        else {
-          for(int n = 0; n < numberOfAtoms; n++) {
-            tab_xy[n]= ComplexConjMul (eir[rx][n][0] , (eir[-ry][n][1]));
+          while (pos < cSim.atoms)
+          {
+            tab_xy[pos]= ComplexConjMul (eir[rx][pos][0] , (eir[-ry][pos][1]));
+            pos                    += blockDim.x * gridDim.x;
          }
        }

@@ -173,34 +174,47 @@ __global__ void  kCalculateCDLJEwaldForces_kernel(unsigned int* workUnit, int nu

          if( rz >= 0) {

-           for( int n = 0; n < numberOfAtoms; n++) {
-             tab_qxyz[n] = FloatComplexMul ( Charge[n] * ComplexMul (tab_xy[n] , eir[rz][n][2]));
+           while (pos < cSim.atoms)
+           {
+             float4 apos         = cSim.pPosq[pos];
+             apos.w             *= cSim.epsfac;
+             tab_qxyz[pos] = FloatComplexMul ( apos.w * ComplexMul (tab_xy[pos] , eir[rz][pos][2]));
+             pos                    += blockDim.x * gridDim.x;
           }
          }

          else {

-            for( int n = 0; n < numberOfAtoms; n++) {
-              tab_qxyz[n] = FloatComplexMul( Charge[n] * ComplexConjMul (tab_xy[n] , conj(eir[-rz][n][2])) );
+            while (pos < cSim.atoms)
+            {
+              float4 apos         = cSim.pPosq[pos];
+              apos.w             *= cSim.epsfac;
+              tab_qxyz[pos] = FloatComplexMul( apos.w * ComplexConjMul (tab_xy[pos] , conj(eir[-rz][pos][2])) );
+              pos                    += blockDim.x * gridDim.x;
            }
          }

          float cs = 0.0f;
          float ss = 0.0f;

-          for( int n = 0; n < numberOfAtoms; n++) {
-            cs += tab_qxyz[n].x;
-            ss += tab_qxyz[n].y;
+          while (pos < cSim.atoms)
+          {
+            cs += tab_qxyz[pos].x;
+            ss += tab_qxyz[pos].y;
+            pos                    += blockDim.x * gridDim.x;
          }

          recipEnergy += ak * ( cs * cs + ss * ss);
          float vir =  akv * ( cs * cs + ss * ss);

-          for(int n = 0; n < numberOfAtoms; n++) {
-            float force = ak * (cs * tab_qxyz[n].y - ss * tab_qxyz[n].x);
-            forces[n][0] += 2.0 * recipCoeff * force * kx ;
-            forces[n][1] += 2.0 * recipCoeff * force * ky ;
-            forces[n][2] += 2.0 * recipCoeff * force * kz ;
+          while (pos < cSim.atoms)
+          {
+            float4 force            = cSim.pForce4[pos];
+
+            float dEdR = ak * (cs * tab_qxyz[pos].y - ss * tab_qxyz[pos].x);
+            force.x += 2.0 * recipCoeff * dEdR * kx ;
+            force.y += 2.0 * recipCoeff * dEdR * ky ;
+            force.z += 2.0 * recipCoeff * dEdR * kz ;
          } 

          lowrz = 1 - numRz;
@@ -217,120 +231,4 @@ __global__ void  kCalculateCDLJEwaldForces_kernel(unsigned int* workUnit, int nu
 // END EWALD RECIP SPACE
 //###########################################################################

-
-    while (pos < end)
-    {
-
-        // Extract cell coordinates from appropriate work unit
-        unsigned int x = workUnit[pos];
-        unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
-        bool bExclusionFlag = (x & 0x1);
-        x = (x >> 17) << GRIDBITS;
-        float4      apos;   // Local atom x, y, z, q
-        float3      af;     // Local atom fx, fy, fz
-        float dx;
-        float dy;
-        float dz;
-        float r2;
-        float r;
-        float invR;
-        float sig;
-        float sig2;
-        float sig6;
-        float eps;
-        float dEdR;
-        unsigned int tgx = threadIdx.x & (GRID - 1);
-        unsigned int tbx = threadIdx.x - tgx;
-        int tj = tgx;
-        Atom* psA = &sA[tbx];
-        unsigned int i      = x + tgx;
-        apos                = cSim.pPosq[i];
-        float2 a            = cSim.pAttr[i];
-        af.x                = 0.0f;
-        af.y                = 0.0f;
-        af.z                = 0.0f;
-        
-                int j                   = y + tgx;
-                float4 temp             = cSim.pPosq[j];
-                float2 temp1            = cSim.pAttr[j];
-                sA[threadIdx.x].x       = temp.x;
-                sA[threadIdx.x].y       = temp.y;
-                sA[threadIdx.x].z       = temp.z;
-                sA[threadIdx.x].q       = temp.w;
-                sA[threadIdx.x].sig     = temp1.x;
-                sA[threadIdx.x].eps     = temp1.y;
-
-            sA[threadIdx.x].fx      = 0.0f;
-            sA[threadIdx.x].fy      = 0.0f;
-            sA[threadIdx.x].fz      = 0.0f;
-            apos.w                 *= cSim.epsfac;
-
-                // Read fixed atom data into registers and GRF
-                unsigned int excl       = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
-                excl                    = (excl >> tgx) | (excl << (GRID - tgx));
-
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    dx              = psA[tj].x - apos.x;
-                    dy              = psA[tj].y - apos.y;
-                    dz              = psA[tj].z - apos.z;
-
-                    dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                    dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                    dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-
-                    r2              = dx * dx + dy * dy + dz * dz;
-                    r               = sqrt(r2);
-                    invR            = 1.0f / sqrt(r2);
-                    sig             = a.x + psA[tj].sig;
-                    sig2            = invR * sig;
-                    sig2           *= sig2;
-                    sig6            = sig2 * sig2 * sig2;
-                    eps             = a.y * psA[tj].eps;
-
-// ##### LJ
-                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6;
-
-// ##### SHORT RANGE EWALD
-                    float alphaR    = alphaEwald * r;
-                    dEdR += apos.w * psA[tj].q * invR * invR * invR * (erfc(alphaR) + 2.0 * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
-
-
-                    if (!(excl & 0x1) || r2 > cSim.nonbondedCutoffSqr)
-                    {
-                        dEdR = 0.0f;
-                    }
-
-
-                    dx             *= dEdR;
-                    dy             *= dEdR;
-                    dz             *= dEdR;
-                    af.x           -= dx;
-                    af.y           -= dy;
-                    af.z           -= dz;
-                    psA[tj].fx     += dx;
-                    psA[tj].fy     += dy;
-                    psA[tj].fz     += dz;
-                    excl          >>= 1;
-                    tj              = (tj + 1) & (GRID - 1);
-                }
-
-            // Write results
-            float4 of;
-
-            of.x                                = af.x;
-            of.y                                = af.y;
-            of.z                                = af.z;
-            of.w                                = 0.0f;
-            int offset                          = x + tgx + (y >> GRIDBITS) * cSim.stride;
-            cSim.pForce4a[offset]               = of;
-            of.x                                = sA[threadIdx.x].fx;
-            of.y                                = sA[threadIdx.x].fy;
-            of.z                                = sA[threadIdx.x].fz;
-            offset                              = y + tgx + (x >> GRIDBITS) * cSim.stride;
-            cSim.pForce4a[offset]               = of;
-
-        pos++;
-    }
-*/
 }