Beginning to implement PME reciprocal space calculation

fb62fecb · Peter Eastman · cf335495 · fb62fecb · fb62fecb · fb62fecb
Commit fb62fecb authored Sep 03, 2010 by Peter Eastman
5 changed files
--- a/plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
+++ b/plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
@@ -29,6 +29,9 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */
+#ifdef WIN32
+  #define _USE_MATH_DEFINES /* M_PI */
+#endif
 #include "openmm/OpenMMException.h"
 #include "cudaKernels.h"
 #include "amoebaCudaKernels.h"
@@ -37,9 +40,11 @@
 extern void OPENMMCUDA_EXPORT SetCalculateObcGbsaForces2Sim(gpuContext gpu);
 extern void OPENMMCUDA_EXPORT SetForcesSim(gpuContext gpu);
+#include <cmath>
 #include <sstream>
 #include <limits>
 #include <cstring>
+#include <vector>
 #ifdef WIN32
 //  #include <windows.h>
@@ -49,6 +54,8 @@ extern void OPENMMCUDA_EXPORT SetForcesSim(gpuContext gpu);
 //#define AMOEBA_DEBUG
 #undef  AMOEBA_DEBUG
+using std::vector;
 extern "C"
 amoebaGpuContext amoebaGpuInit( _gpuContext* gpu )
 {
@@ -68,6 +75,12 @@ amoebaGpuContext amoebaGpuInit( _gpuContext* gpu )
    amoebaGpu->amoebaSim.numberOfAtoms         = gpu->natoms;
    amoebaGpu->amoebaSim.paddedNumberOfAtoms   = gpu->sim.paddedNumberOfAtoms;
+    amoebaGpu->psThetai1 = NULL;
+    amoebaGpu->psThetai2 = NULL;
+    amoebaGpu->psThetai3 = NULL;
+    amoebaGpu->psQfac = NULL;
+    amoebaGpu->psIgrid = NULL;
    return amoebaGpu;
 }
@@ -2176,6 +2189,118 @@ void gpuSetAmoebaVdwParameters( amoebaGpuContext amoebaGpu,
 }
+extern "C"
+void gpuSetAmoebaPMEParameters(amoebaGpuContext amoebaGpu, float alpha, int gridSizeX, int gridSizeY, int gridSizeZ)
+{
+    gpuContext gpu = amoebaGpu->gpuContext;
+    gpu->sim.alphaEwald = alpha;
+    int3 gridSize = make_int3(gridSizeX, gridSizeY, gridSizeZ);
+    gpu->sim.pmeGridSize = gridSize;
+    int3 groupSize = make_int3(2, 4, 4);
+    gpu->sim.pmeGroupSize = groupSize;
+    const int3 numGroups = make_int3((gridSize.x+groupSize.x-1)/groupSize.x, (gridSize.y+groupSize.y-1)/groupSize.y, (gridSize.z+groupSize.z-1)/groupSize.z);
+    const unsigned int totalGroups = numGroups.x*numGroups.y*numGroups.z;
+    cufftPlan3d(&gpu->fftplan, gridSize.x, gridSize.y, gridSize.z, CUFFT_C2C);
+    gpu->psPmeGrid = new CUDAStream<cufftComplex>(gridSize.x*gridSize.y*gridSize.z, 1, "PmeGrid");
+    gpu->sim.pPmeGrid = gpu->psPmeGrid->_pDevData;
+    gpu->psPmeBsplineModuli[0] = new CUDAStream<float>(gridSize.x, 1, "PmeBsplineModuli0");
+    gpu->sim.pPmeBsplineModuli[0] = gpu->psPmeBsplineModuli[0]->_pDevData;
+    gpu->psPmeBsplineModuli[1] = new CUDAStream<float>(gridSize.y, 1, "PmeBsplineModuli1");
+    gpu->sim.pPmeBsplineModuli[1] = gpu->psPmeBsplineModuli[1]->_pDevData;
+    gpu->psPmeBsplineModuli[2] = new CUDAStream<float>(gridSize.z, 1, "PmeBsplineModuli2");
+    gpu->sim.pPmeBsplineModuli[2] = gpu->psPmeBsplineModuli[2]->_pDevData;
+    amoebaGpu->psQfac = new CUDAStream<float>(gridSize.x*gridSize.y*gridSize.z, 1, "qfac");
+    amoebaGpu->amoebaSim.pQfac = amoebaGpu->psQfac->_pDevData;
+    amoebaGpu->psThetai1 = new CUDAStream<float4>(AMOEBA_PME_ORDER*gpu->natoms, 1, "thetai1");
+    amoebaGpu->amoebaSim.pThetai1 = amoebaGpu->psThetai1->_pDevData;
+    amoebaGpu->psThetai2 = new CUDAStream<float4>(AMOEBA_PME_ORDER*gpu->natoms, 1, "thetai2");
+    amoebaGpu->amoebaSim.pThetai2 = amoebaGpu->psThetai2->_pDevData;
+    amoebaGpu->psThetai3 = new CUDAStream<float4>(AMOEBA_PME_ORDER*gpu->natoms, 1, "thetai3");
+    amoebaGpu->amoebaSim.pThetai3 = amoebaGpu->psThetai3->_pDevData;
+    amoebaGpu->psIgrid = new CUDAStream<int4>(gpu->natoms, 1, "igrid");
+    amoebaGpu->amoebaSim.pIgrid = amoebaGpu->psIgrid->_pDevData;
+    gpu->psPmeAtomGridIndex = new CUDAStream<int2>(gpu->natoms, 1, "PmeAtomGridIndex");
+    gpu->sim.pPmeAtomGridIndex = gpu->psPmeAtomGridIndex->_pDevData;
+    gpu->psPmeBsplineTheta = new CUDAStream<float4>(1, 1, "PmeBsplineTheta"); // Not actually uesd
+    gpu->psPmeBsplineDtheta = new CUDAStream<float4>(1, 1, "PmeBsplineDtheta"); // Not actually used
+    // Initialize the b-spline moduli.
+    double array[AMOEBA_PME_ORDER];
+    double x = 0.0;
+    array[0] = 1.0 - x;
+    array[1] = x;
+    for (int k = 2; k < AMOEBA_PME_ORDER; k++) {
+        double denom = 1.0/k;
+        array[k] = x*array[k-1]*denom;
+        for (int i = 1; i < k; i++)
+            array[k-i] = ((x+i)*array[k-i-1] + ((k-i+1)-x)*array[k-i])*denom;
+        array[0] = (1.0-x)*array[0]*denom;
+    }
+    int maxSize = std::max(std::max(gridSize.x, gridSize.y), gridSize.z);
+    vector<double> bsarray(maxSize+1, 0.0);
+    for (int i = 2; i <= AMOEBA_PME_ORDER+1; i++)
+        bsarray[i] = array[i-2];
+    for (int dim = 0; dim < 3; dim++) {
+        float* bsmod = gpu->psPmeBsplineModuli[dim]->_pSysData;
+        int size =  gpu->psPmeBsplineModuli[dim]->_length;
+        // get the modulus of the discrete Fourier transform
+        double factor = 2.0*M_PI/size;
+        for (int i = 0; i < size; i++) {
+            double sum1 = 0.0;
+            double sum2 = 0.0;
+            for (int j = 1; j <= size; j++) {
+                double arg = factor*i*(j-1);
+                sum1 = sum1 + bsarray[j]*cos(arg);
+                sum2 = sum2 + bsarray[j]*sin(arg);
+            }
+            bsmod[i] = sum1*sum1 + sum2*sum2;
+        }
+        // fix for exponential Euler spline interpolation failure
+        double eps = 1.0e-7;
+        if (bsmod[0] < eps)
+            bsmod[0] = 0.5 * bsmod[1];
+        for (int i = 1; i < size-1; i++)
+            if (bsmod[i] < eps)
+                bsmod[i] = 0.5*(bsmod[i-1]+bsmod[i+1]);
+        if (bsmod[size-1] < eps)
+            bsmod[size-1] = 0.5*bsmod[size-2];
+        // compute and apply the optimal zeta coefficient
+        int jcut = 50;
+        for (int i = 1; i <= size; i++) {
+            int k = i - 1;
+            if (i > size/2)
+                k = k - size;
+            double zeta;
+            if (k == 0)
+                zeta = 1.0;
+            else {
+                double sum1 = 1.0;
+                double sum2 = 1.0;
+                factor = M_PI*k/size;
+                for (int j = 1; j <= jcut; j++) {
+                    double arg = factor/(factor+M_PI*j);
+                    sum1 = sum1 + pow(arg, AMOEBA_PME_ORDER);
+                    sum2 = sum2 + pow(arg, 2*AMOEBA_PME_ORDER);
+                }
+                for (int j = 1; j <= jcut; j++) {
+                    double arg = factor/(factor-M_PI*j);
+                    sum1 = sum1 + pow(arg, AMOEBA_PME_ORDER);
+                    sum2 = sum2 + pow(arg, 2*AMOEBA_PME_ORDER);
+                }
+                zeta = sum2/sum1;
+            }
+            bsmod[i-1] = bsmod[i-1]*zeta*zeta;
+        }
+    }
+}
 extern "C"
 void amoebaGpuBuildVdwExclusionList( amoebaGpuContext amoebaGpu,  const std::vector< std::vector<int> >& exclusions )
 {
@@ -2573,6 +2698,14 @@ void amoebaGpuShutDown(amoebaGpuContext gpu)
    delete gpu->ps_P_ScaleIndices; 
    delete gpu->ps_M_ScaleIndices;
+    if (gpu->psThetai1) {
+        delete gpu->psThetai1;
+        delete gpu->psThetai2;
+        delete gpu->psThetai3;
+        delete gpu->psQfac;
+        delete gpu->psIgrid;
+    }
    if( gpu->pMapArray ){
        for( unsigned int ii = 0; ii < gpu->paddedNumberOfAtoms; ii++ ){
            delete gpu->pMapArray[ii];
@@ -2626,6 +2759,7 @@ void amoebaGpuSetConstants(amoebaGpuContext amoebaGpu)
    SetCalculateAmoebaCudaFixedEAndGKFieldsSim( amoebaGpu );
    SetCalculateAmoebaCudaMutualInducedAndGkFieldsSim( amoebaGpu );
    SetCalculateObcGbsaForces2Sim(  amoebaGpu->gpuContext  );
+    SetCalculateAmoebaPMESim( amoebaGpu );
 }
 extern "C"

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
@@ -156,5 +156,9 @@ extern unsigned int getThreadsPerBlock( amoebaGpuContext amoebaGpu, unsigned int
 //extern int isNanOrInfinity( double number );
 extern void trackMutualInducedIterations( amoebaGpuContext amoebaGpu, int iteration);
+// PME
+extern void SetCalculateAmoebaPMESim( amoebaGpuContext amoebaGpu );
 #endif //__AMOEBA_GPU_TYPES_H__
--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaTypes.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaTypes.h
@@ -47,6 +47,8 @@ enum CudaAmoebaNonbondedMethod
    AMOEBA_PARTICLE_MESH_EWALD
 };
+static const int AMOEBA_PME_ORDER = 5;
 struct cudaAmoebaGmxSimulation {
    // Constants
@@ -177,6 +179,13 @@ struct cudaAmoebaGmxSimulation {
    float fd;        // electric * 2.0f * (1.0f-dwater)/(1.0f+2.0f*dwater);
    float fq;        // electric * 3.0f * (1.0f-dwater)/(2.0f+3.0f*dwater);
+    // PME arrays
+    float* pQfac;
+    float4* pThetai1;
+    float4* pThetai2;
+    float4* pThetai3;
+    int4* pIgrid;
 };
 #endif
--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
@@ -219,6 +219,14 @@ struct _amoebaGpuContext {
    // Wca dispersion fields
    CUDAStream<float2>*  psWcaDispersionRadiusEpsilon;
+    // PME fields
+    CUDAStream<float>* psQfac;
+    CUDAStream<float4>* psThetai1;
+    CUDAStream<float4>* psThetai2;
+    CUDAStream<float4>* psThetai3;
+    CUDAStream<int4>* psIgrid;
 };
 typedef struct _amoebaGpuContext *amoebaGpuContext;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPME.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPME.cu
+//-----------------------------------------------------------------------------------------
+//-----------------------------------------------------------------------------------------
+#include "amoebaGpuTypes.h"
+#include "amoebaCudaKernels.h"
+//#define AMOEBA_DEBUG
+static __constant__ cudaGmxSimulation cSim;
+static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
+void SetCalculateAmoebaPMESim(amoebaGpuContext amoebaGpu)
+{
+    cudaError_t status;
+    gpuContext gpu = amoebaGpu->gpuContext;
+    status         = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
+    RTERROR(status, "SetCalculateAmoebaPMESim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
+    status         = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));
+    RTERROR(status, "SetCalculateAmoebaPMESim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
+}
+#define ARRAY(x,y) array[(x)-1+((y)-1)*AMOEBA_PME_ORDER]
+/**
+ * This is called from computeBsplines().  It calculates the spline coefficients for a single atom along a single axis.
+ */
+__device__ void computeBSplinePoint(float4* thetai, float w, float* array)
+{
+//      real*8 thetai(4,AMOEBA_PME_ORDER)
+//      real*8 array(AMOEBA_PME_ORDER,AMOEBA_PME_ORDER)
+    // initialization to get to 2nd order recursion
+    ARRAY(2,2) = w;
+    ARRAY(2,1) = 1.0f - w;
+    // perform one pass to get to 3rd order recursion
+    ARRAY(3,3) = 0.5f * w * ARRAY(2,2);
+    ARRAY(3,2) = 0.5f * ((1.0f+w)*ARRAY(2,1)+(2.0f-w)*ARRAY(2,2));
+    ARRAY(3,1) = 0.5f * (1.0f-w) * ARRAY(2,1);
+    // compute standard B-spline recursion to desired order
+    for (int i = 4; i <= AMOEBA_PME_ORDER; i++)
+    {
+        int k = i - 1;
+        float denom = 1.0f / k;
+        ARRAY(i,i) = denom * w * ARRAY(k,k);
+        for (int j = 1; j <= i-2; j++)
+            ARRAY(i,i-j) = denom * ((w+j)*ARRAY(k,i-j-1)+(i-j-w)*ARRAY(k,i-j));
+        ARRAY(i,1) = denom * (1.0f-w) * ARRAY(k,1);
+    }
+    // get coefficients for the B-spline first derivative
+    int k = AMOEBA_PME_ORDER - 1;
+    ARRAY(k,AMOEBA_PME_ORDER) = ARRAY(k,AMOEBA_PME_ORDER-1);
+    for (int i = AMOEBA_PME_ORDER-1; i >= 2; i--)
+        ARRAY(k,i) = ARRAY(k,i-1) - ARRAY(k,i);
+    ARRAY(k,1) = -ARRAY(k,1);
+    // get coefficients for the B-spline second derivative
+    k = AMOEBA_PME_ORDER - 2;
+    ARRAY(k,AMOEBA_PME_ORDER-1) = ARRAY(k,AMOEBA_PME_ORDER-2);
+    for (int i = AMOEBA_PME_ORDER-2; i >= 2; i--)
+        ARRAY(k,i) = ARRAY(k,i-1) - ARRAY(k,i);
+    ARRAY(k,1) = -ARRAY(k,1);
+    ARRAY(k,AMOEBA_PME_ORDER) = ARRAY(k,AMOEBA_PME_ORDER-1);
+    for (int i = AMOEBA_PME_ORDER-1; i >= 2; i--)
+        ARRAY(k,i) = ARRAY(k,i-1) - ARRAY(k,i);
+    ARRAY(k,1) = -ARRAY(k,1);
+    // get coefficients for the B-spline third derivative
+    k = AMOEBA_PME_ORDER - 3;
+    ARRAY(k,AMOEBA_PME_ORDER-2) = ARRAY(k,AMOEBA_PME_ORDER-3);
+    for (int i = AMOEBA_PME_ORDER-3; i >= 2; i--)
+        ARRAY(k,i) = ARRAY(k,i-1) - ARRAY(k,i);
+    ARRAY(k,1) = -ARRAY(k,1);
+    ARRAY(k,AMOEBA_PME_ORDER-1) = ARRAY(k,AMOEBA_PME_ORDER-2);
+    for (int i = AMOEBA_PME_ORDER-2; i >= 2; i--)
+        ARRAY(k,i) = ARRAY(k,i-1) - ARRAY(k,i);
+    ARRAY(k,1) = -ARRAY(k,1);
+    ARRAY(k,AMOEBA_PME_ORDER) = ARRAY(k,AMOEBA_PME_ORDER-1);
+    for (int i = AMOEBA_PME_ORDER-1; i >= 2; i--)
+        ARRAY(k,i) = ARRAY(k,i-1) - ARRAY(k,i);
+    ARRAY(k,1) = -ARRAY(k,1);
+    // copy coefficients from temporary to permanent storage
+    for (int i = 1; i <= AMOEBA_PME_ORDER; i++)
+        thetai[i] = make_float4(ARRAY(AMOEBA_PME_ORDER,i), ARRAY(AMOEBA_PME_ORDER-1,i), ARRAY(AMOEBA_PME_ORDER-2,i), ARRAY(AMOEBA_PME_ORDER-3,i));
+}
+/**
+ * Compute bspline coefficients.
+ */
+__global__
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(256, 1)
+#elif (__CUDA_ARCH__ >= 120)
+__launch_bounds__(256, 1)
+#else
+__launch_bounds__(128, 1)
+#endif
+void computeBsplines_kernel()
+{
+    extern __shared__ float bsplines_cache[]; // size = block_size*pme_order*pme_order
+    float* array = &bsplines_cache[threadIdx.x*AMOEBA_PME_ORDER*AMOEBA_PME_ORDER];
+    //  get the B-spline coefficients for each multipole site
+    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < cSim.atoms; i += blockDim.x*gridDim.x) {
+        float4 posq = cSim.pPosq[i];
+        posq.x -= floor(posq.x*cSim.invPeriodicBoxSizeX)*cSim.periodicBoxSizeX;
+        posq.y -= floor(posq.y*cSim.invPeriodicBoxSizeY)*cSim.periodicBoxSizeY;
+        posq.z -= floor(posq.z*cSim.invPeriodicBoxSizeZ)*cSim.periodicBoxSizeZ;
+        float3 t = make_float3((posq.x*cSim.invPeriodicBoxSizeX)*cSim.pmeGridSize.x,
+                               (posq.y*cSim.invPeriodicBoxSizeY)*cSim.pmeGridSize.y,
+                               (posq.z*cSim.invPeriodicBoxSizeZ)*cSim.pmeGridSize.z);
+        // First axis.
+        float w = posq.x*cSim.invPeriodicBoxSizeX;
+        float fr = cSim.pmeGridSize.x*(w-(int)(w+0.5f)+0.5f);
+        int ifr = (int) fr;
+        w = fr - ifr;
+        int igrid1 = ifr - AMOEBA_PME_ORDER;
+        computeBSplinePoint(&cAmoebaSim.pThetai1[i*AMOEBA_PME_ORDER], w, array);
+        // Second axis.
+        w = posq.y*cSim.invPeriodicBoxSizeY;
+        fr = cSim.pmeGridSize.y*(w-(int)(w+0.5f)+0.5f);
+        ifr = (int) fr;
+        w = fr - ifr;
+        int igrid2 = ifr - AMOEBA_PME_ORDER;
+        computeBSplinePoint(&cAmoebaSim.pThetai2[i*AMOEBA_PME_ORDER], w, array);
+        // Third axis.
+        w = posq.z*cSim.invPeriodicBoxSizeZ;
+        fr = cSim.pmeGridSize.z*(w-(int)(w+0.5f)+0.5f);
+        ifr = (int) fr;
+        w = fr - ifr;
+        int igrid3 = ifr - AMOEBA_PME_ORDER;
+        computeBSplinePoint(&cAmoebaSim.pThetai3[i*AMOEBA_PME_ORDER], w, array);
+        cAmoebaSim.pIgrid[i] = make_int4(igrid1, igrid2, igrid3, 0);
+        cSim.pPmeAtomGridIndex[i] = make_int2(i, igrid1*cSim.pmeGridSize.y*cSim.pmeGridSize.z+igrid2*cSim.pmeGridSize.z+igrid3);
+    }
+}
+__global__
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(1024, 1)
+#elif (__CUDA_ARCH__ >= 130)
+__launch_bounds__(512, 1)
+#else
+__launch_bounds__(256, 1)
+#endif
+void kGridSpreadMultipoles_kernel()
+{
+    unsigned int numGridPoints = cSim.pmeGridSize.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
+    unsigned int numThreads = gridDim.x*blockDim.x;
+    for (int gridIndex = blockIdx.x*blockDim.x+threadIdx.x; gridIndex < numGridPoints; gridIndex += numThreads)
+    {
+        int3 gridPoint;
+        gridPoint.x = gridIndex/(cSim.pmeGridSize.y*cSim.pmeGridSize.z);
+        int remainder = gridIndex-gridPoint.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
+        gridPoint.y = remainder/cSim.pmeGridSize.z;
+        gridPoint.z = remainder-gridPoint.y*cSim.pmeGridSize.z;
+        float result = 0.0f;
+        for (int ix = 0; ix < PME_ORDER; ++ix)
+        {
+            int x = gridPoint.x-ix+(gridPoint.x >= ix ? 0 : cSim.pmeGridSize.x);
+            for (int iy = 0; iy < PME_ORDER; ++iy)
+            {
+                int y = gridPoint.y-iy+(gridPoint.y >= iy ? 0 : cSim.pmeGridSize.y);
+                int z1 = gridPoint.z-PME_ORDER+1;
+                z1 += (z1 >= 0 ? 0 : cSim.pmeGridSize.z);
+                int z2 = (z1 < gridPoint.z ? gridPoint.z : cSim.pmeGridSize.z-1);
+                int gridIndex1 = x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+y*cSim.pmeGridSize.z+z1;
+                int gridIndex2 = x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+y*cSim.pmeGridSize.z+z2;
+                int firstAtom = cSim.pPmeAtomRange[gridIndex1];
+                int lastAtom = cSim.pPmeAtomRange[gridIndex2+1];
+                for (int i = firstAtom; i < lastAtom; ++i)
+                {
+                    int2 atomData = cSim.pPmeAtomGridIndex[i];
+                    int atomIndex = atomData.x;
+                    int z = atomData.y;
+                    int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : cSim.pmeGridSize.z);
+                    float atomCharge = cSim.pPosq[atomIndex].w;
+                    float atomDipoleX = cAmoebaSim.pLabFrameDipole[atomIndex+3];
+                    float atomDipoleY = cAmoebaSim.pLabFrameDipole[atomIndex+3+1];
+                    float atomDipoleZ = cAmoebaSim.pLabFrameDipole[atomIndex+3+2];
+                    float atomQuadrupoleXX = cAmoebaSim.pLabFrameQuadrupole[atomIndex+9];
+                    float atomQuadrupoleXY = 2*cAmoebaSim.pLabFrameQuadrupole[atomIndex+9+1];
+                    float atomQuadrupoleXZ = 2*cAmoebaSim.pLabFrameQuadrupole[atomIndex+9+2];
+                    float atomQuadrupoleYY = cAmoebaSim.pLabFrameQuadrupole[atomIndex+9+4];
+                    float atomQuadrupoleYZ = 2*cAmoebaSim.pLabFrameQuadrupole[atomIndex+9+5];
+                    float atomQuadrupoleZZ = cAmoebaSim.pLabFrameQuadrupole[atomIndex+9+8];
+                    float4 t = cAmoebaSim.pThetai1[atomIndex+ix*cSim.atoms];
+                    float4 u = cAmoebaSim.pThetai2[atomIndex+iy*cSim.atoms];
+                    float4 v = cAmoebaSim.pThetai3[atomIndex+iz*cSim.atoms];
+                    float term0 = atomCharge*u.x*v.x + atomDipoleY*u.y*v.x + atomDipoleZ*u.x*v.y + atomQuadrupoleYY*u.z*v.x + atomQuadrupoleZZ*u.x*v.z + atomQuadrupoleYZ*u.y*v.y;
+                    float term1 = atomDipoleX*u.x*v.x + atomQuadrupoleXY*u.y*v.x + atomQuadrupoleXZ*u.x*v.y;
+                    float term2 = atomQuadrupoleXX * u.x * v.x;
+                    result += term0*t.x + term1*t.y + term2*t.z;
+                }
+                if (z1 > gridPoint.z)
+                {
+                    gridIndex1 = x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+y*cSim.pmeGridSize.z;
+                    gridIndex2 = x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+y*cSim.pmeGridSize.z+gridPoint.z;
+                    firstAtom = cSim.pPmeAtomRange[gridIndex1];
+                    lastAtom = cSim.pPmeAtomRange[gridIndex2+1];
+                    for (int i = firstAtom; i < lastAtom; ++i)
+                    {
+                        int2 atomData = cSim.pPmeAtomGridIndex[i];
+                        int atomIndex = atomData.x;
+                        int z = atomData.y;
+                        int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : cSim.pmeGridSize.z);
+                        float atomCharge = cSim.pPosq[atomIndex].w;
+                        float atomDipoleX = cAmoebaSim.pLabFrameDipole[atomIndex+3];
+                        float atomDipoleY = cAmoebaSim.pLabFrameDipole[atomIndex+3+1];
+                        float atomDipoleZ = cAmoebaSim.pLabFrameDipole[atomIndex+3+2];
+                        float atomQuadrupoleXX = cAmoebaSim.pLabFrameQuadrupole[atomIndex+9];
+                        float atomQuadrupoleXY = 2*cAmoebaSim.pLabFrameQuadrupole[atomIndex+9+1];
+                        float atomQuadrupoleXZ = 2*cAmoebaSim.pLabFrameQuadrupole[atomIndex+9+2];
+                        float atomQuadrupoleYY = cAmoebaSim.pLabFrameQuadrupole[atomIndex+9+4];
+                        float atomQuadrupoleYZ = 2*cAmoebaSim.pLabFrameQuadrupole[atomIndex+9+5];
+                        float atomQuadrupoleZZ = cAmoebaSim.pLabFrameQuadrupole[atomIndex+9+8];
+                        float4 t = cAmoebaSim.pThetai1[atomIndex+ix*cSim.atoms];
+                        float4 u = cAmoebaSim.pThetai2[atomIndex+iy*cSim.atoms];
+                        float4 v = cAmoebaSim.pThetai3[atomIndex+iz*cSim.atoms];
+                        float term0 = atomCharge*u.x*v.x + atomDipoleY*u.y*v.x + atomDipoleZ*u.x*v.y + atomQuadrupoleYY*u.z*v.x + atomQuadrupoleZZ*u.x*v.z + atomQuadrupoleYZ*u.y*v.y;
+                        float term1 = atomDipoleX*u.x*v.x + atomQuadrupoleXY*u.y*v.x + atomQuadrupoleXZ*u.x*v.y;
+                        float term2 = atomQuadrupoleXX * u.x * v.x;
+                        result += term0*t.x + term1*t.y + term2*t.z;
+                    }
+                }
+            }
+        }
+        cSim.pPmeGrid[gridIndex] = make_cuComplex(result*sqrt(cSim.epsfac), 0.0f);
+    }
+}