Continuing to implement AmoebaMultipoleForce: various bug fixes

4cebdb5a · Peter Eastman · c0d1eb7b · 4cebdb5a · 4cebdb5a · 4cebdb5a
Commit 4cebdb5a authored Aug 27, 2012 by Peter Eastman
4 changed files
--- a/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.cpp
@@ -1154,14 +1154,11 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
        pmeDefines["NUM_ATOMS"] = cu.intToString(numMultipoles);
        pmeDefines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
        pmeDefines["EPSILON_FACTOR"] = cu.doubleToString(138.9354558456);
-//        pmeDefines["RECIP_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(alpha*alpha));
        pmeDefines["GRID_SIZE_X"] = cu.intToString(gridSizeX);
        pmeDefines["GRID_SIZE_Y"] = cu.intToString(gridSizeY);
        pmeDefines["GRID_SIZE_Z"] = cu.intToString(gridSizeZ);
-//        pmeDefines["EPSILON_FACTOR"] = cu.doubleToString(sqrt(ONE_4PI_EPS0));
        pmeDefines["M_PI"] = cu.doubleToString(M_PI);
-//        if (cu.getUseDoublePrecision())
-//            pmeDefines["USE_DOUBLE_PRECISION"] = "1";
+        pmeDefines["SQRT_PI"] = cu.doubleToString(sqrt(M_PI));
        CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipolePme, pmeDefines);
        pmeUpdateBsplinesKernel = cu.getKernel(module, "updateBsplines");
        pmeAtomRangeKernel = cu.getKernel(module, "findAtomRangeForGrid");
@@ -1401,7 +1398,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
            void* updateInducedFieldArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), &inducedField->getDevicePointer(),
                &inducedFieldPolar->getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(),
                &polarizability->getDevicePointer(), &inducedDipoleErrors->getDevicePointer()};
-            cu.executeKernel(updateInducedFieldKernel, updateInducedFieldArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize);
+            cu.executeKernel(updateInducedFieldKernel, updateInducedFieldArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize, cu.ThreadBlockSize, cu.ThreadBlockSize*elementSize*2);
            inducedDipoleErrors->download(errors);
            double total1 = 0.0, total2 = 0.0;
            for (int j = 0; j < (int) errors.size(); j++) {
@@ -1525,7 +1522,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
            void* updateInducedFieldArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), &inducedField->getDevicePointer(),
                &inducedFieldPolar->getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(),
                &polarizability->getDevicePointer(), &inducedDipoleErrors->getDevicePointer()};
-            cu.executeKernel(updateInducedFieldKernel, updateInducedFieldArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize);
+            cu.executeKernel(updateInducedFieldKernel, updateInducedFieldArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize, cu.ThreadBlockSize, cu.ThreadBlockSize*elementSize*2);
            inducedDipoleErrors->download(errors);
            double total1 = 0.0, total2 = 0.0;
            for (int j = 0; j < (int) errors.size(); j++) {

--- a/plugins/amoeba/platforms/cuda2/src/kernels/multipoleInducedField.cu
+++ b/plugins/amoeba/platforms/cuda2/src/kernels/multipoleInducedField.cu
@@ -34,7 +34,7 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 de
        real ralpha = EWALD_ALPHA*r;
        real bn0 = erfc(ralpha)*rI;
        real alsq2 = 2*EWALD_ALPHA*EWALD_ALPHA;
-        real alsq2n = RECIP(SQRT(M_PI)*EWALD_ALPHA);
+        real alsq2n = RECIP(SQRT_PI*EWALD_ALPHA);
        real exp2a = expf(-(ralpha*ralpha));
        alsq2n *= alsq2;
        real bn1 = (bn0+alsq2n*exp2a)*rI*rI;
@@ -129,7 +129,7 @@ extern "C" __global__ void computeInducedField(
 #endif
    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
 #ifndef ENABLE_SHUFFLE
-    __shared__ real tempBuffer[3*THREAD_BLOCK_SIZE];
+//    __shared__ real tempBuffer[3*THREAD_BLOCK_SIZE];
 #endif
    
    do {
@@ -194,11 +194,12 @@ extern "C" __global__ void computeInducedField(
                localData[threadIdx.x].fieldPolar = make_real3(0);
 #ifdef USE_CUTOFF
                unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
-                if (flags != 0xFFFFFFFF) {
+                if (flags == 0) { // TODO: Figure out what the flags != 0 case doesn't work!!!
+//                if (flags != 0xFFFFFFFF) {
                    if (flags == 0) {
                        // No interactions in this tile.
                    }
-                    else {
+/*                    else {
                        // Compute only a subset of the interactions in this tile.

                        for (int j = 0; j < TILE_SIZE; j++) {
@@ -213,6 +214,8 @@ extern "C" __global__ void computeInducedField(
                                real3 fields[4];
                                computeOneInteraction(data, localData[atom2], delta, fields);
                                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                                    data.field += fields[0];
+                                    data.fieldPolar += fields[1];
 #ifdef ENABLE_SHUFFLE
                                    for (int i = 16; i >= 1; i /= 2) {
                                        fields[2].x += __shfl_xor(fields[2].x, i, 32);
@@ -258,7 +261,7 @@ extern "C" __global__ void computeInducedField(
                                }
                            }
                        }
-                    }
+                    }*/
                }
                else
 #endif
@@ -317,7 +320,11 @@ extern "C" __global__ void updateInducedFieldBySOR(const long long* __restrict__
        real* __restrict__ inducedDipolePolar, const float* __restrict__ polarizability, float2* __restrict__ errors) {
    extern __shared__ real2 buffer[];
    const float polarSOR = 0.55f;
-    const real term = (4/(real) 3)*(EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA)/SQRT(M_PI);
+#ifdef USE_EWALD
+    const real ewaldScale = (4/(real) 3)*(EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA)/SQRT_PI;
+#else
+    const real ewaldScale = 0;
+#endif
    const real fieldScale = 1/(real) 0xFFFFFFFF;
    real sumErrors = 0;
    real sumPolarErrors = 0;
@@ -328,8 +335,8 @@ extern "C" __global__ void updateInducedFieldBySOR(const long long* __restrict__
            int fieldIndex = atom+component*PADDED_NUM_ATOMS;
            real previousDipole = inducedDipole[dipoleIndex];
            real previousDipolePolar = inducedDipolePolar[dipoleIndex];
-            real newDipole = scale*((fixedField[fieldIndex]+inducedField[fieldIndex])*fieldScale+term*previousDipole);
-            real newDipolePolar = scale*((fixedFieldPolar[fieldIndex]+inducedFieldPolar[fieldIndex])*fieldScale+term*previousDipolePolar);
+            real newDipole = scale*((fixedField[fieldIndex]+inducedField[fieldIndex])*fieldScale+ewaldScale*previousDipole);
+            real newDipolePolar = scale*((fixedFieldPolar[fieldIndex]+inducedFieldPolar[fieldIndex])*fieldScale+ewaldScale*previousDipolePolar);
            newDipole = previousDipole + polarSOR*(newDipole-previousDipole);
            newDipolePolar = previousDipolePolar + polarSOR*(newDipolePolar-previousDipolePolar);
            inducedDipole[dipoleIndex] = newDipole;

--- a/plugins/amoeba/platforms/cuda2/src/kernels/multipolePme.cu
+++ b/plugins/amoeba/platforms/cuda2/src/kernels/multipolePme.cu
@@ -159,6 +159,7 @@ extern "C" __global__ void findAtomRangeForGrid(int2* __restrict__ pmeAtomGridIn
            pmeAtomRange[j] = NUM_ATOMS;
    }
 }
+
 extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ posq, const real* __restrict__ labFrameDipole,
        const real* __restrict__ labFrameQuadrupole, real2* __restrict__ pmeGrid, int2* __restrict__ pmeAtomGridIndex, int* __restrict__ pmeAtomRange,
        const real4* __restrict__ theta1, const real4* __restrict__ theta2, const real4* __restrict__ theta3, real4 invPeriodicBoxSize) {
@@ -468,7 +469,7 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
        phi[20*m+17] = tuv102;
        phi[20*m+18] = tuv012;
        phi[20*m+19] = tuv111;
-        real dipoleScale = (4/(real) 3)*(EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA)/SQRT(M_PI);
+        real dipoleScale = (4/(real) 3)*(EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA)/SQRT_PI;
        long long fieldx = (long long) ((dipoleScale*labFrameDipole[m*3]-GRID_SIZE_X*invPeriodicBoxSize.x*tuv100)*0xFFFFFFFF);
        fieldBuffers[m] = fieldx;
        fieldPolarBuffers[m] = fieldx;

--- a/plugins/amoeba/platforms/cuda2/src/kernels/pmeElectrostaticPairForce.cu
+++ b/plugins/amoeba/platforms/cuda2/src/kernels/pmeElectrostaticPairForce.cu
@@ -952,7 +952,7 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, bool has
    real alsq2 = 2*EWALD_ALPHA*EWALD_ALPHA;
    real alsq2n = 0;
    if (EWALD_ALPHA > 0)
-        alsq2n = RECIP(SQRT(M_PI)*EWALD_ALPHA);
+        alsq2n = RECIP(SQRT_PI*EWALD_ALPHA);
    real exp2a = EXP(-(ralpha*ralpha));

    real rr1 = RECIP(r);
@@ -1012,7 +1012,7 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, bool has
 */
 __device__ void computeSelfEnergyAndTorque(AtomData& atom1, real& energy) {
    real term = 2*EWALD_ALPHA*EWALD_ALPHA;
-    real fterm = -EWALD_ALPHA/SQRT(M_PI);
+    real fterm = -EWALD_ALPHA/SQRT_PI;
    real cii = atom1.q*atom1.q;
    real dii = dot(atom1.dipole, atom1.dipole);
    real qii = 2*(atom1.quadrupoleXX*atom1.quadrupoleXX +
@@ -1030,5 +1030,5 @@ __device__ void computeSelfEnergyAndTorque(AtomData& atom1, real& energy) {
    // self-torque for PME

    real3 ui = atom1.inducedDipole+atom1.inducedDipolePolar;
-    atom1.torque += ((2/(real) 3)*(EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA)/SQRT(M_PI))*cross(atom1.dipole, ui);
+    atom1.torque += ((2/(real) 3)*(EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA)/SQRT_PI)*cross(atom1.dipole, ui);
 }
\ No newline at end of file