Simplification and cleanup to spherical harmonics code

5c090de2 · peastman · 474f600e · 5c090de2 · 5c090de2 · 5c090de2
Commit 5c090de2 authored Aug 20, 2015 by peastman
3 changed files
--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -1158,20 +1158,13 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
        buildMatrixKernel = cu.getKernel(module, "computeDIISMatrix");
    }
    stringstream electrostaticsSource;
-    if (usePME) {
-        electrostaticsSource << CudaKernelSources::vectorOps;
-        electrostaticsSource << CudaAmoebaKernelSources::sphericalMultipoles;
+    electrostaticsSource << CudaKernelSources::vectorOps;
+    electrostaticsSource << CudaAmoebaKernelSources::sphericalMultipoles;
+    if (usePME)
        electrostaticsSource << CudaAmoebaKernelSources::pmeMultipoleElectrostatics;
-        electrostaticsThreadMemory = 24*elementSize+3*sizeof(float)+3*sizeof(int)/(double) cu.TileSize;
-    }
-    else {
-        electrostaticsSource << CudaKernelSources::vectorOps;
-        electrostaticsSource << CudaAmoebaKernelSources::sphericalMultipoles;
+    else
        electrostaticsSource << CudaAmoebaKernelSources::multipoleElectrostatics;
-        electrostaticsThreadMemory = 24*elementSize+2*sizeof(float)+3*sizeof(int)/(double) cu.TileSize;
-        if (gk != NULL)
-            electrostaticsThreadMemory += 4*elementSize;
-    }
+    electrostaticsThreadMemory = 24*elementSize+3*sizeof(float)+3*sizeof(int)/(double) cu.TileSize;
    electrostaticsThreads = min(maxThreads, cu.computeThreadBlockSize(electrostaticsThreadMemory));
    defines["THREAD_BLOCK_SIZE"] = cu.intToString(electrostaticsThreads);
    module = cu.createModule(electrostaticsSource.str(), defines);
@@ -1492,7 +1485,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
        void* electrostaticsArgs[] = {&cu.getForce().getDevicePointer(), &torque->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
            &cu.getPosq().getDevicePointer(), &covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(),
            &nb.getExclusionTiles().getDevicePointer(), &startTileIndex, &numTileIndices,
-            &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &sphericalDipoles->getDevicePointer(), &sphericalQuadrupoles->getDevicePointer(),
+            &sphericalDipoles->getDevicePointer(), &sphericalQuadrupoles->getDevicePointer(),
            &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &dampingAndThole->getDevicePointer()};
        cu.executeKernel(electrostaticsKernel, electrostaticsArgs, numForceThreadBlocks*electrostaticsThreads, electrostaticsThreads);
        if (gkKernel != NULL)
@@ -1647,7 +1640,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
            &nb.getInteractingTiles().getDevicePointer(), &nb.getInteractionCount().getDevicePointer(),
            cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
            &maxTiles, &nb.getBlockCenters().getDevicePointer(), &nb.getInteractingAtoms().getDevicePointer(),
-            &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &sphericalDipoles->getDevicePointer(), &sphericalQuadrupoles->getDevicePointer(),
+            &sphericalDipoles->getDevicePointer(), &sphericalQuadrupoles->getDevicePointer(),
            &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &dampingAndThole->getDevicePointer()};
        cu.executeKernel(electrostaticsKernel, electrostaticsArgs, numForceThreadBlocks*electrostaticsThreads, electrostaticsThreads);
        void* pmeTransformInducedPotentialArgs[] = {&pmePhidp->getDevicePointer(), &pmeCphi->getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};

--- a/plugins/amoeba/platforms/cuda/src/kernels/multipoleElectrostatics.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/multipoleElectrostatics.cu
@@ -382,8 +382,7 @@ extern "C" __global__ void computeElectrostatics(
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
        const unsigned int* __restrict__ interactingAtoms,
 #endif
-        const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const real* __restrict__ sphericalDipole,
-        const real* __restrict__ sphericalQuadrupole, const real* __restrict__ inducedDipole,
+        const real* __restrict__ sphericalDipole, const real* __restrict__ sphericalQuadrupole, const real* __restrict__ inducedDipole,
        const real* __restrict__ inducedDipolePolar, const float2* __restrict__ dampingAndThole) {
    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;

--- a/plugins/amoeba/platforms/cuda/src/kernels/pmeMultipoleElectrostatics.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/pmeMultipoleElectrostatics.cu
@@ -411,32 +411,23 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, bool has
 /**
 * Compute the self energy and self torque.
 */
-__device__ void computeSelfEnergyAndTorque(AtomData& atom1, int atomIndex, real& energy, const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole) {
-    real term = 2*EWALD_ALPHA*EWALD_ALPHA;
-    real fterm = -EWALD_ALPHA/SQRT_PI;
+__device__ void computeSelfEnergyAndTorque(AtomData& atom1, real& energy) {
    real cii = atom1.q*atom1.q;
-    real3 dipole = make_real3(labFrameDipole[atomIndex*3], labFrameDipole[atomIndex*3+1], labFrameDipole[atomIndex*3+2]);
-    real dii = dot(dipole, dipole);
+    real3 dipole = make_real3(atom1.sphericalDipole.y, atom1.sphericalDipole.z, atom1.sphericalDipole.x);
+    real dii = dot(dipole, dipole+atom1.inducedDipole);
 #ifdef INCLUDE_QUADRUPOLES
-    real quadrupoleXX = labFrameQuadrupole[atomIndex*5];
-    real quadrupoleXY = labFrameQuadrupole[atomIndex*5+1];
-    real quadrupoleXZ = labFrameQuadrupole[atomIndex*5+2];
-    real quadrupoleYY = labFrameQuadrupole[atomIndex*5+3];
-    real quadrupoleYZ = labFrameQuadrupole[atomIndex*5+4];
-    real qii = 2*(quadrupoleXX*quadrupoleXX +
-                  quadrupoleYY*quadrupoleYY +
-                  quadrupoleXX*quadrupoleYY +
-                  quadrupoleXY*quadrupoleXY +
-                  quadrupoleXZ*quadrupoleXZ +
-                  quadrupoleYZ*quadrupoleYZ);
+    real qii = (atom1.sphericalQuadrupole[0]*atom1.sphericalQuadrupole[0] +
+                atom1.sphericalQuadrupole[1]*atom1.sphericalQuadrupole[1] +
+                atom1.sphericalQuadrupole[2]*atom1.sphericalQuadrupole[2] +
+                atom1.sphericalQuadrupole[3]*atom1.sphericalQuadrupole[3] +
+                atom1.sphericalQuadrupole[4]*atom1.sphericalQuadrupole[4]);
 #else
    real qii = 0;
 #endif
-    real uii = dot(dipole, atom1.inducedDipole);
-    real selfEnergy = (cii + term*(dii/3 + 2*term*qii/5));
-    selfEnergy += term*uii/3;
-    selfEnergy *= fterm;
-    energy += selfEnergy;
+    real prefac = -EWALD_ALPHA/SQRT_PI;
+    real a2 = EWALD_ALPHA*EWALD_ALPHA;
+    real a4 = a2*a2;
+    energy += prefac*(cii + ((real)2/3)*a2*dii + ((real) 4/15)*a4*qii);

    // self-torque for PME

@@ -456,9 +447,8 @@ extern "C" __global__ void computeElectrostatics(
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
        const unsigned int* __restrict__ interactingAtoms,
 #endif
-        const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const real* __restrict__ sphericalDipole,
-        const real* __restrict__ sphericalQuadrupole, const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar,
-        const float2* __restrict__ dampingAndThole) {
+        const real* __restrict__ sphericalDipole, const real* __restrict__ sphericalQuadrupole, const real* __restrict__ inducedDipole,
+        const real* __restrict__ inducedDipolePolar, const float2* __restrict__ dampingAndThole) {
    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
@@ -511,7 +501,7 @@ extern "C" __global__ void computeElectrostatics(
                }
            }
            if (atom1 < NUM_ATOMS)
-                computeSelfEnergyAndTorque(data, atom1, energy, labFrameDipole, labFrameQuadrupole);
+                computeSelfEnergyAndTorque(data, energy);
            data.force *= -ENERGY_SCALE_FACTOR;
            data.torque *= ENERGY_SCALE_FACTOR;
            atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));