Added tests for calculating system multipole moments and multipole potential on a grid

Refactored code for calculating system multipole moments and multipole potential on a grid to separate files (removed from kCalculateAmoebaCudaRotateFrame.cu and kCalculateAmoebaCudaElectrostatic.cu)

Added tests for calculating system multipole moments and multipole potential on a grid
Refactored code for calculating system multipole moments and multipole potential on a grid to separate files (removed from kCalculateAmoebaCudaRotateFrame.cu and kCalculateAmoebaCudaElectrostatic.cu)
155fe172 · Mark Friedrichs · 776f6f22 · 155fe172 · 155fe172 · 155fe172
Commit 155fe172 authored Jul 31, 2012 by Mark Friedrichs
9 changed files
--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -860,14 +860,13 @@ static void computeAmoebaMultipolePotential( AmoebaCudaData& data, const std::ve
    }
 }

-static void computeAmoebaSystemMultipoleMoments( AmoebaCudaData& data, const Vec3& origin,
-                                                 std::vector< double >& outputMultipoleMonents) {
+static void computeAmoebaSystemMultipoleMoments( AmoebaCudaData& data, std::vector< double >& outputMultipoleMonents) {

    amoebaGpuContext gpu = data.getAmoebaGpu();

    data.setGpuInitialized( false );
    data.initializeGpu();
-    kCalculateAmoebaSystemMultipoleMoments( gpu, origin, outputMultipoleMonents );
+    kCalculateAmoebaSystemMultipoleMoments( gpu, outputMultipoleMonents );

 }

@@ -1087,7 +1086,7 @@ void CudaCalcAmoebaMultipoleForceKernel::getElectrostaticPotential(ContextImpl&

 void CudaCalcAmoebaMultipoleForceKernel::getSystemMultipoleMoments(ContextImpl& context,  const Vec3& origin,
                                                                   std::vector< double >& outputMultipoleMonents) {
-    computeAmoebaSystemMultipoleMoments( data, origin, outputMultipoleMonents);
+    computeAmoebaSystemMultipoleMoments( data, outputMultipoleMonents);
    return;
 }


--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
@@ -3332,6 +3332,7 @@ void amoebaGpuSetConstants(amoebaGpuContext amoebaGpu, int updateFlag )
    SetCalculateAmoebaLocalForcesSim( amoebaGpu );
    SetCalculateAmoebaCudaUtilitiesSim( amoebaGpu );
    SetCalculateAmoebaMultipoleForcesSim( amoebaGpu );
+    SetCalculateAmoebaMultipolePotentialSim( amoebaGpu );
    SetCalculateAmoebaCudaFixedEFieldSim( amoebaGpu );
    SetCalculateAmoebaCudaVdw14_7Sim( amoebaGpu );
    SetCalculateAmoebaCudaMutualInducedFieldSim( amoebaGpu );

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
@@ -48,13 +48,22 @@ extern void SetCalculateAmoebaLocalForcesSim(amoebaGpuContext gpu);
 extern void GetCalculateAmoebaLocalForcesSim(amoebaGpuContext gpu);
 extern void kCalculateAmoebaLocalForces(amoebaGpuContext gpu);

-// multipole
+// multipole forces

 extern void SetCalculateAmoebaMultipoleForcesSim(amoebaGpuContext gpu);
 extern void GetCalculateAmoebaMultipoleForcesSim(amoebaGpuContext gpu);
 extern void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool performGk );
+extern void kSetupAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaGeneralizedKirkwood );
+
+// multipole potential
+
+extern void SetCalculateAmoebaMultipolePotentialSim(amoebaGpuContext gpu);
+extern void GetCalculateAmoebaMultipolePotentialSim(amoebaGpuContext gpu);
 extern void kCalculateAmoebaMultipolePotential(amoebaGpuContext amoebaGpu );
-extern void kCalculateAmoebaSystemMultipoleMoments(amoebaGpuContext amoebaGpu, const OpenMM::Vec3& origin, std::vector< double >& outputMultipoleMonents );
+
+// system multipole moments
+
+extern void kCalculateAmoebaSystemMultipoleMoments(amoebaGpuContext amoebaGpu, std::vector< double >& outputMultipoleMonents );

 // vdw


--- a/plugins/amoeba/platforms/cuda/src/kernels/calculateSystemMultipoleMoments.cpp
+++ b/plugins/amoeba/platforms/cuda/src/kernels/calculateSystemMultipoleMoments.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "amoebaCudaKernels.h"
+#include "openmm/OpenMMException.h"
+
+#include <stdio.h>
+using namespace std; 
+
+void kCalculateAmoebaSystemMultipoleMoments( amoebaGpuContext amoebaGpu, std::vector< double >& outputMultipoleMoments ) 
+{
+
+    // setup
+
+    kSetupAmoebaMultipoleForces(amoebaGpu, false ); 
+
+    gpuContext gpu         = amoebaGpu->gpuContext;
+
+    gpu->psPosq4->Download();
+    gpu->psVelm4->Download();
+    float4* posq           = gpu->psPosq4->_pSysData;    
+    float4* velm           = gpu->psVelm4->_pSysData;    
+    float totalMass        = 0.0f;
+    float centerOfMass[3]  = { 0.0f, 0.0f, 0.0f };
+    for( unsigned int ii  = 0; ii < gpu->natoms; ii++ ){
+        float mass;
+        if( velm->w > 0.0f ){
+            mass        = 1.0f/velm[ii].w;
+        } else {
+            mass        = 0.0f;
+        }
+        totalMass        += mass;
+        centerOfMass[0]  += mass*posq[ii].x;
+        centerOfMass[1]  += mass*posq[ii].y;
+        centerOfMass[2]  += mass*posq[ii].z;
+    }
+
+    std::vector<float4>  posqLocal(gpu->natoms);
+    if( totalMass > 0.0f ){
+        centerOfMass[0]  /= totalMass;
+        centerOfMass[1]  /= totalMass;
+        centerOfMass[2]  /= totalMass;
+    }
+    for( unsigned int ii  = 0; ii < gpu->natoms; ii++ ){
+        posqLocal[ii].x = posq[ii].x  - centerOfMass[0];
+        posqLocal[ii].y = posq[ii].y  - centerOfMass[1];
+        posqLocal[ii].z = posq[ii].z  - centerOfMass[2];
+        posqLocal[ii].w = posq[ii].w;
+    }
+
+    float netchg  = 0.0f;
+
+    float xdpl    = 0.0f;
+    float ydpl    = 0.0f;
+    float zdpl    = 0.0f;
+
+    float xxqdp   = 0.0f;
+    float xyqdp   = 0.0f;
+    float xzqdp   = 0.0f;
+
+    float yxqdp   = 0.0f;
+    float yyqdp   = 0.0f;
+    float yzqdp   = 0.0f;
+
+    float zxqdp   = 0.0f;
+    float zyqdp   = 0.0f;
+    float zzqdp   = 0.0f;
+
+    amoebaGpu->psLabFrameDipole->Download();
+    float* labFrameDipole      = amoebaGpu->psLabFrameDipole->_pSysData;    
+
+    amoebaGpu->psInducedDipole->Download();
+    float* inducedDipole       = amoebaGpu->psInducedDipole->_pSysData;    
+
+    amoebaGpu->psLabFrameQuadrupole->Download();
+    float* labFrameQuadrupole  = amoebaGpu->psLabFrameQuadrupole->_pSysData;    
+    for( unsigned int ii  = 0; ii < gpu->natoms; ii++ ){
+
+        netchg              += posqLocal[ii].w;
+
+        float netDipoleX     = (labFrameDipole[3*ii]    + inducedDipole[3*ii]);
+        float netDipoleY     = (labFrameDipole[3*ii+1]  + inducedDipole[3*ii+1]);
+        float netDipoleZ     = (labFrameDipole[3*ii+2]  + inducedDipole[3*ii+2]);
+
+        xdpl    += posqLocal[ii].x*posqLocal[ii].w + netDipoleX;
+        ydpl    += posqLocal[ii].y*posqLocal[ii].w + netDipoleY;
+        zdpl    += posqLocal[ii].z*posqLocal[ii].w + netDipoleZ;
+
+        xxqdp   += posqLocal[ii].x*posqLocal[ii].x*posqLocal[ii].w + 2.0f*posqLocal[ii].x*netDipoleX;
+        xyqdp   += posqLocal[ii].x*posqLocal[ii].y*posqLocal[ii].w + posqLocal[ii].x*netDipoleY + posqLocal[ii].y*netDipoleX;
+        xzqdp   += posqLocal[ii].x*posqLocal[ii].z*posqLocal[ii].w + posqLocal[ii].x*netDipoleZ + posqLocal[ii].z*netDipoleX;
+
+        yxqdp   += posqLocal[ii].y*posqLocal[ii].x*posqLocal[ii].w + posqLocal[ii].y*netDipoleX + posqLocal[ii].x*netDipoleY;
+        yyqdp   += posqLocal[ii].y*posqLocal[ii].y*posqLocal[ii].w + 2.0f*posqLocal[ii].y*netDipoleY;
+        yzqdp   += posqLocal[ii].y*posqLocal[ii].z*posqLocal[ii].w + posqLocal[ii].y*netDipoleZ + posqLocal[ii].z*netDipoleY;
+
+        zxqdp   += posqLocal[ii].z*posqLocal[ii].x*posqLocal[ii].w + posqLocal[ii].z*netDipoleX + posqLocal[ii].x*netDipoleZ;
+        zyqdp   += posqLocal[ii].z*posqLocal[ii].y*posqLocal[ii].w + posqLocal[ii].z*netDipoleY + posqLocal[ii].y*netDipoleZ;
+        zzqdp   += posqLocal[ii].z*posqLocal[ii].z*posqLocal[ii].w + 2.0f*posqLocal[ii].z*netDipoleZ;
+
+    }
+
+//  convert the quadrupole from traced to traceless form
+ 
+    float qave   = (xxqdp + yyqdp + zzqdp)/3.0f;
+          xxqdp  = 1.5f*(xxqdp-qave);
+          xyqdp  = 1.5f*xyqdp;
+          xzqdp  = 1.5f*xzqdp;
+          yxqdp  = 1.5f*yxqdp;
+          yyqdp  = 1.5f*(yyqdp-qave);
+          yzqdp  = 1.5f*yzqdp;
+          zxqdp  = 1.5f*zxqdp;
+          zyqdp  = 1.5f*zyqdp;
+          zzqdp  = 1.5f*(zzqdp-qave);
+
+//  add the traceless atomic quadrupoles to total quadrupole
+
+    for( unsigned int ii  = 0; ii < gpu->natoms; ii++ ){
+        xxqdp  = xxqdp + 3.0f*labFrameQuadrupole[9*ii];
+        xyqdp  = xyqdp + 3.0f*labFrameQuadrupole[9*ii+1];
+        xzqdp  = xzqdp + 3.0f*labFrameQuadrupole[9*ii+2];
+        yxqdp  = yxqdp + 3.0f*labFrameQuadrupole[9*ii+3];
+        yyqdp  = yyqdp + 3.0f*labFrameQuadrupole[9*ii+4];
+        yzqdp  = yzqdp + 3.0f*labFrameQuadrupole[9*ii+5];
+        zxqdp  = zxqdp + 3.0f*labFrameQuadrupole[9*ii+6];
+        zyqdp  = zyqdp + 3.0f*labFrameQuadrupole[9*ii+7];
+        zzqdp  = zzqdp + 3.0f*labFrameQuadrupole[9*ii+8];
+    }
+ 
+    float debye                = 4.80321f;
+    outputMultipoleMoments.resize( 13 );
+
+    outputMultipoleMoments[0]  = netchg;
+
+    outputMultipoleMoments[1]  = xdpl*debye;
+    outputMultipoleMoments[2]  = ydpl*debye;
+    outputMultipoleMoments[3]  = zdpl*debye;
+    
+    outputMultipoleMoments[4]  = xxqdp*debye;
+    outputMultipoleMoments[5]  = xyqdp*debye;
+    outputMultipoleMoments[6]  = xzqdp*debye;
+
+    outputMultipoleMoments[7]  = yxqdp*debye;
+    outputMultipoleMoments[8]  = yyqdp*debye;
+    outputMultipoleMoments[9]  = yzqdp*debye;
+
+    outputMultipoleMoments[10] = zxqdp*debye;
+    outputMultipoleMoments[11] = zyqdp*debye;
+    outputMultipoleMoments[12] = zzqdp*debye;
+
+}
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
@@ -669,230 +669,3 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo

   // ---------------------------------------------------------------------------------------
 }
-
-struct ElectrostaticPotentialParticle {
-
-    // coordinates charge
-
-    float x;
-    float y;
-    float z;
-    float q;
-
-    // lab frame dipole
-
-    float labFrameDipole[3];
-
-    // lab frame quadrupole
-
-    float labFrameQuadrupole[9];
-
-    // induced dipole
-
-    float inducedDipole[3];
-
-};
-
-/**---------------------------------------------------------------------------------------
-
-   Load data for particle w/ index=atomI
-
-   @param sa        address to store atomI's coordinates and multipole moments
-   @param atomI     index of atom whose data is to be stored
-
-   --------------------------------------------------------------------------------------- */
-
-static __device__ void loadElectrostaticPotentialParticle( volatile struct ElectrostaticPotentialParticle* sA, unsigned int atomI ){
-
-    // coordinates & charge
-
-    sA->x                        = cSim.pPosq[atomI].x;
-    sA->y                        = cSim.pPosq[atomI].y;
-    sA->z                        = cSim.pPosq[atomI].z;
-    sA->q                        = cSim.pPosq[atomI].w;
-
-    // lab dipole
-
-    sA->labFrameDipole[0]        = cAmoebaSim.pLabFrameDipole[atomI*3];
-    sA->labFrameDipole[1]        = cAmoebaSim.pLabFrameDipole[atomI*3+1];
-    sA->labFrameDipole[2]        = cAmoebaSim.pLabFrameDipole[atomI*3+2];
-
-    // lab quadrupole
-
-    sA->labFrameQuadrupole[0]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9];
-    sA->labFrameQuadrupole[1]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+1];
-    sA->labFrameQuadrupole[2]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+2];
-    sA->labFrameQuadrupole[3]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+3];
-    sA->labFrameQuadrupole[4]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+4];
-    sA->labFrameQuadrupole[5]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+5];
-    sA->labFrameQuadrupole[6]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+6];
-    sA->labFrameQuadrupole[7]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+7];
-    sA->labFrameQuadrupole[8]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+8];
-
-    // induced dipole
-
-    sA->inducedDipole[0]         = cAmoebaSim.pInducedDipole[atomI*3];
-    sA->inducedDipole[1]         = cAmoebaSim.pInducedDipole[atomI*3+1];
-    sA->inducedDipole[2]         = cAmoebaSim.pInducedDipole[atomI*3+2];
-
-}
-
-/**---------------------------------------------------------------------------------------
-
-   Calculate potential at grid point due atomI
-   Code adapted from TINKER routine potpoint in potpoint.f
-
-   @param atomI     atomI's coordinates and multipole moments
-   @param gridPoint grid coordinates
-   @param potential output potential
-
-   --------------------------------------------------------------------------------------- */
-
-__device__ void calculateElectrostaticPotentialForAtomGridPoint_kernel( volatile ElectrostaticPotentialParticle& atomI, volatile float4& gridPoint, float* potential ){
-  
-    float xr                 = atomI.x - gridPoint.x;
-    float yr                 = atomI.y - gridPoint.y;
-    float zr                 = atomI.z - gridPoint.z;
-   
-    float r2                 = xr*xr + yr*yr + zr*zr;
-    float r                  = sqrtf( r2 );
-
-    float rr1                = 1.0f/r;
-    *potential               = atomI.q*rr1;
-    float rr2                = rr1*rr1;
-    float rr3                = rr1*rr2;
-
-    float scd                = atomI.labFrameDipole[0]*xr     +  atomI.labFrameDipole[1]*yr    + atomI.labFrameDipole[2]*zr;
-    float scu                =  atomI.inducedDipole[0]*xr     +   atomI.inducedDipole[1]*yr    +  atomI.inducedDipole[2]*zr;
-    *potential              -= (scd + scu)*rr3;
-
-    float rr5                = 3.0f*rr3*rr2;
-    float scq                = xr*(atomI.labFrameQuadrupole[0]*xr + atomI.labFrameQuadrupole[1]*yr + atomI.labFrameQuadrupole[2]*zr);
-          scq               += yr*(atomI.labFrameQuadrupole[1]*xr + atomI.labFrameQuadrupole[4]*yr + atomI.labFrameQuadrupole[5]*zr);
-          scq               += zr*(atomI.labFrameQuadrupole[2]*xr + atomI.labFrameQuadrupole[5]*yr + atomI.labFrameQuadrupole[8]*zr);
-    *potential              += scq*rr5;
-
-    return;
-
-}
-
-// Include versions of the kernels for N x PotentialGridSize calculations.
-
-#undef USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME 
-#define METHOD_NAME(a, b) a##NxG##b
-#include "kCalculateAmoebaCudaElectrostaticPotential.h"
-
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##NxGByWarp##b
-#include "kCalculateAmoebaCudaElectrostaticPotential.h"
-
-// Kernel to reduce potential
-
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
-#endif
-void kReducePotential_kernel()
-{
-    unsigned int pos             = (blockIdx.x * blockDim.x + threadIdx.x);
-    float conversionFactor       = (cAmoebaSim.electric/cAmoebaSim.dielec);
-   
-    // Reduce potential
-    while (pos < cAmoebaSim.paddedPotentialGridSize)
-    {
-        float totalPotential         = 0.0f;
-        float* pFt                   = cAmoebaSim.pPotential + pos;
-        int i                        = cSim.outputBuffers;
-        while (i >= 4)
-        {
-            float f1             = *pFt;
-            pFt                 += cAmoebaSim.paddedPotentialGridSize;
-            float f2             = *pFt;
-            pFt                 += cAmoebaSim.paddedPotentialGridSize;
-            float f3             = *pFt;
-            pFt                 += cAmoebaSim.paddedPotentialGridSize;
-            float f4             = *pFt;
-            pFt                 += cAmoebaSim.paddedPotentialGridSize;
-            totalPotential      += f1 + f2 + f3 + f4;
-            i                   -= 4;
-        }
-        if (i >= 2)
-        {
-            float f1             = *pFt;
-            pFt                 += cAmoebaSim.paddedPotentialGridSize;
-            float f2             = *pFt;
-            pFt                 += cAmoebaSim.paddedPotentialGridSize;
-            totalPotential      += f1 + f2;
-            i                   -= 2;
-        }
-        if (i > 0)
-        {
-            totalPotential += *pFt;
-        }
-        totalPotential *= conversionFactor;
-        pFt             = cAmoebaSim.pPotential + pos;
-        *pFt            = totalPotential;
-        pos            += gridDim.x*blockDim.x;
-    }   
-}
-
-/**---------------------------------------------------------------------------------------
-
-   Reduce Amoeba electrostatic potential
-
-   @param gpu        gpu context
-
-   --------------------------------------------------------------------------------------- */
-
-void kReducePotential(gpuContext gpu)
-{
-    kReducePotential_kernel<<<gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block>>>();
-    LAUNCHERROR("kReducePotential");
-}
-
-/**---------------------------------------------------------------------------------------
-
-   Compute Amoeba electrostatic potential
-
-   @param amoebaGpu        amoebaGpu context
-
-   --------------------------------------------------------------------------------------- */
-
-void cudaComputeAmoebaElectrostaticPotential( amoebaGpuContext amoebaGpu ){
-  
-   // ---------------------------------------------------------------------------------------
-
-    gpuContext gpu = amoebaGpu->gpuContext;
-
-    // on first pass, set threads/block
-
-    static unsigned int threadsPerBlock = 0;
-    if( threadsPerBlock == 0 ){
-        unsigned int maxThreads;
-        if (gpu->sm_version >= SM_20)
-            //maxThreads = 384;
-            maxThreads = 512;
-        else if (gpu->sm_version >= SM_12)
-            maxThreads = 128;
-        else
-            maxThreads = 64;
-        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(ElectrostaticPotentialParticle), gpu->sharedMemoryPerBlock), maxThreads);
-    }
-
-    if (gpu->bOutputBufferPerWarp){
-        kCalculateAmoebaCudaElectrostaticPotentialNxGByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticPotentialParticle)*threadsPerBlock>>>( );
-    } else {
-        kCalculateAmoebaCudaElectrostaticPotentialNxG_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticPotentialParticle)*threadsPerBlock>>>( );
-    }
-    LAUNCHERROR("kCalculateAmoebaCudaElectrostaticPotential");
-
-    kReducePotential( amoebaGpu->gpuContext );
-
-   // ---------------------------------------------------------------------------------------
-}
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostaticPotential.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostaticPotential.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "cudaKernels.h"
+#include "amoebaCudaKernels.h"
+#include "kCalculateAmoebaCudaUtilities.h"
+#include "openmm/OpenMMException.h"
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cstdlib>
+using namespace std; 
+
+#define SQRT sqrtf
+
+static __constant__ cudaGmxSimulation cSim;
+static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
+extern __global__ void kFindInteractionsWithinBlocksPeriodic_kernel(unsigned int*);
+
+void SetCalculateAmoebaMultipolePotentialSim(amoebaGpuContext amoebaGpu)
+{
+    cudaError_t status;
+    gpuContext gpu = amoebaGpu->gpuContext;
+    status         = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "SetCalculateAmoebaMultipolePotentialSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
+    status         = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));     
+    RTERROR(status, "SetCalculateAmoebaMultipolePotentialSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
+}
+
+void GetCalculateAmoebaMultipolePotentialSim(amoebaGpuContext amoebaGpu)
+{
+    cudaError_t status;
+    gpuContext gpu = amoebaGpu->gpuContext;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "GetCalculateAmoebaMultipolePotentialSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+    status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));     
+    RTERROR(status, "GetCalculateAmoebaMultipolePotentialSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
+}
+
+struct ElectrostaticPotentialParticle {
+
+    // coordinates charge
+
+    float x;
+    float y;
+    float z;
+    float q;
+
+    // lab frame dipole
+
+    float labFrameDipole[3];
+
+    // lab frame quadrupole
+
+    float labFrameQuadrupole[9];
+
+    // induced dipole
+
+    float inducedDipole[3];
+
+};
+
+/**---------------------------------------------------------------------------------------
+
+   Load data for particle w/ index=atomI
+
+   @param sa        address to store atomI's coordinates and multipole moments
+   @param atomI     index of atom whose data is to be stored
+
+   --------------------------------------------------------------------------------------- */
+
+static __device__ void loadElectrostaticPotentialParticle( volatile struct ElectrostaticPotentialParticle* sA, unsigned int atomI ){
+
+    // coordinates & charge
+
+    sA->x                        = cSim.pPosq[atomI].x;
+    sA->y                        = cSim.pPosq[atomI].y;
+    sA->z                        = cSim.pPosq[atomI].z;
+    sA->q                        = cSim.pPosq[atomI].w;
+
+    // lab dipole
+
+    sA->labFrameDipole[0]        = cAmoebaSim.pLabFrameDipole[atomI*3];
+    sA->labFrameDipole[1]        = cAmoebaSim.pLabFrameDipole[atomI*3+1];
+    sA->labFrameDipole[2]        = cAmoebaSim.pLabFrameDipole[atomI*3+2];
+
+    // lab quadrupole
+
+    sA->labFrameQuadrupole[0]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9];
+    sA->labFrameQuadrupole[1]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+1];
+    sA->labFrameQuadrupole[2]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+2];
+    sA->labFrameQuadrupole[3]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+3];
+    sA->labFrameQuadrupole[4]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+4];
+    sA->labFrameQuadrupole[5]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+5];
+    sA->labFrameQuadrupole[6]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+6];
+    sA->labFrameQuadrupole[7]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+7];
+    sA->labFrameQuadrupole[8]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+8];
+
+    // induced dipole
+
+    sA->inducedDipole[0]         = cAmoebaSim.pInducedDipole[atomI*3];
+    sA->inducedDipole[1]         = cAmoebaSim.pInducedDipole[atomI*3+1];
+    sA->inducedDipole[2]         = cAmoebaSim.pInducedDipole[atomI*3+2];
+
+}
+
+/**---------------------------------------------------------------------------------------
+
+   Calculate potential at grid point due atomI
+   Code adapted from TINKER routine potpoint in potpoint.f
+
+   @param atomI     atomI's coordinates and multipole moments
+   @param gridPoint grid coordinates
+   @param potential output potential
+
+   --------------------------------------------------------------------------------------- */
+
+__device__ void calculateElectrostaticPotentialForAtomGridPoint_kernel( volatile ElectrostaticPotentialParticle& atomI, volatile float4& gridPoint, float* potential ){
+  
+    float xr                 = atomI.x - gridPoint.x;
+    float yr                 = atomI.y - gridPoint.y;
+    float zr                 = atomI.z - gridPoint.z;
+   
+    float r2                 = xr*xr + yr*yr + zr*zr;
+    float r                  = sqrtf( r2 );
+
+    float rr1                = 1.0f/r;
+    *potential               = atomI.q*rr1;
+    float rr2                = rr1*rr1;
+    float rr3                = rr1*rr2;
+
+    float scd                = atomI.labFrameDipole[0]*xr     +  atomI.labFrameDipole[1]*yr    + atomI.labFrameDipole[2]*zr;
+    float scu                =  atomI.inducedDipole[0]*xr     +   atomI.inducedDipole[1]*yr    +  atomI.inducedDipole[2]*zr;
+    *potential              -= (scd + scu)*rr3;
+
+    float rr5                = 3.0f*rr3*rr2;
+    float scq                = xr*(atomI.labFrameQuadrupole[0]*xr + atomI.labFrameQuadrupole[1]*yr + atomI.labFrameQuadrupole[2]*zr);
+          scq               += yr*(atomI.labFrameQuadrupole[1]*xr + atomI.labFrameQuadrupole[4]*yr + atomI.labFrameQuadrupole[5]*zr);
+          scq               += zr*(atomI.labFrameQuadrupole[2]*xr + atomI.labFrameQuadrupole[5]*yr + atomI.labFrameQuadrupole[8]*zr);
+    *potential              += scq*rr5;
+
+    return;
+
+}
+
+// Include versions of the kernels for N x PotentialGridSize calculations.
+
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME 
+#define METHOD_NAME(a, b) a##NxG##b
+#include "kCalculateAmoebaCudaElectrostaticPotential.h"
+
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##NxGByWarp##b
+#include "kCalculateAmoebaCudaElectrostaticPotential.h"
+
+// Kernel to reduce potential
+
+__global__ 
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
+#elif (__CUDA_ARCH__ >= 120)
+__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
+#else
+__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
+#endif
+void kReducePotential_kernel()
+{
+    unsigned int pos             = (blockIdx.x * blockDim.x + threadIdx.x);
+    float conversionFactor       = (cAmoebaSim.electric/cAmoebaSim.dielec);
+   
+    // Reduce potential
+    while (pos < cAmoebaSim.paddedPotentialGridSize)
+    {
+        float totalPotential         = 0.0f;
+        float* pFt                   = cAmoebaSim.pPotential + pos;
+        int i                        = cSim.outputBuffers;
+        while (i >= 4)
+        {
+            float f1             = *pFt;
+            pFt                 += cAmoebaSim.paddedPotentialGridSize;
+            float f2             = *pFt;
+            pFt                 += cAmoebaSim.paddedPotentialGridSize;
+            float f3             = *pFt;
+            pFt                 += cAmoebaSim.paddedPotentialGridSize;
+            float f4             = *pFt;
+            pFt                 += cAmoebaSim.paddedPotentialGridSize;
+            totalPotential      += f1 + f2 + f3 + f4;
+            i                   -= 4;
+        }
+        if (i >= 2)
+        {
+            float f1             = *pFt;
+            pFt                 += cAmoebaSim.paddedPotentialGridSize;
+            float f2             = *pFt;
+            pFt                 += cAmoebaSim.paddedPotentialGridSize;
+            totalPotential      += f1 + f2;
+            i                   -= 2;
+        }
+        if (i > 0)
+        {
+            totalPotential += *pFt;
+        }
+        totalPotential *= conversionFactor;
+        pFt             = cAmoebaSim.pPotential + pos;
+        *pFt            = totalPotential;
+        pos            += gridDim.x*blockDim.x;
+    }   
+}
+
+/**---------------------------------------------------------------------------------------
+
+   Reduce Amoeba electrostatic potential
+
+   @param gpu        gpu context
+
+   --------------------------------------------------------------------------------------- */
+
+void kReducePotential(gpuContext gpu)
+{
+    kReducePotential_kernel<<<gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block>>>();
+    LAUNCHERROR("kReducePotential");
+}
+
+/**---------------------------------------------------------------------------------------
+
+   Compute Amoeba electrostatic potential
+
+   @param amoebaGpu        amoebaGpu context
+
+   --------------------------------------------------------------------------------------- */
+
+void cudaComputeAmoebaElectrostaticPotential( amoebaGpuContext amoebaGpu ){
+  
+   // ---------------------------------------------------------------------------------------
+
+    gpuContext gpu = amoebaGpu->gpuContext;
+
+    // on first pass, set threads/block
+
+    static unsigned int threadsPerBlock = 0;
+    if( threadsPerBlock == 0 ){
+        unsigned int maxThreads;
+        if (gpu->sm_version >= SM_20)
+            //maxThreads = 384;
+            maxThreads = 512;
+        else if (gpu->sm_version >= SM_12)
+            maxThreads = 128;
+        else
+            maxThreads = 64;
+        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(ElectrostaticPotentialParticle), gpu->sharedMemoryPerBlock), maxThreads);
+    }
+
+    if (gpu->bOutputBufferPerWarp){
+        kCalculateAmoebaCudaElectrostaticPotentialNxGByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticPotentialParticle)*threadsPerBlock>>>( );
+    } else {
+        kCalculateAmoebaCudaElectrostaticPotentialNxG_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticPotentialParticle)*threadsPerBlock>>>( );
+    }
+    LAUNCHERROR("kCalculateAmoebaCudaElectrostaticPotential");
+
+    kReducePotential( amoebaGpu->gpuContext );
+
+   // ---------------------------------------------------------------------------------------
+}
+
+void kCalculateAmoebaMultipolePotential(amoebaGpuContext amoebaGpu ) 
+{
+
+    // setup
+
+    kSetupAmoebaMultipoleForces(amoebaGpu, false ); 
+
+    // calculate electrostatic potential
+
+    cudaComputeAmoebaElectrostaticPotential( amoebaGpu );
+
+}
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostaticPotential.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostaticPotential.h
@@ -24,6 +24,8 @@
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

+#include "amoebaScaleFactors.h"
+
 __global__ 
 #if (__CUDA_ARCH__ >= 200)
 __launch_bounds__(512, 1)

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaRotateFrame.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaRotateFrame.cu
@@ -422,7 +422,7 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )

 }

-static void kSetupAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaGeneralizedKirkwood ) 
+void kSetupAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaGeneralizedKirkwood ) 
 {
    std::string methodName  = "kSetupAmoebaMultipoleForces";

@@ -500,159 +500,3 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
    }

 }
-
-void kCalculateAmoebaMultipolePotential(amoebaGpuContext amoebaGpu ) 
-{
-
-    // setup
-
-    kSetupAmoebaMultipoleForces(amoebaGpu, false ); 
-
-    // calculate electrostatic potential
-
-    cudaComputeAmoebaElectrostaticPotential( amoebaGpu );
-
-}
-
-void kCalculateAmoebaSystemMultipoleMoments(amoebaGpuContext amoebaGpu, const OpenMM::Vec3& origin, std::vector< double >& outputMultipoleMoments ) 
-{
-
-    // setup
-
-    kSetupAmoebaMultipoleForces(amoebaGpu, false ); 
-
-    gpuContext gpu         = amoebaGpu->gpuContext;
-
-    gpu->psPosq4->Download();
-    gpu->psVelm4->Download();
-    float4* posq           = gpu->psPosq4->_pSysData;    
-    float4* velm           = gpu->psVelm4->_pSysData;    
-    float totalMass        = 0.0f;
-    float centerOfMass[3]  = { 0.0f, 0.0f, 0.0f };
-    for( unsigned int ii  = 0; ii < gpu->natoms; ii++ ){
-        float mass;
-        if( velm->w > 0.0f ){
-            mass        = 1.0f/velm[ii].w;
-        } else {
-            mass        = 0.0f;
-        }
-        totalMass        += mass;
-        centerOfMass[0]  += mass*posq[ii].x;
-        centerOfMass[1]  += mass*posq[ii].y;
-        centerOfMass[2]  += mass*posq[ii].z;
-    }
-
-    std::vector<float4>  posqLocal(gpu->natoms);
-    if( totalMass > 0.0f ){
-        centerOfMass[0]  /= totalMass;
-        centerOfMass[1]  /= totalMass;
-        centerOfMass[2]  /= totalMass;
-    }
-    for( unsigned int ii  = 0; ii < gpu->natoms; ii++ ){
-        posqLocal[ii].x = posq[ii].x  - centerOfMass[0];
-        posqLocal[ii].y = posq[ii].y  - centerOfMass[1];
-        posqLocal[ii].z = posq[ii].z  - centerOfMass[2];
-        posqLocal[ii].w = posq[ii].w;
-    }
-
-    float netchg  = 0.0f;
-
-    float xdpl    = 0.0f;
-    float ydpl    = 0.0f;
-    float zdpl    = 0.0f;
-
-    float xxqdp   = 0.0f;
-    float xyqdp   = 0.0f;
-    float xzqdp   = 0.0f;
-
-    float yxqdp   = 0.0f;
-    float yyqdp   = 0.0f;
-    float yzqdp   = 0.0f;
-
-    float zxqdp   = 0.0f;
-    float zyqdp   = 0.0f;
-    float zzqdp   = 0.0f;
-
-    amoebaGpu->psLabFrameDipole->Download();
-    float* labFrameDipole      = amoebaGpu->psLabFrameDipole->_pSysData;    
-
-    amoebaGpu->psInducedDipole->Download();
-    float* inducedDipole       = amoebaGpu->psInducedDipole->_pSysData;    
-
-    amoebaGpu->psLabFrameQuadrupole->Download();
-    float* labFrameQuadrupole  = amoebaGpu->psLabFrameQuadrupole->_pSysData;    
-    for( unsigned int ii  = 0; ii < gpu->natoms; ii++ ){
-
-        netchg              += posqLocal[ii].w;
-
-        float netDipoleX     = (labFrameDipole[3*ii]    + inducedDipole[3*ii]);
-        float netDipoleY     = (labFrameDipole[3*ii+1]  + inducedDipole[3*ii+1]);
-        float netDipoleZ     = (labFrameDipole[3*ii+2]  + inducedDipole[3*ii+2]);
-
-        xdpl    += posqLocal[ii].x*posqLocal[ii].w + netDipoleX;
-        ydpl    += posqLocal[ii].y*posqLocal[ii].w + netDipoleY;
-        zdpl    += posqLocal[ii].z*posqLocal[ii].w + netDipoleZ;
-
-        xxqdp   += posqLocal[ii].x*posqLocal[ii].x*posqLocal[ii].w + 2.0f*posqLocal[ii].x*netDipoleX;
-        xyqdp   += posqLocal[ii].x*posqLocal[ii].y*posqLocal[ii].w + posqLocal[ii].x*netDipoleY + posqLocal[ii].y*netDipoleX;
-        xzqdp   += posqLocal[ii].x*posqLocal[ii].z*posqLocal[ii].w + posqLocal[ii].x*netDipoleZ + posqLocal[ii].z*netDipoleX;
-
-        yxqdp   += posqLocal[ii].y*posqLocal[ii].x*posqLocal[ii].w + posqLocal[ii].y*netDipoleX + posqLocal[ii].x*netDipoleY;
-        yyqdp   += posqLocal[ii].y*posqLocal[ii].y*posqLocal[ii].w + 2.0f*posqLocal[ii].y*netDipoleY;
-        yzqdp   += posqLocal[ii].y*posqLocal[ii].z*posqLocal[ii].w + posqLocal[ii].y*netDipoleZ + posqLocal[ii].z*netDipoleY;
-
-        zxqdp   += posqLocal[ii].z*posqLocal[ii].x*posqLocal[ii].w + posqLocal[ii].z*netDipoleX + posqLocal[ii].x*netDipoleZ;
-        zyqdp   += posqLocal[ii].z*posqLocal[ii].y*posqLocal[ii].w + posqLocal[ii].z*netDipoleY + posqLocal[ii].y*netDipoleZ;
-        zzqdp   += posqLocal[ii].z*posqLocal[ii].z*posqLocal[ii].w + 2.0f*posqLocal[ii].z*netDipoleZ;
-
-    }
-
-//  convert the quadrupole from traced to traceless form
- 
-    float qave   = (xxqdp + yyqdp + zzqdp)/3.0f;
-          xxqdp  = 1.5f*(xxqdp-qave);
-          xyqdp  = 1.5f*xyqdp;
-          xzqdp  = 1.5f*xzqdp;
-          yxqdp  = 1.5f*yxqdp;
-          yyqdp  = 1.5f*(yyqdp-qave);
-          yzqdp  = 1.5f*yzqdp;
-          zxqdp  = 1.5f*zxqdp;
-          zyqdp  = 1.5f*zyqdp;
-          zzqdp  = 1.5f*(zzqdp-qave);
-
-//  add the traceless atomic quadrupoles to total quadrupole
-
-    for( unsigned int ii  = 0; ii < gpu->natoms; ii++ ){
-        xxqdp  = xxqdp + 3.0f*labFrameQuadrupole[9*ii];
-        xyqdp  = xyqdp + 3.0f*labFrameQuadrupole[9*ii+1];
-        xzqdp  = xzqdp + 3.0f*labFrameQuadrupole[9*ii+2];
-        yxqdp  = yxqdp + 3.0f*labFrameQuadrupole[9*ii+3];
-        yyqdp  = yyqdp + 3.0f*labFrameQuadrupole[9*ii+4];
-        yzqdp  = yzqdp + 3.0f*labFrameQuadrupole[9*ii+5];
-        zxqdp  = zxqdp + 3.0f*labFrameQuadrupole[9*ii+6];
-        zyqdp  = zyqdp + 3.0f*labFrameQuadrupole[9*ii+7];
-        zzqdp  = zzqdp + 3.0f*labFrameQuadrupole[9*ii+8];
-    }
- 
-    float debye                = 4.80321f;
-    outputMultipoleMoments.resize( 13 );
-
-    outputMultipoleMoments[0]  = netchg;
-
-    outputMultipoleMoments[1]  = xdpl*debye;
-    outputMultipoleMoments[2]  = ydpl*debye;
-    outputMultipoleMoments[3]  = zdpl*debye;
-    
-    outputMultipoleMoments[4]  = xxqdp*debye;
-    outputMultipoleMoments[5]  = xyqdp*debye;
-    outputMultipoleMoments[6]  = xzqdp*debye;
-
-    outputMultipoleMoments[7]  = yxqdp*debye;
-    outputMultipoleMoments[8]  = yyqdp*debye;
-    outputMultipoleMoments[9]  = yzqdp*debye;
-
-    outputMultipoleMoments[10] = zxqdp*debye;
-    outputMultipoleMoments[11] = zyqdp*debye;
-    outputMultipoleMoments[12] = zzqdp*debye;
-
-}
--- a/plugins/amoeba/platforms/cuda/tests/TestCudaAmoebaMultipoleForce.cpp
+++ b/plugins/amoeba/platforms/cuda/tests/TestCudaAmoebaMultipoleForce.cpp
@@ -367,7 +367,7 @@ static void compareForcesEnergy( std::string& testName, double expectedEnergy, d

 static void compareForceNormsEnergy( std::string& testName, double expectedEnergy, double energy,
                                     std::vector<Vec3>& expectedForces,
-                                     std::vector<Vec3>& forces, double tolerance, FILE* log ) {
+                                     const std::vector<Vec3>& forces, double tolerance, FILE* log ) {


 //#define AMOEBA_DEBUG
@@ -935,10 +935,14 @@ static void testQuadrupoleValidation( FILE* log ){

 // setup for box of 2 water molecules and 3 ions

+// this method does too much; I tried passing the context ptr back to 
+// the tests methods, but the tests would seg fault w/ a bad_alloc error
+
 static void setupAndGetForcesEnergyMultipoleIonsAndWater( AmoebaMultipoleForce::AmoebaNonbondedMethod nonbondedMethod,
                                                          AmoebaMultipoleForce::AmoebaPolarizationType polarizationType,
-                                                          double cutoff, int inputPmeGridDimension, std::vector<Vec3>& forces,
-                                                          double& energy, FILE* log ){
+                                                          double cutoff, int inputPmeGridDimension, std::string testName,
+                                                          std::vector<Vec3>& forces, double& energy, std::vector< double >& outputMultipoleMoments,
+                                                          std::vector<Vec3>& inputGrid, std::vector< double >& outputGridPotential, FILE* log ){

    // beginning of Multipole setup

@@ -964,7 +968,7 @@ static void setupAndGetForcesEnergyMultipoleIonsAndWater( AmoebaMultipoleForce::
    amoebaMultipoleForce->setMutualInducedTargetEpsilon( 1.0e-06 );
    amoebaMultipoleForce->setMutualInducedMaxIterations( 500 );
    amoebaMultipoleForce->setAEwald( 5.4459052e+00 );
-    amoebaMultipoleForce->setEwaldErrorTolerance( 1.0e-04 );
+    amoebaMultipoleForce->setEwaldErrorTolerance( 1.0e-05 );

    std::vector<int> pmeGridDimension( 3 );
    pmeGridDimension[0] = pmeGridDimension[1] = pmeGridDimension[2] = inputPmeGridDimension;
@@ -997,7 +1001,7 @@ static void setupAndGetForcesEnergyMultipoleIonsAndWater( AmoebaMultipoleForce::
    // waters

    for( unsigned int jj = 2; jj < numberOfParticles; jj += 3 ){
-        system.addParticle( 1.5995000e+01 );
+        system.addParticle( 1.5999000e+01 );
        system.addParticle( 1.0080000e+00 );
        system.addParticle( 1.0080000e+00 );
    }
@@ -1115,12 +1119,23 @@ static void setupAndGetForcesEnergyMultipoleIonsAndWater( AmoebaMultipoleForce::
    std::string platformName;
    platformName = "Cuda";
    LangevinIntegrator integrator(0.0, 0.1, 0.01);
-    Context context(system, integrator, Platform::getPlatformByName( platformName ) );
+    Context context = Context(system, integrator, Platform::getPlatformByName( platformName ) );

    context.setPositions(positions);
-    State state                      = context.getState(State::Forces | State::Energy);
-    forces                           = state.getForces();
-    energy                           = state.getPotentialEnergy();
+
+    if( testName == "testSystemMultipoleMoments" ){
+        Vec3 origin( 0.0, 0.0, 0.0 );
+        amoebaMultipoleForce->getSystemMultipoleMoments( origin, context, outputMultipoleMoments );
+    } else if( testName == "testMultipoleGridPotential" ){
+        amoebaMultipoleForce->getElectrostaticPotential( inputGrid, context, outputGridPotential );
+    } else {
+        State state               = context.getState(State::Forces | State::Energy);
+        forces                    = state.getForces();
+        energy                    = state.getPotentialEnergy();
+    }
+
+    return;
+
 }

 // test multipole mutual polarization using PME for system comprised of 2 ions and 2 waters
@@ -1132,11 +1147,19 @@ static void testMultipoleIonsAndWaterPMEDirectPolarization( FILE* log ) {
    int numberOfParticles     = 8;
    int inputPmeGridDimension = 64;
    double cutoff             = 0.70;
+
    std::vector<Vec3> forces;
    double energy;

+    std::vector<double> outputMultipoleMoments;
+
+    std::vector<Vec3> inputGrid;
+    std::vector<double> outputGridPotential;
+
    setupAndGetForcesEnergyMultipoleIonsAndWater( AmoebaMultipoleForce::PME, AmoebaMultipoleForce::Direct, 
-                                                  cutoff, inputPmeGridDimension, forces, energy, log );
+                                                  cutoff, inputPmeGridDimension, testName, forces, energy, outputMultipoleMoments,
+                                                  inputGrid, outputGridPotential, log );
+
    std::vector<Vec3> expectedForces(numberOfParticles);

    double expectedEnergy     =  -4.6859568e+01;
@@ -1152,39 +1175,197 @@ static void testMultipoleIonsAndWaterPMEDirectPolarization( FILE* log ) {

    double tolerance          = 5.0e-04;
    compareForceNormsEnergy( testName, expectedEnergy, energy, expectedForces, forces, tolerance, log );
+
 }

 // test multipole mutual polarization using PME for system comprised of 2 ions and 2 waters

 static void testMultipoleIonsAndWaterPMEMutualPolarization( FILE* log ) {

-    std::string testName      = "testMultipoleIonsAndWaterMutualPolarization";
+    std::string testName            = "testMultipoleIonsAndWaterMutualPolarization";
+
+    int numberOfParticles           = 8;
+    int inputPmeGridDimension       = 64;
+    double cutoff                   = 0.70;

-    int numberOfParticles     = 8;
-    int inputPmeGridDimension = 64;
-    double cutoff             = 0.70;
    std::vector<Vec3> forces;
    double energy;

+    std::vector<double> outputMultipoleMoments;
+
+    std::vector<Vec3> inputGrid;
+    std::vector<double> outputGridPotential;
+
    setupAndGetForcesEnergyMultipoleIonsAndWater( AmoebaMultipoleForce::PME, AmoebaMultipoleForce::Mutual, 
-                                                  cutoff, inputPmeGridDimension, forces, energy, log );
+                                                  cutoff, inputPmeGridDimension, testName, forces, energy, outputMultipoleMoments,
+                                                  inputGrid, outputGridPotential, log );
+
    std::vector<Vec3> expectedForces(numberOfParticles);

-    double expectedEnergy     = -4.6859424e+01;
+    double expectedEnergy           = -4.6859424e+01;

-    expectedForces[0]         = Vec3(  -9.1272358e+00,   1.5191516e+01,  -4.0058826e+00 );
-    expectedForces[1]         = Vec3(  -1.0497156e+00,   1.4622425e+01,   1.1789420e+01 );
-    expectedForces[2]         = Vec3(  -3.2560478e+00,   6.5289712e+00,  -2.9779483e+00 );
-    expectedForces[3]         = Vec3(   3.0672153e+00,  -8.4407797e-01,  -3.4094884e+00 );
-    expectedForces[4]         = Vec3(   1.1382586e+00,  -3.1512949e+00,  -1.1387028e+00 );
-    expectedForces[5]         = Vec3(  -6.1050295e+00,   9.5345692e-01,   1.1488832e-01 );
-    expectedForces[6]         = Vec3(   1.9319945e+00,  -5.5747599e-01,  -4.8469044e+00 );
-    expectedForces[7]         = Vec3(   4.0622614e+00,  -3.3687594e+00,  -1.6986575e+00 );
+    expectedForces[0]               = Vec3(  -9.1272358e+00,   1.5191516e+01,  -4.0058826e+00 );
+    expectedForces[1]               = Vec3(  -1.0497156e+00,   1.4622425e+01,   1.1789420e+01 );
+    expectedForces[2]               = Vec3(  -3.2560478e+00,   6.5289712e+00,  -2.9779483e+00 );
+    expectedForces[3]               = Vec3(   3.0672153e+00,  -8.4407797e-01,  -3.4094884e+00 );
+    expectedForces[4]               = Vec3(   1.1382586e+00,  -3.1512949e+00,  -1.1387028e+00 );
+    expectedForces[5]               = Vec3(  -6.1050295e+00,   9.5345692e-01,   1.1488832e-01 );
+    expectedForces[6]               = Vec3(   1.9319945e+00,  -5.5747599e-01,  -4.8469044e+00 );
+    expectedForces[7]               = Vec3(   4.0622614e+00,  -3.3687594e+00,  -1.6986575e+00 );

-    //double tolerance          = 1.0e-03;
+    //double tolerance                = 1.0e-03;
    //compareForcesEnergy( testName, expectedEnergy, energy, expectedForces, forces, tolerance, log );
-    double tolerance          = 5.0e-04;
+    double tolerance                = 5.0e-04;
    compareForceNormsEnergy( testName, expectedEnergy, energy, expectedForces, forces, tolerance, log );
+
+}
+
+// test computation of system multipole moments
+
+static void testSystemMultipoleMoments( FILE* log ) {
+
+    std::string testName      = "testSystemMultipoleMoments";
+    
+    int numberOfParticles     = 8;
+    int inputPmeGridDimension = 64;
+    double cutoff             = 0.70;
+
+    std::vector<Vec3> forces;
+    double energy;
+    std::vector<double> outputMultipoleMoments;
+
+    std::vector<Vec3> inputGrid;
+    std::vector<double> outputGridPotential;
+
+    setupAndGetForcesEnergyMultipoleIonsAndWater( AmoebaMultipoleForce::PME, AmoebaMultipoleForce::Mutual, 
+                                                  cutoff, inputPmeGridDimension, testName, forces, energy, outputMultipoleMoments,
+                                                  inputGrid, outputGridPotential, log );
+
+    std::vector<double> tinkerMoments(13);
+    tinkerMoments[0]  =    0.0000000e+00;
+    tinkerMoments[1]  =   -8.5884599e+00;
+    tinkerMoments[2]  =    1.7447506e+01;
+    tinkerMoments[3]  =    3.9868461e+00;
+    tinkerMoments[4]  =   -2.7977319e+00;
+    tinkerMoments[5]  =   -7.8694903e+00;
+    tinkerMoments[6]  =   -2.6429049e+00;
+    tinkerMoments[7]  =   -7.8694903e+00;
+    tinkerMoments[8]  =    7.4048693e+00;
+    tinkerMoments[9]  =    6.7152875e+00;
+    tinkerMoments[10] =   -2.6429049e+00;
+    tinkerMoments[11] =    6.7152875e+00;
+    tinkerMoments[12] =   -4.6071374e+00;
+
+    double tolerance = 1.0e-04;
+    for( unsigned int ii = 0; ii < tinkerMoments.size(); ii++ ){
+        double difference = fabs( outputMultipoleMoments[ii] - tinkerMoments[ii] );
+        //fprintf( stderr, "%2d %15.7e %15.7e %15.7e %15.7e\n", ii, difference, difference/fabs(tinkerMoments[ii]), outputMultipoleMonents[ii], tinkerMoments[ii] );
+        if( difference > tolerance ){
+            std::stringstream details;
+            details << testName << "Multipole moment " << ii << " does not agree w/ TINKER computed moments: OpenMM=" << outputMultipoleMoments[ii];
+            details << " TINKER=" <<  tinkerMoments[ii]  << " difference=" << difference;
+            throwException(__FILE__, __LINE__, details.str());
+        }
+    }
+
+}
+
+// test computation of multipole potential on a grid 
+
+static void testMultipoleGridPotential( FILE* log ) {
+
+    std::string testName      = "testMultipoleGridPotential";
+    
+    int numberOfParticles     = 8;
+    int inputPmeGridDimension = 64;
+    double cutoff             = 0.70;
+
+    std::vector<Vec3> forces;
+    double energy;
+
+    std::vector<double> outputMultipoleMoments;
+
+    // initialize grid
+
+    int gridSize  = 27;
+    std::vector<Vec3> inputGrid(gridSize);
+    inputGrid[0]  = Vec3( -3.2894000e+00,  -1.3098000e+00,  -1.7064092e+00);
+    inputGrid[1]  = Vec3( -3.2894000e+00,  -1.3098000e+00,   6.3928525e-01);
+    inputGrid[2]  = Vec3( -3.2894000e+00,  -1.3098000e+00,   2.9849797e+00);
+    inputGrid[3]  = Vec3( -3.2894000e+00,   5.1360000e-01,  -1.7064092e+00);
+    inputGrid[4]  = Vec3( -3.2894000e+00,   5.1360000e-01,   6.3928525e-01);
+    inputGrid[5]  = Vec3( -3.2894000e+00,   5.1360000e-01,   2.9849797e+00);
+    inputGrid[6]  = Vec3( -3.2894000e+00,   2.3370000e+00,  -1.7064092e+00);
+    inputGrid[7]  = Vec3( -3.2894000e+00,   2.3370000e+00,   6.3928525e-01);
+    inputGrid[8]  = Vec3( -3.2894000e+00,   2.3370000e+00,   2.9849797e+00);
+    inputGrid[9]  = Vec3( -2.3754000e+00,  -1.3098000e+00,  -1.7064092e+00);
+    inputGrid[10] = Vec3( -2.3754000e+00,  -1.3098000e+00,   6.3928525e-01);
+    inputGrid[11] = Vec3( -2.3754000e+00,  -1.3098000e+00,   2.9849797e+00);
+    inputGrid[12] = Vec3( -2.3754000e+00,   5.1360000e-01,  -1.7064092e+00);
+    inputGrid[13] = Vec3( -2.3754000e+00,   5.1360000e-01,   6.3928525e-01);
+    inputGrid[14] = Vec3( -2.3754000e+00,   5.1360000e-01,   2.9849797e+00);
+    inputGrid[15] = Vec3( -2.3754000e+00,   2.3370000e+00,  -1.7064092e+00);
+    inputGrid[16] = Vec3( -2.3754000e+00,   2.3370000e+00,   6.3928525e-01);
+    inputGrid[17] = Vec3( -2.3754000e+00,   2.3370000e+00,   2.9849797e+00);
+    inputGrid[18] = Vec3( -1.4614000e+00,  -1.3098000e+00,  -1.7064092e+00);
+    inputGrid[19] = Vec3( -1.4614000e+00,  -1.3098000e+00,   6.3928525e-01);
+    inputGrid[20] = Vec3( -1.4614000e+00,  -1.3098000e+00,   2.9849797e+00);
+    inputGrid[21] = Vec3( -1.4614000e+00,   5.1360000e-01,  -1.7064092e+00);
+    inputGrid[22] = Vec3( -1.4614000e+00,   5.1360000e-01,   6.3928525e-01);
+    inputGrid[23] = Vec3( -1.4614000e+00,   5.1360000e-01,   2.9849797e+00);
+    inputGrid[24] = Vec3( -1.4614000e+00,   2.3370000e+00,  -1.7064092e+00);
+    inputGrid[25] = Vec3( -1.4614000e+00,   2.3370000e+00,   6.3928525e-01);
+    inputGrid[26] = Vec3( -1.4614000e+00,   2.3370000e+00,   2.9849797e+00);
+
+    std::vector<double> outputGridPotential;
+
+    setupAndGetForcesEnergyMultipoleIonsAndWater( AmoebaMultipoleForce::PME, AmoebaMultipoleForce::Mutual, 
+                                                  cutoff, inputPmeGridDimension, testName, forces, energy, outputMultipoleMoments,
+                                                  inputGrid, outputGridPotential, log );
+
+    // TINKER computed grid values
+
+    std::vector<double> tinkerGridPotential(gridSize);
+    tinkerGridPotential[0]  =  -1.8587681e+01;
+    tinkerGridPotential[1]  =  -3.7731481e+01;
+    tinkerGridPotential[2]  =  -1.4093518e+01;
+    tinkerGridPotential[3]  =  -8.6733284e-01;
+    tinkerGridPotential[4]  =   1.6378362e+01;
+    tinkerGridPotential[5]  =   1.7545816e+01;
+    tinkerGridPotential[6]  =   1.2115085e+01;
+    tinkerGridPotential[7]  =   1.5693689e+02;
+    tinkerGridPotential[8]  =   5.6517394e+01;
+    tinkerGridPotential[9]  =  -2.8489173e+01;
+    tinkerGridPotential[10] =  -1.1037348e+02;
+    tinkerGridPotential[11] =  -7.1050586e+01;
+    tinkerGridPotential[12] =  -5.9901289e+00;
+    tinkerGridPotential[13] =  -4.0533846e+00;
+    tinkerGridPotential[14] =   1.0506199e+01;
+    tinkerGridPotential[15] =  -8.6421838e+00;
+    tinkerGridPotential[16] =   8.3729402e+01;
+    tinkerGridPotential[17] =   4.4286652e+01;
+    tinkerGridPotential[18] =  -3.4746066e+01;
+    tinkerGridPotential[19] =  -1.0763056e+03;
+    tinkerGridPotential[20] =  -2.0034673e+01;
+    tinkerGridPotential[21] =  -1.2131063e+01;
+    tinkerGridPotential[22] =  -2.4662346e+01;
+    tinkerGridPotential[23] =   1.4630799e+00;
+    tinkerGridPotential[24] =   5.1623298e+00;
+    tinkerGridPotential[25] =   3.3114482e+01;
+    tinkerGridPotential[26] =   2.5828622e+01;
+
+    double tolerance = 1.0e-04;
+    for( unsigned int ii = 0; ii < gridSize; ii++ ){
+        double difference = fabs( outputGridPotential[ii] - tinkerGridPotential[ii] );
+        //fprintf( stderr, "%2d %15.7e %15.7e %15.7e %15.7e\n", ii, difference, difference/fabs(tinkerGridPotential[ii]), outputGridPotential[ii], tinkerGridPotential[ii] );
+        if( difference > tolerance ){
+            std::stringstream details;
+            details << testName << "Multipole moment " << ii << " does not agree w/ TINKER computed moments: OpenMM=" << outputGridPotential[ii];
+            details << " TINKER=" <<  tinkerGridPotential[ii]  << " difference=" << difference;
+            throwException(__FILE__, __LINE__, details.str());
+        }
+    }
+
 }

 int main( int numberOfArguments, char* argv[] ) {
@@ -1219,6 +1400,14 @@ int main( int numberOfArguments, char* argv[] ) {
        testMultipoleIonsAndWaterPMEMutualPolarization( log );
        testMultipoleIonsAndWaterPMEDirectPolarization( log );

+        // test computation of system multipole moments
+
+        testSystemMultipoleMoments( log );
+
+        // test computation of grid potential
+
+        testMultipoleGridPotential( log );
+
    } catch(const std::exception& e) {
        std::cout << "exception: " << e.what() << std::endl;
        std::cout << "FAIL - ERROR.  Test failed." << std::endl;