Direct space PME

2ef05285 · Mark Friedrichs · c22f00cf · 2ef05285 · 2ef05285 · 2ef05285
Commit 2ef05285 authored Sep 13, 2010 by Mark Friedrichs
13 changed files
--- a/plugins/amoeba/CMakeLists.txt
+++ b/plugins/amoeba/CMakeLists.txt
@@ -38,8 +38,7 @@ ENDIF(LOG)
 # The source is organized into subdirectories, but we handle them all from
 # this CMakeLists file rather than letting CMake visit them as SUBDIRS.
-#SET(OPENMM_AMOEBA_PLUGIN_SOURCE_SUBDIRS . openmmapi platforms/reference)
+SET(OPENMM_AMOEBA_PLUGIN_SOURCE_SUBDIRS . openmmapi platforms/reference)
-SET(OPENMM_AMOEBA_PLUGIN_SOURCE_SUBDIRS . openmmapi )
 # Collect up information about the version of the OpenMM library we're building
 # and make it available to the code so it can be built into the binaries.
@@ -103,8 +102,10 @@ FOREACH(subdir ${OPENMM_AMOEBA_PLUGIN_SOURCE_SUBDIRS})
    INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include)
 ENDFOREACH(subdir)
-#INCLUDE_DIRECTORIES(BEFORE ${OPENMM_DIR}/platforms/reference/src)
+INCLUDE_DIRECTORIES(BEFORE ${OPENMM_DIR}/platforms/reference/src)
-#INCLUDE_DIRECTORIES(BEFORE ${OPENMM_DIR}/platforms/reference/src/SimTKReference)
+INCLUDE_DIRECTORIES(BEFORE ${OPENMM_DIR}/platforms/reference/src/SimTKReference)
+INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/platforms/reference/src)
+INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/platforms/reference/src/SimTKReference)
 # ----------------------------------------------------------------------------
 IF(LOG)
@@ -158,7 +159,7 @@ IF(OPENMM_BUILD_STATIC_LIB)
  TARGET_LINK_LIBRARIES( ${STATIC_AMOEBA_TARGET} ${STATIC_TARGET} )
 ENDIF(OPENMM_BUILD_STATIC_LIB)
-#ADD_SUBDIRECTORY(platforms/reference/tests)
+ADD_SUBDIRECTORY(platforms/reference/tests)
 # Which hardware platforms to build
 IF(CUDA_FOUND)

--- a/plugins/amoeba/openmmapi/include/AmoebaMultipoleForce.h
+++ b/plugins/amoeba/openmmapi/include/AmoebaMultipoleForce.h
@@ -113,6 +113,20 @@ public:
     */
    void setCutoffDistance(double distance);
+   /**		   
+     * Get the aEwald parameter		 
+     *		 
+     * @return the Ewald parameter		 
+     */		 
+    double getAEwald() const;			 
+    /**		 
+     * Set the aEwald parameter		 
+     *		 
+     * @param Ewald parameter			 
+     */		 
+    void setAEwald(double aewald);		 
    /**
     * Add multipole-related info for a particle 
     *
@@ -279,6 +293,7 @@ private:
    AmoebaNonbondedMethod nonbondedMethod;
    double cutoffDistance;
+    double aewald;
    MutualInducedIterationMethod mutualInducedIterationMethod;
    int mutualInducedMaxIterations;
    double mutualInducedTargetEpsilon;

--- a/plugins/amoeba/openmmapi/src/AmoebaMultipoleForce.cpp
+++ b/plugins/amoeba/openmmapi/src/AmoebaMultipoleForce.cpp
@@ -56,6 +56,14 @@ void AmoebaMultipoleForce::setCutoffDistance(double distance) {
    cutoffDistance = distance;
 }
+double AmoebaMultipoleForce::getAEwald() const {		 
+    return aewald;		 
+}		 
+void AmoebaMultipoleForce::setAEwald(double inputAewald ) {			 
+    aewald = inputAewald;		 
+}		 
 AmoebaMultipoleForce::MutualInducedIterationMethod AmoebaMultipoleForce::getMutualInducedIterationMethod( void ) const {
    return mutualInducedIterationMethod;
 }

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -589,6 +589,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
                                    static_cast<float>( force.getMutualInducedTargetEpsilon()),
                                    nonbondedMethod,
                                    static_cast<float>( force.getCutoffDistance()),
+                                    static_cast<float>( force.getAEwald()),
                                    static_cast<float>( force.getElectricConstant()) );
    if (nonbondedMethod == AmoebaMultipoleForce::PME) {
        double alpha;

--- a/plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
+++ b/plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
@@ -350,6 +350,7 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
    (void) fprintf( log, "     pP_ScaleIndices                    %p\n",      amoebaGpu->amoebaSim.pP_ScaleIndices );
    (void) fprintf( log, "     pM_ScaleIndices                    %p\n",      amoebaGpu->amoebaSim.pM_ScaleIndices );
    (void) fprintf( log, "     sqrtPi                             %15.7e\n",  amoebaGpu->amoebaSim.sqrtPi );
+    (void) fprintf( log, "     alpha Ewald                        %15.7e\n",  gpu->sim.alphaEwald );
    (void) fprintf( log, "     cutoffDistance2                    %15.7e\n",  amoebaGpu->amoebaSim.cutoffDistance2 );
    (void) fprintf( log, "     electric                           %15.7e\n",  amoebaGpu->amoebaSim.electric );
    (void) fprintf( log, "     box                                %15.7e %15.7e %15.7e\n", gpu->sim.periodicBoxSizeX, gpu->sim.periodicBoxSizeY, gpu->sim.periodicBoxSizeZ);
@@ -1463,7 +1464,7 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
                                     const std::vector< std::vector< std::vector<int> > >& multipoleParticleCovalentInfo, const std::vector<int>& covalentDegree,
                                     const std::vector<int>& minCovalentIndices,  const std::vector<int>& minCovalentPolarizationIndices, int maxCovalentRange, 
                                     int mutualInducedIterativeMethod, int mutualInducedMaxIterations, float mutualInducedTargetEpsilon,
-                                     int nonbondedMethod, float cutoffDistance, float electricConstant ){
+                                     int nonbondedMethod, float cutoffDistance, float alphaEwald, float electricConstant ){
 // ---------------------------------------------------------------------------------------
@@ -1565,9 +1566,8 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
    amoebaGpu->amoebaSim.cutoffDistance2             = cutoffDistance*cutoffDistance;
    amoebaGpu->amoebaSim.sqrtPi                      = sqrt( 3.1415926535897932384626433832795 );
    amoebaGpu->amoebaSim.electric                    = electricConstant;
+    amoebaGpu->gpuContext->sim.alphaEwald            = alphaEwald;
    amoebaGpu->gpuContext->sim.nonbondedCutoff       = cutoffDistance;
-    tabulateErfc(amoebaGpu->gpuContext); 
    if( amoebaGpu->amoebaSim.dielec < 1.0e-05 ){
        amoebaGpu->amoebaSim.dielec               = 1.0f;
@@ -2766,7 +2766,7 @@ void amoebaGpuSetConstants(amoebaGpuContext amoebaGpu)
    SetCalculateAmoebaCudaPmeMutualInducedFieldSim( amoebaGpu );
    SetCalculateAmoebaCudaPmeFixedEFieldSim( amoebaGpu );
    SetCalculateAmoebaElectrostaticSim( amoebaGpu );
-    SetCalculateAmoebaRealSpaceEwaldSim( amoebaGpu );
+    SetCalculateAmoebaPmeDirectElectrostaticSim( amoebaGpu );
    SetCalculateAmoebaCudaMapTorquesSim( amoebaGpu );
    SetCalculateAmoebaKirkwoodSim( amoebaGpu );
    SetCalculateAmoebaKirkwoodEDiffSim( amoebaGpu );

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
@@ -106,9 +106,9 @@ extern void SetCalculateAmoebaElectrostaticSim( amoebaGpuContext amoebaGpu );
 extern void GetCalculateAmoebaElectrostaticSim( amoebaGpuContext amoebaGpu );
 extern void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu );
-extern void SetCalculateAmoebaRealSpaceEwaldSim( amoebaGpuContext amoebaGpu );
+extern void SetCalculateAmoebaPmeDirectElectrostaticSim( amoebaGpuContext amoebaGpu );
-extern void GetCalculateAmoebaRealSpaceEwaldSim( amoebaGpuContext amoebaGpu );
+extern void GetCalculateAmoebaPmeDirectElectrostaticSim( amoebaGpuContext amoebaGpu );
-extern void cudaComputeAmoebaRealSpaceEwald( amoebaGpuContext amoebaGpu );
+extern void cudaComputeAmoebaPmeElectrostatic( amoebaGpuContext amoebaGpu );
 extern void SetCalculateAmoebaCudaMapTorquesSim(amoebaGpuContext gpu);
 extern void GetCalculateAmoebaCudaMapTorquesSim(amoebaGpuContext gpu);

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
@@ -307,7 +307,7 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
                                     const std::vector< std::vector< std::vector<int> > >& multipoleAtomCovalentInfo, const std::vector<int>& covalentDegree,
                                     const std::vector<int>& minCovalentIndices,  const std::vector<int>& minCovalentPolarizationIndices, int maxCovalentRange,
                                     int mutualInducedIterationMethod, int mutualInducedMaxIterations, float mutualInducedTargetEpsilon,
-                                     int nonbondedMethod, float cutoffDistance, float electricConstant );
+                                     int nonbondedMethod, float cutoffDistance,  float alphaEwald, float electricConstant );
 extern "C"

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaRealSpaceEwald.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaRealSpaceEwald.cu
@@ -11,34 +11,24 @@
 static __constant__ cudaGmxSimulation cSim;
 static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
-void SetCalculateAmoebaRealSpaceEwaldSim(amoebaGpuContext amoebaGpu)
+void SetCalculateAmoebaPmeDirectElectrostaticSim(amoebaGpuContext amoebaGpu)
 {
    cudaError_t status;
    gpuContext gpu = amoebaGpu->gpuContext;
    status         = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));    
-    RTERROR(status, "SetCalculateAmoebaRealSpaceEwaldSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
+    RTERROR(status, "SetCalculateAmoebaPmeDirectElectrostaticSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
    status         = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));    
-    RTERROR(status, "SetCalculateAmoebaRealSpaceEwaldSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
+    RTERROR(status, "SetCalculateAmoebaPmeDirectElectrostaticSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
 }
-void GetCalculateAmoebaRealSpaceEwaldSim(amoebaGpuContext amoebaGpu)
+void GetCalculateAmoebaPmeDirectElectrostaticSim(amoebaGpuContext amoebaGpu)
 {
    cudaError_t status;
    gpuContext gpu = amoebaGpu->gpuContext;
    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));    
-    RTERROR(status, "GetCalculateAmoebaRealSpaceEwaldSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+    RTERROR(status, "GetCalculateAmoebaPmeDirectElectrostaticSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
    status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));    
-    RTERROR(status, "GetCalculateAmoebaRealSpaceEwaldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
+    RTERROR(status, "GetCalculateAmoebaPmeDirectElectrostaticSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
-}
-texture<float, 1, cudaReadModeElementType> tabulatedErfcRef;
-static __device__ float fastErfc(float r)
-{
-    float normalized = cSim.tabulatedErfcScale*r;
-    int index = (int) normalized;
-    float fract2 = normalized-index;
-    float fract1 = 1.0f-fract2;
-    return fract1*tex1Dfetch(tabulatedErfcRef, index) + fract2*tex1Dfetch(tabulatedErfcRef, index+1);
 }
 static int const PScaleIndex            =  0; 
@@ -60,7 +50,7 @@ static int const MScaleIndex            =  3;
 //static int const Ddsc72Index            = 16;
 static int const LastScalingIndex       = 4;
-struct RealSpaceEwaldParticle {
+struct PmeDirectElectrostaticParticle {
    // coordinates charge
@@ -97,30 +87,59 @@ struct RealSpaceEwaldParticle {
 };
-__device__ void calculateRealSpaceEwaldPairIxn_kernel( RealSpaceEwaldParticle& atomI,   RealSpaceEwaldParticle& atomJ,
+// self-energy for PME
-                                                       float scalingDistanceCutoff,   float* scalingFactors,
-                                                       float*  outputForce,           float  outputTorque[2][3],
+__device__ static void calculatePmeSelfEnergyElectrostaticPairIxn_kernel( PmeDirectElectrostaticParticle& atomI, float* energy)
-                                                       float* energy
+{
+    float term   = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
+    float fterm  = -(cAmoebaSim.electric/cAmoebaSim.dielec)*cSim.alphaEwald/cAmoebaSim.sqrtPi;
+    float cii    = atomI.q*atomI.q;
+    float dii    = atomI.labFrameDipole[0]*atomI.labFrameDipole[0] +
+                   atomI.labFrameDipole[1]*atomI.labFrameDipole[1] +
+                   atomI.labFrameDipole[2]*atomI.labFrameDipole[2];
+    float qii    = atomI.labFrameQuadrupole[0]*atomI.labFrameQuadrupole[0] +
+                   atomI.labFrameQuadrupole[4]*atomI.labFrameQuadrupole[4] +
+                   atomI.labFrameQuadrupole[8]*atomI.labFrameQuadrupole[8] + 2.0f*(
+                   atomI.labFrameQuadrupole[1]*atomI.labFrameQuadrupole[1] +
+                   atomI.labFrameQuadrupole[2]*atomI.labFrameQuadrupole[2] +
+                   atomI.labFrameQuadrupole[5]*atomI.labFrameQuadrupole[5]);
+    float uii    = atomI.labFrameDipole[0]*atomI.inducedDipole[0] + atomI.labFrameDipole[1]*atomI.inducedDipole[1] + atomI.labFrameDipole[2]*atomI.inducedDipole[2];
+    *energy      = (cii + term*(dii/3.0f + 2.0f*term*qii/5.0f));
+    *energy     += term*uii/3.0f;
+    *energy     *= fterm;
+}
+// self-torque for PME
+__device__ static void calculatePmeSelfTorqueElectrostaticPairIxn_kernel( PmeDirectElectrostaticParticle& atomI)
+{
+    float term       = (4.0f/3.0f)*(cAmoebaSim.electric/cAmoebaSim.dielec)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
+    float uix        = 0.5f*(atomI.inducedDipole[0] + atomI.inducedDipoleP[0]);
+    float uiy        = 0.5f*(atomI.inducedDipole[1] + atomI.inducedDipoleP[1]);
+    float uiz        = 0.5f*(atomI.inducedDipole[2] + atomI.inducedDipoleP[2]);
+    atomI.torque[0] += term*(atomI.labFrameDipole[1]*uiz - atomI.labFrameDipole[2]*uiy);
+    atomI.torque[1] += term*(atomI.labFrameDipole[2]*uix - atomI.labFrameDipole[0]*uiz);
+    atomI.torque[2] += term*(atomI.labFrameDipole[0]*uiy - atomI.labFrameDipole[1]*uix);
+}
+__device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectrostaticParticle& atomI,   PmeDirectElectrostaticParticle& atomJ,
+                                                               float* scalingFactors, float*  outputForce, float  outputTorque[2][3],
+                                                               float* energy
 #ifdef AMOEBA_DEBUG
-                                                    ,float4* debugArray 
+                                                               ,float4* debugArray 
 #endif
 ){
-    float e,ei,f;
+    float e,ei;
-    //float gfd,gfdr;
-    //float xix,yix,zix;
-    //float xiy,yiy,ziy;
-    //float xiz,yiz,ziz;
-    //float xkx,ykx,zkx;
-    //float xky,yky,zky;
-    //float xkz,ykz,zkz;
-    //float rr1,rr3;
-    //float rr5,rr7,rr9,rr11;
    float erl,erli;
-    //float frcxi[4],frcxk[4];
-    //float frcyi[4],frcyk[4];
-    //float frczi[4],frczk[4];
    float di[4],qi[10];
    float dk[4],qk[10];
    float fridmp[4],findmp[4];
@@ -130,7 +149,6 @@ __device__ void calculateRealSpaceEwaldPairIxn_kernel( RealSpaceEwaldParticle& a
    float ttm2i[4],ttm3i[4];
    float ttm2r[4],ttm3r[4];
    float ttm2ri[4],ttm3ri[4];
-    //float fdir[4]
    float dixdk[4];
    float dkxui[4],dixuk[4];
    float dixukp[4],dkxuip[4];
@@ -159,7 +177,7 @@ __device__ void calculateRealSpaceEwaldPairIxn_kernel( RealSpaceEwaldParticle& a
    float gfr[8],gfri[7];
    float gti[7],gtri[7];
-    f = (cAmoebaSim.electric/cAmoebaSim.dielec);
+    float conversionFactor   = (cAmoebaSim.electric/cAmoebaSim.dielec);
    // set the permanent multipole and induced dipole values;
@@ -193,28 +211,29 @@ __device__ void calculateRealSpaceEwaldPairIxn_kernel( RealSpaceEwaldParticle& a
    float r2    = xr*xr + yr*yr + zr*zr;
    if( r2 <= cAmoebaSim.cutoffDistance2 ){
        float r      = sqrt(r2);
        float ck     = atomJ.q;
-              dk[1]  = atomJ.labFrameDipole[0];
+        dk[1]  = atomJ.labFrameDipole[0];
-              dk[2]  = atomJ.labFrameDipole[1];
+        dk[2]  = atomJ.labFrameDipole[1];
-              dk[3]  = atomJ.labFrameDipole[2];
+        dk[3]  = atomJ.labFrameDipole[2];
-              qk[1]  = atomJ.labFrameQuadrupole[0];
+        qk[1]  = atomJ.labFrameQuadrupole[0];
-              qk[2]  = atomJ.labFrameQuadrupole[1];
+        qk[2]  = atomJ.labFrameQuadrupole[1];
-              qk[3]  = atomJ.labFrameQuadrupole[2];
+        qk[3]  = atomJ.labFrameQuadrupole[2];
-              qk[4]  = atomJ.labFrameQuadrupole[3];
+        qk[4]  = atomJ.labFrameQuadrupole[3];
-              qk[5]  = atomJ.labFrameQuadrupole[4];
+        qk[5]  = atomJ.labFrameQuadrupole[4];
-              qk[6]  = atomJ.labFrameQuadrupole[5];
+        qk[6]  = atomJ.labFrameQuadrupole[5];
-              qk[7]  = atomJ.labFrameQuadrupole[6];
+        qk[7]  = atomJ.labFrameQuadrupole[6];
-              qk[8]  = atomJ.labFrameQuadrupole[7];
+        qk[8]  = atomJ.labFrameQuadrupole[7];
-              qk[9]  = atomJ.labFrameQuadrupole[8];
+        qk[9]  = atomJ.labFrameQuadrupole[8];
        // calculate the real space error function terms;
        float ralpha = cSim.alphaEwald*r;
-               bn[0] = fastErfc(ralpha)/r;
+               bn[0] = erfc(ralpha)/r;
        float alsq2  = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
        float alsq2n = 0.0f;
@@ -250,17 +269,17 @@ __device__ void calculateRealSpaceEwaldPairIxn_kernel( RealSpaceEwaldParticle& a
        float scale5 = 1.0f;
        float scale7 = 1.0f;
-            ddsc3[0] = 0.0f;
            ddsc3[1] = 0.0f;
            ddsc3[2] = 0.0f;
+            ddsc3[3] = 0.0f;
-            ddsc5[0] = 0.0f;
            ddsc5[1] = 0.0f;
            ddsc5[2] = 0.0f;
+            ddsc5[3] = 0.0f;
-            ddsc7[0] = 0.0f;
            ddsc7[1] = 0.0f;
            ddsc7[2] = 0.0f;
+            ddsc7[3] = 0.0f;
        float pdk    = atomJ.damp;
        float ptk    = atomJ.thole;
@@ -280,9 +299,11 @@ __device__ void calculateRealSpaceEwaldPairIxn_kernel( RealSpaceEwaldParticle& a
                   ddsc3[1] = temp3 * xr;
                   ddsc3[2] = temp3 * yr;
                   ddsc3[3] = temp3 * zr;
                   ddsc5[1] = temp5 * ddsc3[1];
                   ddsc5[2] = temp5 * ddsc3[2];
                   ddsc5[3] = temp5 * ddsc3[3];
                   ddsc7[1] = temp7 * ddsc5[1];
                   ddsc7[2] = temp7 * ddsc5[2];
                   ddsc7[3] = temp7 * ddsc5[3];
@@ -508,8 +529,8 @@ __device__ void calculateRealSpaceEwaldPairIxn_kernel( RealSpaceEwaldParticle& a
                    + rr7*gli[3]*psc7);
        e = e - (1.0f-scalingFactors[MScaleIndex])*erl;
        ei = ei - erli;
-        e = f * e;
+        e = conversionFactor * e;
-        ei = f * ei;
+        ei = conversionFactor * ei;
 //        em = em + e;
 //        ep = ep + ei;
@@ -679,12 +700,9 @@ __device__ void calculateRealSpaceEwaldPairIxn_kernel( RealSpaceEwaldParticle& a
                                   +(glip[2]+glip[7])*scalingFactors[DScaleIndex]);
        float temp7 = 0.5f * rr7 * (gli[3]*scalingFactors[PScaleIndex]
                                   +glip[3]*scalingFactors[DScaleIndex]);
-        fridmp[1] = temp3*ddsc3[1] + temp5*ddsc5[1]
+        fridmp[1] = temp3*ddsc3[1] + temp5*ddsc5[1] + temp7*ddsc7[1];
-                         + temp7*ddsc7[1];
+        fridmp[2] = temp3*ddsc3[2] + temp5*ddsc5[2] + temp7*ddsc7[2];
-        fridmp[2] = temp3*ddsc3[2] + temp5*ddsc5[2]
+        fridmp[3] = temp3*ddsc3[3] + temp5*ddsc5[3] + temp7*ddsc7[3];
-                         + temp7*ddsc7[2];
-        fridmp[3] = temp3*ddsc3[3] + temp5*ddsc5[3]
-                         + temp7*ddsc7[3];
        // find some scaling terms for induced-induced force
@@ -833,84 +851,186 @@ __device__ void calculateRealSpaceEwaldPairIxn_kernel( RealSpaceEwaldParticle& a
        // handle the case where scaling is used
        for( int j = 1; j <= 3; j++ ){
-           ftm2[j] = f * (ftm2[j]-(1.0f-scalingFactors[MScaleIndex])*ftm2r[j]);
+           ftm2[j]  = (ftm2[j]-(1.0f-scalingFactors[MScaleIndex])*ftm2r[j]);
-           ftm2i[j] = f * (ftm2i[j]-ftm2ri[j]);
+           ftm2i[j] = (ftm2i[j]-ftm2ri[j]);
-           ttm2[j] = f * (ttm2[j]-(1.0f-scalingFactors[MScaleIndex])*ttm2r[j]);
+           ttm2[j]  = (ttm2[j]-(1.0f-scalingFactors[MScaleIndex])*ttm2r[j]);
-           ttm2i[j] = f * (ttm2i[j]-ttm2ri[j]);
+           ttm2i[j] = (ttm2i[j]-ttm2ri[j]);
-           ttm3[j] = f * (ttm3[j]-(1.0f-scalingFactors[MScaleIndex])*ttm3r[j]);
+           ttm3[j]  = (ttm3[j]-(1.0f-scalingFactors[MScaleIndex])*ttm3r[j]);
-           ttm3i[j] = f * (ttm3i[j]-ttm3ri[j]);
+           ttm3i[j] = (ttm3i[j]-ttm3ri[j]);
        }
        // increment gradient due to force and torque on first site;
+        outputForce[0]           = -conversionFactor*(ftm2[1] + ftm2i[1]);
+        outputForce[1]           = -conversionFactor*(ftm2[2] + ftm2i[2]);
+        outputForce[2]           = -conversionFactor*(ftm2[3] + ftm2i[3]);
+        outputTorque[0][0]       = conversionFactor*(ttm2[1] + ttm2i[1]);
+        outputTorque[0][1]       = conversionFactor*(ttm2[2] + ttm2i[2]);
+        outputTorque[0][2]       = conversionFactor*(ttm2[3] + ttm2i[3]);
+        outputTorque[1][0]       = conversionFactor*(ttm3[1] + ttm3i[1]);
+        outputTorque[1][1]       = conversionFactor*(ttm3[2] + ttm3i[2]);
+        outputTorque[1][2]       = conversionFactor*(ttm3[3] + ttm3i[3]);
+        //outputTorque[1][2]    = conversionFactor*(ttm3_2 + ttm3i_2);
+#ifdef AMOEBA_DEBUG
+    int debugIndex               = 0;
+    float idTracker              = 1.0f;
 /*
-        dem(1,ii) = dem(1,ii) + ftm2[1];
+    debugArray[debugIndex].x = atomI.labFrameDipole[0];
-        dem(2,ii) = dem(2,ii) + ftm2[2];
+    debugArray[debugIndex].y = atomI.labFrameDipole[1];
-        dem(3,ii) = dem(3,ii) + ftm2[3];
+    debugArray[debugIndex].z = atomI.labFrameDipole[2];
-        dep(1,ii) = dep(1,ii) + ftm2i[1];
+    debugArray[debugIndex].w = r2;
-        dep(2,ii) = dep(2,ii) + ftm2i[2];
-        dep(3,ii) = dep(3,ii) + ftm2i[3];
+    debugIndex++;
-        call torque (i,ttm2,ttm2i,frcxi,frcyi,frczi);
+    idTracker               += 1.0;
+    debugArray[debugIndex].x = atomJ.labFrameDipole[0];
-        // increment gradient due to force and torque on second site;
+    debugArray[debugIndex].y = atomJ.labFrameDipole[1];
+    debugArray[debugIndex].z = atomJ.labFrameDipole[2];
-        dem(1,kk) = dem(1,kk) - ftm2[1];
+    debugArray[debugIndex].w = cSim.alphaEwald;
-        dem(2,kk) = dem(2,kk) - ftm2[2];
-        dem(3,kk) = dem(3,kk) - ftm2[3];
+    debugIndex++;
-        dep(1,kk) = dep(1,kk) - ftm2i[1];
+    idTracker               += 1.0;
-        dep(2,kk) = dep(2,kk) - ftm2i[2];
+    debugArray[debugIndex].x = atomI.inducedDipole[0];
-        dep(3,kk) = dep(3,kk) - ftm2i[3];
+    debugArray[debugIndex].y = atomI.inducedDipole[1];
-        call torque (k,ttm3,ttm3i,frcxk,frcyk,frczk);
+    debugArray[debugIndex].z = atomI.inducedDipole[2];
+    debugArray[debugIndex].w = idTracker;
+    debugIndex++;
+    idTracker               += 1.0;
+    debugArray[debugIndex].x = atomJ.inducedDipole[0];
+    debugArray[debugIndex].y = atomJ.inducedDipole[1];
+    debugArray[debugIndex].z = atomJ.inducedDipole[2];
+    debugArray[debugIndex].w = idTracker;
+    debugIndex++;
+    idTracker               += 1.0;
+    debugArray[debugIndex].x = atomI.inducedDipoleP[0];
+    debugArray[debugIndex].y = atomI.inducedDipoleP[1];
+    debugArray[debugIndex].z = atomI.inducedDipoleP[2];
+    debugArray[debugIndex].w = idTracker;
+    debugIndex++;
+    idTracker               += 1.0;
+    debugArray[debugIndex].x = atomJ.inducedDipoleP[0];
+    debugArray[debugIndex].y = atomJ.inducedDipoleP[1];
+    debugArray[debugIndex].z = atomJ.inducedDipoleP[2];
+    debugArray[debugIndex].w = idTracker;
+    debugIndex++;
+    idTracker               += 1.0;
+    debugArray[debugIndex].x = conversionFactor*ftm2[1];
+    debugArray[debugIndex].y = conversionFactor*ftm2[2];
+    debugArray[debugIndex].z = conversionFactor*ftm2[3];
+    debugArray[debugIndex].w = idTracker;
+    debugIndex++;
 */
+    idTracker               += 1.0;
+    debugArray[debugIndex].x = conversionFactor*ftm2i[1];
+    debugArray[debugIndex].y = conversionFactor*ftm2i[2];
+    debugArray[debugIndex].z = conversionFactor*ftm2i[3];
+    debugArray[debugIndex].w = r2;
+    debugIndex++;
+    idTracker               += 1.0;
+    debugArray[debugIndex].x = conversionFactor*fridmp[1];
+    debugArray[debugIndex].y = conversionFactor*fridmp[2];
+    debugArray[debugIndex].z = conversionFactor*fridmp[3];
+    debugArray[debugIndex].w = cSim.alphaEwald;
+    debugIndex++;
+    idTracker               += 1.0;
+    debugArray[debugIndex].x = conversionFactor*findmp[1];
+    debugArray[debugIndex].y = conversionFactor*findmp[2];
+    debugArray[debugIndex].z = conversionFactor*findmp[3];
+    debugArray[debugIndex].w = cSim.alphaEwald + 1.0f;
+    debugIndex++;
+    idTracker               += 1.0;
+    debugArray[debugIndex].x = conversionFactor*ttm2[1];
+    debugArray[debugIndex].y = conversionFactor*ttm2[2];
+    debugArray[debugIndex].z = conversionFactor*ttm2[3];
+    debugArray[debugIndex].w = idTracker;
+    debugIndex++;
+    idTracker               += 1.0;
+    debugArray[debugIndex].x = conversionFactor*ttm2i[1];
+    debugArray[debugIndex].y = conversionFactor*ttm2i[2];
+    debugArray[debugIndex].z = conversionFactor*ttm2i[3];
+    debugArray[debugIndex].w = idTracker;
+#endif
+    } else {
+        outputForce[0]           = 0.0f;
+        outputForce[1]           = 0.0f;
+        outputForce[2]           = 0.0f;
+        outputTorque[0][0]       = 0.0f;
+        outputTorque[0][1]       = 0.0f;
+        outputTorque[0][2]       = 0.0f;
+        outputTorque[1][0]       = 0.0f;
+        outputTorque[1][1]       = 0.0f;
+        outputTorque[1][2]       = 0.0f;
+#ifdef AMOEBA_DEBUG
+for( int ii = 0; ii < 20; ii++ ){
+    debugArray[ii].x = 0.0f;
+    debugArray[ii].y = 0.0f;
+    debugArray[ii].z = 0.0f;
+    debugArray[ii].w = (float) ii;
+}
+#endif
    }
    return;
 }
-__device__ void loadRealSpaceEwaldShared( struct RealSpaceEwaldParticle* sA, unsigned int atomI,
+__device__ void loadPmeDirectElectrostaticShared( struct PmeDirectElectrostaticParticle* sA, unsigned int atomI )
-                                       float4* atomCoord, float* labFrameDipoleJ, float* labQuadrupole,
-                                       float* inducedDipole, float* inducedDipolePolar, float2* dampingFactorAndThole )
 {
    // coordinates & charge
+    sA->x                        = cSim.pPosq[atomI].x;
-    sA->x                        = atomCoord[atomI].x;
+    sA->y                        = cSim.pPosq[atomI].y;
-    sA->y                        = atomCoord[atomI].y;
+    sA->z                        = cSim.pPosq[atomI].z;
-    sA->z                        = atomCoord[atomI].z;
+    sA->q                        = cSim.pPosq[atomI].w;
-    sA->q                        = atomCoord[atomI].w;
    // lab dipole
-    sA->labFrameDipole[0]         = labFrameDipoleJ[atomI*3];
+    sA->labFrameDipole[0]        = cAmoebaSim.pLabFrameDipole[atomI*3];
-    sA->labFrameDipole[1]         = labFrameDipoleJ[atomI*3+1];
+    sA->labFrameDipole[1]        = cAmoebaSim.pLabFrameDipole[atomI*3+1];
-    sA->labFrameDipole[2]         = labFrameDipoleJ[atomI*3+2];
+    sA->labFrameDipole[2]        = cAmoebaSim.pLabFrameDipole[atomI*3+2];
    // lab quadrupole
-    sA->labFrameQuadrupole[0]    = labQuadrupole[atomI*9];
+    sA->labFrameQuadrupole[0]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9];
-    sA->labFrameQuadrupole[1]    = labQuadrupole[atomI*9+1];
+    sA->labFrameQuadrupole[1]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+1];
-    sA->labFrameQuadrupole[2]    = labQuadrupole[atomI*9+2];
+    sA->labFrameQuadrupole[2]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+2];
-    sA->labFrameQuadrupole[3]    = labQuadrupole[atomI*9+3];
+    sA->labFrameQuadrupole[3]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+3];
-    sA->labFrameQuadrupole[4]    = labQuadrupole[atomI*9+4];
+    sA->labFrameQuadrupole[4]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+4];
-    sA->labFrameQuadrupole[5]    = labQuadrupole[atomI*9+5];
+    sA->labFrameQuadrupole[5]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+5];
-    sA->labFrameQuadrupole[6]    = labQuadrupole[atomI*9+6];
+    sA->labFrameQuadrupole[6]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+6];
-    sA->labFrameQuadrupole[7]    = labQuadrupole[atomI*9+7];
+    sA->labFrameQuadrupole[7]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+7];
-    sA->labFrameQuadrupole[8]    = labQuadrupole[atomI*9+8];
+    sA->labFrameQuadrupole[8]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+8];
    // induced dipole
-    sA->inducedDipole[0]          = inducedDipole[atomI*3];
+    sA->inducedDipole[0]         =  cAmoebaSim.pInducedDipole[atomI*3];
-    sA->inducedDipole[1]          = inducedDipole[atomI*3+1];
+    sA->inducedDipole[1]         =  cAmoebaSim.pInducedDipole[atomI*3+1];
-    sA->inducedDipole[2]          = inducedDipole[atomI*3+2];
+    sA->inducedDipole[2]         =  cAmoebaSim.pInducedDipole[atomI*3+2];
    // induced dipole polar
-    sA->inducedDipoleP[0]         = inducedDipolePolar[atomI*3];
+    sA->inducedDipoleP[0]        =  cAmoebaSim.pInducedDipolePolar[atomI*3];
-    sA->inducedDipoleP[1]         = inducedDipolePolar[atomI*3+1];
+    sA->inducedDipoleP[1]        =  cAmoebaSim.pInducedDipolePolar[atomI*3+1];
-    sA->inducedDipoleP[2]         = inducedDipolePolar[atomI*3+2];
+    sA->inducedDipoleP[2]        =  cAmoebaSim.pInducedDipolePolar[atomI*3+2];
-    sA->damp                     = dampingFactorAndThole[atomI].x;
+    sA->damp                     = cAmoebaSim.pDampingFactorAndThole[atomI].x;
-    sA->thole                    = dampingFactorAndThole[atomI].y;
+    sA->thole                    = cAmoebaSim.pDampingFactorAndThole[atomI].y;
 }
@@ -918,11 +1038,11 @@ __device__ void loadRealSpaceEwaldShared( struct RealSpaceEwaldParticle* sA, uns
 #undef USE_OUTPUT_BUFFER_PER_WARP
 #define METHOD_NAME(a, b) a##N2##b
-#include "kCalculateAmoebaCudaRealSpaceEwald.h"
+#include "kCalculateAmoebaCudaPmeDirectElectrostatic.h"
 #define USE_OUTPUT_BUFFER_PER_WARP
 #undef METHOD_NAME
 #define METHOD_NAME(a, b) a##N2ByWarp##b
-#include "kCalculateAmoebaCudaRealSpaceEwald.h"
+#include "kCalculateAmoebaCudaPmeDirectElectrostatic.h"
 // reduce psWorkArray_3_1 -> force
 // reduce psWorkArray_3_2 -> torque
@@ -932,23 +1052,27 @@ static void kReduceForceTorque(amoebaGpuContext amoebaGpu )
    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                             amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
                             amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psForce->_pDevStream[0] );
-    LAUNCHERROR("kReduceRealSpaceEwaldForce");
+    LAUNCHERROR("kReducePmeDirectElectrostaticForce");
    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                             amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
                             amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psTorque->_pDevStream[0] );
-    LAUNCHERROR("kReduceRealSpaceEwaldTorque");
+    LAUNCHERROR("kReducePmeDirectElectrostaticTorque");
 }
+//#define GET_INDUCED_DIPOLE_FROM_FILE
+#ifdef GET_INDUCED_DIPOLE_FROM_FILE
+#include <stdlib.h>
+#endif
 /**---------------------------------------------------------------------------------------
-   Compute Amoeba electrostatic force & torque
+   Compute Amoeba dirrect space portion of electrostatic force & torque
   @param amoebaGpu        amoebaGpu context
-   @param gpu              OpenMM gpu Cuda context
   --------------------------------------------------------------------------------------- */
-void cudaComputeAmoebaRealSpaceEwald( amoebaGpuContext amoebaGpu )
+void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
 {
   // ---------------------------------------------------------------------------------------
@@ -956,7 +1080,7 @@ void cudaComputeAmoebaRealSpaceEwald( amoebaGpuContext amoebaGpu )
    static unsigned int threadsPerBlock = 0;
 #ifdef AMOEBA_DEBUG
-    static const char* methodName = "cudaComputeAmoebaRealSpaceEwald";
+    static const char* methodName = "cudaComputeAmoebaPmeDirectElectrostatic";
    static int timestep = 0;
    std::vector<int> fileId;
    timestep++;
@@ -983,7 +1107,38 @@ void cudaComputeAmoebaRealSpaceEwald( amoebaGpuContext amoebaGpu )
    CUDAStream<float4>* debugArray            = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
    memset( debugArray->_pSysStream[0],      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
    debugArray->Upload();
-    unsigned int targetAtom                   = 0;
+    unsigned int targetAtom                   = 10;
+#endif
+#ifdef GET_INDUCED_DIPOLE_FROM_FILE
+    std::string fileName = "waterInducedDipole.txt";
+    StringVectorVector fileContents;
+    readFile( fileName, fileContents );
+    unsigned int offset  = 0;
+    (void) fprintf( amoebaGpu->log, "Read file: %s %u\n", fileName.c_str(), fileContents.size() ); fflush(  amoebaGpu->log );
+    for( unsigned int ii = 1; ii < fileContents.size()-1; ii++ ){
+        StringVector lineTokens     = fileContents[ii];
+        unsigned int lineTokenIndex = 1;
+        // (void) fprintf( amoebaGpu->log, "   %u %s %s\n", ii, lineTokens[0].c_str(), lineTokens[lineTokenIndex].c_str() ); fflush(  amoebaGpu->log );
+        amoebaGpu->psInducedDipole->_pSysStream[0][offset++]       = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
+        amoebaGpu->psInducedDipole->_pSysStream[0][offset++]       = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
+        amoebaGpu->psInducedDipole->_pSysStream[0][offset++]       = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str())); 
+        offset                                              -= 3;        
+        amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset++]  = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
+        amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset++]  = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
+        amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset++]  = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
+    }
+    float conversion = 0.1f;
+    for( int ii = 0; ii < 3*gpu->natoms; ii++ ){
+        amoebaGpu->psInducedDipole->_pSysStream[0][ii]       *= conversion;
+        amoebaGpu->psInducedDipolePolar->_pSysStream[0][ii]  *= conversion;
+    }
+    amoebaGpu->gpuContext->sim.alphaEwald = 5.4459052e+00f;
+    SetCalculateAmoebaPmeDirectElectrostaticSim(amoebaGpu);
+    amoebaGpu->psInducedDipole->Upload();
+    amoebaGpu->psInducedDipolePolar->Upload();
 #endif
    // on first pass, set threads/block
@@ -996,20 +1151,15 @@ void cudaComputeAmoebaRealSpaceEwald( amoebaGpuContext amoebaGpu )
          maxThreads = 128;
      else
          maxThreads = 64;
-      threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(RealSpaceEwaldParticle)), maxThreads);
+      threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(PmeDirectElectrostaticParticle)), maxThreads);
    }
    kClearFields_3( amoebaGpu, 2 );
    if (gpu->bOutputBufferPerWarp){
-      kCalculateAmoebaCudaRealSpaceEwaldN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(RealSpaceEwaldParticle)*threadsPerBlock>>>(
+      kCalculateAmoebaPmeDirectElectrostaticN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>(
                                                                         amoebaGpu->psWorkUnit->_pDevStream[0],
-                                                                         gpu->psPosq4->_pDevStream[0],
-                                                                         amoebaGpu->psLabFrameDipole->_pDevStream[0],
-                                                                         amoebaGpu->psLabFrameQuadrupole->_pDevStream[0],
-                                                                         amoebaGpu->psInducedDipole->_pDevStream[0],
-                                                                         amoebaGpu->psInducedDipolePolar->_pDevStream[0],
                                                                         amoebaGpu->psWorkArray_3_1->_pDevStream[0],
 #ifdef AMOEBA_DEBUG
                                                                         amoebaGpu->psWorkArray_3_2->_pDevStream[0],
@@ -1021,22 +1171,14 @@ void cudaComputeAmoebaRealSpaceEwald( amoebaGpuContext amoebaGpu )
    } else {
 #ifdef AMOEBA_DEBUG
-      (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaRealSpaceEwaldN2Forces no warp:  numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n",
+      (void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticN2Forces no warp:  numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n",
                      amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
-                      sizeof(RealSpaceEwaldParticle), sizeof(RealSpaceEwaldParticle)*threadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
+                      sizeof(PmeDirectElectrostaticParticle), sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
      (void) fflush( amoebaGpu->log );
 #endif
-      cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+      kCalculateAmoebaPmeDirectElectrostaticN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>(
-      cudaBindTexture(NULL, &tabulatedErfcRef, gpu->psTabulatedErfc->_pDevData, &channelDesc, gpu->psTabulatedErfc->_length*sizeof(float));
-      kCalculateAmoebaCudaRealSpaceEwaldN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(RealSpaceEwaldParticle)*threadsPerBlock>>>(
                                                                         amoebaGpu->psWorkUnit->_pDevStream[0],
-                                                                         gpu->psPosq4->_pDevStream[0],
-                                                                         amoebaGpu->psLabFrameDipole->_pDevStream[0],
-                                                                         amoebaGpu->psLabFrameQuadrupole->_pDevStream[0],
-                                                                         amoebaGpu->psInducedDipole->_pDevStream[0],
-                                                                         amoebaGpu->psInducedDipolePolar->_pDevStream[0],
                                                                         amoebaGpu->psWorkArray_3_1->_pDevStream[0],
 #ifdef AMOEBA_DEBUG
                                                                         amoebaGpu->psWorkArray_3_2->_pDevStream[0],
@@ -1045,108 +1187,69 @@ void cudaComputeAmoebaRealSpaceEwald( amoebaGpuContext amoebaGpu )
                                                                         amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
 #endif
    }
-    LAUNCHERROR("kCalculateAmoebaCudaRealSpaceEwaldN2Forces");
+    LAUNCHERROR("kCalculateAmoebaPmeDirectElectrostaticN2Forces");
    kReduceForceTorque( amoebaGpu );
 #ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){
-      amoebaGpu->psForce->Download();
+        amoebaGpu->psForce->Download();
-      amoebaGpu->psTorque->Download();
+        amoebaGpu->psTorque->Download();
-      debugArray->Download();
+        debugArray->Download();
-      (void) fprintf( amoebaGpu->log, "Finished RealSpaceEwald kernel execution\n" ); (void) fflush( amoebaGpu->log );
+        (void) fprintf( amoebaGpu->log, "Finished PmeDirectElectrostatic kernel execution\n" ); (void) fflush( amoebaGpu->log );
-      int maxPrint        = 1400;
+        int maxPrint        = 1400;
-      for( int ii = 0; ii < gpu->natoms; ii++ ){
+        float conversion    = 1.0f/41.84f;
-         (void) fprintf( amoebaGpu->log, "%5d ", ii); 
+        for( int ii = 0; ii < gpu->natoms; ii++ ){
+            (void) fprintf( amoebaGpu->log, "%5d ", ii); 
-          int indexOffset     = ii*3;
+            int indexOffset     = ii*3;
-         // force
+            // force
-         (void) fprintf( amoebaGpu->log,"RealSpaceEwaldF [%16.9e %16.9e %16.9e] ",
-                         amoebaGpu->psForce->_pSysStream[0][indexOffset],
+            (void) fprintf( amoebaGpu->log,"PmeDirectElectrostaticF [%16.9e %16.9e %16.9e] ",
-                         amoebaGpu->psForce->_pSysStream[0][indexOffset+1],
+                            conversion*amoebaGpu->psForce->_pSysStream[0][indexOffset],
-                         amoebaGpu->psForce->_pSysStream[0][indexOffset+2] );
+                            conversion*amoebaGpu->psForce->_pSysStream[0][indexOffset+1],
+                            conversion*amoebaGpu->psForce->_pSysStream[0][indexOffset+2] );
-         // torque
+            // torque
-         (void) fprintf( amoebaGpu->log,"RealSpaceEwaldT [%16.9e %16.9e %16.9e] ",
-                         amoebaGpu->psTorque->_pSysStream[0][indexOffset],
+            (void) fprintf( amoebaGpu->log,"PmeDirectElectrostaticT [%16.9e %16.9e %16.9e] ",
-                         amoebaGpu->psTorque->_pSysStream[0][indexOffset+1],
+                            conversion*amoebaGpu->psTorque->_pSysStream[0][indexOffset],
-                         amoebaGpu->psTorque->_pSysStream[0][indexOffset+2] );
+                            conversion*amoebaGpu->psTorque->_pSysStream[0][indexOffset+1],
+                            conversion*amoebaGpu->psTorque->_pSysStream[0][indexOffset+2] );
-         // coords
+            (void) fprintf( amoebaGpu->log,"\n" );
-#if 0
+            if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
-          (void) fprintf( amoebaGpu->log,"x[%16.9e %16.9e %16.9e] ",
+                ii = gpu->natoms - maxPrint;
-                          gpu->psPosq4->_pSysStream[0][ii].x,
+            }
-                          gpu->psPosq4->_pSysStream[0][ii].y,
+        }
-                          gpu->psPosq4->_pSysStream[0][ii].z);
+        if( 1 ){
+            (void) fprintf( amoebaGpu->log,"DebugElec\n" );
+            int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-         for( int jj = 0; jj < gpu->natoms && jj < 5; jj++ ){
+            for( int jj = 0; jj < gpu->natoms; jj++ ){
-             int debugIndex = jj*gpu->natoms + ii;
+                int debugIndex = jj;
-             float xx       =  gpu->psPosq4->_pSysStream[0][jj].x -  gpu->psPosq4->_pSysStream[0][ii].x;
+                for( int kk = 0; kk < 15; kk++ ){
-             float yy       =  gpu->psPosq4->_pSysStream[0][jj].y -  gpu->psPosq4->_pSysStream[0][ii].y;
+                    (void) fprintf( amoebaGpu->log,"%5d %5d [%16.9e %16.9e %16.9e %16.9e] E11\n", targetAtom, jj,
-             float zz       =  gpu->psPosq4->_pSysStream[0][jj].z -  gpu->psPosq4->_pSysStream[0][ii].z;
+                                    debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
-             (void) fprintf( amoebaGpu->log,"\n%4d %4d delta [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] ",
+                                    debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
-                             ii, jj, xx, yy, zz,
+                    debugIndex += paddedNumberOfAtoms;
-                             debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y, debugArray->_pSysStream[0][debugIndex].z );
+                }
+                (void) fprintf( amoebaGpu->log,"\n" );
-         }
+            }
-#endif
+        }
-         (void) fprintf( amoebaGpu->log,"\n" );
+        (void) fflush( amoebaGpu->log );
-         if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
-              ii = gpu->natoms - maxPrint;
+        if( 1 ){
+            std::vector<int> fileId;
+            //fileId.push_back( 0 );
+            VectorOfDoubleVectors outputVector;
+            cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,            outputVector );
+            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psForce,      outputVector );
+            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psTorque,     outputVector);
+            cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeDirectForceTorque", fileId, outputVector );
         }
-      }
-      if( 1 ){
-          (void) fprintf( amoebaGpu->log,"DebugElec\n" );
-          int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-          for( int jj = 0; jj < gpu->natoms; jj++ ){
-              int debugIndex = jj;
-              for( int kk = 0; kk < 5; kk++ ){
-                  (void) fprintf( amoebaGpu->log,"%5d %5d [%16.9e %16.9e %16.9e %16.9e] E11\n", targetAtom, jj,
-                                  debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
-                                  debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
-                  debugIndex += paddedNumberOfAtoms;
-              }
-              (void) fprintf( amoebaGpu->log,"\n" );
-          }
-      }
-      (void) fflush( amoebaGpu->log );
-      if( 0 ){
-          (void) fprintf( amoebaGpu->log, "%s Tiled F & T\n", methodName ); fflush( amoebaGpu->log );
-          int maxPrint = 12;
-          for( int ii = 0; ii < gpu->natoms; ii++ ){
-              // print cpu & gpu reductions
-              int offset  = 3*ii;
-              (void) fprintf( amoebaGpu->log,"%6d F[%16.7e %16.7e %16.7e] T[%16.7e %16.7e %16.7e]\n", ii,
-                              amoebaGpu->psForce->_pSysStream[0][offset],
-                              amoebaGpu->psForce->_pSysStream[0][offset+1],
-                              amoebaGpu->psForce->_pSysStream[0][offset+2],
-                              amoebaGpu->psTorque->_pSysStream[0][offset],
-                              amoebaGpu->psTorque->_pSysStream[0][offset+1],
-                              amoebaGpu->psTorque->_pSysStream[0][offset+2] );
-              if( (ii == maxPrint) && (ii < (gpu->natoms - maxPrint)) )ii = gpu->natoms - maxPrint; 
-          }   
-      }   
-      if( 1 ){
-          std::vector<int> fileId;
-          //fileId.push_back( 0 );
-          VectorOfDoubleVectors outputVector;
-          cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,            outputVector );
-          cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psForce,      outputVector );
-          cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psTorque,     outputVector);
-          cudaWriteVectorOfDoubleVectorsToFile( "CudaForceTorque", fileId, outputVector );
-       }
    }   
    delete debugArray;
@@ -1155,3 +1258,19 @@ void cudaComputeAmoebaRealSpaceEwald( amoebaGpuContext amoebaGpu )
   // ---------------------------------------------------------------------------------------
 }
+/**---------------------------------------------------------------------------------------
+   Compute Amoeba electrostatic force & torque using PME
+   @param amoebaGpu        amoebaGpu context
+   --------------------------------------------------------------------------------------- */
+void cudaComputeAmoebaPmeElectrostatic( amoebaGpuContext amoebaGpu )
+{
+    cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpu );
+    kCalculateAmoebaPME( amoebaGpu );
+} 
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaRealSpaceEwald.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaRealSpaceEwald.h
@@ -34,15 +34,8 @@ __launch_bounds__(128, 1)
 #else
 __launch_bounds__(64, 1)
 #endif
-void METHOD_NAME(kCalculateAmoebaCudaRealSpaceEwald, Forces_kernel)(
+void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
-                            unsigned int* workUnit,
+                            unsigned int* workUnit, float* outputForce, float* outputTorque
-                            float4* atomCoord,
-                            float* labFrameDipole,
-                            float* labFrameQuadrupole,
-                            float* inducedDipole,
-                            float* inducedDipolePolar,
-                            float* outputForce,
-                            float* outputTorque
 #ifdef AMOEBA_DEBUG
                           , float4* debugArray, unsigned int targetAtom
@@ -50,10 +43,11 @@ void METHOD_NAME(kCalculateAmoebaCudaRealSpaceEwald, Forces_kernel)(
 ){
 #ifdef AMOEBA_DEBUG
+    int pullIndexMax = 12;
    float4 pullBack[20];
 #endif
-    extern __shared__ RealSpaceEwaldParticle sA[];
+    extern __shared__ PmeDirectElectrostaticParticle sA[];
    unsigned int totalWarps      = gridDim.x*blockDim.x/GRID;
    unsigned int warp            = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
@@ -80,12 +74,10 @@ void METHOD_NAME(kCalculateAmoebaCudaRealSpaceEwald, Forces_kernel)(
        unsigned int tbx              = threadIdx.x - tgx;
        unsigned int tj               = tgx;
-        RealSpaceEwaldParticle* psA    = &sA[tbx];
+        PmeDirectElectrostaticParticle* psA    = &sA[tbx];
        unsigned int atomI            = x + tgx;
-        RealSpaceEwaldParticle localParticle;
+        PmeDirectElectrostaticParticle localParticle;
-        loadRealSpaceEwaldShared(&localParticle, atomI,
+        loadPmeDirectElectrostaticShared(&localParticle, atomI );
-                                atomCoord, labFrameDipole, labFrameQuadrupole,
-                                inducedDipole, inducedDipolePolar, cAmoebaSim.pDampingFactorAndThole );
        localParticle.force[0]                   = 0.0f;
        localParticle.force[1]                   = 0.0f;
@@ -105,9 +97,7 @@ void METHOD_NAME(kCalculateAmoebaCudaRealSpaceEwald, Forces_kernel)(
            // load shared data
-           loadRealSpaceEwaldShared( &(sA[threadIdx.x]), atomI,
+            loadPmeDirectElectrostaticShared( &(sA[threadIdx.x]), atomI );
-                                    atomCoord, labFrameDipole, labFrameQuadrupole,
-                                    inducedDipole, inducedDipolePolar, cAmoebaSim.pDampingFactorAndThole );
            if (!bExclusionFlag)
            {
@@ -122,9 +112,8 @@ void METHOD_NAME(kCalculateAmoebaCudaRealSpaceEwald, Forces_kernel)(
                    float torque[2][3];
                    float energy;
-                    calculateRealSpaceEwaldPairIxn_kernel( localParticle,                              psA[j],
+                    calculatePmeDirectElectrostaticPairIxn_kernel( localParticle, psA[j],
-                                                          cAmoebaSim.scalingDistanceCutoff,           scalingFactors,
+                                                                   scalingFactors, force, torque, &energy
-                                                          force, torque, &energy
 #ifdef AMOEBA_DEBUG
 , pullBack
 #endif
@@ -142,7 +131,7 @@ void METHOD_NAME(kCalculateAmoebaCudaRealSpaceEwald, Forces_kernel)(
                    localParticle.torque[1]           += mask ? torque[0][1] : 0.0f;
                    localParticle.torque[2]           += mask ? torque[0][2] : 0.0f;
-                    totalEnergy            += mask ? 0.5*energy   : 0.0f;
+                    totalEnergy                       += mask ? 0.5*energy   : 0.0f;
                }
            }
@@ -171,9 +160,8 @@ void METHOD_NAME(kCalculateAmoebaCudaRealSpaceEwald, Forces_kernel)(
                    // force
                    float energy;
-                    calculateRealSpaceEwaldPairIxn_kernel( localParticle,                              psA[j],
+                    calculatePmeDirectElectrostaticPairIxn_kernel( localParticle,   psA[j],
-                                                          cAmoebaSim.scalingDistanceCutoff,           scalingFactors,
+                                                                   scalingFactors, force, torque, &energy
-                                                          force, torque, &energy
 #ifdef AMOEBA_DEBUG
 , pullBack
 #endif
@@ -194,63 +182,59 @@ void METHOD_NAME(kCalculateAmoebaCudaRealSpaceEwald, Forces_kernel)(
                    localParticle.torque[1]           += mask ? torque[0][1] : 0.0f;
                    localParticle.torque[2]           += mask ? torque[0][2] : 0.0f;
-                    totalEnergy            += mask ? 0.5*energy   : 0.0f;
+                    totalEnergy                       += mask ? 0.5*energy   : 0.0f;
 #ifdef AMOEBA_DEBUG
 if( atomI == targetAtom ){
-            unsigned int index                 = (atomI == targetAtom) ? atomJ : atomI;
+    unsigned int index                 = (atomI == targetAtom) ? atomJ : atomI;
+    float blockId                      = 1.0f;
-            debugArray[index].x                = (float) atomI;
-            debugArray[index].y                = (float) atomJ;
+    debugArray[index].x                = (float) atomI;
-            debugArray[index].z                = 1.0f;
+    debugArray[index].y                = (float) atomJ;
-            debugArray[index].w                = (float) y;
+    debugArray[index].z                = (float) y;
-/*
+    debugArray[index].w                = blockId;
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                = mask ? force[0]     : 0.0f;
+    index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].y                = mask ? force[1]     : 0.0f;
+    debugArray[index].x                = mask ? force[0]     : 0.0f;
-            debugArray[index].z                = mask ? force[2]     : 0.0f;
+    debugArray[index].y                = mask ? force[1]     : 0.0f;
+    debugArray[index].z                = mask ? force[2]     : 0.0f;
+    debugArray[index].w                = blockId;
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                = mask ? torque[0][0] : 0.0f;
-            debugArray[index].y                = mask ? torque[0][1] : 0.0f;
+    index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].z                = mask ? torque[0][2] : 0.0f;
+    debugArray[index].x                = mask ? torque[0][0] : 0.0f;
-*/
+    debugArray[index].y                = mask ? torque[0][1] : 0.0f;
+    debugArray[index].z                = mask ? torque[0][2] : 0.0f;
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+    debugArray[index].w                = blockId;
-            int pullIndex                      = 0;
-            debugArray[index].x                = pullBack[pullIndex].x;
+    index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].y                = pullBack[pullIndex].y;
+    debugArray[index].x                = mask ? torque[0][0] : 0.0f;
-            debugArray[index].z                = pullBack[pullIndex].z;
+    debugArray[index].y                = mask ? torque[0][1] : 0.0f;
-            debugArray[index].w                = pullBack[pullIndex].w;
+    debugArray[index].z                = mask ? torque[0][2] : 0.0f;
+    debugArray[index].w                = blockId;
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            pullIndex++;
+    for( int pullIndex = 0; pullIndex < pullIndexMax; pullIndex++ ){
-            debugArray[index].x                = pullBack[pullIndex].x;
+        index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].y                = pullBack[pullIndex].y;
+        debugArray[index].x                = pullBack[pullIndex].x;
-            debugArray[index].z                = pullBack[pullIndex].z;
+        debugArray[index].y                = pullBack[pullIndex].y;
-            debugArray[index].w                = pullBack[pullIndex].w;
+        debugArray[index].z                = pullBack[pullIndex].z;
+        debugArray[index].w                = pullBack[pullIndex].w;
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+    }
-            pullIndex++;
-            debugArray[index].x                = pullBack[pullIndex].x;
-            debugArray[index].y                = pullBack[pullIndex].y;
-            debugArray[index].z                = pullBack[pullIndex].z;
-            debugArray[index].w                = pullBack[pullIndex].w;
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            pullIndex++;
-            debugArray[index].x                = pullBack[pullIndex].x;
-            debugArray[index].y                = pullBack[pullIndex].y;
-            debugArray[index].z                = pullBack[pullIndex].z;
-            debugArray[index].w                = pullBack[pullIndex].w;
 }
 #endif
                }
            }
+            // include self energy and self torque
+            if( atomI < cAmoebaSim.numberOfAtoms ){
+                calculatePmeSelfTorqueElectrostaticPairIxn_kernel( localParticle );
+                //float energy;
+                //calculatePmeSelfEnergyElectrostaticPairIxn_kernel( localParticle, &energy );
+                //totalEnergy += energy;
+            }
            // Write results
 #ifdef USE_OUTPUT_BUFFER_PER_WARP
@@ -300,9 +284,7 @@ if( atomI == targetAtom ){
            {
                // load shared data
-               loadRealSpaceEwaldShared( &(sA[threadIdx.x]), (y+tgx),
+               loadPmeDirectElectrostaticShared( &(sA[threadIdx.x]), (y+tgx) );
-                                        atomCoord, labFrameDipole, labFrameQuadrupole,
-                                        inducedDipole, inducedDipolePolar, cAmoebaSim.pDampingFactorAndThole );
            }
@@ -325,9 +307,8 @@ if( atomI == targetAtom ){
                    unsigned int atomJ = y + tj;
                    float energy;
-                    calculateRealSpaceEwaldPairIxn_kernel( localParticle,                              psA[tj],
+                    calculatePmeDirectElectrostaticPairIxn_kernel( localParticle,  psA[tj],
-                                                                 cAmoebaSim.scalingDistanceCutoff,           scalingFactors,
+                                                                   scalingFactors, force, torque, &energy
-                                                                 force, torque, &energy
 #ifdef AMOEBA_DEBUG
 , pullBack
 #endif
@@ -345,75 +326,59 @@ if( atomI == targetAtom ){
                           localParticle.torque[1]        += mask ? torque[0][1]  : 0.0f;
                           localParticle.torque[2]        += mask ? torque[0][2]  : 0.0f;
-                           totalEnergy         += mask ? energy        : 0.0f;
+                           totalEnergy                    += mask ? energy        : 0.0f;
                           // add force and torque to atom J due atom I
-                           psA[tj].force[0]     -= mask ?  force[0]     : 0.0f;
+                           psA[tj].force[0]               -= mask ?  force[0]     : 0.0f;
-                           psA[tj].force[1]     -= mask ?  force[1]     : 0.0f;
+                           psA[tj].force[1]               -= mask ?  force[1]     : 0.0f;
-                           psA[tj].force[2]     -= mask ?  force[2]     : 0.0f;
+                           psA[tj].force[2]               -= mask ?  force[2]     : 0.0f;
-                           psA[tj].torque[0]    += mask ?  torque[1][0] : 0.0f;
+                           psA[tj].torque[0]              += mask ?  torque[1][0] : 0.0f;
-                           psA[tj].torque[1]    += mask ?  torque[1][1] : 0.0f;
+                           psA[tj].torque[1]              += mask ?  torque[1][1] : 0.0f;
-                           psA[tj].torque[2]    += mask ?  torque[1][2] : 0.0f;
+                           psA[tj].torque[2]              += mask ?  torque[1][2] : 0.0f;
 #ifdef AMOEBA_DEBUG
 if( atomI == targetAtom  || atomJ == targetAtom ){
-        unsigned int index                 = (atomI == targetAtom) ? atomJ : atomI;
-        unsigned int indexI                = (atomI == targetAtom) ? 0 : 1;
-        unsigned int indexJ                = (atomI == targetAtom) ? 1 : 0;
-        float forceSign                    = (atomI == targetAtom) ? 1.0f : -1.0f;
-        debugArray[index].x                = (float) atomI;
-        debugArray[index].y                = (float) atomJ;
-        debugArray[index].z                = 2.0f;
-        debugArray[index].w                = (float) y;
-/*
-        index                             += cAmoebaSim.paddedNumberOfAtoms;
-        debugArray[index].x                = mask ? forceSign*force[0]     : 0.0f;
-        debugArray[index].y                = mask ? forceSign*force[1]     : 0.0f;
-        debugArray[index].z                = mask ? forceSign*force[2]     : 0.0f;
-        index                             += cAmoebaSim.paddedNumberOfAtoms;
-        debugArray[index].x                = mask ? torque[indexI][0] : 0.0f;
-        debugArray[index].y                = mask ? torque[indexI][1] : 0.0f;
-        debugArray[index].z                = mask ? torque[indexI][2] : 0.0f;
-        index                             += cAmoebaSim.paddedNumberOfAtoms;
-        debugArray[index].x                = mask ? torque[indexJ][0] : 0.0f;
-        debugArray[index].y                = mask ? torque[indexJ][1] : 0.0f;
-        debugArray[index].z                = mask ? torque[indexJ][2] : 0.0f;
-*/
-        index                             += cAmoebaSim.paddedNumberOfAtoms;
-        int pullIndex                      = 0;
-        debugArray[index].x                = pullBack[pullIndex].x;
-        debugArray[index].y                = pullBack[pullIndex].y;
-        debugArray[index].z                = pullBack[pullIndex].z;
-        debugArray[index].w                = pullBack[pullIndex].w;
-        index                             += cAmoebaSim.paddedNumberOfAtoms;
-        pullIndex++;
-        debugArray[index].x                = pullBack[pullIndex].x;
-        debugArray[index].y                = pullBack[pullIndex].y;
-        debugArray[index].z                = pullBack[pullIndex].z;
-        debugArray[index].w                = pullBack[pullIndex].w;
+    unsigned int index                 = (atomI == targetAtom) ? atomJ : atomI;
+    unsigned int indexI                = (atomI == targetAtom) ? 0 : 1;
+    unsigned int indexJ                = (atomI == targetAtom) ? 1 : 0;
+    float forceSign                    = (atomI == targetAtom) ? 1.0f : -1.0f;
+    float blockId                      = 2.0f;
+    debugArray[index].x                = (float) atomI;
+    debugArray[index].y                = (float) atomJ;
+    debugArray[index].z                = (float) y;
+    debugArray[index].w                = blockId;
+    index                             += cAmoebaSim.paddedNumberOfAtoms;
+    debugArray[index].x                = mask ? forceSign*force[0]     : 0.0f;
+    debugArray[index].y                = mask ? forceSign*force[1]     : 0.0f;
+    debugArray[index].z                = mask ? forceSign*force[2]     : 0.0f;
+    debugArray[index].w                = blockId;
+    index                             += cAmoebaSim.paddedNumberOfAtoms;
+    debugArray[index].x                = mask ? torque[indexI][0] : 0.0f;
+    debugArray[index].y                = mask ? torque[indexI][1] : 0.0f;
+    debugArray[index].z                = mask ? torque[indexI][2] : 0.0f;
+    debugArray[index].w                = blockId;
+    index                             += cAmoebaSim.paddedNumberOfAtoms;
+    debugArray[index].x                = mask ? torque[indexJ][0] : 0.0f;
+    debugArray[index].y                = mask ? torque[indexJ][1] : 0.0f;
+    debugArray[index].z                = mask ? torque[indexJ][2] : 0.0f;
+    debugArray[index].w                = blockId;
+    for( int pullIndex = 0; pullIndex < pullIndexMax; pullIndex++ ){
        index                             += cAmoebaSim.paddedNumberOfAtoms;
-        pullIndex++;
-        debugArray[index].x                = pullBack[pullIndex].x;
-        debugArray[index].y                = pullBack[pullIndex].y;
-        debugArray[index].z                = pullBack[pullIndex].z;
-        debugArray[index].w                = pullBack[pullIndex].w;
-        index                             += cAmoebaSim.paddedNumberOfAtoms;
-        pullIndex++;
        debugArray[index].x                = pullBack[pullIndex].x;
        debugArray[index].y                = pullBack[pullIndex].y;
        debugArray[index].z                = pullBack[pullIndex].z;
        debugArray[index].w                = pullBack[pullIndex].w;
+    }
 }
 #endif
@@ -449,9 +414,8 @@ if( atomI == targetAtom  || atomJ == targetAtom ){
                           // force
                    float energy;
-                    calculateRealSpaceEwaldPairIxn_kernel( localParticle,                              psA[tj],
+                    calculatePmeDirectElectrostaticPairIxn_kernel( localParticle,   psA[tj],
-                                                          cAmoebaSim.scalingDistanceCutoff,           scalingFactors,
+                                                                   scalingFactors, force, torque, &energy
-                                                          force, torque, &energy
 #ifdef AMOEBA_DEBUG
 , pullBack
 #endif
@@ -471,74 +435,58 @@ if( atomI == targetAtom  || atomJ == targetAtom ){
                    localParticle.torque[1]        += mask ? torque[0][1]  : 0.0f;
                    localParticle.torque[2]        += mask ? torque[0][2]  : 0.0f;
-                    totalEnergy         += mask ? energy        : 0.0f;
+                    totalEnergy                    += mask ? energy        : 0.0f;
                    // add force and torque to atom J due atom I
-                    psA[tj].force[0]     -= mask ?  force[0]     : 0.0f;
+                    psA[tj].force[0]               -= mask ?  force[0]     : 0.0f;
-                    psA[tj].force[1]     -= mask ?  force[1]     : 0.0f;
+                    psA[tj].force[1]               -= mask ?  force[1]     : 0.0f;
-                    psA[tj].force[2]     -= mask ?  force[2]     : 0.0f;
+                    psA[tj].force[2]               -= mask ?  force[2]     : 0.0f;
-                    psA[tj].torque[0]    += mask ?  torque[1][0] : 0.0f;
+                    psA[tj].torque[0]              += mask ?  torque[1][0] : 0.0f;
-                    psA[tj].torque[1]    += mask ?  torque[1][1] : 0.0f;
+                    psA[tj].torque[1]              += mask ?  torque[1][1] : 0.0f;
-                    psA[tj].torque[2]    += mask ?  torque[1][2] : 0.0f;
+                    psA[tj].torque[2]              += mask ?  torque[1][2] : 0.0f;
 #ifdef AMOEBA_DEBUG
 if( atomI == targetAtom  || atomJ == targetAtom ){
-         unsigned int index                 = (atomI == targetAtom) ? atomJ : atomI;
+        unsigned int index                 = (atomI == targetAtom) ? atomJ : atomI;
-         unsigned int indexI                = (atomI == targetAtom) ? 0 : 1;
+        unsigned int indexI                = (atomI == targetAtom) ? 0 : 1;
-         unsigned int indexJ                = (atomI == targetAtom) ? 1 : 0;
+        unsigned int indexJ                = (atomI == targetAtom) ? 1 : 0;
-         float forceSign                    = (atomI == targetAtom) ? 1.0f : -1.0f;
+        float forceSign                    = (atomI == targetAtom) ? 1.0f : -1.0f;
+        float blockId                      = 3.0f;
-         debugArray[index].x                = (float) atomI;
-         debugArray[index].y                = (float) atomJ;
-         debugArray[index].z                = 3.0f;
-         debugArray[index].w                = (float) y;
-/*
-         index                             += cAmoebaSim.paddedNumberOfAtoms;
-         debugArray[index].x                = mask ? forceSign*force[0]     : 0.0f;
-         debugArray[index].y                = mask ? forceSign*force[1]     : 0.0f;
-         debugArray[index].z                = mask ? forceSign*force[2]     : 0.0f;
-         index                             += cAmoebaSim.paddedNumberOfAtoms;
-         debugArray[index].x                = mask ? torque[indexI][0] : 0.0f;
-         debugArray[index].y                = mask ? torque[indexI][1] : 0.0f;
-         debugArray[index].z                = mask ? torque[indexI][2] : 0.0f;
-         index                             += cAmoebaSim.paddedNumberOfAtoms;
-         debugArray[index].x                = mask ? torque[indexJ][0] : 0.0f;
-         debugArray[index].y                = mask ? torque[indexJ][1] : 0.0f;
-         debugArray[index].z                = mask ? torque[indexJ][2] : 0.0f;
-*/
-        index                             += cAmoebaSim.paddedNumberOfAtoms;
-        int pullIndex                      = 0;
-        debugArray[index].x                = pullBack[pullIndex].x;
-        debugArray[index].y                = pullBack[pullIndex].y;
-        debugArray[index].z                = pullBack[pullIndex].z;
-        debugArray[index].w                = pullBack[pullIndex].w;
+        debugArray[index].x                = (float) atomI;
+        debugArray[index].y                = (float) atomJ;
+        debugArray[index].z                = (float) y;
+        debugArray[index].w                = blockId;
        index                             += cAmoebaSim.paddedNumberOfAtoms;
-        pullIndex++;
-        debugArray[index].x                = pullBack[pullIndex].x;
+        debugArray[index].x                = mask ? forceSign*force[0]     : 0.0f;
-        debugArray[index].y                = pullBack[pullIndex].y;
+        debugArray[index].y                = mask ? forceSign*force[1]     : 0.0f;
-        debugArray[index].z                = pullBack[pullIndex].z;
+        debugArray[index].z                = mask ? forceSign*force[2]     : 0.0f;
-        debugArray[index].w                = pullBack[pullIndex].w;
+        debugArray[index].w                = blockId;
        index                             += cAmoebaSim.paddedNumberOfAtoms;
-        pullIndex++;
+        debugArray[index].x                = mask ? torque[indexI][0] : 0.0f;
-        debugArray[index].x                = pullBack[pullIndex].x;
+        debugArray[index].y                = mask ? torque[indexI][1] : 0.0f;
-        debugArray[index].y                = pullBack[pullIndex].y;
+        debugArray[index].z                = mask ? torque[indexI][2] : 0.0f;
-        debugArray[index].z                = pullBack[pullIndex].z;
+        debugArray[index].w                = blockId;
-        debugArray[index].w                = pullBack[pullIndex].w;
        index                             += cAmoebaSim.paddedNumberOfAtoms;
-        pullIndex++;
+        debugArray[index].x                = mask ? torque[indexJ][0] : 0.0f;
-        debugArray[index].x                = pullBack[pullIndex].x;
+        debugArray[index].y                = mask ? torque[indexJ][1] : 0.0f;
-        debugArray[index].y                = pullBack[pullIndex].y;
+        debugArray[index].z                = mask ? torque[indexJ][2] : 0.0f;
-        debugArray[index].z                = pullBack[pullIndex].z;
+        debugArray[index].w                = blockId;
-        debugArray[index].w                = pullBack[pullIndex].w;
+        for( int pullIndex = 0; pullIndex < pullIndexMax; pullIndex++ ){
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = pullBack[pullIndex].x;
+            debugArray[index].y                = pullBack[pullIndex].y;
+            debugArray[index].z                = pullBack[pullIndex].z;
+            debugArray[index].w                = pullBack[pullIndex].w;
+        }
 #if 0
            index                             += cAmoebaSim.paddedNumberOfAtoms;
@@ -550,13 +498,6 @@ if( atomI == targetAtom  || atomJ == targetAtom ){
            index                             += cAmoebaSim.paddedNumberOfAtoms;
            debugArray[index].x                = scalingFactors[MScaleIndex];
            debugArray[index].y                = mScaleVal;
-            for( int pIndex = 0; pIndex < 14; pIndex++ ){
-                index                             += cAmoebaSim.paddedNumberOfAtoms;
-                debugArray[index].x                = pullBack[pIndex].x;
-                debugArray[index].y                = pullBack[pIndex].y;
-                debugArray[index].z                = pullBack[pIndex].z;
-                debugArray[index].w                = pullBack[pIndex].w;
-            }
            index                             += cAmoebaSim.paddedNumberOfAtoms;
            debugArray[index].x                = labFrameDipole[3*atomI];

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
@@ -322,7 +322,7 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
   --------------------------------------------------------------------------------------- */
-void cudaComputeAmoebaPmeFixedEField( amoebaGpuContext amoebaGpu )
+static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
 {
   // ---------------------------------------------------------------------------------------
@@ -487,3 +487,9 @@ void cudaComputeAmoebaPmeFixedEField( amoebaGpuContext amoebaGpu )
         }
 }
+void cudaComputeAmoebaPmeFixedEField( amoebaGpuContext amoebaGpu )
+{
+    cudaComputeAmoebaPmeDirectFixedEField( amoebaGpu );
+    kCalculateAmoebaPMEFixedMultipoleField( amoebaGpu );
+}
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
@@ -616,6 +616,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
        // matrix multiply
        cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpu, amoebaGpu->psWorkVector[0],  amoebaGpu->psWorkVector[1] );
+        kCalculateAmoebaPMEInducedDipoleField( amoebaGpu );
        LAUNCHERROR("cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply Loop\n");  
@@ -654,7 +655,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
           gpu->natoms, amoebaGpu->psPolarizability->_pDevStream[0],
           amoebaGpu->psInducedDipole->_pDevStream[0], amoebaGpu->psInducedDipolePolar->_pDevStream[0],
           amoebaGpu->psE_Field->_pDevStream[0],       amoebaGpu->psE_FieldPolar->_pDevStream[0],
-           amoebaGpu->psWorkVector[0]->_pDevStream[0],     amoebaGpu->psWorkVector[1]->_pDevStream[0] );
+           amoebaGpu->psWorkVector[0]->_pDevStream[0], amoebaGpu->psWorkVector[1]->_pDevStream[0] );
        LAUNCHERROR("kSorUpdatePmeMutualInducedField");  
        // get total epsilon -- performing sums on gpu

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaRotateFrame.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaRotateFrame.cu
@@ -390,7 +390,7 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
    if( amoebaGpu->multipoleNonbondedMethod == AMOEBA_NO_CUTOFF ){
        cudaComputeAmoebaElectrostatic( amoebaGpu );
    } else {
-        cudaComputeAmoebaRealSpaceEwald( amoebaGpu );
+        cudaComputeAmoebaPmeElectrostatic( amoebaGpu );
    }
    // map torques to forces

--- a/plugins/amoeba/platforms/cuda/tests/AmoebaTinkerParameterFile.cpp
+++ b/plugins/amoeba/platforms/cuda/tests/AmoebaTinkerParameterFile.cpp
@@ -2008,6 +2008,7 @@ static int readAmoebaMultipoleParameters( FILE* filePtr, int version, MapStringI
        multipoleForce->setNonbondedMethod( AmoebaMultipoleForce::NoCutoff );
    }
    multipoleForce->setCutoffDistance( cutoffDistance );
+    multipoleForce->setAEwald( aewald );
    system.setDefaultPeriodicBoxVectors( Vec3(box[0], 0.0, 0.0), Vec3(0.0, box[1], 0.0), Vec3(0.0, 0.0, box[2]) );
    if( log ){
        (void) fprintf( log, "%s number of MultipoleParameter terms=%d usePme=%d aewald=%15.7e cutoffDistance=%12.4f\n",
@@ -2111,6 +2112,7 @@ static int readAmoebaMultipoleParameters( FILE* filePtr, int version, MapStringI
        double polarityConversion      = AngstromToNm*AngstromToNm*AngstromToNm;
        double dampingFactorConversion = sqrt( AngstromToNm );
+        multipoleForce->setAEwald(                multipoleForce->getAEwald()/AngstromToNm );
        multipoleForce->setCutoffDistance(        multipoleForce->getCutoffDistance()*AngstromToNm );
        multipoleForce->setScalingDistanceCutoff( multipoleForce->getScalingDistanceCutoff()*AngstromToNm );
@@ -2165,8 +2167,8 @@ static int readAmoebaMultipoleParameters( FILE* filePtr, int version, MapStringI
                        (useOpenMMUnits ? "OpenMM" : "Amoeba") );
        std::string nonbondedMethod = multipoleForce->getNonbondedMethod( ) == AmoebaMultipoleForce::PME ? "PME" : "NoCutoff";
-        (void) fprintf( log, "NonbondedMethod=%s cutoff=%15.7e.\n", nonbondedMethod.c_str(),
+        (void) fprintf( log, "NonbondedMethod=%s aEwald=%15.7e cutoff=%15.7e.\n", nonbondedMethod.c_str(),
-                         multipoleForce->getCutoffDistance() );
+                        multipoleForce->getAEwald(), multipoleForce->getCutoffDistance() );
        Vec3 a,b,c;
        system.getDefaultPeriodicBoxVectors( a, b, c );