Direct space PME

2ef05285 · Mark Friedrichs · c22f00cf · 2ef05285 · 2ef05285 · 2ef05285
Commit 2ef05285 authored Sep 13, 2010 by Mark Friedrichs
13 changed files
--- a/plugins/amoeba/CMakeLists.txt
+++ b/plugins/amoeba/CMakeLists.txt
@@ -38,8 +38,7 @@ ENDIF(LOG)

 # The source is organized into subdirectories, but we handle them all from
 # this CMakeLists file rather than letting CMake visit them as SUBDIRS.
-#SET(OPENMM_AMOEBA_PLUGIN_SOURCE_SUBDIRS . openmmapi platforms/reference)
-SET(OPENMM_AMOEBA_PLUGIN_SOURCE_SUBDIRS . openmmapi )
+SET(OPENMM_AMOEBA_PLUGIN_SOURCE_SUBDIRS . openmmapi platforms/reference)

 # Collect up information about the version of the OpenMM library we're building
 # and make it available to the code so it can be built into the binaries.
@@ -103,8 +102,10 @@ FOREACH(subdir ${OPENMM_AMOEBA_PLUGIN_SOURCE_SUBDIRS})
    INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include)
 ENDFOREACH(subdir)

-#INCLUDE_DIRECTORIES(BEFORE ${OPENMM_DIR}/platforms/reference/src)
-#INCLUDE_DIRECTORIES(BEFORE ${OPENMM_DIR}/platforms/reference/src/SimTKReference)
+INCLUDE_DIRECTORIES(BEFORE ${OPENMM_DIR}/platforms/reference/src)
+INCLUDE_DIRECTORIES(BEFORE ${OPENMM_DIR}/platforms/reference/src/SimTKReference)
+INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/platforms/reference/src)
+INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/platforms/reference/src/SimTKReference)

 # ----------------------------------------------------------------------------
 IF(LOG)
@@ -158,7 +159,7 @@ IF(OPENMM_BUILD_STATIC_LIB)
  TARGET_LINK_LIBRARIES( ${STATIC_AMOEBA_TARGET} ${STATIC_TARGET} )
 ENDIF(OPENMM_BUILD_STATIC_LIB)

-#ADD_SUBDIRECTORY(platforms/reference/tests)
+ADD_SUBDIRECTORY(platforms/reference/tests)

 # Which hardware platforms to build
 IF(CUDA_FOUND)

--- a/plugins/amoeba/openmmapi/include/AmoebaMultipoleForce.h
+++ b/plugins/amoeba/openmmapi/include/AmoebaMultipoleForce.h
@@ -113,6 +113,20 @@ public:
     */
    void setCutoffDistance(double distance);

+   /**		   
+     * Get the aEwald parameter		 
+     *		 
+     * @return the Ewald parameter		 
+     */		 
+    double getAEwald() const;			 
+		 
+    /**		 
+     * Set the aEwald parameter		 
+     *		 
+     * @param Ewald parameter			 
+     */		 
+    void setAEwald(double aewald);		 
+		 
    /**
     * Add multipole-related info for a particle 
     *
@@ -279,6 +293,7 @@ private:

    AmoebaNonbondedMethod nonbondedMethod;
    double cutoffDistance;
+    double aewald;
    MutualInducedIterationMethod mutualInducedIterationMethod;
    int mutualInducedMaxIterations;
    double mutualInducedTargetEpsilon;

--- a/plugins/amoeba/openmmapi/src/AmoebaMultipoleForce.cpp
+++ b/plugins/amoeba/openmmapi/src/AmoebaMultipoleForce.cpp
@@ -56,6 +56,14 @@ void AmoebaMultipoleForce::setCutoffDistance(double distance) {
    cutoffDistance = distance;
 }

+double AmoebaMultipoleForce::getAEwald() const {		 
+    return aewald;		 
+}		 
+		 
+void AmoebaMultipoleForce::setAEwald(double inputAewald ) {			 
+    aewald = inputAewald;		 
+}		 
+ 
 AmoebaMultipoleForce::MutualInducedIterationMethod AmoebaMultipoleForce::getMutualInducedIterationMethod( void ) const {
    return mutualInducedIterationMethod;
 }

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -589,6 +589,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
                                    static_cast<float>( force.getMutualInducedTargetEpsilon()),
                                    nonbondedMethod,
                                    static_cast<float>( force.getCutoffDistance()),
+                                    static_cast<float>( force.getAEwald()),
                                    static_cast<float>( force.getElectricConstant()) );
    if (nonbondedMethod == AmoebaMultipoleForce::PME) {
        double alpha;

--- a/plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
+++ b/plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
@@ -350,6 +350,7 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
    (void) fprintf( log, "     pP_ScaleIndices                    %p\n",      amoebaGpu->amoebaSim.pP_ScaleIndices );
    (void) fprintf( log, "     pM_ScaleIndices                    %p\n",      amoebaGpu->amoebaSim.pM_ScaleIndices );
    (void) fprintf( log, "     sqrtPi                             %15.7e\n",  amoebaGpu->amoebaSim.sqrtPi );
+    (void) fprintf( log, "     alpha Ewald                        %15.7e\n",  gpu->sim.alphaEwald );
    (void) fprintf( log, "     cutoffDistance2                    %15.7e\n",  amoebaGpu->amoebaSim.cutoffDistance2 );
    (void) fprintf( log, "     electric                           %15.7e\n",  amoebaGpu->amoebaSim.electric );
    (void) fprintf( log, "     box                                %15.7e %15.7e %15.7e\n", gpu->sim.periodicBoxSizeX, gpu->sim.periodicBoxSizeY, gpu->sim.periodicBoxSizeZ);
@@ -1463,7 +1464,7 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
                                     const std::vector< std::vector< std::vector<int> > >& multipoleParticleCovalentInfo, const std::vector<int>& covalentDegree,
                                     const std::vector<int>& minCovalentIndices,  const std::vector<int>& minCovalentPolarizationIndices, int maxCovalentRange, 
                                     int mutualInducedIterativeMethod, int mutualInducedMaxIterations, float mutualInducedTargetEpsilon,
-                                     int nonbondedMethod, float cutoffDistance, float electricConstant ){
+                                     int nonbondedMethod, float cutoffDistance, float alphaEwald, float electricConstant ){

 // ---------------------------------------------------------------------------------------

@@ -1565,9 +1566,8 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
    amoebaGpu->amoebaSim.cutoffDistance2             = cutoffDistance*cutoffDistance;
    amoebaGpu->amoebaSim.sqrtPi                      = sqrt( 3.1415926535897932384626433832795 );
    amoebaGpu->amoebaSim.electric                    = electricConstant;
-
+    amoebaGpu->gpuContext->sim.alphaEwald            = alphaEwald;
    amoebaGpu->gpuContext->sim.nonbondedCutoff       = cutoffDistance;
-    tabulateErfc(amoebaGpu->gpuContext); 

    if( amoebaGpu->amoebaSim.dielec < 1.0e-05 ){
        amoebaGpu->amoebaSim.dielec               = 1.0f;
@@ -2766,7 +2766,7 @@ void amoebaGpuSetConstants(amoebaGpuContext amoebaGpu)
    SetCalculateAmoebaCudaPmeMutualInducedFieldSim( amoebaGpu );
    SetCalculateAmoebaCudaPmeFixedEFieldSim( amoebaGpu );
    SetCalculateAmoebaElectrostaticSim( amoebaGpu );
-    SetCalculateAmoebaRealSpaceEwaldSim( amoebaGpu );
+    SetCalculateAmoebaPmeDirectElectrostaticSim( amoebaGpu );
    SetCalculateAmoebaCudaMapTorquesSim( amoebaGpu );
    SetCalculateAmoebaKirkwoodSim( amoebaGpu );
    SetCalculateAmoebaKirkwoodEDiffSim( amoebaGpu );

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
@@ -106,9 +106,9 @@ extern void SetCalculateAmoebaElectrostaticSim( amoebaGpuContext amoebaGpu );
 extern void GetCalculateAmoebaElectrostaticSim( amoebaGpuContext amoebaGpu );
 extern void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu );

-extern void SetCalculateAmoebaRealSpaceEwaldSim( amoebaGpuContext amoebaGpu );
-extern void GetCalculateAmoebaRealSpaceEwaldSim( amoebaGpuContext amoebaGpu );
-extern void cudaComputeAmoebaRealSpaceEwald( amoebaGpuContext amoebaGpu );
+extern void SetCalculateAmoebaPmeDirectElectrostaticSim( amoebaGpuContext amoebaGpu );
+extern void GetCalculateAmoebaPmeDirectElectrostaticSim( amoebaGpuContext amoebaGpu );
+extern void cudaComputeAmoebaPmeElectrostatic( amoebaGpuContext amoebaGpu );

 extern void SetCalculateAmoebaCudaMapTorquesSim(amoebaGpuContext gpu);
 extern void GetCalculateAmoebaCudaMapTorquesSim(amoebaGpuContext gpu);

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
@@ -307,7 +307,7 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
                                     const std::vector< std::vector< std::vector<int> > >& multipoleAtomCovalentInfo, const std::vector<int>& covalentDegree,
                                     const std::vector<int>& minCovalentIndices,  const std::vector<int>& minCovalentPolarizationIndices, int maxCovalentRange,
                                     int mutualInducedIterationMethod, int mutualInducedMaxIterations, float mutualInducedTargetEpsilon,
-                                     int nonbondedMethod, float cutoffDistance, float electricConstant );
+                                     int nonbondedMethod, float cutoffDistance,  float alphaEwald, float electricConstant );


 extern "C"

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaRealSpaceEwald.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaRealSpaceEwald.cu
@@ -11,34 +11,24 @@
 static __constant__ cudaGmxSimulation cSim;
 static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;

-void SetCalculateAmoebaRealSpaceEwaldSim(amoebaGpuContext amoebaGpu)
+void SetCalculateAmoebaPmeDirectElectrostaticSim(amoebaGpuContext amoebaGpu)
 {
    cudaError_t status;
    gpuContext gpu = amoebaGpu->gpuContext;
    status         = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));    
-    RTERROR(status, "SetCalculateAmoebaRealSpaceEwaldSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
+    RTERROR(status, "SetCalculateAmoebaPmeDirectElectrostaticSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
    status         = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));    
-    RTERROR(status, "SetCalculateAmoebaRealSpaceEwaldSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
+    RTERROR(status, "SetCalculateAmoebaPmeDirectElectrostaticSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
 }

-void GetCalculateAmoebaRealSpaceEwaldSim(amoebaGpuContext amoebaGpu)
+void GetCalculateAmoebaPmeDirectElectrostaticSim(amoebaGpuContext amoebaGpu)
 {
    cudaError_t status;
    gpuContext gpu = amoebaGpu->gpuContext;
    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));    
-    RTERROR(status, "GetCalculateAmoebaRealSpaceEwaldSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+    RTERROR(status, "GetCalculateAmoebaPmeDirectElectrostaticSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
    status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));    
-    RTERROR(status, "GetCalculateAmoebaRealSpaceEwaldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
-}
-texture<float, 1, cudaReadModeElementType> tabulatedErfcRef;
-
-static __device__ float fastErfc(float r)
-{
-    float normalized = cSim.tabulatedErfcScale*r;
-    int index = (int) normalized;
-    float fract2 = normalized-index;
-    float fract1 = 1.0f-fract2;
-    return fract1*tex1Dfetch(tabulatedErfcRef, index) + fract2*tex1Dfetch(tabulatedErfcRef, index+1);
+    RTERROR(status, "GetCalculateAmoebaPmeDirectElectrostaticSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
 }

 static int const PScaleIndex            =  0; 
@@ -60,7 +50,7 @@ static int const MScaleIndex            =  3;
 //static int const Ddsc72Index            = 16;
 static int const LastScalingIndex       = 4;

-struct RealSpaceEwaldParticle {
+struct PmeDirectElectrostaticParticle {

    // coordinates charge

@@ -97,9 +87,50 @@ struct RealSpaceEwaldParticle {

 };

-__device__ void calculateRealSpaceEwaldPairIxn_kernel( RealSpaceEwaldParticle& atomI,   RealSpaceEwaldParticle& atomJ,
-                                                       float scalingDistanceCutoff,   float* scalingFactors,
-                                                       float*  outputForce,           float  outputTorque[2][3],
+// self-energy for PME
+
+__device__ static void calculatePmeSelfEnergyElectrostaticPairIxn_kernel( PmeDirectElectrostaticParticle& atomI, float* energy)
+{
+    float term   = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
+    float fterm  = -(cAmoebaSim.electric/cAmoebaSim.dielec)*cSim.alphaEwald/cAmoebaSim.sqrtPi;
+
+    float cii    = atomI.q*atomI.q;
+
+    float dii    = atomI.labFrameDipole[0]*atomI.labFrameDipole[0] +
+                   atomI.labFrameDipole[1]*atomI.labFrameDipole[1] +
+                   atomI.labFrameDipole[2]*atomI.labFrameDipole[2];
+
+    float qii    = atomI.labFrameQuadrupole[0]*atomI.labFrameQuadrupole[0] +
+                   atomI.labFrameQuadrupole[4]*atomI.labFrameQuadrupole[4] +
+                   atomI.labFrameQuadrupole[8]*atomI.labFrameQuadrupole[8] + 2.0f*(
+                   atomI.labFrameQuadrupole[1]*atomI.labFrameQuadrupole[1] +
+                   atomI.labFrameQuadrupole[2]*atomI.labFrameQuadrupole[2] +
+                   atomI.labFrameQuadrupole[5]*atomI.labFrameQuadrupole[5]);
+
+    float uii    = atomI.labFrameDipole[0]*atomI.inducedDipole[0] + atomI.labFrameDipole[1]*atomI.inducedDipole[1] + atomI.labFrameDipole[2]*atomI.inducedDipole[2];
+
+    *energy      = (cii + term*(dii/3.0f + 2.0f*term*qii/5.0f));
+    *energy     += term*uii/3.0f;
+    *energy     *= fterm;
+}
+
+// self-torque for PME
+
+__device__ static void calculatePmeSelfTorqueElectrostaticPairIxn_kernel( PmeDirectElectrostaticParticle& atomI)
+{
+    float term       = (4.0f/3.0f)*(cAmoebaSim.electric/cAmoebaSim.dielec)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
+
+    float uix        = 0.5f*(atomI.inducedDipole[0] + atomI.inducedDipoleP[0]);
+    float uiy        = 0.5f*(atomI.inducedDipole[1] + atomI.inducedDipoleP[1]);
+    float uiz        = 0.5f*(atomI.inducedDipole[2] + atomI.inducedDipoleP[2]);
+
+    atomI.torque[0] += term*(atomI.labFrameDipole[1]*uiz - atomI.labFrameDipole[2]*uiy);
+    atomI.torque[1] += term*(atomI.labFrameDipole[2]*uix - atomI.labFrameDipole[0]*uiz);
+    atomI.torque[2] += term*(atomI.labFrameDipole[0]*uiy - atomI.labFrameDipole[1]*uix);
+}
+
+__device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectrostaticParticle& atomI,   PmeDirectElectrostaticParticle& atomJ,
+                                                               float* scalingFactors, float*  outputForce, float  outputTorque[2][3],
                                                               float* energy
 #ifdef AMOEBA_DEBUG
                                                               ,float4* debugArray 
@@ -107,20 +138,8 @@ __device__ void calculateRealSpaceEwaldPairIxn_kernel( RealSpaceEwaldParticle& a
 ){
  

-    float e,ei,f;
-    //float gfd,gfdr;
-    //float xix,yix,zix;
-    //float xiy,yiy,ziy;
-    //float xiz,yiz,ziz;
-    //float xkx,ykx,zkx;
-    //float xky,yky,zky;
-    //float xkz,ykz,zkz;
-    //float rr1,rr3;
-    //float rr5,rr7,rr9,rr11;
+    float e,ei;
    float erl,erli;
-    //float frcxi[4],frcxk[4];
-    //float frcyi[4],frcyk[4];
-    //float frczi[4],frczk[4];
    float di[4],qi[10];
    float dk[4],qk[10];
    float fridmp[4],findmp[4];
@@ -130,7 +149,6 @@ __device__ void calculateRealSpaceEwaldPairIxn_kernel( RealSpaceEwaldParticle& a
    float ttm2i[4],ttm3i[4];
    float ttm2r[4],ttm3r[4];
    float ttm2ri[4],ttm3ri[4];
-    //float fdir[4]
    float dixdk[4];
    float dkxui[4],dixuk[4];
    float dixukp[4],dkxuip[4];
@@ -159,7 +177,7 @@ __device__ void calculateRealSpaceEwaldPairIxn_kernel( RealSpaceEwaldParticle& a
    float gfr[8],gfri[7];
    float gti[7],gtri[7];

-    f = (cAmoebaSim.electric/cAmoebaSim.dielec);
+    float conversionFactor   = (cAmoebaSim.electric/cAmoebaSim.dielec);

    // set the permanent multipole and induced dipole values;

@@ -193,6 +211,7 @@ __device__ void calculateRealSpaceEwaldPairIxn_kernel( RealSpaceEwaldParticle& a

    float r2    = xr*xr + yr*yr + zr*zr;
    if( r2 <= cAmoebaSim.cutoffDistance2 ){
+
        float r      = sqrt(r2);
        float ck     = atomJ.q;
      
@@ -214,7 +233,7 @@ __device__ void calculateRealSpaceEwaldPairIxn_kernel( RealSpaceEwaldParticle& a

        float ralpha = cSim.alphaEwald*r;

-               bn[0] = fastErfc(ralpha)/r;
+               bn[0] = erfc(ralpha)/r;

        float alsq2  = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
        float alsq2n = 0.0f;
@@ -250,17 +269,17 @@ __device__ void calculateRealSpaceEwaldPairIxn_kernel( RealSpaceEwaldParticle& a
        float scale5 = 1.0f;
        float scale7 = 1.0f;

-            ddsc3[0] = 0.0f;
            ddsc3[1] = 0.0f;
            ddsc3[2] = 0.0f;
+            ddsc3[3] = 0.0f;

-            ddsc5[0] = 0.0f;
            ddsc5[1] = 0.0f;
            ddsc5[2] = 0.0f;
+            ddsc5[3] = 0.0f;

-            ddsc7[0] = 0.0f;
            ddsc7[1] = 0.0f;
            ddsc7[2] = 0.0f;
+            ddsc7[3] = 0.0f;

        float pdk    = atomJ.damp;
        float ptk    = atomJ.thole;
@@ -280,9 +299,11 @@ __device__ void calculateRealSpaceEwaldPairIxn_kernel( RealSpaceEwaldParticle& a
                   ddsc3[1] = temp3 * xr;
                   ddsc3[2] = temp3 * yr;
                   ddsc3[3] = temp3 * zr;
+
                   ddsc5[1] = temp5 * ddsc3[1];
                   ddsc5[2] = temp5 * ddsc3[2];
                   ddsc5[3] = temp5 * ddsc3[3];
+
                   ddsc7[1] = temp7 * ddsc5[1];
                   ddsc7[2] = temp7 * ddsc5[2];
                   ddsc7[3] = temp7 * ddsc5[3];
@@ -508,8 +529,8 @@ __device__ void calculateRealSpaceEwaldPairIxn_kernel( RealSpaceEwaldParticle& a
                    + rr7*gli[3]*psc7);
        e = e - (1.0f-scalingFactors[MScaleIndex])*erl;
        ei = ei - erli;
-        e = f * e;
-        ei = f * ei;
+        e = conversionFactor * e;
+        ei = conversionFactor * ei;
 //        em = em + e;
 //        ep = ep + ei;

@@ -679,12 +700,9 @@ __device__ void calculateRealSpaceEwaldPairIxn_kernel( RealSpaceEwaldParticle& a
                                   +(glip[2]+glip[7])*scalingFactors[DScaleIndex]);
        float temp7 = 0.5f * rr7 * (gli[3]*scalingFactors[PScaleIndex]
                                   +glip[3]*scalingFactors[DScaleIndex]);
-        fridmp[1] = temp3*ddsc3[1] + temp5*ddsc5[1]
-                         + temp7*ddsc7[1];
-        fridmp[2] = temp3*ddsc3[2] + temp5*ddsc5[2]
-                         + temp7*ddsc7[2];
-        fridmp[3] = temp3*ddsc3[3] + temp5*ddsc5[3]
-                         + temp7*ddsc7[3];
+        fridmp[1] = temp3*ddsc3[1] + temp5*ddsc5[1] + temp7*ddsc7[1];
+        fridmp[2] = temp3*ddsc3[2] + temp5*ddsc5[2] + temp7*ddsc7[2];
+        fridmp[3] = temp3*ddsc3[3] + temp5*ddsc5[3] + temp7*ddsc7[3];

        // find some scaling terms for induced-induced force

@@ -833,34 +851,138 @@ __device__ void calculateRealSpaceEwaldPairIxn_kernel( RealSpaceEwaldParticle& a
        // handle the case where scaling is used

        for( int j = 1; j <= 3; j++ ){
-           ftm2[j] = f * (ftm2[j]-(1.0f-scalingFactors[MScaleIndex])*ftm2r[j]);
-           ftm2i[j] = f * (ftm2i[j]-ftm2ri[j]);
-           ttm2[j] = f * (ttm2[j]-(1.0f-scalingFactors[MScaleIndex])*ttm2r[j]);
-           ttm2i[j] = f * (ttm2i[j]-ttm2ri[j]);
-           ttm3[j] = f * (ttm3[j]-(1.0f-scalingFactors[MScaleIndex])*ttm3r[j]);
-           ttm3i[j] = f * (ttm3i[j]-ttm3ri[j]);
+           ftm2[j]  = (ftm2[j]-(1.0f-scalingFactors[MScaleIndex])*ftm2r[j]);
+           ftm2i[j] = (ftm2i[j]-ftm2ri[j]);
+           ttm2[j]  = (ttm2[j]-(1.0f-scalingFactors[MScaleIndex])*ttm2r[j]);
+           ttm2i[j] = (ttm2i[j]-ttm2ri[j]);
+           ttm3[j]  = (ttm3[j]-(1.0f-scalingFactors[MScaleIndex])*ttm3r[j]);
+           ttm3i[j] = (ttm3i[j]-ttm3ri[j]);
        }

        // increment gradient due to force and torque on first site;
+
+        outputForce[0]           = -conversionFactor*(ftm2[1] + ftm2i[1]);
+        outputForce[1]           = -conversionFactor*(ftm2[2] + ftm2i[2]);
+        outputForce[2]           = -conversionFactor*(ftm2[3] + ftm2i[3]);
+        
+        outputTorque[0][0]       = conversionFactor*(ttm2[1] + ttm2i[1]);
+        outputTorque[0][1]       = conversionFactor*(ttm2[2] + ttm2i[2]);
+        outputTorque[0][2]       = conversionFactor*(ttm2[3] + ttm2i[3]);
+    
+        outputTorque[1][0]       = conversionFactor*(ttm3[1] + ttm3i[1]);
+        outputTorque[1][1]       = conversionFactor*(ttm3[2] + ttm3i[2]);
+        outputTorque[1][2]       = conversionFactor*(ttm3[3] + ttm3i[3]);
+        //outputTorque[1][2]    = conversionFactor*(ttm3_2 + ttm3i_2);
+
+#ifdef AMOEBA_DEBUG
+    int debugIndex               = 0;
+    float idTracker              = 1.0f;
 /*
-        dem(1,ii) = dem(1,ii) + ftm2[1];
-        dem(2,ii) = dem(2,ii) + ftm2[2];
-        dem(3,ii) = dem(3,ii) + ftm2[3];
-        dep(1,ii) = dep(1,ii) + ftm2i[1];
-        dep(2,ii) = dep(2,ii) + ftm2i[2];
-        dep(3,ii) = dep(3,ii) + ftm2i[3];
-        call torque (i,ttm2,ttm2i,frcxi,frcyi,frczi);
-
-        // increment gradient due to force and torque on second site;
-
-        dem(1,kk) = dem(1,kk) - ftm2[1];
-        dem(2,kk) = dem(2,kk) - ftm2[2];
-        dem(3,kk) = dem(3,kk) - ftm2[3];
-        dep(1,kk) = dep(1,kk) - ftm2i[1];
-        dep(2,kk) = dep(2,kk) - ftm2i[2];
-        dep(3,kk) = dep(3,kk) - ftm2i[3];
-        call torque (k,ttm3,ttm3i,frcxk,frcyk,frczk);
+    debugArray[debugIndex].x = atomI.labFrameDipole[0];
+    debugArray[debugIndex].y = atomI.labFrameDipole[1];
+    debugArray[debugIndex].z = atomI.labFrameDipole[2];
+    debugArray[debugIndex].w = r2;
+
+    debugIndex++;
+    idTracker               += 1.0;
+    debugArray[debugIndex].x = atomJ.labFrameDipole[0];
+    debugArray[debugIndex].y = atomJ.labFrameDipole[1];
+    debugArray[debugIndex].z = atomJ.labFrameDipole[2];
+    debugArray[debugIndex].w = cSim.alphaEwald;
+
+    debugIndex++;
+    idTracker               += 1.0;
+    debugArray[debugIndex].x = atomI.inducedDipole[0];
+    debugArray[debugIndex].y = atomI.inducedDipole[1];
+    debugArray[debugIndex].z = atomI.inducedDipole[2];
+    debugArray[debugIndex].w = idTracker;
+
+    debugIndex++;
+    idTracker               += 1.0;
+    debugArray[debugIndex].x = atomJ.inducedDipole[0];
+    debugArray[debugIndex].y = atomJ.inducedDipole[1];
+    debugArray[debugIndex].z = atomJ.inducedDipole[2];
+    debugArray[debugIndex].w = idTracker;
+
+    debugIndex++;
+    idTracker               += 1.0;
+    debugArray[debugIndex].x = atomI.inducedDipoleP[0];
+    debugArray[debugIndex].y = atomI.inducedDipoleP[1];
+    debugArray[debugIndex].z = atomI.inducedDipoleP[2];
+    debugArray[debugIndex].w = idTracker;
+
+    debugIndex++;
+    idTracker               += 1.0;
+    debugArray[debugIndex].x = atomJ.inducedDipoleP[0];
+    debugArray[debugIndex].y = atomJ.inducedDipoleP[1];
+    debugArray[debugIndex].z = atomJ.inducedDipoleP[2];
+    debugArray[debugIndex].w = idTracker;
+
+    debugIndex++;
+    idTracker               += 1.0;
+    debugArray[debugIndex].x = conversionFactor*ftm2[1];
+    debugArray[debugIndex].y = conversionFactor*ftm2[2];
+    debugArray[debugIndex].z = conversionFactor*ftm2[3];
+    debugArray[debugIndex].w = idTracker;
+    debugIndex++;
 */
+    idTracker               += 1.0;
+    debugArray[debugIndex].x = conversionFactor*ftm2i[1];
+    debugArray[debugIndex].y = conversionFactor*ftm2i[2];
+    debugArray[debugIndex].z = conversionFactor*ftm2i[3];
+    debugArray[debugIndex].w = r2;
+
+    debugIndex++;
+    idTracker               += 1.0;
+    debugArray[debugIndex].x = conversionFactor*fridmp[1];
+    debugArray[debugIndex].y = conversionFactor*fridmp[2];
+    debugArray[debugIndex].z = conversionFactor*fridmp[3];
+    debugArray[debugIndex].w = cSim.alphaEwald;
+
+    debugIndex++;
+    idTracker               += 1.0;
+    debugArray[debugIndex].x = conversionFactor*findmp[1];
+    debugArray[debugIndex].y = conversionFactor*findmp[2];
+    debugArray[debugIndex].z = conversionFactor*findmp[3];
+    debugArray[debugIndex].w = cSim.alphaEwald + 1.0f;
+
+    debugIndex++;
+    idTracker               += 1.0;
+    debugArray[debugIndex].x = conversionFactor*ttm2[1];
+    debugArray[debugIndex].y = conversionFactor*ttm2[2];
+    debugArray[debugIndex].z = conversionFactor*ttm2[3];
+    debugArray[debugIndex].w = idTracker;
+
+    debugIndex++;
+    idTracker               += 1.0;
+    debugArray[debugIndex].x = conversionFactor*ttm2i[1];
+    debugArray[debugIndex].y = conversionFactor*ttm2i[2];
+    debugArray[debugIndex].z = conversionFactor*ttm2i[3];
+    debugArray[debugIndex].w = idTracker;
+#endif
+
+    } else {
+
+        outputForce[0]           = 0.0f;
+        outputForce[1]           = 0.0f;
+        outputForce[2]           = 0.0f;
+        
+        outputTorque[0][0]       = 0.0f;
+        outputTorque[0][1]       = 0.0f;
+        outputTorque[0][2]       = 0.0f;
+    
+        outputTorque[1][0]       = 0.0f;
+        outputTorque[1][1]       = 0.0f;
+        outputTorque[1][2]       = 0.0f;
+
+#ifdef AMOEBA_DEBUG
+for( int ii = 0; ii < 20; ii++ ){
+    debugArray[ii].x = 0.0f;
+    debugArray[ii].y = 0.0f;
+    debugArray[ii].z = 0.0f;
+    debugArray[ii].w = (float) ii;
+}
+#endif
   
    }

@@ -868,49 +990,47 @@ __device__ void calculateRealSpaceEwaldPairIxn_kernel( RealSpaceEwaldParticle& a

 }

-__device__ void loadRealSpaceEwaldShared( struct RealSpaceEwaldParticle* sA, unsigned int atomI,
-                                       float4* atomCoord, float* labFrameDipoleJ, float* labQuadrupole,
-                                       float* inducedDipole, float* inducedDipolePolar, float2* dampingFactorAndThole )
+__device__ void loadPmeDirectElectrostaticShared( struct PmeDirectElectrostaticParticle* sA, unsigned int atomI )
 {
    // coordinates & charge
-
-    sA->x                        = atomCoord[atomI].x;
-    sA->y                        = atomCoord[atomI].y;
-    sA->z                        = atomCoord[atomI].z;
-    sA->q                        = atomCoord[atomI].w;
+    sA->x                        = cSim.pPosq[atomI].x;
+    sA->y                        = cSim.pPosq[atomI].y;
+    sA->z                        = cSim.pPosq[atomI].z;
+    sA->q                        = cSim.pPosq[atomI].w;

    // lab dipole

-    sA->labFrameDipole[0]         = labFrameDipoleJ[atomI*3];
-    sA->labFrameDipole[1]         = labFrameDipoleJ[atomI*3+1];
-    sA->labFrameDipole[2]         = labFrameDipoleJ[atomI*3+2];
+    sA->labFrameDipole[0]        = cAmoebaSim.pLabFrameDipole[atomI*3];
+    sA->labFrameDipole[1]        = cAmoebaSim.pLabFrameDipole[atomI*3+1];
+    sA->labFrameDipole[2]        = cAmoebaSim.pLabFrameDipole[atomI*3+2];
+

    // lab quadrupole

-    sA->labFrameQuadrupole[0]    = labQuadrupole[atomI*9];
-    sA->labFrameQuadrupole[1]    = labQuadrupole[atomI*9+1];
-    sA->labFrameQuadrupole[2]    = labQuadrupole[atomI*9+2];
-    sA->labFrameQuadrupole[3]    = labQuadrupole[atomI*9+3];
-    sA->labFrameQuadrupole[4]    = labQuadrupole[atomI*9+4];
-    sA->labFrameQuadrupole[5]    = labQuadrupole[atomI*9+5];
-    sA->labFrameQuadrupole[6]    = labQuadrupole[atomI*9+6];
-    sA->labFrameQuadrupole[7]    = labQuadrupole[atomI*9+7];
-    sA->labFrameQuadrupole[8]    = labQuadrupole[atomI*9+8];
+    sA->labFrameQuadrupole[0]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9];
+    sA->labFrameQuadrupole[1]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+1];
+    sA->labFrameQuadrupole[2]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+2];
+    sA->labFrameQuadrupole[3]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+3];
+    sA->labFrameQuadrupole[4]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+4];
+    sA->labFrameQuadrupole[5]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+5];
+    sA->labFrameQuadrupole[6]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+6];
+    sA->labFrameQuadrupole[7]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+7];
+    sA->labFrameQuadrupole[8]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+8];

    // induced dipole

-    sA->inducedDipole[0]          = inducedDipole[atomI*3];
-    sA->inducedDipole[1]          = inducedDipole[atomI*3+1];
-    sA->inducedDipole[2]          = inducedDipole[atomI*3+2];
+    sA->inducedDipole[0]         =  cAmoebaSim.pInducedDipole[atomI*3];
+    sA->inducedDipole[1]         =  cAmoebaSim.pInducedDipole[atomI*3+1];
+    sA->inducedDipole[2]         =  cAmoebaSim.pInducedDipole[atomI*3+2];

    // induced dipole polar

-    sA->inducedDipoleP[0]         = inducedDipolePolar[atomI*3];
-    sA->inducedDipoleP[1]         = inducedDipolePolar[atomI*3+1];
-    sA->inducedDipoleP[2]         = inducedDipolePolar[atomI*3+2];
+    sA->inducedDipoleP[0]        =  cAmoebaSim.pInducedDipolePolar[atomI*3];
+    sA->inducedDipoleP[1]        =  cAmoebaSim.pInducedDipolePolar[atomI*3+1];
+    sA->inducedDipoleP[2]        =  cAmoebaSim.pInducedDipolePolar[atomI*3+2];

-    sA->damp                     = dampingFactorAndThole[atomI].x;
-    sA->thole                    = dampingFactorAndThole[atomI].y;
+    sA->damp                     = cAmoebaSim.pDampingFactorAndThole[atomI].x;
+    sA->thole                    = cAmoebaSim.pDampingFactorAndThole[atomI].y;

 }

@@ -918,11 +1038,11 @@ __device__ void loadRealSpaceEwaldShared( struct RealSpaceEwaldParticle* sA, uns

 #undef USE_OUTPUT_BUFFER_PER_WARP
 #define METHOD_NAME(a, b) a##N2##b
-#include "kCalculateAmoebaCudaRealSpaceEwald.h"
+#include "kCalculateAmoebaCudaPmeDirectElectrostatic.h"
 #define USE_OUTPUT_BUFFER_PER_WARP
 #undef METHOD_NAME
 #define METHOD_NAME(a, b) a##N2ByWarp##b
-#include "kCalculateAmoebaCudaRealSpaceEwald.h"
+#include "kCalculateAmoebaCudaPmeDirectElectrostatic.h"

 // reduce psWorkArray_3_1 -> force
 // reduce psWorkArray_3_2 -> torque
@@ -932,23 +1052,27 @@ static void kReduceForceTorque(amoebaGpuContext amoebaGpu )
    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                             amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
                             amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psForce->_pDevStream[0] );
-    LAUNCHERROR("kReduceRealSpaceEwaldForce");
+    LAUNCHERROR("kReducePmeDirectElectrostaticForce");
    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                             amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
                             amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psTorque->_pDevStream[0] );
-    LAUNCHERROR("kReduceRealSpaceEwaldTorque");
+    LAUNCHERROR("kReducePmeDirectElectrostaticTorque");
 }

+//#define GET_INDUCED_DIPOLE_FROM_FILE
+#ifdef GET_INDUCED_DIPOLE_FROM_FILE
+#include <stdlib.h>
+#endif
+
 /**---------------------------------------------------------------------------------------

-   Compute Amoeba electrostatic force & torque
+   Compute Amoeba dirrect space portion of electrostatic force & torque

   @param amoebaGpu        amoebaGpu context
-   @param gpu              OpenMM gpu Cuda context

   --------------------------------------------------------------------------------------- */

-void cudaComputeAmoebaRealSpaceEwald( amoebaGpuContext amoebaGpu )
+void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
 {
  
   // ---------------------------------------------------------------------------------------
@@ -956,7 +1080,7 @@ void cudaComputeAmoebaRealSpaceEwald( amoebaGpuContext amoebaGpu )
    static unsigned int threadsPerBlock = 0;

 #ifdef AMOEBA_DEBUG
-    static const char* methodName = "cudaComputeAmoebaRealSpaceEwald";
+    static const char* methodName = "cudaComputeAmoebaPmeDirectElectrostatic";
    static int timestep = 0;
    std::vector<int> fileId;
    timestep++;
@@ -983,7 +1107,38 @@ void cudaComputeAmoebaRealSpaceEwald( amoebaGpuContext amoebaGpu )
    CUDAStream<float4>* debugArray            = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
    memset( debugArray->_pSysStream[0],      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
    debugArray->Upload();
-    unsigned int targetAtom                   = 0;
+    unsigned int targetAtom                   = 10;
+#endif
+
+#ifdef GET_INDUCED_DIPOLE_FROM_FILE
+    std::string fileName = "waterInducedDipole.txt";
+    StringVectorVector fileContents;
+    readFile( fileName, fileContents );
+    unsigned int offset  = 0;
+    (void) fprintf( amoebaGpu->log, "Read file: %s %u\n", fileName.c_str(), fileContents.size() ); fflush(  amoebaGpu->log );
+    for( unsigned int ii = 1; ii < fileContents.size()-1; ii++ ){
+
+        StringVector lineTokens     = fileContents[ii];
+        unsigned int lineTokenIndex = 1;
+
+        // (void) fprintf( amoebaGpu->log, "   %u %s %s\n", ii, lineTokens[0].c_str(), lineTokens[lineTokenIndex].c_str() ); fflush(  amoebaGpu->log );
+        amoebaGpu->psInducedDipole->_pSysStream[0][offset++]       = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
+        amoebaGpu->psInducedDipole->_pSysStream[0][offset++]       = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
+        amoebaGpu->psInducedDipole->_pSysStream[0][offset++]       = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str())); 
+        offset                                              -= 3;        
+        amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset++]  = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
+        amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset++]  = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
+        amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset++]  = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
+    }
+    float conversion = 0.1f;
+    for( int ii = 0; ii < 3*gpu->natoms; ii++ ){
+        amoebaGpu->psInducedDipole->_pSysStream[0][ii]       *= conversion;
+        amoebaGpu->psInducedDipolePolar->_pSysStream[0][ii]  *= conversion;
+    }
+    amoebaGpu->gpuContext->sim.alphaEwald = 5.4459052e+00f;
+    SetCalculateAmoebaPmeDirectElectrostaticSim(amoebaGpu);
+    amoebaGpu->psInducedDipole->Upload();
+    amoebaGpu->psInducedDipolePolar->Upload();
 #endif

    // on first pass, set threads/block
@@ -996,20 +1151,15 @@ void cudaComputeAmoebaRealSpaceEwald( amoebaGpuContext amoebaGpu )
          maxThreads = 128;
      else
          maxThreads = 64;
-      threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(RealSpaceEwaldParticle)), maxThreads);
+      threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(PmeDirectElectrostaticParticle)), maxThreads);
    }

    kClearFields_3( amoebaGpu, 2 );

    if (gpu->bOutputBufferPerWarp){

-      kCalculateAmoebaCudaRealSpaceEwaldN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(RealSpaceEwaldParticle)*threadsPerBlock>>>(
+      kCalculateAmoebaPmeDirectElectrostaticN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>(
                                                                         amoebaGpu->psWorkUnit->_pDevStream[0],
-                                                                         gpu->psPosq4->_pDevStream[0],
-                                                                         amoebaGpu->psLabFrameDipole->_pDevStream[0],
-                                                                         amoebaGpu->psLabFrameQuadrupole->_pDevStream[0],
-                                                                         amoebaGpu->psInducedDipole->_pDevStream[0],
-                                                                         amoebaGpu->psInducedDipolePolar->_pDevStream[0],
                                                                         amoebaGpu->psWorkArray_3_1->_pDevStream[0],
 #ifdef AMOEBA_DEBUG
                                                                         amoebaGpu->psWorkArray_3_2->_pDevStream[0],
@@ -1021,22 +1171,14 @@ void cudaComputeAmoebaRealSpaceEwald( amoebaGpuContext amoebaGpu )
    } else {

 #ifdef AMOEBA_DEBUG
-      (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaRealSpaceEwaldN2Forces no warp:  numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n",
+      (void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticN2Forces no warp:  numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n",
                      amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
-                      sizeof(RealSpaceEwaldParticle), sizeof(RealSpaceEwaldParticle)*threadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
+                      sizeof(PmeDirectElectrostaticParticle), sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
      (void) fflush( amoebaGpu->log );
 #endif

-      cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
-      cudaBindTexture(NULL, &tabulatedErfcRef, gpu->psTabulatedErfc->_pDevData, &channelDesc, gpu->psTabulatedErfc->_length*sizeof(float));
-
-      kCalculateAmoebaCudaRealSpaceEwaldN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(RealSpaceEwaldParticle)*threadsPerBlock>>>(
+      kCalculateAmoebaPmeDirectElectrostaticN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>(
                                                                         amoebaGpu->psWorkUnit->_pDevStream[0],
-                                                                         gpu->psPosq4->_pDevStream[0],
-                                                                         amoebaGpu->psLabFrameDipole->_pDevStream[0],
-                                                                         amoebaGpu->psLabFrameQuadrupole->_pDevStream[0],
-                                                                         amoebaGpu->psInducedDipole->_pDevStream[0],
-                                                                         amoebaGpu->psInducedDipolePolar->_pDevStream[0],
                                                                         amoebaGpu->psWorkArray_3_1->_pDevStream[0],
 #ifdef AMOEBA_DEBUG
                                                                         amoebaGpu->psWorkArray_3_2->_pDevStream[0],
@@ -1045,7 +1187,7 @@ void cudaComputeAmoebaRealSpaceEwald( amoebaGpuContext amoebaGpu )
                                                                         amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
 #endif
    }
-    LAUNCHERROR("kCalculateAmoebaCudaRealSpaceEwaldN2Forces");
+    LAUNCHERROR("kCalculateAmoebaPmeDirectElectrostaticN2Forces");
    kReduceForceTorque( amoebaGpu );

 #ifdef AMOEBA_DEBUG
@@ -1055,9 +1197,10 @@ void cudaComputeAmoebaRealSpaceEwald( amoebaGpuContext amoebaGpu )
        amoebaGpu->psTorque->Download();
        debugArray->Download();
  
-      (void) fprintf( amoebaGpu->log, "Finished RealSpaceEwald kernel execution\n" ); (void) fflush( amoebaGpu->log );
+        (void) fprintf( amoebaGpu->log, "Finished PmeDirectElectrostatic kernel execution\n" ); (void) fflush( amoebaGpu->log );
  
        int maxPrint        = 1400;
+        float conversion    = 1.0f/41.84f;
        for( int ii = 0; ii < gpu->natoms; ii++ ){
            (void) fprintf( amoebaGpu->log, "%5d ", ii); 
  
@@ -1065,38 +1208,18 @@ void cudaComputeAmoebaRealSpaceEwald( amoebaGpuContext amoebaGpu )
      
            // force
  
-         (void) fprintf( amoebaGpu->log,"RealSpaceEwaldF [%16.9e %16.9e %16.9e] ",
-                         amoebaGpu->psForce->_pSysStream[0][indexOffset],
-                         amoebaGpu->psForce->_pSysStream[0][indexOffset+1],
-                         amoebaGpu->psForce->_pSysStream[0][indexOffset+2] );
+            (void) fprintf( amoebaGpu->log,"PmeDirectElectrostaticF [%16.9e %16.9e %16.9e] ",
+                            conversion*amoebaGpu->psForce->_pSysStream[0][indexOffset],
+                            conversion*amoebaGpu->psForce->_pSysStream[0][indexOffset+1],
+                            conversion*amoebaGpu->psForce->_pSysStream[0][indexOffset+2] );
      
            // torque
  
-         (void) fprintf( amoebaGpu->log,"RealSpaceEwaldT [%16.9e %16.9e %16.9e] ",
-                         amoebaGpu->psTorque->_pSysStream[0][indexOffset],
-                         amoebaGpu->psTorque->_pSysStream[0][indexOffset+1],
-                         amoebaGpu->psTorque->_pSysStream[0][indexOffset+2] );
-
-         // coords
-
-#if 0
-          (void) fprintf( amoebaGpu->log,"x[%16.9e %16.9e %16.9e] ",
-                          gpu->psPosq4->_pSysStream[0][ii].x,
-                          gpu->psPosq4->_pSysStream[0][ii].y,
-                          gpu->psPosq4->_pSysStream[0][ii].z);
-
-
-         for( int jj = 0; jj < gpu->natoms && jj < 5; jj++ ){
-             int debugIndex = jj*gpu->natoms + ii;
-             float xx       =  gpu->psPosq4->_pSysStream[0][jj].x -  gpu->psPosq4->_pSysStream[0][ii].x;
-             float yy       =  gpu->psPosq4->_pSysStream[0][jj].y -  gpu->psPosq4->_pSysStream[0][ii].y;
-             float zz       =  gpu->psPosq4->_pSysStream[0][jj].z -  gpu->psPosq4->_pSysStream[0][ii].z;
-             (void) fprintf( amoebaGpu->log,"\n%4d %4d delta [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] ",
-                             ii, jj, xx, yy, zz,
-                             debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y, debugArray->_pSysStream[0][debugIndex].z );
+            (void) fprintf( amoebaGpu->log,"PmeDirectElectrostaticT [%16.9e %16.9e %16.9e] ",
+                            conversion*amoebaGpu->psTorque->_pSysStream[0][indexOffset],
+                            conversion*amoebaGpu->psTorque->_pSysStream[0][indexOffset+1],
+                            conversion*amoebaGpu->psTorque->_pSysStream[0][indexOffset+2] );
  
-         }
-#endif
            (void) fprintf( amoebaGpu->log,"\n" );
            if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
                ii = gpu->natoms - maxPrint;
@@ -1107,7 +1230,7 @@ void cudaComputeAmoebaRealSpaceEwald( amoebaGpuContext amoebaGpu )
            int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
            for( int jj = 0; jj < gpu->natoms; jj++ ){
                int debugIndex = jj;
-              for( int kk = 0; kk < 5; kk++ ){
+                for( int kk = 0; kk < 15; kk++ ){
                    (void) fprintf( amoebaGpu->log,"%5d %5d [%16.9e %16.9e %16.9e %16.9e] E11\n", targetAtom, jj,
                                    debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
                                    debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
@@ -1118,26 +1241,6 @@ void cudaComputeAmoebaRealSpaceEwald( amoebaGpuContext amoebaGpu )
        }
        (void) fflush( amoebaGpu->log );
  
-      if( 0 ){
-          (void) fprintf( amoebaGpu->log, "%s Tiled F & T\n", methodName ); fflush( amoebaGpu->log );
-          int maxPrint = 12;
-          for( int ii = 0; ii < gpu->natoms; ii++ ){
-    
-              // print cpu & gpu reductions
-    
-              int offset  = 3*ii;
-    
-              (void) fprintf( amoebaGpu->log,"%6d F[%16.7e %16.7e %16.7e] T[%16.7e %16.7e %16.7e]\n", ii,
-                              amoebaGpu->psForce->_pSysStream[0][offset],
-                              amoebaGpu->psForce->_pSysStream[0][offset+1],
-                              amoebaGpu->psForce->_pSysStream[0][offset+2],
-                              amoebaGpu->psTorque->_pSysStream[0][offset],
-                              amoebaGpu->psTorque->_pSysStream[0][offset+1],
-                              amoebaGpu->psTorque->_pSysStream[0][offset+2] );
-              if( (ii == maxPrint) && (ii < (gpu->natoms - maxPrint)) )ii = gpu->natoms - maxPrint; 
-          }   
-      }   
-
        if( 1 ){
            std::vector<int> fileId;
            //fileId.push_back( 0 );
@@ -1145,7 +1248,7 @@ void cudaComputeAmoebaRealSpaceEwald( amoebaGpuContext amoebaGpu )
            cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,            outputVector );
            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psForce,      outputVector );
            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psTorque,     outputVector);
-          cudaWriteVectorOfDoubleVectorsToFile( "CudaForceTorque", fileId, outputVector );
+            cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeDirectForceTorque", fileId, outputVector );
         }

    }   
@@ -1155,3 +1258,19 @@ void cudaComputeAmoebaRealSpaceEwald( amoebaGpuContext amoebaGpu )

   // ---------------------------------------------------------------------------------------
 }
+
+/**---------------------------------------------------------------------------------------
+
+   Compute Amoeba electrostatic force & torque using PME
+
+   @param amoebaGpu        amoebaGpu context
+
+   --------------------------------------------------------------------------------------- */
+
+void cudaComputeAmoebaPmeElectrostatic( amoebaGpuContext amoebaGpu )
+{
+ 
+    cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpu );
+    kCalculateAmoebaPME( amoebaGpu );
+} 
+
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaRealSpaceEwald.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaRealSpaceEwald.h
@@ -34,15 +34,8 @@ __launch_bounds__(128, 1)
 #else
 __launch_bounds__(64, 1)
 #endif
-void METHOD_NAME(kCalculateAmoebaCudaRealSpaceEwald, Forces_kernel)(
-                            unsigned int* workUnit,
-                            float4* atomCoord,
-                            float* labFrameDipole,
-                            float* labFrameQuadrupole,
-                            float* inducedDipole,
-                            float* inducedDipolePolar,
-                            float* outputForce,
-                            float* outputTorque
+void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
+                            unsigned int* workUnit, float* outputForce, float* outputTorque

 #ifdef AMOEBA_DEBUG
                           , float4* debugArray, unsigned int targetAtom
@@ -50,10 +43,11 @@ void METHOD_NAME(kCalculateAmoebaCudaRealSpaceEwald, Forces_kernel)(
 ){

 #ifdef AMOEBA_DEBUG
+    int pullIndexMax = 12;
    float4 pullBack[20];
 #endif

-    extern __shared__ RealSpaceEwaldParticle sA[];
+    extern __shared__ PmeDirectElectrostaticParticle sA[];

    unsigned int totalWarps      = gridDim.x*blockDim.x/GRID;
    unsigned int warp            = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
@@ -80,12 +74,10 @@ void METHOD_NAME(kCalculateAmoebaCudaRealSpaceEwald, Forces_kernel)(
        unsigned int tbx              = threadIdx.x - tgx;
        unsigned int tj               = tgx;

-        RealSpaceEwaldParticle* psA    = &sA[tbx];
+        PmeDirectElectrostaticParticle* psA    = &sA[tbx];
        unsigned int atomI            = x + tgx;
-        RealSpaceEwaldParticle localParticle;
-        loadRealSpaceEwaldShared(&localParticle, atomI,
-                                atomCoord, labFrameDipole, labFrameQuadrupole,
-                                inducedDipole, inducedDipolePolar, cAmoebaSim.pDampingFactorAndThole );
+        PmeDirectElectrostaticParticle localParticle;
+        loadPmeDirectElectrostaticShared(&localParticle, atomI );

        localParticle.force[0]                   = 0.0f;
        localParticle.force[1]                   = 0.0f;
@@ -105,9 +97,7 @@ void METHOD_NAME(kCalculateAmoebaCudaRealSpaceEwald, Forces_kernel)(

            // load shared data

-           loadRealSpaceEwaldShared( &(sA[threadIdx.x]), atomI,
-                                    atomCoord, labFrameDipole, labFrameQuadrupole,
-                                    inducedDipole, inducedDipolePolar, cAmoebaSim.pDampingFactorAndThole );
+            loadPmeDirectElectrostaticShared( &(sA[threadIdx.x]), atomI );

            if (!bExclusionFlag)
            {
@@ -122,9 +112,8 @@ void METHOD_NAME(kCalculateAmoebaCudaRealSpaceEwald, Forces_kernel)(
                    float torque[2][3];

                    float energy;
-                    calculateRealSpaceEwaldPairIxn_kernel( localParticle,                              psA[j],
-                                                          cAmoebaSim.scalingDistanceCutoff,           scalingFactors,
-                                                          force, torque, &energy
+                    calculatePmeDirectElectrostaticPairIxn_kernel( localParticle, psA[j],
+                                                                   scalingFactors, force, torque, &energy
 #ifdef AMOEBA_DEBUG
 , pullBack
 #endif
@@ -171,9 +160,8 @@ void METHOD_NAME(kCalculateAmoebaCudaRealSpaceEwald, Forces_kernel)(
                    // force

                    float energy;
-                    calculateRealSpaceEwaldPairIxn_kernel( localParticle,                              psA[j],
-                                                          cAmoebaSim.scalingDistanceCutoff,           scalingFactors,
-                                                          force, torque, &energy
+                    calculatePmeDirectElectrostaticPairIxn_kernel( localParticle,   psA[j],
+                                                                   scalingFactors, force, torque, &energy
 #ifdef AMOEBA_DEBUG
 , pullBack
 #endif
@@ -199,58 +187,54 @@ void METHOD_NAME(kCalculateAmoebaCudaRealSpaceEwald, Forces_kernel)(
 #ifdef AMOEBA_DEBUG
 if( atomI == targetAtom ){
    unsigned int index                 = (atomI == targetAtom) ? atomJ : atomI;
+    float blockId                      = 1.0f;

    debugArray[index].x                = (float) atomI;
    debugArray[index].y                = (float) atomJ;
-            debugArray[index].z                = 1.0f;
-            debugArray[index].w                = (float) y;
-/*
+    debugArray[index].z                = (float) y;
+    debugArray[index].w                = blockId;
+
    index                             += cAmoebaSim.paddedNumberOfAtoms;
    debugArray[index].x                = mask ? force[0]     : 0.0f;
    debugArray[index].y                = mask ? force[1]     : 0.0f;
    debugArray[index].z                = mask ? force[2]     : 0.0f;
+    debugArray[index].w                = blockId;


    index                             += cAmoebaSim.paddedNumberOfAtoms;
    debugArray[index].x                = mask ? torque[0][0] : 0.0f;
    debugArray[index].y                = mask ? torque[0][1] : 0.0f;
    debugArray[index].z                = mask ? torque[0][2] : 0.0f;
-*/
+    debugArray[index].w                = blockId;

    index                             += cAmoebaSim.paddedNumberOfAtoms;
-            int pullIndex                      = 0;
-            debugArray[index].x                = pullBack[pullIndex].x;
-            debugArray[index].y                = pullBack[pullIndex].y;
-            debugArray[index].z                = pullBack[pullIndex].z;
-            debugArray[index].w                = pullBack[pullIndex].w;
-
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            pullIndex++;
-            debugArray[index].x                = pullBack[pullIndex].x;
-            debugArray[index].y                = pullBack[pullIndex].y;
-            debugArray[index].z                = pullBack[pullIndex].z;
-            debugArray[index].w                = pullBack[pullIndex].w;
-
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            pullIndex++;
-            debugArray[index].x                = pullBack[pullIndex].x;
-            debugArray[index].y                = pullBack[pullIndex].y;
-            debugArray[index].z                = pullBack[pullIndex].z;
-            debugArray[index].w                = pullBack[pullIndex].w;
+    debugArray[index].x                = mask ? torque[0][0] : 0.0f;
+    debugArray[index].y                = mask ? torque[0][1] : 0.0f;
+    debugArray[index].z                = mask ? torque[0][2] : 0.0f;
+    debugArray[index].w                = blockId;

+    for( int pullIndex = 0; pullIndex < pullIndexMax; pullIndex++ ){
        index                             += cAmoebaSim.paddedNumberOfAtoms;
-            pullIndex++;
        debugArray[index].x                = pullBack[pullIndex].x;
        debugArray[index].y                = pullBack[pullIndex].y;
        debugArray[index].z                = pullBack[pullIndex].z;
        debugArray[index].w                = pullBack[pullIndex].w;
-
+    }
 }
 #endif

                }
            }

+            // include self energy and self torque
+
+            if( atomI < cAmoebaSim.numberOfAtoms ){
+                calculatePmeSelfTorqueElectrostaticPairIxn_kernel( localParticle );
+                //float energy;
+                //calculatePmeSelfEnergyElectrostaticPairIxn_kernel( localParticle, &energy );
+                //totalEnergy += energy;
+            }
+
            // Write results

 #ifdef USE_OUTPUT_BUFFER_PER_WARP
@@ -300,9 +284,7 @@ if( atomI == targetAtom ){
            {
                // load shared data

-               loadRealSpaceEwaldShared( &(sA[threadIdx.x]), (y+tgx),
-                                        atomCoord, labFrameDipole, labFrameQuadrupole,
-                                        inducedDipole, inducedDipolePolar, cAmoebaSim.pDampingFactorAndThole );
+               loadPmeDirectElectrostaticShared( &(sA[threadIdx.x]), (y+tgx) );

            }

@@ -325,9 +307,8 @@ if( atomI == targetAtom ){
                    unsigned int atomJ = y + tj;

                    float energy;
-                    calculateRealSpaceEwaldPairIxn_kernel( localParticle,                              psA[tj],
-                                                                 cAmoebaSim.scalingDistanceCutoff,           scalingFactors,
-                                                                 force, torque, &energy
+                    calculatePmeDirectElectrostaticPairIxn_kernel( localParticle,  psA[tj],
+                                                                   scalingFactors, force, torque, &energy
 #ifdef AMOEBA_DEBUG
 , pullBack
 #endif
@@ -360,60 +341,44 @@ if( atomI == targetAtom ){
       
 #ifdef AMOEBA_DEBUG
 if( atomI == targetAtom  || atomJ == targetAtom ){
+
    unsigned int index                 = (atomI == targetAtom) ? atomJ : atomI;
    unsigned int indexI                = (atomI == targetAtom) ? 0 : 1;
    unsigned int indexJ                = (atomI == targetAtom) ? 1 : 0;
    float forceSign                    = (atomI == targetAtom) ? 1.0f : -1.0f;
+    float blockId                      = 2.0f;

    debugArray[index].x                = (float) atomI;
    debugArray[index].y                = (float) atomJ;
-        debugArray[index].z                = 2.0f;
-        debugArray[index].w                = (float) y;
-/*
+    debugArray[index].z                = (float) y;
+    debugArray[index].w                = blockId;
+
    index                             += cAmoebaSim.paddedNumberOfAtoms;
    debugArray[index].x                = mask ? forceSign*force[0]     : 0.0f;
    debugArray[index].y                = mask ? forceSign*force[1]     : 0.0f;
    debugArray[index].z                = mask ? forceSign*force[2]     : 0.0f;
+    debugArray[index].w                = blockId;


    index                             += cAmoebaSim.paddedNumberOfAtoms;
    debugArray[index].x                = mask ? torque[indexI][0] : 0.0f;
    debugArray[index].y                = mask ? torque[indexI][1] : 0.0f;
    debugArray[index].z                = mask ? torque[indexI][2] : 0.0f;
+    debugArray[index].w                = blockId;

    index                             += cAmoebaSim.paddedNumberOfAtoms;
    debugArray[index].x                = mask ? torque[indexJ][0] : 0.0f;
    debugArray[index].y                = mask ? torque[indexJ][1] : 0.0f;
    debugArray[index].z                = mask ? torque[indexJ][2] : 0.0f;
-*/
-        index                             += cAmoebaSim.paddedNumberOfAtoms;
-        int pullIndex                      = 0;
-        debugArray[index].x                = pullBack[pullIndex].x;
-        debugArray[index].y                = pullBack[pullIndex].y;
-        debugArray[index].z                = pullBack[pullIndex].z;
-        debugArray[index].w                = pullBack[pullIndex].w;
+    debugArray[index].w                = blockId;

+    for( int pullIndex = 0; pullIndex < pullIndexMax; pullIndex++ ){
        index                             += cAmoebaSim.paddedNumberOfAtoms;
-        pullIndex++;
        debugArray[index].x                = pullBack[pullIndex].x;
        debugArray[index].y                = pullBack[pullIndex].y;
        debugArray[index].z                = pullBack[pullIndex].z;
        debugArray[index].w                = pullBack[pullIndex].w;
-
-        index                             += cAmoebaSim.paddedNumberOfAtoms;
-        pullIndex++;
-        debugArray[index].x                = pullBack[pullIndex].x;
-        debugArray[index].y                = pullBack[pullIndex].y;
-        debugArray[index].z                = pullBack[pullIndex].z;
-        debugArray[index].w                = pullBack[pullIndex].w;
-       
-        index                             += cAmoebaSim.paddedNumberOfAtoms;
-        pullIndex++;
-        debugArray[index].x                = pullBack[pullIndex].x;
-        debugArray[index].y                = pullBack[pullIndex].y;
-        debugArray[index].z                = pullBack[pullIndex].z;
-        debugArray[index].w                = pullBack[pullIndex].w;
-       
+    }
 }
 #endif
       
@@ -449,9 +414,8 @@ if( atomI == targetAtom  || atomJ == targetAtom ){
                           // force
       
                    float energy;
-                    calculateRealSpaceEwaldPairIxn_kernel( localParticle,                              psA[tj],
-                                                          cAmoebaSim.scalingDistanceCutoff,           scalingFactors,
-                                                          force, torque, &energy
+                    calculatePmeDirectElectrostaticPairIxn_kernel( localParticle,   psA[tj],
+                                                                   scalingFactors, force, torque, &energy
 #ifdef AMOEBA_DEBUG
 , pullBack
 #endif
@@ -490,55 +454,39 @@ if( atomI == targetAtom  || atomJ == targetAtom ){
        unsigned int indexI                = (atomI == targetAtom) ? 0 : 1;
        unsigned int indexJ                = (atomI == targetAtom) ? 1 : 0;
        float forceSign                    = (atomI == targetAtom) ? 1.0f : -1.0f;
+        float blockId                      = 3.0f;

        debugArray[index].x                = (float) atomI;
        debugArray[index].y                = (float) atomJ;
-         debugArray[index].z                = 3.0f;
-         debugArray[index].w                = (float) y;
-/*
+        debugArray[index].z                = (float) y;
+        debugArray[index].w                = blockId;
        index                             += cAmoebaSim.paddedNumberOfAtoms;
+
        debugArray[index].x                = mask ? forceSign*force[0]     : 0.0f;
        debugArray[index].y                = mask ? forceSign*force[1]     : 0.0f;
        debugArray[index].z                = mask ? forceSign*force[2]     : 0.0f;
+        debugArray[index].w                = blockId;


        index                             += cAmoebaSim.paddedNumberOfAtoms;
        debugArray[index].x                = mask ? torque[indexI][0] : 0.0f;
        debugArray[index].y                = mask ? torque[indexI][1] : 0.0f;
        debugArray[index].z                = mask ? torque[indexI][2] : 0.0f;
+        debugArray[index].w                = blockId;

        index                             += cAmoebaSim.paddedNumberOfAtoms;
        debugArray[index].x                = mask ? torque[indexJ][0] : 0.0f;
        debugArray[index].y                = mask ? torque[indexJ][1] : 0.0f;
        debugArray[index].z                = mask ? torque[indexJ][2] : 0.0f;
-*/
-        index                             += cAmoebaSim.paddedNumberOfAtoms;
-        int pullIndex                      = 0;
-        debugArray[index].x                = pullBack[pullIndex].x;
-        debugArray[index].y                = pullBack[pullIndex].y;
-        debugArray[index].z                = pullBack[pullIndex].z;
-        debugArray[index].w                = pullBack[pullIndex].w;
-
-        index                             += cAmoebaSim.paddedNumberOfAtoms;
-        pullIndex++;
-        debugArray[index].x                = pullBack[pullIndex].x;
-        debugArray[index].y                = pullBack[pullIndex].y;
-        debugArray[index].z                = pullBack[pullIndex].z;
-        debugArray[index].w                = pullBack[pullIndex].w;
-
-        index                             += cAmoebaSim.paddedNumberOfAtoms;
-        pullIndex++;
-        debugArray[index].x                = pullBack[pullIndex].x;
-        debugArray[index].y                = pullBack[pullIndex].y;
-        debugArray[index].z                = pullBack[pullIndex].z;
-        debugArray[index].w                = pullBack[pullIndex].w;
+        debugArray[index].w                = blockId;

+        for( int pullIndex = 0; pullIndex < pullIndexMax; pullIndex++ ){
            index                             += cAmoebaSim.paddedNumberOfAtoms;
-        pullIndex++;
            debugArray[index].x                = pullBack[pullIndex].x;
            debugArray[index].y                = pullBack[pullIndex].y;
            debugArray[index].z                = pullBack[pullIndex].z;
            debugArray[index].w                = pullBack[pullIndex].w;
+        }
       
 #if 0
            index                             += cAmoebaSim.paddedNumberOfAtoms;
@@ -550,13 +498,6 @@ if( atomI == targetAtom  || atomJ == targetAtom ){
            index                             += cAmoebaSim.paddedNumberOfAtoms;
            debugArray[index].x                = scalingFactors[MScaleIndex];
            debugArray[index].y                = mScaleVal;
-            for( int pIndex = 0; pIndex < 14; pIndex++ ){
-                index                             += cAmoebaSim.paddedNumberOfAtoms;
-                debugArray[index].x                = pullBack[pIndex].x;
-                debugArray[index].y                = pullBack[pIndex].y;
-                debugArray[index].z                = pullBack[pIndex].z;
-                debugArray[index].w                = pullBack[pIndex].w;
-            }

            index                             += cAmoebaSim.paddedNumberOfAtoms;
            debugArray[index].x                = labFrameDipole[3*atomI];

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
@@ -322,7 +322,7 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&

   --------------------------------------------------------------------------------------- */

-void cudaComputeAmoebaPmeFixedEField( amoebaGpuContext amoebaGpu )
+static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
 {
  
   // ---------------------------------------------------------------------------------------
@@ -487,3 +487,9 @@ void cudaComputeAmoebaPmeFixedEField( amoebaGpuContext amoebaGpu )

         }
 }
+
+void cudaComputeAmoebaPmeFixedEField( amoebaGpuContext amoebaGpu )
+{
+    cudaComputeAmoebaPmeDirectFixedEField( amoebaGpu );
+    kCalculateAmoebaPMEFixedMultipoleField( amoebaGpu );
+}
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
@@ -616,6 +616,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
        // matrix multiply

        cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpu, amoebaGpu->psWorkVector[0],  amoebaGpu->psWorkVector[1] );
+        kCalculateAmoebaPMEInducedDipoleField( amoebaGpu );
        LAUNCHERROR("cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply Loop\n");  



--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaRotateFrame.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaRotateFrame.cu
@@ -390,7 +390,7 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
    if( amoebaGpu->multipoleNonbondedMethod == AMOEBA_NO_CUTOFF ){
        cudaComputeAmoebaElectrostatic( amoebaGpu );
    } else {
-        cudaComputeAmoebaRealSpaceEwald( amoebaGpu );
+        cudaComputeAmoebaPmeElectrostatic( amoebaGpu );
    }

    // map torques to forces

--- a/plugins/amoeba/platforms/cuda/tests/AmoebaTinkerParameterFile.cpp
+++ b/plugins/amoeba/platforms/cuda/tests/AmoebaTinkerParameterFile.cpp
@@ -2008,6 +2008,7 @@ static int readAmoebaMultipoleParameters( FILE* filePtr, int version, MapStringI
        multipoleForce->setNonbondedMethod( AmoebaMultipoleForce::NoCutoff );
    }
    multipoleForce->setCutoffDistance( cutoffDistance );
+    multipoleForce->setAEwald( aewald );
    system.setDefaultPeriodicBoxVectors( Vec3(box[0], 0.0, 0.0), Vec3(0.0, box[1], 0.0), Vec3(0.0, 0.0, box[2]) );
    if( log ){
        (void) fprintf( log, "%s number of MultipoleParameter terms=%d usePme=%d aewald=%15.7e cutoffDistance=%12.4f\n",
@@ -2111,6 +2112,7 @@ static int readAmoebaMultipoleParameters( FILE* filePtr, int version, MapStringI
        double polarityConversion      = AngstromToNm*AngstromToNm*AngstromToNm;
        double dampingFactorConversion = sqrt( AngstromToNm );
      
+        multipoleForce->setAEwald(                multipoleForce->getAEwald()/AngstromToNm );
        multipoleForce->setCutoffDistance(        multipoleForce->getCutoffDistance()*AngstromToNm );
        multipoleForce->setScalingDistanceCutoff( multipoleForce->getScalingDistanceCutoff()*AngstromToNm );

@@ -2165,8 +2167,8 @@ static int readAmoebaMultipoleParameters( FILE* filePtr, int version, MapStringI
                        (useOpenMMUnits ? "OpenMM" : "Amoeba") );

        std::string nonbondedMethod = multipoleForce->getNonbondedMethod( ) == AmoebaMultipoleForce::PME ? "PME" : "NoCutoff";
-        (void) fprintf( log, "NonbondedMethod=%s cutoff=%15.7e.\n", nonbondedMethod.c_str(),
-                         multipoleForce->getCutoffDistance() );
+        (void) fprintf( log, "NonbondedMethod=%s aEwald=%15.7e cutoff=%15.7e.\n", nonbondedMethod.c_str(),
+                        multipoleForce->getAEwald(), multipoleForce->getCutoffDistance() );

        Vec3 a,b,c;
        system.getDefaultPeriodicBoxVectors( a, b, c );