AmoebaCudaPmeMutualInducedField

c22f00cf · Mark Friedrichs · 3763b76b · c22f00cf · c22f00cf · c22f00cf
Commit c22f00cf authored Sep 13, 2010 by Mark Friedrichs
10 changed files
--- a/plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
+++ b/plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
@@ -2763,6 +2763,7 @@ void amoebaGpuSetConstants(amoebaGpuContext amoebaGpu)
    SetCalculateAmoebaCudaVdw14_7Sim( amoebaGpu );
    SetCalculateAmoebaCudaWcaDispersionSim( amoebaGpu );
    SetCalculateAmoebaCudaMutualInducedFieldSim( amoebaGpu );
+    SetCalculateAmoebaCudaPmeMutualInducedFieldSim( amoebaGpu );
    SetCalculateAmoebaCudaPmeFixedEFieldSim( amoebaGpu );
    SetCalculateAmoebaElectrostaticSim( amoebaGpu );
    SetCalculateAmoebaRealSpaceEwaldSim( amoebaGpu );

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
@@ -88,6 +88,10 @@ extern void SetCalculateAmoebaCudaMutualInducedFieldSim(amoebaGpuContext gpu);
 extern void GetCalculateAmoebaCudaMutualInducedFieldSim(amoebaGpuContext gpu);
 extern void cudaComputeAmoebaMutualInducedField( amoebaGpuContext gpu);
+extern void SetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext gpu);
+extern void GetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext gpu);
+extern void cudaComputeAmoebaPmeMutualInducedField( amoebaGpuContext gpu);
 // mutual induced and Gk
 extern void SetCalculateAmoebaCudaMutualInducedAndGkFieldsSim(amoebaGpuContext amoebaGpu);

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
@@ -36,14 +36,11 @@ void GetCalculateAmoebaCudaMutualInducedAndGkFieldsSim(amoebaGpuContext amoebaGp
 //#define AMOEBA_DEBUG
 #undef AMOEBA_DEBUG
-__device__ void calculateMutualInducedAndGkFieldsPairIxn_kernel( float4 atomCoordinatesI, float4 atomCoordinatesJ,
+#define GK
-                                                                 float dampingFactorI,    float dampingFactorJ,
+#include "kCalculateAmoebaCudaMutualInducedParticle.h"
-                                                                 float tholeI,            float tholeJ,
+#undef GK
-                                                                 float* inducedDipoleI,   float* inducedDipolePolarI,
-                                                                 float* inducedDipoleJ,   float* inducedDipolePolarJ,
+__device__ void calculateMutualInducedAndGkFieldsPairIxn_kernel( MutualInducedParticle& atomI, MutualInducedParticle& atomJ,
-                                                                 float* inducedDipoleSI,  float* inducedDipolePolarSI,
-                                                                 float* inducedDipoleSJ,  float* inducedDipolePolarSJ,
-                                                                 float scalingDistanceCutoff,
                                                                 float fields[8][3]
 #ifdef AMOEBA_DEBUG
@@ -59,9 +56,9 @@ __device__ void calculateMutualInducedAndGkFieldsPairIxn_kernel( float4 atomCoor
    // get deltaR, and r between 2 atoms
-    deltaR[0]                                    = atomCoordinatesJ.x - atomCoordinatesI.x;
+    deltaR[0]                                    = atomJ.x - atomI.x;
-    deltaR[1]                                    = atomCoordinatesJ.y - atomCoordinatesI.y;
+    deltaR[1]                                    = atomJ.y - atomI.y;
-    deltaR[2]                                    = atomCoordinatesJ.z - atomCoordinatesI.z;
+    deltaR[2]                                    = atomJ.z - atomI.z;
    float r                                      =  sqrtf( deltaR[0]*deltaR[0] + deltaR[1]*deltaR[1] + deltaR[2]*deltaR[2] );
    float rI                                     =  1.0f/r;
@@ -69,112 +66,59 @@ __device__ void calculateMutualInducedAndGkFieldsPairIxn_kernel( float4 atomCoor
    float rr3                                    = -rI*r2I;
    float rr5                                    = -3.0f*rr3*r2I;
-    float dampProd                               = dampingFactorI*dampingFactorJ;
+    float dampProd                               = atomI.damp*atomJ.damp;
    float ratio                                  = (dampProd != 0.0f) ? (r/dampProd) : 1.0f;
-    float pGamma                                 = tholeJ > tholeI ? tholeI : tholeJ;
+    float pGamma                                 = atomI.thole > atomJ.thole ? atomJ.thole: atomI.thole;
    float damp                                   = ratio*ratio*ratio*pGamma;
-    float dampExp                                = ( (dampProd != 0.0f) && (r < scalingDistanceCutoff) ) ? expf( -damp ) : 0.0f; 
+    float dampExp                                = ( (dampProd != 0.0f) && (r < cAmoebaSim.scalingDistanceCutoff) ) ? expf( -damp ) : 0.0f; 
    rr3                                         *= (1.0f - dampExp);
    rr5                                         *= (1.0f - ( 1.0f + damp )*dampExp);
-    float* dipole                                = inducedDipoleJ;
+    float dDotDelta                              = rr5*(deltaR[0]*atomJ.inducedDipole[0]    + deltaR[1]*atomJ.inducedDipole[1]    + deltaR[2]*atomJ.inducedDipole[2] );
-    float dDotDelta                              = rr5*(deltaR[0]*dipole[0]    + deltaR[1]*dipole[1]    + deltaR[2]*dipole[2] );
+    fields[0][0]                                 = rr3*atomJ.inducedDipole[0] + dDotDelta*deltaR[0];
-    fields[0][0]                                 = rr3*dipole[0] + dDotDelta*deltaR[0];
+    fields[0][1]                                 = rr3*atomJ.inducedDipole[1] + dDotDelta*deltaR[1];
-    fields[0][1]                                 = rr3*dipole[1] + dDotDelta*deltaR[1];
+    fields[0][2]                                 = rr3*atomJ.inducedDipole[2] + dDotDelta*deltaR[2];
-    fields[0][2]                                 = rr3*dipole[2] + dDotDelta*deltaR[2];
-    dipole                                       = inducedDipolePolarJ;
+    dDotDelta                                    = rr5*(deltaR[0]*atomJ.inducedDipolePolar[0]    + deltaR[1]*atomJ.inducedDipolePolar[1]    + deltaR[2]*atomJ.inducedDipolePolar[2] );
-    dDotDelta                                    = rr5*(deltaR[0]*dipole[0]    + deltaR[1]*dipole[1]    + deltaR[2]*dipole[2] );
+    fields[1][0]                                 = rr3*atomJ.inducedDipolePolar[0] + dDotDelta*deltaR[0];
-    fields[1][0]                                 = rr3*dipole[0] + dDotDelta*deltaR[0];
+    fields[1][1]                                 = rr3*atomJ.inducedDipolePolar[1] + dDotDelta*deltaR[1];
-    fields[1][1]                                 = rr3*dipole[1] + dDotDelta*deltaR[1];
+    fields[1][2]                                 = rr3*atomJ.inducedDipolePolar[2] + dDotDelta*deltaR[2];
-    fields[1][2]                                 = rr3*dipole[2] + dDotDelta*deltaR[2];
-    dipole                                       = inducedDipoleI;
+    dDotDelta                                    = rr5*(deltaR[0]*atomI.inducedDipole[0]    + deltaR[1]*atomI.inducedDipole[1]    + deltaR[2]*atomI.inducedDipole[2] );
-    dDotDelta                                    = rr5*(deltaR[0]*dipole[0]    + deltaR[1]*dipole[1]    + deltaR[2]*dipole[2] );
+    fields[2][0]                                 = rr3*atomI.inducedDipole[0] + dDotDelta*deltaR[0];
-    fields[2][0]                                 = rr3*dipole[0] + dDotDelta*deltaR[0];
+    fields[2][1]                                 = rr3*atomI.inducedDipole[1] + dDotDelta*deltaR[1];
-    fields[2][1]                                 = rr3*dipole[1] + dDotDelta*deltaR[1];
+    fields[2][2]                                 = rr3*atomI.inducedDipole[2] + dDotDelta*deltaR[2];
-    fields[2][2]                                 = rr3*dipole[2] + dDotDelta*deltaR[2];
-    dipole                                       = inducedDipolePolarI;
+    dDotDelta                                    = rr5*(deltaR[0]*atomI.inducedDipolePolar[0]    + deltaR[1]*atomI.inducedDipolePolar[1]    + deltaR[2]*atomI.inducedDipolePolar[2] );
-    dDotDelta                                    = rr5*(deltaR[0]*dipole[0]    + deltaR[1]*dipole[1]    + deltaR[2]*dipole[2] );
+    fields[3][0]                                 = rr3*atomI.inducedDipolePolar[0] + dDotDelta*deltaR[0];
-    fields[3][0]                                 = rr3*dipole[0] + dDotDelta*deltaR[0];
+    fields[3][1]                                 = rr3*atomI.inducedDipolePolar[1] + dDotDelta*deltaR[1];
-    fields[3][1]                                 = rr3*dipole[1] + dDotDelta*deltaR[1];
+    fields[3][2]                                 = rr3*atomI.inducedDipolePolar[2] + dDotDelta*deltaR[2];
-    fields[3][2]                                 = rr3*dipole[2] + dDotDelta*deltaR[2];
+    dDotDelta                                    = rr5*(deltaR[0]*atomJ.inducedDipoleS[0]    + deltaR[1]*atomJ.inducedDipoleS[1]    + deltaR[2]*atomJ.inducedDipoleS[2] );
-    dipole                                       = inducedDipoleSJ;
+    fields[4][0]                                 = rr3*atomJ.inducedDipoleS[0] + dDotDelta*deltaR[0];
-    dDotDelta                                    = rr5*(deltaR[0]*dipole[0]    + deltaR[1]*dipole[1]    + deltaR[2]*dipole[2] );
+    fields[4][1]                                 = rr3*atomJ.inducedDipoleS[1] + dDotDelta*deltaR[1];
-    fields[4][0]                                 = rr3*dipole[0] + dDotDelta*deltaR[0];
+    fields[4][2]                                 = rr3*atomJ.inducedDipoleS[2] + dDotDelta*deltaR[2];
-    fields[4][1]                                 = rr3*dipole[1] + dDotDelta*deltaR[1];
-    fields[4][2]                                 = rr3*dipole[2] + dDotDelta*deltaR[2];
-    dipole                                       = inducedDipolePolarSJ;
+    dDotDelta                                    = rr5*(deltaR[0]*atomJ.inducedDipolePolarS[0]    + deltaR[1]*atomJ.inducedDipolePolarS[1]    + deltaR[2]*atomJ.inducedDipolePolarS[2] );
-    dDotDelta                                    = rr5*(deltaR[0]*dipole[0]    + deltaR[1]*dipole[1]    + deltaR[2]*dipole[2] );
+    fields[5][0]                                 = rr3*atomJ.inducedDipolePolarS[0] + dDotDelta*deltaR[0];
-    fields[5][0]                                 = rr3*dipole[0] + dDotDelta*deltaR[0];
+    fields[5][1]                                 = rr3*atomJ.inducedDipolePolarS[1] + dDotDelta*deltaR[1];
-    fields[5][1]                                 = rr3*dipole[1] + dDotDelta*deltaR[1];
+    fields[5][2]                                 = rr3*atomJ.inducedDipolePolarS[2] + dDotDelta*deltaR[2];
-    fields[5][2]                                 = rr3*dipole[2] + dDotDelta*deltaR[2];
-    dipole                                       = inducedDipoleSI;
+    dDotDelta                                    = rr5*(deltaR[0]*atomI.inducedDipoleS[0]    + deltaR[1]*atomI.inducedDipoleS[1]    + deltaR[2]*atomI.inducedDipoleS[2] );
-    dDotDelta                                    = rr5*(deltaR[0]*dipole[0]    + deltaR[1]*dipole[1]    + deltaR[2]*dipole[2] );
+    fields[6][0]                                 = rr3*atomI.inducedDipoleS[0] + dDotDelta*deltaR[0];
-    fields[6][0]                                 = rr3*dipole[0] + dDotDelta*deltaR[0];
+    fields[6][1]                                 = rr3*atomI.inducedDipoleS[1] + dDotDelta*deltaR[1];
-    fields[6][1]                                 = rr3*dipole[1] + dDotDelta*deltaR[1];
+    fields[6][2]                                 = rr3*atomI.inducedDipoleS[2] + dDotDelta*deltaR[2];
-    fields[6][2]                                 = rr3*dipole[2] + dDotDelta*deltaR[2];
-    dipole                                       = inducedDipolePolarSI;
+    dDotDelta                                    = rr5*(deltaR[0]*atomI.inducedDipolePolarS[0]    + deltaR[1]*atomI.inducedDipolePolarS[1]    + deltaR[2]*atomI.inducedDipolePolarS[2] );
-    dDotDelta                                    = rr5*(deltaR[0]*dipole[0]    + deltaR[1]*dipole[1]    + deltaR[2]*dipole[2] );
+    fields[7][0]                                 = rr3*atomI.inducedDipolePolarS[0] + dDotDelta*deltaR[0];
-    fields[7][0]                                 = rr3*dipole[0] + dDotDelta*deltaR[0];
+    fields[7][1]                                 = rr3*atomI.inducedDipolePolarS[1] + dDotDelta*deltaR[1];
-    fields[7][1]                                 = rr3*dipole[1] + dDotDelta*deltaR[1];
+    fields[7][2]                                 = rr3*atomI.inducedDipolePolarS[2] + dDotDelta*deltaR[2];
-    fields[7][2]                                 = rr3*dipole[2] + dDotDelta*deltaR[2];
-#ifdef AMOEBA_DEBUGX
-//int n2         = numberOfAtoms*numberOfAtoms;
-//int debugIndex = atomI*numberOfAtoms + atomJ;
-if( atomI == 0 ){
-int debugIndex               = atomJ;
-    debugArray[debugIndex].x = rr3;
-    debugArray[debugIndex].y = rr5;
-    dipole                                       = &inducedDipole[atomJ*3];
-    debugArray[debugIndex].z = (deltaR[0]*dipole[0]    + deltaR[1]*dipole[1]    + deltaR[2]*dipole[2] );
-    dipole                                       = &inducedDipole[atomI*3];
-    debugArray[debugIndex].w = (deltaR[0]*dipole[0]    + deltaR[1]*dipole[1]    + deltaR[2]*dipole[2] );
-    debugIndex              += numberOfAtoms;
-    dipole                                       = &inducedDipolePolar[atomJ*3];
-    debugArray[debugIndex].x = (deltaR[0]*dipole[0]    + deltaR[1]*dipole[1]    + deltaR[2]*dipole[2] );
-    dipole                                       = &inducedDipolePolar[atomI*3];
-    debugArray[debugIndex].y = (deltaR[0]*dipole[0]    + deltaR[1]*dipole[1]    + deltaR[2]*dipole[2] );
-    debugIndex              += numberOfAtoms;
-    debugArray[debugIndex].x = fields1[atomI*numberOfAtoms*3 + atomJ*3 +0];
-    debugArray[debugIndex].y = fields1[atomI*numberOfAtoms*3 + atomJ*3 +1];
-    debugArray[debugIndex].z = fields1[atomI*numberOfAtoms*3 + atomJ*3 +2];
-    debugIndex              += numberOfAtoms;
-    debugArray[debugIndex].x = fields1[atomJ*numberOfAtoms*3 + atomI*3 +0];
-    debugArray[debugIndex].y = fields1[atomJ*numberOfAtoms*3 + atomI*3 +1];
-    debugArray[debugIndex].z = fields1[atomJ*numberOfAtoms*3 + atomI*3 +2];
-    debugIndex              += numberOfAtoms;
-    debugArray[debugIndex].x = fields2[atomI*numberOfAtoms*3 + atomJ*3 +0];
-    debugArray[debugIndex].y = fields2[atomI*numberOfAtoms*3 + atomJ*3 +1];
-    debugArray[debugIndex].z = fields2[atomI*numberOfAtoms*3 + atomJ*3 +2];
-    debugIndex              += numberOfAtoms;
-    debugArray[debugIndex].x = fields2[atomJ*numberOfAtoms*3 + atomI*3 +0];
-    debugArray[debugIndex].y = fields2[atomJ*numberOfAtoms*3 + atomI*3 +1];
-    debugArray[debugIndex].z = fields2[atomJ*numberOfAtoms*3 + atomI*3 +2];
-}
-#endif
 }
-__device__ void calculateMutualInducedAndGkFieldsGkPairIxn_kernel( float4 atomCoordinatesI, float4 atomCoordinatesJ, float rb2,
+__device__ void calculateMutualInducedAndGkFieldsGkPairIxn_kernel( MutualInducedParticle& atomI, MutualInducedParticle& atomJ,
-                                                                   float* inducedDipoleSI,  float* inducedDipolePolarSI,
-                                                                   float* inducedDipoleSJ,  float* inducedDipolePolarSJ,
                                                                   float gkField[8][3]
 #ifdef AMOEBA_DEBUG
@@ -191,14 +135,16 @@ __device__ void calculateMutualInducedAndGkFieldsGkPairIxn_kernel( float4 atomCo
    // ---------------------------------------------------------------------------------------
-    float xr               = atomCoordinatesJ.x - atomCoordinatesI.x;
+    float xr               = atomJ.x - atomI.x;
-    float yr               = atomCoordinatesJ.y - atomCoordinatesI.y;
+    float yr               = atomJ.y - atomI.y;
-    float zr               = atomCoordinatesJ.z - atomCoordinatesI.z;
+    float zr               = atomJ.z - atomI.z;
    float xr2              = xr*xr;
    float yr2              = yr*yr;
    float zr2              = zr*zr;
+    float rb2              = atomI.bornRadius*atomJ.bornRadius;
    float r2               = xr2 + yr2 + zr2;
    float expterm          = expf(-r2/(cAmoebaSim.gkc*rb2));
    float expc             = expterm /cAmoebaSim.gkc; 
@@ -209,21 +155,21 @@ __device__ void calculateMutualInducedAndGkFieldsGkPairIxn_kernel( float4 atomCo
    float gf3              = gf2 * gf;
    float gf5              = gf3 * gf2;
-    float duixs            = inducedDipoleSI[0];
+    float duixs            = atomI.inducedDipoleS[0];
-    float duiys            = inducedDipoleSI[1];
+    float duiys            = atomI.inducedDipoleS[1];
-    float duizs            = inducedDipoleSI[2];
+    float duizs            = atomI.inducedDipoleS[2];
-    float puixs            = inducedDipolePolarSI[0];
+    float puixs            = atomI.inducedDipolePolarS[0];
-    float puiys            = inducedDipolePolarSI[1];
+    float puiys            = atomI.inducedDipolePolarS[1];
-    float puizs            = inducedDipolePolarSI[2];
+    float puizs            = atomI.inducedDipolePolarS[2];
-    float dukxs            = inducedDipoleSJ[0];
+    float dukxs            = atomJ.inducedDipoleS[0];
-    float dukys            = inducedDipoleSJ[1];
+    float dukys            = atomJ.inducedDipoleS[1];
-    float dukzs            = inducedDipoleSJ[2];
+    float dukzs            = atomJ.inducedDipoleS[2];
-    float pukxs            = inducedDipolePolarSJ[0];
+    float pukxs            = atomJ.inducedDipolePolarS[0];
-    float pukys            = inducedDipolePolarSJ[1];
+    float pukys            = atomJ.inducedDipolePolarS[1];
-    float pukzs            = inducedDipolePolarSJ[2];
+    float pukzs            = atomJ.inducedDipolePolarS[2];
    // reaction potential auxiliary terms
@@ -280,9 +226,6 @@ __device__ static int debugAccumulate( int index, float4* debugArray, float* fie
 }
 #endif
-#define GK
-#include "kCalculateAmoebaCudaMutualInducedParticle.h"
-#undef GK
 // Include versions of the kernels for N^2 calculations.
@@ -587,24 +530,16 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
    }
    if (gpu->bOutputBufferPerWarp){
-#if 0
+        kCalculateAmoebaMutualInducedAndGkFieldsN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
-        (void) fprintf( amoebaGpu->log, "N2 warp\n" ); (void) fflush( amoebaGpu->log );
-        kCalculateAmoebaMutualInducedAndGkFieldsN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks,
-                                                                  amoebaGpu->nonbondThreadsPerBlock,
-                                                                  sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
                                                                 amoebaGpu->psWorkUnit->_pDevStream[0],
-                                                                 gpu->psPosq4->_pDevStream[0],
-                                                                 amoebaGpu->psInducedDipole->_pDevStream[0],
-                                                                 amoebaGpu->psInducedDipolePolar->_pDevStream[0],
                                                                 amoebaGpu->psWorkArray_3_1->_pDevStream[0],
-#ifdef AMOEBA_DEBUG
                                                                 amoebaGpu->psWorkArray_3_2->_pDevStream[0],
+                                                                 amoebaGpu->psWorkArray_3_3->_pDevStream[0],
+#ifdef AMOEBA_DEBUG
+                                                                 amoebaGpu->psWorkArray_3_4->_pDevStream[0],
                                                                 debugArray->_pDevStream[0], targetAtom );
 #else
-                                                                 amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
+                                                                 amoebaGpu->psWorkArray_3_4->_pDevStream[0] );
-#endif
 #endif
    } else {
@@ -619,12 +554,6 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
        kCalculateAmoebaMutualInducedAndGkFieldsN2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
                                                            amoebaGpu->psWorkUnit->_pDevStream[0],
-                                                            gpu->psPosq4->_pDevStream[0],
-                                                            gpu->psBornRadii->_pDevStream[0],
-                                                            amoebaGpu->psInducedDipole->_pDevStream[0],
-                                                            amoebaGpu->psInducedDipolePolar->_pDevStream[0],
-                                                            amoebaGpu->psInducedDipoleS->_pDevStream[0],
-                                                            amoebaGpu->psInducedDipolePolarS->_pDevStream[0],
                                                            amoebaGpu->psWorkArray_3_1->_pDevStream[0],
                                                            amoebaGpu->psWorkArray_3_2->_pDevStream[0],
                                                            amoebaGpu->psWorkArray_3_3->_pDevStream[0],

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.h
@@ -36,12 +36,6 @@ __launch_bounds__(64, 1)
 #endif
 void METHOD_NAME(kCalculateAmoebaMutualInducedAndGkFields, _kernel)(
                            unsigned int* workUnit,
-                            float4* atomCoord,
-                            float*  bornRadii,
-                            float* inducedDipole,
-                            float* inducedDipolePolar,
-                            float* inducedDipoleS,
-                            float* inducedDipolePolarS,
                            float* outputField,
                            float* outputFieldPolar,
                            float* outputFieldS,
@@ -60,13 +54,6 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedAndGkFields, _kernel)(
    unsigned int end             = (warp+1)*numWorkUnits/totalWarps;
    unsigned int lasty           = 0xFFFFFFFF;
-    float4 jCoord;
-    float  jBornRadius;
-    float  jDipole[3];     
-    float  jDipolePolar[3];     
-    float  jDipoleS[3];     
-    float  jDipolePolarS[3];     
    while (pos < end)
    {
@@ -84,7 +71,8 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedAndGkFields, _kernel)(
        MutualInducedParticle*  psA      = &sA[tbx];
        unsigned int atomI               = x + tgx;
-        float4 iCoord                    = atomCoord[atomI];
+        MutualInducedParticle localParticle;
+        loadMutualInducedShared( &localParticle, atomI );
        float fieldSum[3];
        float fieldPolarSum[3];
@@ -115,9 +103,7 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedAndGkFields, _kernel)(
            // load shared data
-            loadMutualInducedShared( &(sA[threadIdx.x]), atomI,
+            loadMutualInducedShared( &(sA[threadIdx.x]), atomI );
-                                     atomCoord, inducedDipole, inducedDipolePolar, cAmoebaSim.pDampingFactorAndThole,
-                                     bornRadii, inducedDipoleS, inducedDipolePolarS );
            for (unsigned int j = 0; j < GRID; j++)
            {
@@ -126,16 +112,7 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedAndGkFields, _kernel)(
                // load coords, charge, ...
-                loadMutualInducedData( &(psA[j]), &jCoord, jDipole, jDipolePolar, &jBornRadius, jDipoleS, jDipolePolarS ); 
+                calculateMutualInducedAndGkFieldsPairIxn_kernel( localParticle, psA[j], ijField
-                calculateMutualInducedAndGkFieldsPairIxn_kernel( iCoord,                                          jCoord,
-                                         cAmoebaSim.pDampingFactorAndThole[atomI].x,      psA[j].damp,
-                                         cAmoebaSim.pDampingFactorAndThole[atomI].y,      psA[j].thole,
-                                         &(inducedDipole[atomI*3]),                       &(inducedDipolePolar[atomI*3]),
-                                         jDipole,                                         jDipolePolar,
-                                         &(inducedDipoleS[atomI*3]),                      &(inducedDipolePolarS[atomI*3]),
-                                         jDipoleS,                                        jDipolePolarS,
-                                         cAmoebaSim.scalingDistanceCutoff,                ijField
 #ifdef AMOEBA_DEBUG
 ,  debugArray
 #endif
@@ -177,10 +154,7 @@ if( atomI == targetAtom ){
        index                              = debugAccumulate( index, debugArray, ijField[5],   mask, 4.0f );
 }
 #endif
-                calculateMutualInducedAndGkFieldsGkPairIxn_kernel( iCoord,                                          jCoord, bornRadii[atomI]*jBornRadius,
+                calculateMutualInducedAndGkFieldsGkPairIxn_kernel( localParticle, psA[j], ijField
-                                                                   &(inducedDipoleS[atomI*3]),                      &(inducedDipolePolarS[atomI*3]),
-                                                                   jDipoleS,                                        jDipolePolarS,
-                                                                   ijField
 #ifdef AMOEBA_DEBUG
                                                                   , debugArray
 #endif
@@ -247,9 +221,7 @@ if( atomI == targetAtom ){
            {
                // load coordinates, charge, ...
-                loadMutualInducedShared( &(sA[threadIdx.x]), (y+tgx),
+                loadMutualInducedShared( &(sA[threadIdx.x]), (y+tgx) );
-                                         atomCoord, inducedDipole,
-                                         inducedDipolePolar, cAmoebaSim.pDampingFactorAndThole, bornRadii, inducedDipoleS, inducedDipolePolarS );
            }
           // zero shared fields
@@ -263,16 +235,7 @@ if( atomI == targetAtom ){
                // load coords, charge, ...
-                loadMutualInducedData( &(psA[tj]), &jCoord, jDipole, jDipolePolar, &jBornRadius, jDipoleS, jDipolePolarS ); 
+                calculateMutualInducedAndGkFieldsPairIxn_kernel( localParticle, psA[tj], ijField
-                calculateMutualInducedAndGkFieldsPairIxn_kernel( iCoord,                  jCoord,
-                                         cAmoebaSim.pDampingFactorAndThole[atomI].x,      psA[tj].damp,
-                                         cAmoebaSim.pDampingFactorAndThole[atomI].y,      psA[tj].thole,
-                                         &(inducedDipole[atomI*3]),                       &(inducedDipolePolar[atomI*3]),
-                                         jDipole,                                         jDipolePolar,
-                                         &(inducedDipoleS[atomI*3]),                      &(inducedDipolePolarS[atomI*3]),
-                                         jDipoleS,                                        jDipolePolarS,
-                                         cAmoebaSim.scalingDistanceCutoff, ijField
 #ifdef AMOEBA_DEBUG
 ,  debugArray
 #endif
@@ -344,10 +307,7 @@ if( atomI == targetAtom  || (y + tj) == targetAtom ){
        index                              = debugAccumulate( index, debugArray, ijField[indexI+5], maskD, -4.0f );
 }
 #endif
-                calculateMutualInducedAndGkFieldsGkPairIxn_kernel( iCoord,                                          jCoord, bornRadii[atomI]*jBornRadius,
+                calculateMutualInducedAndGkFieldsGkPairIxn_kernel( localParticle, psA[tj], ijField
-                                                                   &(inducedDipoleS[atomI*3]),                      &(inducedDipolePolarS[atomI*3]),
-                                                                   jDipoleS,                                        jDipolePolarS,
-                                                                   ijField
 #ifdef AMOEBA_DEBUG
 ,  debugArray
 #endif

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
@@ -36,12 +36,9 @@ void GetCalculateAmoebaCudaMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
 //#define AMOEBA_DEBUG
 #undef AMOEBA_DEBUG
-__device__ void calculateMutualInducedFieldPairIxn_kernel( float4 atomCoordinatesI, float4 atomCoordinatesJ,
+#include "kCalculateAmoebaCudaMutualInducedParticle.h"
-                                                           float dampingFactorI,    float dampingFactorJ,
-                                                           float tholeI,            float tholeJ,
+__device__ void calculateMutualInducedFieldPairIxn_kernel( MutualInducedParticle& atomI, MutualInducedParticle& atomJ,
-                                                           float* inducedDipoleI,   float* inducedDipolePolarI,
-                                                           float* inducedDipoleJ,   float* inducedDipolePolarJ,
-                                                           float scalingDistanceCutoff,
                                                           float fields[4][3]
 #ifdef AMOEBA_DEBUG
@@ -57,9 +54,9 @@ __device__ void calculateMutualInducedFieldPairIxn_kernel( float4 atomCoordinate
    // get deltaR, and r between 2 atoms
-    deltaR[0]                                    = atomCoordinatesJ.x - atomCoordinatesI.x;
+    deltaR[0]                                    = atomJ.x - atomI.x;
-    deltaR[1]                                    = atomCoordinatesJ.y - atomCoordinatesI.y;
+    deltaR[1]                                    = atomJ.y - atomI.y;
-    deltaR[2]                                    = atomCoordinatesJ.z - atomCoordinatesI.z;
+    deltaR[2]                                    = atomJ.z - atomI.z;
    float r                                      =  sqrtf( deltaR[0]*deltaR[0] + deltaR[1]*deltaR[1] + deltaR[2]*deltaR[2] );
    float rI                                     =  1.0f/r;
@@ -67,87 +64,36 @@ __device__ void calculateMutualInducedFieldPairIxn_kernel( float4 atomCoordinate
    float rr3                                    = -rI*r2I;
    float rr5                                    = -3.0f*rr3*r2I;
-    float dampProd                               = dampingFactorI*dampingFactorJ;
+    float dampProd                               = atomI.damp*atomJ.damp;
    float ratio                                  = (dampProd != 0.0f) ? (r/dampProd) : 1.0f;
-    float pGamma                                 = tholeJ > tholeI ? tholeI : tholeJ;
+    float pGamma                                 = atomJ.thole > atomI.thole ? atomI.thole: atomJ.thole;
    float damp                                   = ratio*ratio*ratio*pGamma;
-    float dampExp                                = ( (dampProd != 0.0f) && (r < scalingDistanceCutoff) ) ? expf( -damp ) : 0.0f; 
+    float dampExp                                = ( (dampProd != 0.0f) && (r < cAmoebaSim.scalingDistanceCutoff) ) ? expf( -damp ) : 0.0f; 
    rr3                                         *= (1.0f - dampExp);
    rr5                                         *= (1.0f - ( 1.0f + damp )*dampExp);
-    float* dipole                                = inducedDipoleJ;
+    float dDotDelta                              = rr5*(deltaR[0]*atomJ.inducedDipole[0]         + deltaR[1]*atomJ.inducedDipole[1]       + deltaR[2]*atomJ.inducedDipole[2] );
-    float dDotDelta                              = rr5*(deltaR[0]*dipole[0]    + deltaR[1]*dipole[1]    + deltaR[2]*dipole[2] );
+    fields[0][0]                                 = rr3*atomJ.inducedDipole[0] + dDotDelta*deltaR[0];
-    fields[0][0]                                 = rr3*dipole[0] + dDotDelta*deltaR[0];
+    fields[0][1]                                 = rr3*atomJ.inducedDipole[1] + dDotDelta*deltaR[1];
-    fields[0][1]                                 = rr3*dipole[1] + dDotDelta*deltaR[1];
+    fields[0][2]                                 = rr3*atomJ.inducedDipole[2] + dDotDelta*deltaR[2];
-    fields[0][2]                                 = rr3*dipole[2] + dDotDelta*deltaR[2];
-    dipole                                       = inducedDipolePolarJ;
+    dDotDelta                                    = rr5*(deltaR[0]*atomJ.inducedDipolePolar[0]    + deltaR[1]*atomJ.inducedDipolePolar[1]  + deltaR[2]*atomJ.inducedDipolePolar[2] );
-    dDotDelta                                    = rr5*(deltaR[0]*dipole[0]    + deltaR[1]*dipole[1]    + deltaR[2]*dipole[2] );
+    fields[1][0]                                 = rr3*atomJ.inducedDipolePolar[0] + dDotDelta*deltaR[0];
-    fields[1][0]                                 = rr3*dipole[0] + dDotDelta*deltaR[0];
+    fields[1][1]                                 = rr3*atomJ.inducedDipolePolar[1] + dDotDelta*deltaR[1];
-    fields[1][1]                                 = rr3*dipole[1] + dDotDelta*deltaR[1];
+    fields[1][2]                                 = rr3*atomJ.inducedDipolePolar[2] + dDotDelta*deltaR[2];
-    fields[1][2]                                 = rr3*dipole[2] + dDotDelta*deltaR[2];
-    dipole                                       = inducedDipoleI;
+    dDotDelta                                    = rr5*(deltaR[0]*atomI.inducedDipole[0]         + deltaR[1]*atomI.inducedDipole[1]       + deltaR[2]*atomI.inducedDipole[2] );
-    dDotDelta                                    = rr5*(deltaR[0]*dipole[0]    + deltaR[1]*dipole[1]    + deltaR[2]*dipole[2] );
+    fields[2][0]                                 = rr3*atomI.inducedDipole[0] + dDotDelta*deltaR[0];
-    fields[2][0]                                 = rr3*dipole[0] + dDotDelta*deltaR[0];
+    fields[2][1]                                 = rr3*atomI.inducedDipole[1] + dDotDelta*deltaR[1];
-    fields[2][1]                                 = rr3*dipole[1] + dDotDelta*deltaR[1];
+    fields[2][2]                                 = rr3*atomI.inducedDipole[2] + dDotDelta*deltaR[2];
-    fields[2][2]                                 = rr3*dipole[2] + dDotDelta*deltaR[2];
-    dipole                                       = inducedDipolePolarI;
+    dDotDelta                                    = rr5*(deltaR[0]*atomI.inducedDipolePolar[0]    + deltaR[1]*atomI.inducedDipolePolar[1]  + deltaR[2]*atomI.inducedDipolePolar[2] );
-    dDotDelta                                    = rr5*(deltaR[0]*dipole[0]    + deltaR[1]*dipole[1]    + deltaR[2]*dipole[2] );
+    fields[3][0]                                 = rr3*atomI.inducedDipolePolar[0] + dDotDelta*deltaR[0];
-    fields[3][0]                                 = rr3*dipole[0] + dDotDelta*deltaR[0];
+    fields[3][1]                                 = rr3*atomI.inducedDipolePolar[1] + dDotDelta*deltaR[1];
-    fields[3][1]                                 = rr3*dipole[1] + dDotDelta*deltaR[1];
+    fields[3][2]                                 = rr3*atomI.inducedDipolePolar[2] + dDotDelta*deltaR[2];
-    fields[3][2]                                 = rr3*dipole[2] + dDotDelta*deltaR[2];
-#ifdef AMOEBA_DEBUGX
-//int n2         = numberOfAtoms*numberOfAtoms;
-//int debugIndex = atomI*numberOfAtoms + atomJ;
-if( atomI == 0 ){
-int debugIndex               = atomJ;
-    debugArray[debugIndex].x = rr3;
-    debugArray[debugIndex].y = rr5;
-    dipole                                       = &inducedDipole[atomJ*3];
-    debugArray[debugIndex].z = (deltaR[0]*dipole[0]    + deltaR[1]*dipole[1]    + deltaR[2]*dipole[2] );
-    dipole                                       = &inducedDipole[atomI*3];
-    debugArray[debugIndex].w = (deltaR[0]*dipole[0]    + deltaR[1]*dipole[1]    + deltaR[2]*dipole[2] );
-    debugIndex              += numberOfAtoms;
-    dipole                                       = &inducedDipolePolar[atomJ*3];
-    debugArray[debugIndex].x = (deltaR[0]*dipole[0]    + deltaR[1]*dipole[1]    + deltaR[2]*dipole[2] );
-    dipole                                       = &inducedDipolePolar[atomI*3];
-    debugArray[debugIndex].y = (deltaR[0]*dipole[0]    + deltaR[1]*dipole[1]    + deltaR[2]*dipole[2] );
-    debugIndex              += numberOfAtoms;
-    debugArray[debugIndex].x = fields1[atomI*numberOfAtoms*3 + atomJ*3 +0];
-    debugArray[debugIndex].y = fields1[atomI*numberOfAtoms*3 + atomJ*3 +1];
-    debugArray[debugIndex].z = fields1[atomI*numberOfAtoms*3 + atomJ*3 +2];
-    debugIndex              += numberOfAtoms;
-    debugArray[debugIndex].x = fields1[atomJ*numberOfAtoms*3 + atomI*3 +0];
-    debugArray[debugIndex].y = fields1[atomJ*numberOfAtoms*3 + atomI*3 +1];
-    debugArray[debugIndex].z = fields1[atomJ*numberOfAtoms*3 + atomI*3 +2];
-    debugIndex              += numberOfAtoms;
-    debugArray[debugIndex].x = fields2[atomI*numberOfAtoms*3 + atomJ*3 +0];
-    debugArray[debugIndex].y = fields2[atomI*numberOfAtoms*3 + atomJ*3 +1];
-    debugArray[debugIndex].z = fields2[atomI*numberOfAtoms*3 + atomJ*3 +2];
-    debugIndex              += numberOfAtoms;
-    debugArray[debugIndex].x = fields2[atomJ*numberOfAtoms*3 + atomI*3 +0];
-    debugArray[debugIndex].y = fields2[atomJ*numberOfAtoms*3 + atomI*3 +1];
-    debugArray[debugIndex].z = fields2[atomJ*numberOfAtoms*3 + atomI*3 +2];
-}
-#endif
 }
-#include "kCalculateAmoebaCudaMutualInducedParticle.h"
 // Include versions of the kernels for N^2 calculations.
 #define METHOD_NAME(a, b) a##N2##b
@@ -230,18 +176,9 @@ void kReduceMutualInducedFieldDelta_kernel(int numberOfEntries, float* arrayOfDe
    {   
        epsilon[0]  = delta[0].x > delta[0].y ? delta[0].x : delta[0].y;
        epsilon[0]  = 4.8033324f*sqrtf( epsilon[0]/( (float) (numberOfEntries/3)) );
-#ifdef AMOEBA_DEBUG
-        epsilon[1]  = 4.8033324f*sqrtf( delta[0].x/( (float) (numberOfEntries/3)) );
-        epsilon[2]  = 4.8033324f*sqrtf( delta[0].y/( (float) (numberOfEntries/3)) );
-#endif
    }   
 }
-/**
-   matrixProduct/matrixProductP contains epsilon**2 on output
-*/
 __global__ 
 #if (__CUDA_ARCH__ >= 200)
 __launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
@@ -291,46 +228,6 @@ static void kReduceMutualInducedFields(amoebaGpuContext amoebaGpu, CUDAStream<fl
    LAUNCHERROR("kReduceMI_Fields2");
 }
-#ifdef AMOEBA_DEBUG
-#if 0
-static void printMiFieldBuffer( amoebaGpuContext amoebaGpu, unsigned int bufferIndex )
-{
-    (void) fprintf( amoebaGpu->log, "MI Field Buffer %u\n", bufferIndex );
-    unsigned int start = bufferIndex*3*amoebaGpu->paddedNumberOfAtoms;
-    unsigned int stop  = (bufferIndex+1)*3*amoebaGpu->paddedNumberOfAtoms;
-    for( unsigned int ii = start; ii < stop; ii += 3 ){
-        unsigned int ii3Index      = ii/3;
-        unsigned int bufferIndex   = ii3Index/(amoebaGpu->paddedNumberOfAtoms);
-        unsigned int particleIndex = ii3Index - bufferIndex*(amoebaGpu->paddedNumberOfAtoms);
-        (void) fprintf( amoebaGpu->log, "   %6u %3u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n", 
-                            ii/3,  bufferIndex, particleIndex,
-                            amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii],
-                            amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii+1],
-                            amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii+2],
-                            amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii],
-                            amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii+1],
-                            amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii+2] );
-    } 
-}
-static void printMiFieldAtomBuffers( amoebaGpuContext amoebaGpu, unsigned int targetAtom )
-{
-    (void) fprintf( amoebaGpu->log, "MI Field atom %u\n", targetAtom );
-    for( unsigned int ii = 0; ii < amoebaGpu->outputBuffers; ii++ ){
-        unsigned int particleIndex = 3*(targetAtom + ii*amoebaGpu->paddedNumberOfAtoms);
-        (void) fprintf( amoebaGpu->log, " %2u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n", 
-                        ii, particleIndex,
-                        amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex],
-                        amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex+1],
-                        amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex+2],
-                        amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex],
-                        amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex+1],
-                        amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex+2] );
-    } 
-}
-#endif
-#endif
 /**---------------------------------------------------------------------------------------
   Compute mutual induce field
@@ -367,9 +264,6 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
                                                                   amoebaGpu->nonbondThreadsPerBlock,
                                                                   sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
                                                                 amoebaGpu->psWorkUnit->_pDevStream[0],
-                                                                 gpu->psPosq4->_pDevStream[0],
-                                                                 amoebaGpu->psInducedDipole->_pDevStream[0],
-                                                                 amoebaGpu->psInducedDipolePolar->_pDevStream[0],
                                                                 amoebaGpu->psWorkArray_3_1->_pDevStream[0],
 #ifdef AMOEBA_DEBUG
                                                                 amoebaGpu->psWorkArray_3_2->_pDevStream[0],
@@ -392,9 +286,6 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
                                                           amoebaGpu->nonbondThreadsPerBlock,
                                                           sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
                                                                 amoebaGpu->psWorkUnit->_pDevStream[0],
-                                                                 gpu->psPosq4->_pDevStream[0],
-                                                                 amoebaGpu->psInducedDipole->_pDevStream[0],
-                                                                 amoebaGpu->psInducedDipolePolar->_pDevStream[0],
                                                                 amoebaGpu->psWorkArray_3_1->_pDevStream[0],
 #ifdef AMOEBA_DEBUG
                                                                 amoebaGpu->psWorkArray_3_2->_pDevStream[0],
@@ -410,17 +301,8 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
    kReduceMutualInducedFields( amoebaGpu, outputArray, outputPolarArray );
 #ifdef AMOEBA_DEBUG
-        amoebaGpu->psWorkArray_3_1->Download();
+    amoebaGpu->psWorkArray_3_1->Download();
-        amoebaGpu->psWorkArray_3_2->Download();
+    amoebaGpu->psWorkArray_3_2->Download();
-        //printMiFieldAtomBuffers( amoebaGpu, (targetAtom + 0) );
-        //printMiFieldAtomBuffers( amoebaGpu, (targetAtom + 1) );
-        //printMiFieldAtomBuffers( amoebaGpu, 100 );
-        //printMiFieldBuffer( amoebaGpu, 0 );
-        //printMiFieldBuffer( amoebaGpu, 1 );
-        //printMiFieldBuffer( amoebaGpu, 37 );
-        //printMiFieldBuffer( amoebaGpu, 38 );
    if( amoebaGpu->log && iteration == -1 ){
        (void) fprintf( amoebaGpu->log, "Finished MI kernel execution %d\n", iteration ); (void) fflush( amoebaGpu->log );
@@ -523,50 +405,6 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
                   }
                   (void) fprintf( amoebaGpu->log,"\n" );
                   count++;
-#if 0
-                   //debugIndex += gpu->natoms;
-                   debugIndex += paddedNumberOfAtoms;
-                   (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e]\n",
-                                   debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
-                                   debugArray->_pSysStream[0][debugIndex].z );
-                   int index = 0;
-                   sums[index][0] += debugArray->_pSysStream[0][debugIndex].x; 
-                   sums[index][1] += debugArray->_pSysStream[0][debugIndex].y; 
-                   sums[index][2] += debugArray->_pSysStream[0][debugIndex].z; 
-                   debugIndex += gpu->natoms;
-                   (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e]\n",
-                                   debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
-                                   debugArray->_pSysStream[0][debugIndex].z );
-                   index++;
-                   sums[index][0] += debugArray->_pSysStream[0][debugIndex].x; 
-                   sums[index][1] += debugArray->_pSysStream[0][debugIndex].y; 
-                   sums[index][2] += debugArray->_pSysStream[0][debugIndex].z; 
-                   debugIndex += gpu->natoms;
-                   (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e]\n",
-                                   debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
-                                   debugArray->_pSysStream[0][debugIndex].z );
-                   index++;
-                   sums[index][0] += debugArray->_pSysStream[0][debugIndex].x; 
-                   sums[index][1] += debugArray->_pSysStream[0][debugIndex].y; 
-                   sums[index][2] += debugArray->_pSysStream[0][debugIndex].z; 
-                   debugIndex += gpu->natoms;
-                   (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
-                                   debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
-                                   debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
-                   index++;
-                   sums[index][0] += debugArray->_pSysStream[0][debugIndex].x; 
-                   sums[index][1] += debugArray->_pSysStream[0][debugIndex].y; 
-                   sums[index][2] += debugArray->_pSysStream[0][debugIndex].z; 
-#endif
               }
               (void) fprintf( amoebaGpu->log,"\n" );
@@ -693,43 +531,6 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
        cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpu, amoebaGpu->psWorkVector[0],  amoebaGpu->psWorkVector[1] );
        LAUNCHERROR("cudaComputeAmoebaMutualInducedFieldMatrixMultiply Loop\n");  
-//#ifdef AMOEBA_DEBUG
-#if 0
-        if( 0 && amoebaGpu->log ){
-           amoebaGpu->psInducedDipole->Download();
-           amoebaGpu->psInducedDipolePolar->Download();
-           (void) fprintf( amoebaGpu->log, "%s Post cudaComputeAmoebaMutualInducedFieldMatrixMultiply\n", methodName );
-            int offset   = 0;
-            int maxPrint = 20;
-            for( int ii = 0; ii < gpu->natoms; ii++ ){
-                (void) fprintf( amoebaGpu->log, "%4d ", ii ); 
-                (void) fprintf( amoebaGpu->log," MMi[%14.6e %14.6e %14.6e] ",
-                                amoebaGpu->psInducedDipole->_pSysStream[0][offset], amoebaGpu->psInducedDipole->_pSysStream[0][offset+1], amoebaGpu->psInducedDipole->_pSysStream[0][offset+2] );
-                (void) fprintf( amoebaGpu->log,"MMip[%14.6e %14.6e %14.6e]\n",
-                                amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset], amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset+1], amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset+2] );
-                if( ii == maxPrint && (ii < (gpu->natoms - maxPrint) ) ){
-                    ii =  (gpu->natoms - maxPrint);
-                    offset = ii*3;
-                } else {
-                    offset += 3;
-                }
-            }   
-            (void) fflush( amoebaGpu->log );
-            // write results to file
-            if( iteration == -1 ){
-               fileId[1]  = iteration;
-               cudaWriteFloat1AndFloat1ArraysToFile( gpu->natoms, "CudaMI_MatrixProduct1", fileId, 3, amoebaGpu->psWorkVector[0], 
-                                           3, amoebaGpu->psWorkVector[1] );
-            }
-        }
-#endif
-    // ---------------------------------------------------------------------------------------
        // post matrix multiply
        kSorUpdateMutualInducedField_kernel<<< numBlocks, numThreads >>>(
@@ -763,18 +564,8 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
        if( amoebaGpu->log ){
           amoebaGpu->psInducedDipole->Download();
           amoebaGpu->psInducedDipolePolar->Download();
-#if 1
+           (void) fprintf( amoebaGpu->log, "%s iteration=%3d eps %14.6e done=%d\n",
-           (void) fprintf( amoebaGpu->log, "%s iteration=%3d eps %14.6e [%14.6e %14.6e] done=%d\n",
+                           methodName, iteration, amoebaGpu->mutualInducedCurrentEpsilon, done );
-                           methodName, iteration, amoebaGpu->mutualInducedCurrentEpsilon,
-                           amoebaGpu->psCurrentEpsilon->_pSysStream[0][1], 
-                           amoebaGpu->psCurrentEpsilon->_pSysStream[0][2], done );
-#else
-           (void) fprintf( amoebaGpu->log, "%s iteration=%3d eps %14.6e %14.6e crrntEps=%14.6e %14.6e %14.6e %14.6e done=%d\n",
-                           methodName, iteration, sum1, sum2, amoebaGpu->mutualInducedCurrentEpsilon,
-                           amoebaGpu->psCurrentEpsilon->_pSysStream[0][0], 
-                           amoebaGpu->psCurrentEpsilon->_pSysStream[0][1], 
-                           amoebaGpu->psCurrentEpsilon->_pSysStream[0][2], done );
-#endif
           (void) fflush( amoebaGpu->log );
            int offset   = 0;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.h
@@ -36,11 +36,7 @@ __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
 #endif
 void METHOD_NAME(kCalculateAmoebaMutualInducedField, _kernel)(
                            unsigned int* workUnit,
-                            float4* atomCoord,
+                            float* outputField, float* outputFieldPolar
-                            float* inducedDipole,
-                            float* inducedDipolePolar,
-                            float* outputField,
-                            float* outputFieldPolar
 #ifdef AMOEBA_DEBUG
                           , float4* debugArray, unsigned int targetAtom
 #endif
@@ -55,10 +51,6 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedField, _kernel)(
    unsigned int end             = (warp+1)*numWorkUnits/totalWarps;
    unsigned int lasty           = 0xFFFFFFFF;
-    float4 jCoord;
-    float jDipole[3];     
-    float jDipolePolar[3];     
    while (pos < end)
    {
@@ -76,7 +68,8 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedField, _kernel)(
        MutualInducedParticle*  psA      = &sA[tbx];
        unsigned int atomI               = x + tgx;
-        float4 iCoord                    = atomCoord[atomI];
+        MutualInducedParticle localParticle;
+        loadMutualInducedShared( &localParticle, atomI );
        float fieldSum[3];
        float fieldPolarSum[3];
@@ -97,9 +90,7 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedField, _kernel)(
            // load shared data
-            loadMutualInducedShared( &(sA[threadIdx.x]), atomI,
+            loadMutualInducedShared( &(sA[threadIdx.x]), atomI );
-                                     atomCoord, inducedDipole,
-                                     inducedDipolePolar, cAmoebaSim.pDampingFactorAndThole );
            for (unsigned int j = 0; j < GRID; j++)
            {
@@ -108,14 +99,7 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedField, _kernel)(
                // load coords, charge, ...
-                loadMutualInducedData( &(psA[j]),  &jCoord, jDipole, jDipolePolar ); 
+                calculateMutualInducedFieldPairIxn_kernel( localParticle, psA[j], ijField
-                calculateMutualInducedFieldPairIxn_kernel( iCoord,                                          jCoord,
-                                                           cAmoebaSim.pDampingFactorAndThole[atomI].x,      psA[j].damp,
-                                                           cAmoebaSim.pDampingFactorAndThole[atomI].y,      psA[j].thole,
-                                                           &(inducedDipole[atomI*3]),                       &(inducedDipolePolar[atomI*3]),
-                                                           jDipole,                                         jDipolePolar,
-                                                           cAmoebaSim.scalingDistanceCutoff,                ijField
 #ifdef AMOEBA_DEBUG
 ,  debugArray
 #endif
@@ -187,9 +171,7 @@ if( atomI == targetAtom ){
                // load coordinates, charge, ...
-                loadMutualInducedShared( &(sA[threadIdx.x]), atomJ,
+                loadMutualInducedShared( &(sA[threadIdx.x]), atomJ );
-                                         atomCoord, inducedDipole,
-                                         inducedDipolePolar, cAmoebaSim.pDampingFactorAndThole );
            }
           // zero shared fields
@@ -203,14 +185,7 @@ if( atomI == targetAtom ){
                // load coords, charge, ...
-                loadMutualInducedData( &(psA[tj]),  &jCoord, jDipole, jDipolePolar ); 
+                calculateMutualInducedFieldPairIxn_kernel( localParticle, psA[tj], ijField
-                calculateMutualInducedFieldPairIxn_kernel( iCoord,                                          jCoord,
-                                                           cAmoebaSim.pDampingFactorAndThole[atomI].x,      psA[tj].damp,
-                                                           cAmoebaSim.pDampingFactorAndThole[atomI].y,      psA[tj].thole,
-                                                           &(inducedDipole[atomI*3]),                       &(inducedDipolePolar[atomI*3]),
-                                                           jDipole,                                         jDipolePolar,
-                                                           cAmoebaSim.scalingDistanceCutoff,                ijField
 #ifdef AMOEBA_DEBUG
 ,  debugArray
 #endif

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedParticle.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedParticle.h
@@ -26,58 +26,50 @@ struct MutualInducedParticle {
 #endif
 };
-__device__ void loadMutualInducedShared( struct MutualInducedParticle* sA, unsigned int atomI,
+__device__ static void loadMutualInducedShared( MutualInducedParticle* sA, unsigned int atomI )
-                                         float4* atomCoord,
-                                         float* inducedDipole, float* inducedDipolePolar,
-                                         float2* dampingFactorAndThole 
-#ifdef GK
-                                         , float* bornRadii, float* inducedDipoleS, float* inducedDipolePolarS
-#endif
-)
 {
    // coordinates & charge
-    sA->x                        = atomCoord[atomI].x;
+    sA->x                        = cSim.pPosq[atomI].x;
-    sA->y                        = atomCoord[atomI].y;
+    sA->y                        = cSim.pPosq[atomI].y;
-    sA->z                        = atomCoord[atomI].z;
+    sA->z                        = cSim.pPosq[atomI].z;
-    sA->q                        = atomCoord[atomI].w;
+    sA->q                        = cSim.pPosq[atomI].w;
    // dipole
-    sA->inducedDipole[0]         = inducedDipole[atomI*3];
+    sA->inducedDipole[0]         = cAmoebaSim.pInducedDipole[atomI*3];
-    sA->inducedDipole[1]         = inducedDipole[atomI*3+1];
+    sA->inducedDipole[1]         = cAmoebaSim.pInducedDipole[atomI*3+1];
-    sA->inducedDipole[2]         = inducedDipole[atomI*3+2];
+    sA->inducedDipole[2]         = cAmoebaSim.pInducedDipole[atomI*3+2];
    // dipole polar
-    sA->inducedDipolePolar[0]    = inducedDipolePolar[atomI*3];
+    sA->inducedDipolePolar[0]    = cAmoebaSim.pInducedDipolePolar[atomI*3];
-    sA->inducedDipolePolar[1]    = inducedDipolePolar[atomI*3+1];
+    sA->inducedDipolePolar[1]    = cAmoebaSim.pInducedDipolePolar[atomI*3+1];
-    sA->inducedDipolePolar[2]    = inducedDipolePolar[atomI*3+2];
+    sA->inducedDipolePolar[2]    = cAmoebaSim.pInducedDipolePolar[atomI*3+2];
-    sA->damp                     = dampingFactorAndThole[atomI].x;
+    sA->damp                     = cAmoebaSim.pDampingFactorAndThole[atomI].x;
-    sA->thole                    = dampingFactorAndThole[atomI].y;
+    sA->thole                    = cAmoebaSim.pDampingFactorAndThole[atomI].y;
 #ifdef GK
-    sA->bornRadius               = bornRadii[atomI];
+    sA->bornRadius               =  cSim.pBornRadii[atomI];
    // dipoleS
-    sA->inducedDipoleS[0]        = inducedDipoleS[atomI*3];
+    sA->inducedDipoleS[0]        = cAmoebaSim.pInducedDipoleS[atomI*3];
-    sA->inducedDipoleS[1]        = inducedDipoleS[atomI*3+1];
+    sA->inducedDipoleS[1]        = cAmoebaSim.pInducedDipoleS[atomI*3+1];
-    sA->inducedDipoleS[2]        = inducedDipoleS[atomI*3+2];
+    sA->inducedDipoleS[2]        = cAmoebaSim.pInducedDipoleS[atomI*3+2];
    // dipole polar S
-    sA->inducedDipolePolarS[0]   = inducedDipolePolarS[atomI*3];
+    sA->inducedDipolePolarS[0]   = cAmoebaSim.pInducedDipolePolarS[atomI*3];
-    sA->inducedDipolePolarS[1]   = inducedDipolePolarS[atomI*3+1];
+    sA->inducedDipolePolarS[1]   = cAmoebaSim.pInducedDipolePolarS[atomI*3+1];
-    sA->inducedDipolePolarS[2]   = inducedDipolePolarS[atomI*3+2];
+    sA->inducedDipolePolarS[2]   = cAmoebaSim.pInducedDipolePolarS[atomI*3+2];
 #endif
 }
-__device__ static void zeroMutualInducedParticleSharedField( struct MutualInducedParticle* sA )
+__device__ static void zeroMutualInducedParticleSharedField( MutualInducedParticle* sA )
 {
    // zero shared fields
@@ -101,44 +93,3 @@ __device__ static void zeroMutualInducedParticleSharedField( struct MutualInduce
 #endif
 }
-// load struct and arrays w/ shared data in sA
-__device__ void loadMutualInducedData( struct MutualInducedParticle* sA,  float4* jCoord,
-                                       float* jInducedDipole,  float* jInducedDipolePolar
-#ifdef GK
-                                      , float* jBornRadius, float* jInducedDipoleS, float* jInducedDipolePolarS
-#endif
-)
-{
-    // load coords, charge, ...
-    jCoord->x                = sA->x;
-    jCoord->y                = sA->y;
-    jCoord->z                = sA->z;
-    jCoord->w                = sA->q;
-    jInducedDipole[0]        = sA->inducedDipole[0];
-    jInducedDipole[1]        = sA->inducedDipole[1];
-    jInducedDipole[2]        = sA->inducedDipole[2];
-    jInducedDipolePolar[0]   = sA->inducedDipolePolar[0];
-    jInducedDipolePolar[1]   = sA->inducedDipolePolar[1];
-    jInducedDipolePolar[2]   = sA->inducedDipolePolar[2];
-#ifdef GK
-    *jBornRadius             = sA->bornRadius;
-    jInducedDipoleS[0]       = sA->inducedDipoleS[0];
-    jInducedDipoleS[1]       = sA->inducedDipoleS[1];
-    jInducedDipoleS[2]       = sA->inducedDipoleS[2];
-    jInducedDipolePolarS[0]  = sA->inducedDipolePolarS[0];
-    jInducedDipolePolarS[1]  = sA->inducedDipolePolarS[1];
-    jInducedDipolePolarS[2]  = sA->inducedDipolePolarS[2];
-#endif
-}
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
+//-----------------------------------------------------------------------------------------
+//-----------------------------------------------------------------------------------------
+#include "amoebaGpuTypes.h"
+#include "amoebaCudaKernels.h"
+#include "kCalculateAmoebaCudaUtilities.h"
+#include <stdio.h>
+using namespace std;
+static __constant__ cudaGmxSimulation cSim;
+static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
+void SetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
+{
+    cudaError_t status;
+    gpuContext gpu = amoebaGpu->gpuContext;
+    status         = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));    
+    RTERROR(status, "SetCalculateAmoebaCudaPmeMutualInducedFieldSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
+    status         = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));    
+    RTERROR(status, "SetCalculateAmoebaCudaPmeMutualInducedFieldSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
+}
+void GetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
+{
+    cudaError_t status;
+    gpuContext gpu = amoebaGpu->gpuContext;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));    
+    RTERROR(status, "GetCalculateAmoebaCudaPmeMutualInducedFieldSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+    status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));    
+    RTERROR(status, "GetCalculateAmoebaCudaPmeMutualInducedFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
+}
+//#define AMOEBA_DEBUG
+#undef AMOEBA_DEBUG
+#include "kCalculateAmoebaCudaMutualInducedParticle.h"
+// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
+__device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInducedParticle& atomI, MutualInducedParticle& atomJ,
+                                                                    float uscale, float fields[4][3]
+#ifdef AMOEBA_DEBUG
+                                                            , float4* pullBack
+#endif
+ ){
+    // compute the real space portion of the Ewald summation
+    float xr          = atomJ.x - atomI.x;
+    float yr          = atomJ.y - atomI.y;
+    float zr          = atomJ.z - atomI.z;
+    // periodic boundary conditions
+    xr               -= floor(xr*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+    yr               -= floor(yr*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+    zr               -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+    float r2          = xr*xr + yr* yr + zr*zr;
+    float r           = sqrtf(r2);
+    // calculate the error function damping terms
+    float ralpha      = cSim.alphaEwald*r;
+    float bn[3];
+    bn[0]             = erfc(ralpha)/r;
+    float alsq2       = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
+    float alsq2n      = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald);
+    float exp2a       = exp(-(ralpha*ralpha));
+    alsq2n           *= alsq2;
+    bn[1]             = (bn[0]+alsq2n*exp2a)/r2;
+    alsq2n           *= alsq2;
+    bn[2]             = (3.0f*bn[1]+alsq2n*exp2a)/r2;
+    // compute the error function scaled and unscaled terms
+    float scale3      = 1.0f;
+    float scale5      = 1.0f;
+    float damp        = atomI.damp*atomJ.damp;
+    if( damp != 0.0f ){
+        float ratio  = (r/damp);
+              ratio  = ratio*ratio*ratio;
+        float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
+              damp   = -pgamma*ratio;
+        if( damp > -50.0f) {
+            float expdamp = exp(damp);
+            scale3        = 1.0f - expdamp;
+            scale5        = 1.0f - expdamp*(1.0f-damp);
+        }
+    }
+    float dsc3        = uscale*scale3;
+    float dsc5        = uscale*scale5;
+    float r3          = (r*r2);
+    float r5          = (r3*r2);
+    float rr3         = (1.0f-dsc3)/r3;
+    float rr5         = 3.0f * (1.0f-dsc5)/r5;
+    float duir        = atomI.inducedDipole[0]*xr      + atomI.inducedDipole[1]*yr      + atomI.inducedDipole[2]*zr;
+    float dukr        = atomJ.inducedDipole[0]*xr      + atomJ.inducedDipole[1]*yr      + atomJ.inducedDipole[2]*zr;
+    float puir        = atomI.inducedDipolePolar[0]*xr + atomI.inducedDipolePolar[1]*yr + atomI.inducedDipolePolar[2]*zr;
+    float pukr        = atomJ.inducedDipolePolar[0]*xr + atomJ.inducedDipolePolar[1]*yr + atomJ.inducedDipolePolar[2]*zr;
+    float fimd[3],fkmd[3];
+    float fimp[3],fkmp[3];
+    float fid[3],fkd[3];
+    float fip[3],fkp[3];
+    bn[1]            *= -1.0f;
+    fimd[0]           = bn[1]*atomJ.inducedDipole[0]      + bn[2]*dukr*xr;
+    fimd[1]           = bn[1]*atomJ.inducedDipole[1]      + bn[2]*dukr*yr;
+    fimd[2]           = bn[1]*atomJ.inducedDipole[2]      + bn[2]*dukr*zr;
+    fkmd[0]           = bn[1]*atomI.inducedDipole[0]      + bn[2]*duir*xr;
+    fkmd[1]           = bn[1]*atomI.inducedDipole[1]      + bn[2]*duir*yr;
+    fkmd[2]           = bn[1]*atomI.inducedDipole[2]      + bn[2]*duir*zr;
+    fimp[0]           = bn[1]*atomJ.inducedDipolePolar[0] + bn[2]*pukr*xr;
+    fimp[1]           = bn[1]*atomJ.inducedDipolePolar[1] + bn[2]*pukr*yr;
+    fimp[2]           = bn[1]*atomJ.inducedDipolePolar[2] + bn[2]*pukr*zr;
+    fkmp[0]           = bn[1]*atomI.inducedDipolePolar[0] + bn[2]*puir*xr;
+    fkmp[1]           = bn[1]*atomI.inducedDipolePolar[1] + bn[2]*puir*yr;
+    fkmp[2]           = bn[1]*atomI.inducedDipolePolar[2] + bn[2]*puir*zr;
+    rr3              *= -1.0f;;
+    fid[0]            = rr3*atomJ.inducedDipole[0]      + rr5*dukr*xr;
+    fid[1]            = rr3*atomJ.inducedDipole[1]      + rr5*dukr*yr;
+    fid[2]            = rr3*atomJ.inducedDipole[2]      + rr5*dukr*zr;
+    fkd[0]            = rr3*atomI.inducedDipole[0]      + rr5*duir*xr;
+    fkd[1]            = rr3*atomI.inducedDipole[1]      + rr5*duir*yr;
+    fkd[2]            = rr3*atomI.inducedDipole[2]      + rr5*duir*zr;
+    fip[0]            = rr3*atomJ.inducedDipolePolar[0] + rr5*pukr*xr;
+    fip[1]            = rr3*atomJ.inducedDipolePolar[1] + rr5*pukr*yr;
+    fip[2]            = rr3*atomJ.inducedDipolePolar[2] + rr5*pukr*zr;
+    fkp[0]            = rr3*atomI.inducedDipolePolar[0] + rr5*puir*xr;
+    fkp[1]            = rr3*atomI.inducedDipolePolar[1] + rr5*puir*yr;
+    fkp[2]            = rr3*atomI.inducedDipolePolar[2] + rr5*puir*zr;
+    // increment the field at each site due to this interaction
+    if( r2 <= cAmoebaSim.cutoffDistance2 ){
+        fields[0][0]       = fimd[0] - fid[0];
+        fields[1][0]       = fkmd[0] - fkd[0];
+        fields[2][0]       = fimp[0] - fip[0];
+        fields[3][0]       = fkmp[0] - fkp[0];
+        fields[0][1]       = fimd[1] - fid[1];
+        fields[1][1]       = fkmd[1] - fkd[1];
+        fields[2][1]       = fimp[1] - fip[1];
+        fields[3][1]       = fkmp[1] - fkp[1];
+        fields[0][2]       = fimd[2] - fid[2];
+        fields[1][2]       = fkmd[2] - fkd[2];
+        fields[2][2]       = fimp[2] - fip[2];
+        fields[3][2]       = fkmp[2] - fkp[2];
+    } else {
+        fields[0][0]       = 0.0f;
+        fields[1][0]       = 0.0f;
+        fields[2][0]       = 0.0f;
+        fields[3][0]       = 0.0f;
+        fields[0][1]       = 0.0f;
+        fields[1][1]       = 0.0f;
+        fields[2][1]       = 0.0f;
+        fields[3][1]       = 0.0f;
+        fields[0][2]       = 0.0f;
+        fields[1][2]       = 0.0f;
+        fields[2][2]       = 0.0f;
+        fields[3][2]       = 0.0f;
+    }
+#ifdef AMOEBA_DEBUG
+    pullBack[0].x = xr;
+    pullBack[0].y = yr;
+    pullBack[0].z = zr;
+    pullBack[0].w = r2;
+    pullBack[1].x = alsq2;
+    pullBack[1].y = bn[0];
+    pullBack[1].z = bn[2];
+    pullBack[1].w = exp2a;
+/*
+    pullBack[1].x = atomJ.x - atomI.x;
+    pullBack[1].y = atomJ.y - atomI.y;
+    pullBack[1].z = atomJ.z - atomI.z;
+    pullBack[1].w = (atomJ.x - atomI.x)*(atomJ.x - atomI.x) + (atomJ.y - atomI.y)*(atomJ.y - atomI.y)+ (atomJ.z - atomI.z)*(atomJ.z - atomI.z);
+    pullBack[1].x = scale3;
+    pullBack[1].y = scale5;
+    pullBack[1].z = scale7;
+*/
+#endif
+}
+// Include versions of the kernels for N^2 calculations.
+#define METHOD_NAME(a, b) a##N2##b
+#include "kCalculateAmoebaCudaPmeMutualInducedField.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##N2ByWarp##b
+#include "kCalculateAmoebaCudaPmeMutualInducedField.h"
+__global__ 
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
+#elif (__CUDA_ARCH__ >= 130)
+__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
+#else
+__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
+#endif
+static void kInitializeMutualInducedField_kernel(
+                   int numberOfAtoms,
+                   float* fixedEField,
+                   float* fixedEFieldPolar,
+                   float* polarizability,
+                   float* inducedDipole,
+                   float* inducedDipolePolar )
+{
+    int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
+    if( threadId >= 3*numberOfAtoms )return;
+    fixedEField[threadId]         *= polarizability[threadId];
+    inducedDipole[threadId]        = fixedEField[threadId];
+    fixedEFieldPolar[threadId]    *= polarizability[threadId];
+    inducedDipolePolar[threadId]   = fixedEFieldPolar[threadId];
+}
+__global__ 
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
+#elif (__CUDA_ARCH__ >= 130)
+__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
+#else
+__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
+#endif
+static void kReduceMutualInducedFieldDelta_kernel(int numberOfEntries, float* arrayOfDeltas1, float* arrayOfDeltas2, float* epsilon )
+{
+    extern __shared__ float2 delta[];
+    delta[threadIdx.x].x    = 0.0f;
+    delta[threadIdx.x].y    = 0.0f;
+    unsigned int pos = threadIdx.x;
+    // load deltas
+    while( pos < numberOfEntries )
+    {   
+        delta[threadIdx.x].x  += arrayOfDeltas1[pos];
+        delta[threadIdx.x].y  += arrayOfDeltas2[pos];
+        pos                   += blockDim.x*gridDim.x;
+    }   
+    __syncthreads();
+    // sum the deltas
+    for (int offset = 1; offset < blockDim.x; offset *= 2 )
+    {   
+        if (threadIdx.x + offset < blockDim.x && (threadIdx.x & (2*offset-1)) == 0)
+        {
+            delta[threadIdx.x].x   += delta[threadIdx.x+offset].x;
+            delta[threadIdx.x].y   += delta[threadIdx.x+offset].y;
+        }
+        __syncthreads();
+    }   
+    // set epsilons
+    if (threadIdx.x == 0)
+    {   
+        epsilon[0]  = delta[0].x > delta[0].y ? delta[0].x : delta[0].y;
+        epsilon[0]  = 4.8033324f*sqrtf( epsilon[0]/( (float) (numberOfEntries/3)) );
+#ifdef AMOEBA_DEBUG
+        epsilon[1]  = 4.8033324f*sqrtf( delta[0].x/( (float) (numberOfEntries/3)) );
+        epsilon[2]  = 4.8033324f*sqrtf( delta[0].y/( (float) (numberOfEntries/3)) );
+#endif
+    }   
+}
+/**
+   matrixProduct/matrixProductP contains epsilon**2 on output
+*/
+__global__ 
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
+#elif (__CUDA_ARCH__ >= 130)
+__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
+#else
+__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
+#endif
+static void kSorUpdateMutualInducedField_kernel(
+                   int numberOfEntries,    float* polarizability,
+                   float* inducedDipole, float* inducedDipoleP,
+                   float* fixedEField,   float* fixedEFieldP,
+                   float* matrixProduct, float* matrixProductP )
+{
+    int threadId         = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
+    if( threadId  >= 3*numberOfEntries )return;
+    float previousDipole                = inducedDipole[threadId];
+    float previousDipoleP               = inducedDipoleP[threadId];
+    // add self terms to fields
+    const float term                    = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
+    matrixProduct[threadId]            +=  term*previousDipole;
+    matrixProductP[threadId]           +=  term*previousDipoleP;
+    inducedDipole[threadId]             = fixedEField[threadId]     + polarizability[threadId]*matrixProduct[threadId];
+    inducedDipoleP[threadId]            = fixedEFieldP[threadId]    + polarizability[threadId]*matrixProductP[threadId];
+    const float polarSOR                = 0.70f;
+    inducedDipole[threadId]             = previousDipole   + polarSOR*( inducedDipole[threadId]   - previousDipole  );   
+    inducedDipoleP[threadId]            = previousDipoleP  + polarSOR*( inducedDipoleP[threadId]  - previousDipoleP );
+    matrixProduct[threadId]             = ( inducedDipole[threadId]  - previousDipole  )*( inducedDipole[threadId]  - previousDipole  );
+    matrixProductP[threadId]            = ( inducedDipoleP[threadId] - previousDipoleP )*( inducedDipoleP[threadId] - previousDipoleP );
+}
+// reduce psWorkArray_3_1
+// reduce psWorkArray_3_2
+static void kReduceMutualInducedFields(amoebaGpuContext amoebaGpu, CUDAStream<float>* outputArray, CUDAStream<float>* outputPolarArray )
+{
+    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
+                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
+                               amoebaGpu->psWorkArray_3_1->_pDevStream[0], outputArray->_pDevStream[0] );
+    LAUNCHERROR("kReducePmeMI_Fields1");
+    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
+                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
+                               amoebaGpu->psWorkArray_3_2->_pDevStream[0], outputPolarArray->_pDevStream[0] );
+    LAUNCHERROR("kReducePmeMI_Fields2");
+}
+/**---------------------------------------------------------------------------------------
+   Compute mutual induce field
+   @param amoebaGpu        amoebaGpu context
+   --------------------------------------------------------------------------------------- */
+static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuContext amoebaGpu,
+                                                                  CUDAStream<float>* outputArray, CUDAStream<float>* outputPolarArray )
+{
+  gpuContext gpu    = amoebaGpu->gpuContext;
+#ifdef AMOEBA_DEBUG
+    int targetAtom    = 0;
+    static const char* methodName = "cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply";
+    static int iteration = 1;
+    if( 1 && amoebaGpu->log ){
+        (void) fprintf( amoebaGpu->log, "%s: scalingDistanceCutoff=%.5f\n",
+                        methodName, amoebaGpu->scalingDistanceCutoff );
+        (void) fflush( amoebaGpu->log );
+    }
+    int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
+    CUDAStream<float4>* debugArray             = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
+    memset( debugArray->_pSysStream[0],      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
+    debugArray->Upload();
+#endif
+    kClearFields_3( amoebaGpu, 2 );
+    if (gpu->bOutputBufferPerWarp){
+        kCalculateAmoebaPmeMutualInducedFieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
+                                                                 amoebaGpu->psWorkUnit->_pDevStream[0],
+                                                                 amoebaGpu->psWorkArray_3_1->_pDevStream[0],
+#ifdef AMOEBA_DEBUG
+                                                                 amoebaGpu->psWorkArray_3_2->_pDevStream[0],
+                                                                 debugArray->_pDevStream[0], targetAtom );
+#else
+                                                                 amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
+#endif
+    } else {
+#ifdef AMOEBA_DEBUG
+        (void) fprintf( amoebaGpu->log, "N2 no warp\n" );
+        (void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n",
+                        amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
+                        sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock,
+                        amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
+        (void) fflush( amoebaGpu->log );
+#endif
+        kCalculateAmoebaPmeMutualInducedFieldN2_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock,
+                                                         sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
+                                                                 amoebaGpu->psWorkUnit->_pDevStream[0],
+                                                                 amoebaGpu->psWorkArray_3_1->_pDevStream[0],
+#ifdef AMOEBA_DEBUG
+                                                                 amoebaGpu->psWorkArray_3_2->_pDevStream[0],
+                                                                 debugArray->_pDevStream[0], targetAtom );
+#else
+                                                                 amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
+#endif
+    }
+    LAUNCHERROR("kCalculateAmoebaPmeMutualInducedField");
+    kReduceMutualInducedFields( amoebaGpu, outputArray, outputPolarArray );
+#ifdef AMOEBA_DEBUG
+    if( amoebaGpu->log && iteration == 1 ){
+        (void) fprintf( amoebaGpu->log, "Finished maxtrixMultiply kernel execution %d\n", iteration ); (void) fflush( amoebaGpu->log );
+        outputArray->Download();
+        outputPolarArray->Download();
+        debugArray->Download();
+        int maxPrint = 5;
+        for( int ii = 0; ii < gpu->natoms; ii++ ){
+            (void) fprintf( amoebaGpu->log, "%5d ", ii); 
+             int indexOffset     = ii*3;
+            // MI
+            (void) fprintf( amoebaGpu->log,"Mult[%16.9e %16.9e %16.9e] ",
+                            outputArray->_pSysStream[0][indexOffset],
+                            outputArray->_pSysStream[0][indexOffset+1],
+                            outputArray->_pSysStream[0][indexOffset+2] );
+            // MI polar
+            (void) fprintf( amoebaGpu->log,"MultP[%16.9e %16.9e %16.9e]\n",
+                            outputPolarArray->_pSysStream[0][indexOffset],
+                            outputPolarArray->_pSysStream[0][indexOffset+1],
+                            outputPolarArray->_pSysStream[0][indexOffset+2] );
+            if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
+                ii = gpu->natoms - maxPrint;
+            }
+        }
+/*
+        int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
+        for( int jj = 0; jj < gpu->natoms; jj++ ){
+            int debugIndex = jj; 
+            (void) fprintf( amoebaGpu->log,"%5d PmeMIMult\n", jj );
+            for( int kk = 0; kk < 7; kk++ ){
+                (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
+                                debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
+                                debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
+                debugIndex += paddedNumberOfAtoms;
+            }
+            (void) fprintf( amoebaGpu->log,"\n" );
+        }
+*/
+        (void) fflush( amoebaGpu->log );
+        iteration++;
+     }
+     delete debugArray;
+#endif
+}
+/**---------------------------------------------------------------------------------------
+   Compute mutual induce field
+   @param amoebaGpu        amoebaGpu context
+   --------------------------------------------------------------------------------------- */
+//#define GET_EFIELD_FROM_FILE
+#ifdef GET_EFIELD_FROM_FILE
+#include <stdlib.h>
+#endif
+static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu )
+{
+   // ---------------------------------------------------------------------------------------
+#ifdef AMOEBA_DEBUG
+    static const char* methodName = "cudaComputeAmoebaPmeMutualInducedFieldBySOR";
+    static int timestep = 0;
+    std::vector<int> fileId;
+    timestep++;
+    fileId.resize( 2 );
+    fileId[0] = timestep;
+    fileId[1] = 1;
+#endif
+   // ---------------------------------------------------------------------------------------
+    int done;
+    int iteration;
+     gpuContext gpu    = amoebaGpu->gpuContext;
+    int numOfElems     = gpu->natoms*3;
+    int numThreads     = min( THREADS_PER_BLOCK, numOfElems );
+    int numBlocks      = numOfElems/numThreads;
+    if( (numOfElems % numThreads) != 0 )numBlocks++;
+#ifdef AMOEBA_DEBUG
+    if( amoebaGpu->log ){
+        (void) fprintf( amoebaGpu->log, "%s %d numOfElems=%d numThreads=%d numBlocks=%d "
+                        "maxIterations=%d targetEpsilon=%.3e\n", 
+                        methodName, gpu->natoms, numOfElems, numThreads, numBlocks,
+                        amoebaGpu->mutualInducedMaxIterations, amoebaGpu->mutualInducedTargetEpsilon);
+        (void) fflush( amoebaGpu->log );
+    }   
+#endif
+#ifdef GET_EFIELD_FROM_FILE
+    std::string fileName = "waterEField.txt";
+    StringVectorVector fileContents;
+    readFile( fileName, fileContents );
+    unsigned int offset  = 0;
+    (void) fprintf( amoebaGpu->log, "Read file: %s %u\n", fileName.c_str(), fileContents.size() ); fflush(  amoebaGpu->log );
+    for( unsigned int ii = 1; ii < fileContents.size()-1; ii++ ){
+        StringVector lineTokens     = fileContents[ii];
+        unsigned int lineTokenIndex = 1;
+        // (void) fprintf( amoebaGpu->log, "   %u %s %s\n", ii, lineTokens[0].c_str(), lineTokens[lineTokenIndex].c_str() ); fflush(  amoebaGpu->log );
+        amoebaGpu->psE_Field->_pSysStream[0][offset++]       = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
+        amoebaGpu->psE_Field->_pSysStream[0][offset++]       = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
+        amoebaGpu->psE_Field->_pSysStream[0][offset++]       = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str())); 
+        offset                                              -= 3;        
+        amoebaGpu->psE_FieldPolar->_pSysStream[0][offset++]  = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
+        amoebaGpu->psE_FieldPolar->_pSysStream[0][offset++]  = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
+        amoebaGpu->psE_FieldPolar->_pSysStream[0][offset++]  = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
+    }
+    float conversion = 100.0f;
+    for( int ii = 0; ii < 3*gpu->natoms; ii++ ){
+        amoebaGpu->psE_Field->_pSysStream[0][ii]       *= conversion;
+        amoebaGpu->psE_FieldPolar->_pSysStream[0][ii]  *= conversion;
+    }
+    amoebaGpu->psE_Field->Upload();
+    amoebaGpu->psE_FieldPolar->Upload();
+#endif
+   // ---------------------------------------------------------------------------------------
+    // set  E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability
+    // initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability
+    kInitializeMutualInducedField_kernel<<< numBlocks, numThreads >>>(
+         gpu->natoms,
+         amoebaGpu->psE_Field->_pDevStream[0],
+         amoebaGpu->psE_FieldPolar->_pDevStream[0],
+         amoebaGpu->psPolarizability->_pDevStream[0],
+         amoebaGpu->psInducedDipole->_pDevStream[0],
+         amoebaGpu->psInducedDipolePolar->_pDevStream[0] );
+    LAUNCHERROR("AmoebaPmeMutualInducedFieldSetup");  
+#ifdef AMOEBA_DEBUG
+    if( amoebaGpu->log ){
+        amoebaGpu->psE_Field->Download();
+        amoebaGpu->psE_FieldPolar->Download();
+        amoebaGpu->psInducedDipole->Download(),
+        amoebaGpu->psInducedDipolePolar->Download();
+        amoebaGpu->psPolarizability->Download();
+        (void) fprintf( amoebaGpu->log, "%s Initial setup for matrix multiply\n", methodName );
+        int offset   = 0;
+        int maxPrint = 20000;
+        for( int ii = 0; ii < gpu->natoms; ii++ ){
+            (void) fprintf( amoebaGpu->log, "%4d pol=%12.4e ", ii, 
+                            amoebaGpu->psPolarizability->_pSysStream[0][offset] );
+            if( amoebaGpu->psPolarizability->_pSysStream[0][offset] != amoebaGpu->psPolarizability->_pSysStream[0][offset+1] ||
+                amoebaGpu->psPolarizability->_pSysStream[0][offset] != amoebaGpu->psPolarizability->_pSysStream[0][offset+2] ){
+                (void) fprintf( amoebaGpu->log, "PolX!!! %12.4e %12.4e ", amoebaGpu->psPolarizability->_pSysStream[0][offset+1], amoebaGpu->psPolarizability->_pSysStream[0][offset+2] ); 
+            }
+            (void) fprintf( amoebaGpu->log," E[%14.6e %14.6e %14.6e] Mi[%14.6e %14.6e %14.6e] ",
+                            amoebaGpu->psE_Field->_pSysStream[0][offset],       amoebaGpu->psE_Field->_pSysStream[0][offset+1],       amoebaGpu->psE_Field->_pSysStream[0][offset+2],
+                            amoebaGpu->psInducedDipole->_pSysStream[0][offset], amoebaGpu->psInducedDipole->_pSysStream[0][offset+1], amoebaGpu->psInducedDipole->_pSysStream[0][offset+2] );
+            (void) fprintf( amoebaGpu->log,"Ep[%14.6e %14.6e %14.6e] Mip[%14.6e %14.6e %14.6e]\n",
+                            amoebaGpu->psE_FieldPolar->_pSysStream[0][offset],       amoebaGpu->psE_FieldPolar->_pSysStream[0][offset+1],       amoebaGpu->psE_FieldPolar->_pSysStream[0][offset+2],
+                            amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset], amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset+1], amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset+2] );
+            offset += 3;
+            if( ii == maxPrint && (ii < (gpu->natoms - maxPrint) ) )ii =  (gpu->natoms - maxPrint);
+        }   
+        (void) fflush( amoebaGpu->log );
+    }   
+#endif
+    // ---------------------------------------------------------------------------------------
+    done      = 0;
+    iteration = 1;
+    while( !done ){
+        // matrix multiply
+        cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpu, amoebaGpu->psWorkVector[0],  amoebaGpu->psWorkVector[1] );
+        LAUNCHERROR("cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply Loop\n");  
+#ifdef GET_EFIELD_FROM_FILE
+{
+    std::string fileName = "waterInduceRecip.txt";
+    StringVectorVector fileContents;
+    readFile( fileName, fileContents );
+    unsigned int offset  = 0;
+    amoebaGpu->psWorkVector[0]->Download();
+    amoebaGpu->psWorkVector[1]->Download();
+    (void) fprintf( amoebaGpu->log, "Read file: %s %u\n", fileName.c_str(), fileContents.size() ); fflush(  amoebaGpu->log );
+    float conversion = 100.0f;
+    for( unsigned int ii = 1; ii < fileContents.size()-1; ii++ ){
+        StringVector lineTokens     = fileContents[ii];
+        unsigned int lineTokenIndex = 1;
+        // (void) fprintf( amoebaGpu->log, "   %u %s %s\n", ii, lineTokens[0].c_str(), lineTokens[lineTokenIndex].c_str() ); fflush(  amoebaGpu->log );
+        amoebaGpu->psWorkVector[0]->_pSysStream[0][offset++]       += conversion*static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
+        amoebaGpu->psWorkVector[0]->_pSysStream[0][offset++]       += conversion*static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
+        amoebaGpu->psWorkVector[0]->_pSysStream[0][offset++]       += conversion*static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str())); 
+        offset                                                     -= 3;        
+        amoebaGpu->psWorkVector[1]->_pSysStream[0][offset++]       += conversion*static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
+        amoebaGpu->psWorkVector[1]->_pSysStream[0][offset++]       += conversion*static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
+        amoebaGpu->psWorkVector[1]->_pSysStream[0][offset++]       += conversion*static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
+    }
+    amoebaGpu->psWorkVector[0]->Upload();
+    amoebaGpu->psWorkVector[1]->Upload();
+}
+#endif
+        // post matrix multiply
+        kSorUpdateMutualInducedField_kernel<<< numBlocks, numThreads >>>(
+           gpu->natoms, amoebaGpu->psPolarizability->_pDevStream[0],
+           amoebaGpu->psInducedDipole->_pDevStream[0], amoebaGpu->psInducedDipolePolar->_pDevStream[0],
+           amoebaGpu->psE_Field->_pDevStream[0],       amoebaGpu->psE_FieldPolar->_pDevStream[0],
+           amoebaGpu->psWorkVector[0]->_pDevStream[0],     amoebaGpu->psWorkVector[1]->_pDevStream[0] );
+        LAUNCHERROR("kSorUpdatePmeMutualInducedField");  
+        // get total epsilon -- performing sums on gpu
+        kReduceMutualInducedFieldDelta_kernel<<<1, amoebaGpu->epsilonThreadsPerBlock, 2*sizeof(float)*amoebaGpu->epsilonThreadsPerBlock>>>(
+           3*gpu->natoms, amoebaGpu->psWorkVector[0]->_pDevStream[0], amoebaGpu->psWorkVector[1]->_pDevStream[0],
+           amoebaGpu->psCurrentEpsilon->_pDevStream[0] );
+        LAUNCHERROR("kReducePmeMutualInducedFieldDelta");
+        if( amoebaGpu->log ){
+            trackMutualInducedIterations( amoebaGpu, iteration);
+        }
+        // Debye=4.8033324f
+        amoebaGpu->psCurrentEpsilon->Download();
+        float currentEpsilon                     = amoebaGpu->psCurrentEpsilon->_pSysStream[0][0];
+        amoebaGpu->mutualInducedCurrentEpsilon   = currentEpsilon;
+        if( iteration > amoebaGpu->mutualInducedMaxIterations || amoebaGpu->mutualInducedCurrentEpsilon < amoebaGpu->mutualInducedTargetEpsilon ){ 
+            done = 1;
+        }
+#ifdef AMOEBA_DEBUG
+        if( amoebaGpu->log ){
+           amoebaGpu->psInducedDipole->Download();
+           amoebaGpu->psInducedDipolePolar->Download();
+#if 1
+           (void) fprintf( amoebaGpu->log, "%s iteration=%3d eps %14.6e [%14.6e %14.6e] done=%d\n",
+                           methodName, iteration, amoebaGpu->mutualInducedCurrentEpsilon,
+                           amoebaGpu->psCurrentEpsilon->_pSysStream[0][1], 
+                           amoebaGpu->psCurrentEpsilon->_pSysStream[0][2], done );
+#else
+           (void) fprintf( amoebaGpu->log, "%s iteration=%3d eps %14.6e %14.6e crrntEps=%14.6e %14.6e %14.6e %14.6e done=%d\n",
+                           methodName, iteration, sum1, sum2, amoebaGpu->mutualInducedCurrentEpsilon,
+                           amoebaGpu->psCurrentEpsilon->_pSysStream[0][0], 
+                           amoebaGpu->psCurrentEpsilon->_pSysStream[0][1], 
+                           amoebaGpu->psCurrentEpsilon->_pSysStream[0][2], done );
+#endif
+           (void) fflush( amoebaGpu->log );
+            int offset   = 0;
+            int maxPrint = 20;
+            for( int ii = 0; ii < gpu->natoms; ii++ ){
+                (void) fprintf( amoebaGpu->log, "%4d ", ii ); 
+                (void) fprintf( amoebaGpu->log," Mi[%14.6e %14.6e %14.6e] ",
+                                amoebaGpu->psInducedDipole->_pSysStream[0][offset], amoebaGpu->psInducedDipole->_pSysStream[0][offset+1], amoebaGpu->psInducedDipole->_pSysStream[0][offset+2] );
+                (void) fprintf( amoebaGpu->log,"Mip[%14.6e %14.6e %14.6e]\n",
+                                amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset], amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset+1], amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset+2] );
+                if( ii == maxPrint && (ii < (gpu->natoms - maxPrint) ) ){
+                    ii =  (gpu->natoms - maxPrint);
+                    offset = 3*(ii+1);
+                } else {
+                    offset += 3;
+                }
+            }   
+            (void) fflush( amoebaGpu->log );
+        }
+#endif
+        iteration++;
+    }
+    amoebaGpu->mutualInducedDone             = done;
+    amoebaGpu->mutualInducedConverged        = ( !done || iteration > amoebaGpu->mutualInducedMaxIterations ) ? 0 : 1;
+#ifdef AMOEBA_DEBUG
+/*
+    if( 0 ){
+        std::vector<int> fileId;
+        //fileId.push_back( 0 );
+        VectorOfDoubleVectors outputVector;
+        cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,                    outputVector );
+        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipole,      outputVector );
+        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipolePolar, outputVector );
+        cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeMI", fileId, outputVector );
+     }
+*/
+#endif
+   // ---------------------------------------------------------------------------------------
+}
+void cudaComputeAmoebaPmeMutualInducedField( amoebaGpuContext amoebaGpu )
+{
+    if( amoebaGpu->mutualInducedIterativeMethod == 0 ){
+        cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpu );
+    }
+}
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "amoebaScaleFactors.h"
+__global__
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
+#elif (__CUDA_ARCH__ >= 130)
+__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
+#else
+__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
+#endif
+void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)(
+                            unsigned int* workUnit,
+                            float* outputField, float* outputFieldPolar
+#ifdef AMOEBA_DEBUG
+                           , float4* debugArray, unsigned int targetAtom
+#endif
+){
+    extern __shared__ MutualInducedParticle sA[];
+    unsigned int totalWarps      = gridDim.x*blockDim.x/GRID;
+    unsigned int warp            = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
+    unsigned int numWorkUnits    = cSim.pInteractionCount[0];
+    unsigned int pos             = warp*numWorkUnits/totalWarps;
+    unsigned int end             = (warp+1)*numWorkUnits/totalWarps;
+    unsigned int lasty           = 0xFFFFFFFF;
+    const float uscale           = 1.0f;
+#ifdef AMOEBA_DEBUG
+    float4 pullBack[4];
+#endif
+    while (pos < end)
+    {
+        unsigned int x;
+        unsigned int y;
+        bool bExclusionFlag;
+        // Extract cell coordinates
+        decodeCell( workUnit[pos], &x, &y, &bExclusionFlag );
+        unsigned int tgx                 = threadIdx.x & (GRID - 1);
+        unsigned int tbx                 = threadIdx.x - tgx;
+        unsigned int tj                  = tgx;
+        MutualInducedParticle*  psA      = &sA[tbx];
+        unsigned int atomI               = x + tgx;
+        MutualInducedParticle localParticle;
+        loadMutualInducedShared( &localParticle, atomI );
+        float fieldSum[3];
+        float fieldPolarSum[3];
+        // 0: field at i due to j
+        // 1: field at i due to j polar
+        fieldSum[0]                      = 0.0f;
+        fieldSum[1]                      = 0.0f;
+        fieldSum[2]                      = 0.0f;
+        fieldPolarSum[0]                 = 0.0f;
+        fieldPolarSum[1]                 = 0.0f;
+        fieldPolarSum[2]                 = 0.0f;
+        if (x == y) // Handle diagonals uniquely at 50% efficiency
+        {
+            // load shared data
+            loadMutualInducedShared( &(sA[threadIdx.x]), atomI );
+            for (unsigned int j = 0; j < GRID; j++)
+            {
+                float ijField[4][3];
+                // load coords, charge, ...
+                calculatePmeDirectMutualInducedFieldPairIxn_kernel( localParticle, psA[j], uscale, ijField
+#ifdef AMOEBA_DEBUG
+, pullBack 
+#endif
+);
+                unsigned int mask       =  ( (atomI == (y + j)) || (atomI >= cAmoebaSim.numberOfAtoms) || ((y+j) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
+                // add to field at atomI the field due atomJ's dipole
+                fieldSum[0]            += mask ? ijField[0][0] : 0.0f;
+                fieldSum[1]            += mask ? ijField[0][1] : 0.0f;
+                fieldSum[2]            += mask ? ijField[0][2] : 0.0f;
+                fieldPolarSum[0]       += mask ? ijField[2][0] : 0.0f;
+                fieldPolarSum[1]       += mask ? ijField[2][1] : 0.0f;
+                fieldPolarSum[2]       += mask ? ijField[2][2] : 0.0f;
+#ifdef AMOEBA_DEBUG
+if( atomI == targetAtom || (y+j) == targetAtom ){
+            unsigned int index                 = atomI == targetAtom ? (y+j) : atomI;
+            unsigned int pullBackIndex         = 0;
+            unsigned int indexI                = 0;
+            unsigned int indexJ                = indexI ? 0 : 2;
+            debugArray[index].x                = (float) atomI;
+            debugArray[index].y                = (float) (y + j);
+            debugArray[index].z                = cAmoebaSim.cutoffDistance2;
+            debugArray[index].w                = 6.0f;
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = pullBack[pullBackIndex].x;
+            debugArray[index].y                = pullBack[pullBackIndex].y;
+            debugArray[index].z                = pullBack[pullBackIndex].z;
+            debugArray[index].w                = pullBack[pullBackIndex].w;
+            pullBackIndex++;
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = pullBack[pullBackIndex].x;
+            debugArray[index].y                = pullBack[pullBackIndex].y;
+            debugArray[index].z                = pullBack[pullBackIndex].z;
+            debugArray[index].w                = pullBack[pullBackIndex].w;
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            float flag                         = 6.0f;
+            debugArray[index].x                = ijField[indexI][0];
+            debugArray[index].y                = ijField[indexI][1];
+            debugArray[index].z                = ijField[indexI][2];
+            debugArray[index].w                = flag;
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = ijField[indexJ][0];
+            debugArray[index].y                = ijField[indexJ][1];
+            debugArray[index].z                = ijField[indexJ][2];
+            debugArray[index].w                = flag;
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = ijField[indexI+1][0];
+            debugArray[index].y                = ijField[indexI+1][1];
+            debugArray[index].z                = ijField[indexI+1][2];
+            debugArray[index].w                = flag;
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = ijField[indexJ+1][0];
+            debugArray[index].y                = ijField[indexJ+1][1];
+            debugArray[index].z                = ijField[indexJ+1][2];
+            debugArray[index].w                = flag;
+/*
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = match ? 0.0f : ijField[indexI][0];
+            debugArray[index].y                = match ? 0.0f : ijField[indexI][1];
+            debugArray[index].z                = match ? 0.0f : ijField[indexI][2];
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            unsigned int mask                  = 1 << j;
+            unsigned int pScaleIndex           = (scaleMask.x & mask) ? 1 : 0;
+            pScaleIndex                       += (scaleMask.y & mask) ? 2 : 0;
+            debugArray[index].x                = (float) pScaleIndex;
+            debugArray[index].y                = scaleMask.x & mask ? 1.0f : -1.0f;
+            debugArray[index].z                = scaleMask.y & mask ? 1.0f : -1.0f;
+            debugArray[index].w                = + 10.0f;
+*/
+}
+#endif
+            }
+            // Write results
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            unsigned int offset            = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+            load3dArrayBufferPerWarp( offset, fieldSum,      outputField );
+            load3dArrayBufferPerWarp( offset, fieldPolarSum, outputFieldPolar);
+#else
+            unsigned int offset            = 3*(x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+            load3dArray( offset, fieldSum,      outputField );
+            load3dArray( offset, fieldPolarSum, outputFieldPolar);
+#endif
+        }
+        else        // 100% utilization
+        {
+            // Read fixed atom data into registers and GRF
+            if (lasty != y)
+            {
+                unsigned int atomJ        = y + tgx;
+                // load coordinates, charge, ...
+                loadMutualInducedShared( &(sA[threadIdx.x]), atomJ );
+            }
+           // zero shared fields
+            zeroMutualInducedParticleSharedField(  &(sA[threadIdx.x]) );
+            for (unsigned int j = 0; j < GRID; j++)
+            {
+                float ijField[4][3];
+                // load coords, charge, ...
+                calculatePmeDirectMutualInducedFieldPairIxn_kernel( localParticle, psA[tj], uscale, ijField
+#ifdef AMOEBA_DEBUG
+, pullBack 
+#endif
+   );
+                unsigned int mask   =  ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+tj) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
+                // add to field at atomI the field due atomJ's dipole
+                fieldSum[0]              += mask ? ijField[0][0] : 0.0f;
+                fieldSum[1]              += mask ? ijField[0][1] : 0.0f;
+                fieldSum[2]              += mask ? ijField[0][2] : 0.0f;
+                // add to polar field at atomI the field due atomJ's dipole
+                fieldPolarSum[0]         += mask ? ijField[2][0] : 0.0f;
+                fieldPolarSum[1]         += mask ? ijField[2][1] : 0.0f;
+                fieldPolarSum[2]         += mask ? ijField[2][2] : 0.0f;
+                // add to field at atomJ the field due atomI's dipole
+                psA[tj].field[0]         += mask ? ijField[1][0] : 0.0f;
+                psA[tj].field[1]         += mask ? ijField[1][1] : 0.0f;
+                psA[tj].field[2]         += mask ? ijField[1][2] : 0.0f;
+                // add to polar field at atomJ the field due atomI's dipole
+                psA[tj].fieldPolar[0]    += mask ? ijField[3][0] : 0.0f;
+                psA[tj].fieldPolar[1]    += mask ? ijField[3][1] : 0.0f;
+                psA[tj].fieldPolar[2]    += mask ? ijField[3][2] : 0.0f;
+#ifdef AMOEBA_DEBUG
+if( atomI == targetAtom || (y+tj) == targetAtom ){
+            unsigned int index                 = atomI == targetAtom ? (y+tj) : atomI;
+            unsigned int pullBackIndex         = 0;
+            unsigned int indexI                = 0;
+            unsigned int indexJ                = indexI ? 0 : 2;
+            debugArray[index].x                = (float) atomI;
+            debugArray[index].y                = (float) (y + tj);
+            debugArray[index].z                = cAmoebaSim.cutoffDistance2;
+            debugArray[index].w                = 7.0f;
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = pullBack[pullBackIndex].x;
+            debugArray[index].y                = pullBack[pullBackIndex].y;
+            debugArray[index].z                = pullBack[pullBackIndex].z;
+            debugArray[index].w                = pullBack[pullBackIndex].w;
+            pullBackIndex++;
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = pullBack[pullBackIndex].x;
+            debugArray[index].y                = pullBack[pullBackIndex].y;
+            debugArray[index].z                = pullBack[pullBackIndex].z;
+            debugArray[index].w                = pullBack[pullBackIndex].w;
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            float flag                         = 7.0f;
+            debugArray[index].x                = ijField[indexI][0];
+            debugArray[index].y                = ijField[indexI][1];
+            debugArray[index].z                = ijField[indexI][2];
+            debugArray[index].w                = flag;
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = ijField[indexJ][0];
+            debugArray[index].y                = ijField[indexJ][1];
+            debugArray[index].z                = ijField[indexJ][2];
+            debugArray[index].w                = flag;
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = ijField[indexI+1][0];
+            debugArray[index].y                = ijField[indexI+1][1];
+            debugArray[index].z                = ijField[indexI+1][2];
+            debugArray[index].w                = flag;
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = ijField[indexJ+1][0];
+            debugArray[index].y                = ijField[indexJ+1][1];
+            debugArray[index].z                = ijField[indexJ+1][2];
+            debugArray[index].w                = flag;
+/*
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = match ? 0.0f : ijField[indexI][0];
+            debugArray[index].y                = match ? 0.0f : ijField[indexI][1];
+            debugArray[index].z                = match ? 0.0f : ijField[indexI][2];
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            unsigned int mask                  = 1 << j;
+            unsigned int pScaleIndex           = (scaleMask.x & mask) ? 1 : 0;
+            pScaleIndex                       += (scaleMask.y & mask) ? 2 : 0;
+            debugArray[index].x                = (float) pScaleIndex;
+            debugArray[index].y                = scaleMask.x & mask ? 1.0f : -1.0f;
+            debugArray[index].z                = scaleMask.y & mask ? 1.0f : -1.0f;
+            debugArray[index].w                = + 10.0f;
+*/
+}
+#endif
+                tj                  = (tj + 1) & (GRID - 1);
+            }
+            // Write results
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            unsigned int offset     = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+            load3dArrayBufferPerWarp( offset, fieldSum,      outputField );
+            load3dArrayBufferPerWarp( offset, fieldPolarSum, outputFieldPolar);
+            offset                  = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+            load3dArrayBufferPerWarp( offset, sA[threadIdx.x].field,      outputField );
+            load3dArrayBufferPerWarp( offset, sA[threadIdx.x].fieldPolar, outputFieldPolar);
+#else
+            unsigned int offset     = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+            load3dArray( offset, fieldSum,      outputField );
+            load3dArray( offset, fieldPolarSum, outputFieldPolar);
+            offset                  = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+            load3dArray( offset, sA[threadIdx.x].field,      outputField );
+            load3dArray( offset, sA[threadIdx.x].fieldPolar, outputFieldPolar);
+#endif
+            lasty = y;
+        }
+        pos++;
+    }
+}
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaRotateFrame.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaRotateFrame.cu
@@ -373,6 +373,7 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
            cudaComputeAmoebaMutualInducedField( amoebaGpu );
        } else {
            cudaComputeAmoebaPmeFixedEField( amoebaGpu );
+            cudaComputeAmoebaPmeMutualInducedField( amoebaGpu );
        }
    }