PME real space and self terms for fixed E-field

cf335495 · Mark Friedrichs · acb46362 · cf335495 · cf335495 · cf335495
Commit cf335495 authored Sep 03, 2010 by Mark Friedrichs
11 changed files
--- a/plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
+++ b/plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
@@ -1420,6 +1420,18 @@ void gpuKirkwoodAllocate( amoebaGpuContext amoebaGpu )
    
 }

+static void tabulateErfc(gpuContext gpu) 
+{
+    int tableSize = 2048;
+    gpu->sim.tabulatedErfcSize = tableSize;
+    gpu->sim.tabulatedErfcScale = tableSize/(gpu->sim.alphaEwald*gpu->sim.nonbondedCutoff);
+    gpu->psTabulatedErfc = new CUDAStream<float>(tableSize, 1, "TabulatedErfc");
+    gpu->sim.pTabulatedErfc = gpu->psTabulatedErfc->_pDevData;
+    for (int i = 0; i < tableSize; ++i) 
+        (*gpu->psTabulatedErfc)[i] = (float) erfc(i*(gpu->sim.alphaEwald*gpu->sim.nonbondedCutoff)/tableSize);
+    gpu->psTabulatedErfc->Upload();
+}
+
 /**---------------------------------------------------------------------------------------

   Create/initialize data structs associated w/ molecular -> lab frame calculation
@@ -1525,14 +1537,26 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
    } else if( nonbondedMethod == 1 ){
        amoebaGpu->multipoleNonbondedMethod = AMOEBA_PARTICLE_MESH_EWALD;
    } else {
-        throw OpenMM::OpenMMException("multipoleNonbondedMethod not recognzied.\n" );
+        throw OpenMM::OpenMMException("multipoleNonbondedMethod not recognized.\n" );
    }
-    amoebaGpu->amoebaSim.cutoffDistance2    = cutoffDistance*cutoffDistance;
-    amoebaGpu->amoebaSim.sqrtPi             = sqrt( 3.1415926535897932384626433832795 );
-    amoebaGpu->amoebaSim.aewald             = aewald;
-    amoebaGpu->amoebaSim.electric           = electricConstant;
+
+    if( amoebaGpu->log ){
+        (void) fprintf( amoebaGpu->log,"%s Nonbonded method=%d %d [NoCutoff=%d PME=%d]\n",
+                        methodName.c_str(), nonbondedMethod, amoebaGpu->multipoleNonbondedMethod,
+                        AMOEBA_NO_CUTOFF, AMOEBA_PARTICLE_MESH_EWALD );
+        (void) fflush( amoebaGpu->log );
+    }
+    amoebaGpu->amoebaSim.cutoffDistance2             = cutoffDistance*cutoffDistance;
+    amoebaGpu->amoebaSim.sqrtPi                      = sqrt( 3.1415926535897932384626433832795 );
+    amoebaGpu->amoebaSim.aewald                      = aewald;
+    amoebaGpu->amoebaSim.electric                    = electricConstant;
+
+    amoebaGpu->gpuContext->sim.alphaEwald            = aewald;
+    amoebaGpu->gpuContext->sim.nonbondedCutoff       = cutoffDistance;
+    tabulateErfc(amoebaGpu->gpuContext); 
+
    if( amoebaGpu->amoebaSim.dielec < 1.0e-05 ){
-        amoebaGpu->amoebaSim.dielec      = 1.0f;
+        amoebaGpu->amoebaSim.dielec               = 1.0f;
    }

    for( int ii = 0; ii < static_cast<int>(charges.size()); ii++ ){
@@ -2593,6 +2617,7 @@ void amoebaGpuSetConstants(amoebaGpuContext amoebaGpu)
    SetCalculateAmoebaCudaVdw14_7Sim( amoebaGpu );
    SetCalculateAmoebaCudaWcaDispersionSim( amoebaGpu );
    SetCalculateAmoebaCudaMutualInducedFieldSim( amoebaGpu );
+    SetCalculateAmoebaCudaPmeFixedEFieldSim( amoebaGpu );
    SetCalculateAmoebaElectrostaticSim( amoebaGpu );
    SetCalculateAmoebaRealSpaceEwaldSim( amoebaGpu );
    SetCalculateAmoebaCudaMapTorquesSim( amoebaGpu );

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
@@ -65,12 +65,17 @@ extern void SetCalculateAmoebaCudaWcaDispersionSim(amoebaGpuContext gpu);
 extern void GetCalculateAmoebaCudaWcaDispersionSim(amoebaGpuContext gpu);
 extern void kCalculateAmoebaWcaDispersionForces(amoebaGpuContext amoebaGpu );

-// fixed electric field
+// fixed electric field -- no cutoff

 extern void SetCalculateAmoebaCudaFixedEFieldSim(amoebaGpuContext gpu);
 extern void GetCalculateAmoebaCudaFixedEFieldSim(amoebaGpuContext gpu);
 extern void cudaComputeAmoebaFixedEField( amoebaGpuContext gpu);

+// fixed electric field  -- PME
+extern void SetCalculateAmoebaCudaPmeFixedEFieldSim(amoebaGpuContext gpu);
+extern void GetCalculateAmoebaCudaPmeFixedEFieldSim(amoebaGpuContext gpu);
+extern void cudaComputeAmoebaPmeFixedEField( amoebaGpuContext gpu);
+
 // fixed electric field and Gk

 extern void SetCalculateAmoebaCudaFixedEAndGKFieldsSim(amoebaGpuContext gpu);

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.h
@@ -83,6 +83,9 @@ void METHOD_NAME(kCalculateAmoebaFixedEAndGkField, _kernel)(

        FixedFieldParticle* psA    = &sA[tbx];
        unsigned int atomI         = x + tgx;
+        FixedFieldParticle localParticle;
+        loadFixedFieldShared( &localParticle, atomI, bornRadii ); 
+
        float4 iCoord              = atomCoord[atomI];

        float eFieldSum[3];
@@ -106,9 +109,7 @@ void METHOD_NAME(kCalculateAmoebaFixedEAndGkField, _kernel)(

            // load coordinates, charge, ...

-            loadFixedFieldShared( &(sA[threadIdx.x]), atomI,
-                                  atomCoord, labFrameDipole, labFrameQuadrupole,
-                                  cAmoebaSim.pDampingFactorAndThole, bornRadii );
+            loadFixedFieldShared( &(sA[threadIdx.x]), atomI, bornRadii );

            if (!bExclusionFlag)
            {
@@ -125,12 +126,7 @@ void METHOD_NAME(kCalculateAmoebaFixedEAndGkField, _kernel)(

                    loadFixedFieldParticleData( &(psA[j]), &jCoord, jDipole, jQuadrupole, &jBornRadius );

-                    calculateFixedEFieldPairIxn_kernel( iCoord,                                          jCoord,
-                                                        cAmoebaSim.pDampingFactorAndThole[atomI].x,      psA[j].damp,
-                                                        cAmoebaSim.pDampingFactorAndThole[atomI].y,      psA[j].thole,
-                                                        &(labFrameDipole[atomI*3]),                      jDipole,
-                                                        &(labFrameQuadrupole[atomI*9]),                  jQuadrupole,
-                                                        cAmoebaSim.scalingDistanceCutoff,                ijField
+                    calculateFixedEFieldPairIxn_kernel( localParticle, psA[j], ijField
 #ifdef AMOEBA_DEBUG
                                                , pullBack
 #endif
@@ -182,12 +178,7 @@ void METHOD_NAME(kCalculateAmoebaFixedEAndGkField, _kernel)(

                    loadFixedFieldParticleData( &(psA[j]), &jCoord, jDipole, jQuadrupole, &jBornRadius );

-                    calculateFixedEFieldPairIxn_kernel( iCoord,                                          jCoord,
-                                                        cAmoebaSim.pDampingFactorAndThole[atomI].x,      psA[j].damp,
-                                                        cAmoebaSim.pDampingFactorAndThole[atomI].y,      psA[j].thole,
-                                                        &(labFrameDipole[atomI*3]),                      jDipole,
-                                                        &(labFrameQuadrupole[atomI*9]),                  jQuadrupole,
-                                                        cAmoebaSim.scalingDistanceCutoff,                ijField
+                    calculateFixedEFieldPairIxn_kernel( localParticle, psA[j], ijField
 #ifdef AMOEBA_DEBUG
                                                , pullBack
 #endif
@@ -366,9 +357,7 @@ if( atomI == targetAtom ){
            {
                // load coordinates, charge, ...

-                loadFixedFieldShared( &(sA[threadIdx.x]), (y+tgx),
-                                      atomCoord, labFrameDipole, labFrameQuadrupole,
-                                      cAmoebaSim.pDampingFactorAndThole, bornRadii );
+                loadFixedFieldShared( &(sA[threadIdx.x]), (y+tgx),  bornRadii );

            }

@@ -387,12 +376,7 @@ if( atomI == targetAtom ){
    
                    loadFixedFieldParticleData( &(psA[tj]),  &jCoord, jDipole, jQuadrupole, &jBornRadius );

-                    calculateFixedEFieldPairIxn_kernel( iCoord,                                          jCoord,
-                                                        cAmoebaSim.pDampingFactorAndThole[atomI].x,      psA[tj].damp,
-                                                        cAmoebaSim.pDampingFactorAndThole[atomI].y,      psA[tj].thole,
-                                                        &(labFrameDipole[atomI*3]),                      jDipole,
-                                                        &(labFrameQuadrupole[atomI*9]),                  jQuadrupole,
-                                                        cAmoebaSim.scalingDistanceCutoff,                ijField
+                    calculateFixedEFieldPairIxn_kernel( localParticle, psA[tj], ijField
 #ifdef AMOEBA_DEBUG
                                                 , pullBack
 #endif
@@ -563,12 +547,7 @@ if( (atomI == targetAtom  || (y + tj) == targetAtom) ){

                    loadFixedFieldParticleData( &(psA[tj]),  &jCoord, jDipole, jQuadrupole, &jBornRadius );
 
-                    calculateFixedEFieldPairIxn_kernel( iCoord,                                          jCoord,
-                                                        cAmoebaSim.pDampingFactorAndThole[atomI].x,      psA[tj].damp,
-                                                        cAmoebaSim.pDampingFactorAndThole[atomI].y,      psA[tj].thole,
-                                                        &(labFrameDipole[atomI*3]),                      jDipole,
-                                                        &(labFrameQuadrupole[atomI*9]),                  jQuadrupole,
-                                                        cAmoebaSim.scalingDistanceCutoff,                ijField
+                    calculateFixedEFieldPairIxn_kernel( localParticle, psA[tj], ijField
 #ifdef AMOEBA_DEBUG
                                                , pullBack
 #endif

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.cu
@@ -61,46 +61,6 @@ static void kReduceE_Fields_kernel(amoebaGpuContext amoebaGpu )
 #define METHOD_NAME(a, b) a##N2ByWarp##b
 #include "kCalculateAmoebaCudaFixedEField.h"

-#ifdef AMOEBA_DEBUG
-#if 0
-static void printEFieldBuffer( amoebaGpuContext amoebaGpu, unsigned int bufferIndex )
-{
-    (void) fprintf( amoebaGpu->log, "EField Buffer %u\n", bufferIndex );
-    unsigned int start = bufferIndex*3*amoebaGpu->paddedNumberOfAtoms;
-    unsigned int stop  = (bufferIndex+1)*3*amoebaGpu->paddedNumberOfAtoms;
-    for( unsigned int ii = start; ii < stop; ii += 3 ){
-        unsigned int ii3Index      = ii/3;
-        unsigned int bufferIndex   = ii3Index/(amoebaGpu->paddedNumberOfAtoms);
-        unsigned int particleIndex = ii3Index - bufferIndex*(amoebaGpu->paddedNumberOfAtoms);
-        (void) fprintf( amoebaGpu->log, "   %6u %3u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n", 
-                            ii/3,  bufferIndex, particleIndex,
-                            amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii],
-                            amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii+1],
-                            amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii+2],
-                            amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii],
-                            amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii+1],
-                            amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii+2] );
-    } 
-}
-
-static void printEFieldAtomBuffers( amoebaGpuContext amoebaGpu, unsigned int targetAtom )
-{
-    (void) fprintf( amoebaGpu->log, "EField atom %u\n", targetAtom );
-    for( unsigned int ii = 0; ii < amoebaGpu->outputBuffers; ii++ ){
-        unsigned int particleIndex = targetAtom + ii*3*amoebaGpu->paddedNumberOfAtoms;
-        (void) fprintf( amoebaGpu->log, " %2u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n", 
-                        ii, particleIndex,
-                        amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex],
-                        amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex+1],
-                        amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex+2],
-                        amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex],
-                        amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex+1],
-                        amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex+2] );
-    } 
-}
-#endif
-#endif
-
 /**---------------------------------------------------------------------------------------

   Compute fixed electric field
@@ -145,9 +105,6 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
        (void) fprintf( amoebaGpu->log, "N2 warp\n" );
        kCalculateAmoebaFixedE_FieldN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
                                                                           amoebaGpu->psWorkUnit->_pDevStream[0],
-                                                                           gpu->psPosq4->_pDevStream[0],
-                                                                           amoebaGpu->psLabFrameDipole->_pDevStream[0],
-                                                                           amoebaGpu->psLabFrameQuadrupole->_pDevStream[0],
                                                                           amoebaGpu->psWorkArray_3_1->_pDevStream[0],
 #ifdef AMOEBA_DEBUG
                                                                           amoebaGpu->psWorkArray_3_2->_pDevStream[0],
@@ -167,9 +124,6 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )

        kCalculateAmoebaFixedE_FieldN2Forces_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
                                                                           amoebaGpu->psWorkUnit->_pDevStream[0],
-                                                                           gpu->psPosq4->_pDevStream[0],
-                                                                           amoebaGpu->psLabFrameDipole->_pDevStream[0],
-                                                                           amoebaGpu->psLabFrameQuadrupole->_pDevStream[0],
                                                                           amoebaGpu->psWorkArray_3_1->_pDevStream[0],
 #ifdef AMOEBA_DEBUG
                                                                           amoebaGpu->psWorkArray_3_2->_pDevStream[0],

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.h
@@ -36,9 +36,6 @@ __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
 #endif
 void METHOD_NAME(kCalculateAmoebaFixedE_Field, Forces_kernel)(
                            unsigned int* workUnit,
-                            float4* atomCoord,
-                            float* labFrameDipole,
-                            float* labFrameQuadrupole,
                            float* outputEField,
                            float* outputEFieldPolar
 #ifdef AMOEBA_DEBUG
@@ -59,10 +56,6 @@ void METHOD_NAME(kCalculateAmoebaFixedE_Field, Forces_kernel)(
    unsigned int end             = (warp+1)*numWorkUnits/totalWarps;
    unsigned int lasty           = 0xFFFFFFFF;

-    float4 jCoord;
-    float jDipole[3];    
-    float jQuadrupole[9];    
- 
    while (pos < end)
    {

@@ -80,7 +73,9 @@ void METHOD_NAME(kCalculateAmoebaFixedE_Field, Forces_kernel)(

        FixedFieldParticle* psA    = &sA[tbx];
        unsigned int atomI         = x + tgx;
-        float4 iCoord              = atomCoord[atomI];
+
+        FixedFieldParticle localParticle;
+        loadFixedFieldShared( &localParticle, atomI );

        float fieldSum[3];
        float fieldPolarSum[3];
@@ -98,9 +93,7 @@ void METHOD_NAME(kCalculateAmoebaFixedE_Field, Forces_kernel)(

            // load coordinates, charge, ...

-            loadFixedFieldShared( &(sA[threadIdx.x]), atomI,
-                                  atomCoord, labFrameDipole, labFrameQuadrupole,
-                                  cAmoebaSim.pDampingFactorAndThole );
+            loadFixedFieldShared( &(sA[threadIdx.x]), atomI );

            if (!bExclusionFlag)
            {
@@ -113,16 +106,7 @@ void METHOD_NAME(kCalculateAmoebaFixedE_Field, Forces_kernel)(

                    float ijField[2][3];

-                    // load coords, charge, ...
-
-                    loadFixedFieldParticleData( &(psA[j]), &jCoord, jDipole, jQuadrupole );
-
-                    calculateFixedEFieldPairIxn_kernel( iCoord,                                          jCoord,
-                                                        cAmoebaSim.pDampingFactorAndThole[atomI].x,      psA[j].damp,
-                                                        cAmoebaSim.pDampingFactorAndThole[atomI].y,      psA[j].thole,
-                                                        &(labFrameDipole[atomI*3]),                      jDipole,
-                                                        &(labFrameQuadrupole[atomI*9]),                  jQuadrupole,
-                                                        cAmoebaSim.scalingDistanceCutoff,                ijField
+                    calculateFixedEFieldPairIxn_kernel( localParticle, psA[j], ijField
 #ifdef AMOEBA_DEBUG
                                                , pullBack
 #endif
@@ -156,14 +140,9 @@ void METHOD_NAME(kCalculateAmoebaFixedE_Field, Forces_kernel)(

                    float ijField[2][3];

-                    loadFixedFieldParticleData( &(psA[j]), &jCoord, jDipole, jQuadrupole );
+                    //loadFixedFieldParticleData( &(psA[j]), &jCoord, jDipole, jQuadrupole );

-                    calculateFixedEFieldPairIxn_kernel( iCoord,                                     jCoord,
-                                                        cAmoebaSim.pDampingFactorAndThole[atomI].x, psA[j].damp,
-                                                        cAmoebaSim.pDampingFactorAndThole[atomI].y, psA[j].thole,
-                                                        &(labFrameDipole[atomI*3]),                 jDipole,
-                                                        &(labFrameQuadrupole[atomI*9]),             jQuadrupole,
-                                                        cAmoebaSim.scalingDistanceCutoff,           ijField
+                    calculateFixedEFieldPairIxn_kernel( localParticle, psA[j], ijField
 #ifdef AMOEBA_DEBUG
                                                , pullBack
 #endif
@@ -280,9 +259,7 @@ if( 0 && atomI == targetAtom ){

                // load coordinates, charge, ...

-                loadFixedFieldShared( &(sA[threadIdx.x]), (y+tgx),
-                                      atomCoord, labFrameDipole, labFrameQuadrupole,
-                                      cAmoebaSim.pDampingFactorAndThole );
+                loadFixedFieldShared( &(sA[threadIdx.x]), (y+tgx) );

            }

@@ -297,16 +274,7 @@ if( 0 && atomI == targetAtom ){

                    float ijField[2][3];
    
-                    // load coords, charge, ...
-    
-                    loadFixedFieldParticleData( &(psA[tj]),  &jCoord, jDipole, jQuadrupole );
-
-                    calculateFixedEFieldPairIxn_kernel( iCoord,                                     jCoord,
-                                                        cAmoebaSim.pDampingFactorAndThole[atomI].x, psA[tj].damp,
-                                                        cAmoebaSim.pDampingFactorAndThole[atomI].y, psA[tj].thole,
-                                                        &(labFrameDipole[atomI*3]),                 jDipole,
-                                                        &(labFrameQuadrupole[atomI*9]),             jQuadrupole,
-                                                        cAmoebaSim.scalingDistanceCutoff,           ijField
+                    calculateFixedEFieldPairIxn_kernel( localParticle, psA[tj], ijField
 #ifdef AMOEBA_DEBUG
                                                 , pullBack
 #endif
@@ -420,14 +388,7 @@ if( 0 && (atomI == targetAtom  || (y + tj) == targetAtom) ){

                    float ijField[2][3];

-                    loadFixedFieldParticleData( &(psA[tj]),  &jCoord, jDipole, jQuadrupole );
- 
-                    calculateFixedEFieldPairIxn_kernel( iCoord,                                          jCoord,
-                                                        cAmoebaSim.pDampingFactorAndThole[atomI].x,      psA[tj].damp,
-                                                        cAmoebaSim.pDampingFactorAndThole[atomI].y,      psA[tj].thole,
-                                                        &(labFrameDipole[atomI*3]),                      jDipole,
-                                                        &(labFrameQuadrupole[atomI*9]),                  jQuadrupole,
-                                                        cAmoebaSim.scalingDistanceCutoff,                ijField
+                    calculateFixedEFieldPairIxn_kernel( localParticle, psA[tj], ijField
 #ifdef AMOEBA_DEBUG
                                                , pullBack
 #endif

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedFieldParticle.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedFieldParticle.h
@@ -46,9 +46,7 @@ struct FixedFieldParticle {
 #endif
 };

-__device__ void loadFixedFieldShared( struct FixedFieldParticle* sA, unsigned int atomI,
-                                      float4* atomCoord, float* labDipole, float* labQuadrupole,
-                                      float2* dampingFactorAndThole
+__device__ static void loadFixedFieldShared( struct FixedFieldParticle* sA, unsigned int atomI 
 #ifdef GK
    , float* bornR
 #endif
@@ -56,28 +54,28 @@ __device__ void loadFixedFieldShared( struct FixedFieldParticle* sA, unsigned in
 {
    // coordinates & charge

-    sA->x                        = atomCoord[atomI].x;
-    sA->y                        = atomCoord[atomI].y;
-    sA->z                        = atomCoord[atomI].z;
-    sA->q                        = atomCoord[atomI].w;
+    sA->x                        = cSim.pPosq[atomI].x;
+    sA->y                        = cSim.pPosq[atomI].y;
+    sA->z                        = cSim.pPosq[atomI].z;
+    sA->q                        = cSim.pPosq[atomI].w;

    // lab dipole

-    sA->labFrameDipole_X         = labDipole[atomI*3];
-    sA->labFrameDipole_Y         = labDipole[atomI*3+1];
-    sA->labFrameDipole_Z         = labDipole[atomI*3+2];
+    sA->labFrameDipole_X         = cAmoebaSim.pLabFrameDipole[atomI*3];
+    sA->labFrameDipole_Y         = cAmoebaSim.pLabFrameDipole[atomI*3+1];
+    sA->labFrameDipole_Z         = cAmoebaSim.pLabFrameDipole[atomI*3+2];

    // lab quadrupole

-    sA->labFrameQuadrupole_XX    = labQuadrupole[atomI*9];
-    sA->labFrameQuadrupole_XY    = labQuadrupole[atomI*9+1];
-    sA->labFrameQuadrupole_XZ    = labQuadrupole[atomI*9+2];
-    sA->labFrameQuadrupole_YY    = labQuadrupole[atomI*9+4];
-    sA->labFrameQuadrupole_YZ    = labQuadrupole[atomI*9+5];
-    sA->labFrameQuadrupole_ZZ    = labQuadrupole[atomI*9+8];
+    sA->labFrameQuadrupole_XX    = cAmoebaSim.pLabFrameQuadrupole[atomI*9];
+    sA->labFrameQuadrupole_XY    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+1];
+    sA->labFrameQuadrupole_XZ    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+2];
+    sA->labFrameQuadrupole_YY    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+4];
+    sA->labFrameQuadrupole_YZ    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+5];
+    sA->labFrameQuadrupole_ZZ    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+8];

-    sA->damp                     = dampingFactorAndThole[atomI].x;
-    sA->thole                    = dampingFactorAndThole[atomI].y;
+    sA->damp                     = cAmoebaSim.pDampingFactorAndThole[atomI].x;
+    sA->thole                    = cAmoebaSim.pDampingFactorAndThole[atomI].y;
 #ifdef GK
    sA->bornR                    = bornR[atomI];
 #endif
@@ -86,8 +84,8 @@ __device__ void loadFixedFieldShared( struct FixedFieldParticle* sA, unsigned in

 // load struct and arrays w/ shared data in sA

-__device__ void loadFixedFieldParticleData( struct FixedFieldParticle* sA, 
-                                            float4* jCoord, float* jDipole, float* jQuadrupole
+__device__ static void loadFixedFieldParticleData( struct FixedFieldParticle* sA, 
+                                                   float4* jCoord, float* jDipole, float* jQuadrupole
 #ifdef GK
 , float* bornR
 #endif
@@ -142,15 +140,11 @@ __device__ static void zeroFixedFieldParticleSharedField( struct FixedFieldParti
 #endif
 }

+
 // body of fixed E-field calculation

-__device__ static void calculateFixedEFieldPairIxn_kernel( float4 atomCoordinatesI, float4 atomCoordinatesJ,
-                                                    float dampingFactorI,    float dampingFactorJ,
-                                                    float tholeI,            float tholeJ,
-                                                    float* labDipoleI, float* labDipoleJ,
-                                                    float* labQuadrupoleI, float* labQuadrupoleJ,
-                                                    float scalingDistanceCutoff,
-                                                    float field[2][3]
+__device__ static void calculateFixedEFieldPairIxn_kernel( FixedFieldParticle& atomI, FixedFieldParticle& atomJ,
+                                                           float field[2][3]
 #ifdef AMOEBA_DEBUG
                                                    , float4 debugArray[12]
 #endif
@@ -163,9 +157,9 @@ __device__ static void calculateFixedEFieldPairIxn_kernel( float4 atomCoordinate
    // get deltaR and r between 2 atoms
 
    float deltaR[3];
-    deltaR[0]           = atomCoordinatesJ.x - atomCoordinatesI.x;
-    deltaR[1]           = atomCoordinatesJ.y - atomCoordinatesI.y;
-    deltaR[2]           = atomCoordinatesJ.z - atomCoordinatesI.z;
+    deltaR[0]           = atomJ.x - atomI.x;
+    deltaR[1]           = atomJ.y - atomI.y;
+    deltaR[2]           = atomJ.z - atomI.z;

    float r             =  SQRT( deltaR[0]*deltaR[0] + deltaR[1]*deltaR[1] + deltaR[2]*deltaR[2] );
    float rI            =  1.0f/r;
@@ -177,14 +171,14 @@ __device__ static void calculateFixedEFieldPairIxn_kernel( float4 atomCoordinate
 
    // get scaling factors, if needed
    
-    float damp          = dampingFactorI*dampingFactorJ;
+    float damp          = atomI.damp*atomJ.damp;
    float dampExp;
-    if( damp != 0.0f && r < scalingDistanceCutoff ){
+    if( damp != 0.0f && r < cAmoebaSim.scalingDistanceCutoff ){

        // get scaling factors
      
        float ratio     = r/damp;
-        float pGamma    = tholeJ > tholeI ? tholeI : tholeJ; 
+        float pGamma    = atomJ.thole > atomI.thole ? atomI.thole : atomJ.thole; 
        damp            = ratio*ratio*ratio*pGamma;
        dampExp         = EXP( -damp );
    } else {
@@ -197,69 +191,30 @@ __device__ static void calculateFixedEFieldPairIxn_kernel( float4 atomCoordinate
      
    float rr5_2         = rr5*2.0f;
 
-#ifdef AMOEBA_DEBUG
-    int index           = 0;
-
-    // 0-2
-    debugArray[index].x   = r;
-    debugArray[index].y   = rr3;
-    debugArray[index].z   = rr5;
-    index++;
-#endif
-
-    float* dipole       = labDipoleJ;
-    float* quadrupole   = labQuadrupoleJ;
    float  qDotDelta[3];
-    qDotDelta[0]        = deltaR[0]*quadrupole[0] + deltaR[1]*quadrupole[1] + deltaR[2]*quadrupole[2];
-    qDotDelta[1]        = deltaR[0]*quadrupole[3] + deltaR[1]*quadrupole[4] + deltaR[2]*quadrupole[5];
-    qDotDelta[2]        = deltaR[0]*quadrupole[6] + deltaR[1]*quadrupole[7] + deltaR[2]*quadrupole[8];
- 
-    float dotdd         = deltaR[0]*dipole[0]    + deltaR[1]*dipole[1]    + deltaR[2]*dipole[2];
-    float dotqd         = deltaR[0]*qDotDelta[0] + deltaR[1]*qDotDelta[1] + deltaR[2]*qDotDelta[2];
-    float factor        = -rr3*atomCoordinatesJ.w + rr5*dotdd - rr7*dotqd;
+    qDotDelta[0]        = deltaR[0]*atomJ.labFrameQuadrupole_XX + deltaR[1]*atomJ.labFrameQuadrupole_XY + deltaR[2]*atomJ.labFrameQuadrupole_XZ;
+    qDotDelta[1]        = deltaR[0]*atomJ.labFrameQuadrupole_XY + deltaR[1]*atomJ.labFrameQuadrupole_YY + deltaR[2]*atomJ.labFrameQuadrupole_YZ;
+    qDotDelta[2]        = deltaR[0]*atomJ.labFrameQuadrupole_XZ + deltaR[1]*atomJ.labFrameQuadrupole_YZ + deltaR[2]*atomJ.labFrameQuadrupole_ZZ;
 
-#ifdef AMOEBA_DEBUG
-    // 3-5
-    debugArray[index].x   = dotdd;
-    debugArray[index].y   = dotqd;
-    debugArray[index].z   = factor;
-    index++;
-#endif
+    float dotdd         = deltaR[0]*atomJ.labFrameDipole_X      + deltaR[1]*atomJ.labFrameDipole_Y      + deltaR[2]*atomJ.labFrameDipole_Z;
+    float dotqd         = deltaR[0]*qDotDelta[0]                + deltaR[1]*qDotDelta[1]                + deltaR[2]*qDotDelta[2];

-    field[0][0]         = deltaR[0]*factor - rr3*dipole[0] + rr5_2*qDotDelta[0];
-    field[0][1]         = deltaR[1]*factor - rr3*dipole[1] + rr5_2*qDotDelta[1];
-    field[0][2]         = deltaR[2]*factor - rr3*dipole[2] + rr5_2*qDotDelta[2];
+    float factor        = -rr3*atomJ.q + rr5*dotdd - rr7*dotqd;
+ 
+    field[0][0]         = deltaR[0]*factor - rr3*atomJ.labFrameDipole_X + rr5_2*qDotDelta[0];
+    field[0][1]         = deltaR[1]*factor - rr3*atomJ.labFrameDipole_Y + rr5_2*qDotDelta[1];
+    field[0][2]         = deltaR[2]*factor - rr3*atomJ.labFrameDipole_Z + rr5_2*qDotDelta[2];
 
-    dipole              = labDipoleI;
-    quadrupole          = labQuadrupoleI;
-    qDotDelta[0]        = deltaR[0]*quadrupole[0] + deltaR[1]*quadrupole[1] + deltaR[2]*quadrupole[2];
-    qDotDelta[1]        = deltaR[0]*quadrupole[3] + deltaR[1]*quadrupole[4] + deltaR[2]*quadrupole[5];
-    qDotDelta[2]        = deltaR[0]*quadrupole[6] + deltaR[1]*quadrupole[7] + deltaR[2]*quadrupole[8];
+    qDotDelta[0]        = deltaR[0]*atomI.labFrameQuadrupole_XX + deltaR[1]*atomI.labFrameQuadrupole_XY + deltaR[2]*atomI.labFrameQuadrupole_XZ;
+    qDotDelta[1]        = deltaR[0]*atomI.labFrameQuadrupole_XY + deltaR[1]*atomI.labFrameQuadrupole_YY + deltaR[2]*atomI.labFrameQuadrupole_YZ;
+    qDotDelta[2]        = deltaR[0]*atomI.labFrameQuadrupole_XZ + deltaR[1]*atomI.labFrameQuadrupole_YZ + deltaR[2]*atomI.labFrameQuadrupole_ZZ;
 
-    dotdd               = deltaR[0]*dipole[0]    + deltaR[1]*dipole[1]    + deltaR[2]*dipole[2];
+    dotdd               = deltaR[0]*atomI.labFrameDipole_X    + deltaR[1]*atomI.labFrameDipole_Y    + deltaR[2]*atomI.labFrameDipole_Z;
    dotqd               = deltaR[0]*qDotDelta[0] + deltaR[1]*qDotDelta[1] + deltaR[2]*qDotDelta[2];
-    factor              = rr3*atomCoordinatesI.w + rr5*dotdd + rr7*dotqd;
+    factor              = rr3*atomI.q + rr5*dotdd + rr7*dotqd;
 
-#ifdef AMOEBA_DEBUG
-    // 6-8
-    debugArray[index].x = dotdd;
-    debugArray[index].y = dotqd;
-    debugArray[index].z = factor;
-    index++;
-#endif
+    field[1][0]         = deltaR[0]*factor - rr3*atomI.labFrameDipole_X - rr5_2*qDotDelta[0];
+    field[1][1]         = deltaR[1]*factor - rr3*atomI.labFrameDipole_Y - rr5_2*qDotDelta[1];
+    field[1][2]         = deltaR[2]*factor - rr3*atomI.labFrameDipole_Z - rr5_2*qDotDelta[2];

-    field[1][0]         = deltaR[0]*factor - rr3*dipole[0] - rr5_2*qDotDelta[0];
-    field[1][1]         = deltaR[1]*factor - rr3*dipole[1] - rr5_2*qDotDelta[1];
-    field[1][2]         = deltaR[2]*factor - rr3*dipole[2] - rr5_2*qDotDelta[2];
-
-#if 0
-    float testValue  = 1.0f;
-    field[0][0]      = testValue;
-    field[0][1]      = testValue;
-    field[0][2]      = testValue;
-    field[1][0]      = testValue;
-    field[1][1]      = testValue;
-    field[1][2]      = testValue;
-#endif
- 
 }
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
+
+//-----------------------------------------------------------------------------------------
+
+//-----------------------------------------------------------------------------------------
+
+#include "amoebaCudaKernels.h"
+#include "kCalculateAmoebaCudaUtilities.h"
+
+//#define AMOEBA_DEBUG
+
+static __constant__ cudaGmxSimulation cSim;
+static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
+
+void SetCalculateAmoebaCudaPmeFixedEFieldSim(amoebaGpuContext amoebaGpu)
+{
+    cudaError_t status;
+    gpuContext gpu = amoebaGpu->gpuContext;
+    status         = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));    
+    RTERROR(status, "SetCalculateAmoebaCudaPmeFixedEFieldSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
+    status         = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));    
+    RTERROR(status, "SetCalculateAmoebaCudaPmeFixedEFieldSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
+}
+
+void GetCalculateAmoebaCudaPmeFixedEFieldSim(amoebaGpuContext amoebaGpu)
+{
+    cudaError_t status;
+    gpuContext gpu = amoebaGpu->gpuContext;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));    
+    RTERROR(status, "GetCalculateAmoebaCudaPmeFixedEFieldSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+    status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));         
+    RTERROR(status, "GetCalculateAmoebaCudaPmeFixedEFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
+}
+
+__global__ 
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
+#elif (__CUDA_ARCH__ >= 130)
+__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
+#else
+__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
+#endif
+void kReduceDirectSelfFields_kernel( unsigned int fieldComponents, unsigned int outputBuffers,  float* fieldIn, float* fieldOut )
+{
+    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // Reduce field
+
+    const float term = (4.0f/3.0f)*(cAmoebaSim.aewald*cAmoebaSim.aewald*cAmoebaSim.aewald)/cAmoebaSim.sqrtPi;
+    while (pos < fieldComponents)
+    {   
+
+        // self-term included here
+
+        float totalField = term*cAmoebaSim.pLabFrameDipole[pos];
+
+        float* pFt       = fieldIn + pos;
+        unsigned int i   = outputBuffers;
+        while (i >= 4)
+        {   
+            totalField += pFt[0] + pFt[fieldComponents] + pFt[2*fieldComponents] + pFt[3*fieldComponents];
+            pFt        += fieldComponents*4;
+            i          -= 4;
+        }   
+
+        if (i >= 2)
+        {   
+            totalField += pFt[0] + pFt[fieldComponents];
+            pFt        += fieldComponents*2;
+            i          -= 2;
+        }   
+
+        if (i > 0)
+        {   
+            totalField += pFt[0];
+        }   
+        fieldOut[pos]   = totalField;
+        pos            += gridDim.x * blockDim.x;
+    }   
+}
+
+// reduce psWorkArray_3_1 -> EField
+// reduce psWorkArray_3_2 -> EFieldPolar
+
+static void kReducePmeDirectE_Fields(amoebaGpuContext amoebaGpu )
+{
+    kReduceDirectSelfFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
+                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
+                               amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psE_Field->_pDevStream[0] );
+    LAUNCHERROR("kReducePmeE_Fields1");
+
+    kReduceDirectSelfFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
+                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
+                               amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psE_FieldPolar->_pDevStream[0] );
+    LAUNCHERROR("kReducePmeE_Fields2");
+}
+
+// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
+
+#undef GK
+#include "kCalculateAmoebaCudaFixedFieldParticle.h"
+__device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle& atomI, FixedFieldParticle& atomJ,
+                                                            float dscale, float pscale, float fields[4][3]
+#ifdef AMOEBA_DEBUG
+                                                            , float4* pullBack
+#endif
+
+ ){
+
+    // compute the real space portion of the Ewald summation
+  
+    float xr          = atomJ.x - atomI.x;
+    float yr          = atomJ.y - atomI.y;
+    float zr          = atomJ.z - atomI.z;
+
+    // periodic boundary conditions
+
+    xr               -= floor(xr*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+    yr               -= floor(yr*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+    zr               -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+
+    float r2          = xr*xr + yr* yr + zr*zr;
+    float r           = sqrtf(r2);
+
+    // calculate the error function damping terms
+
+    float ralpha      = cAmoebaSim.aewald*r;
+    float bn[4];
+
+    bn[0]             = erfc(ralpha)/r;
+    float alsq2       = 2.0f*cAmoebaSim.aewald*cAmoebaSim.aewald;
+    float alsq2n      = 1.0f/(cAmoebaSim.sqrtPi*cAmoebaSim.aewald);
+    float exp2a       = exp(-(ralpha*ralpha));
+    alsq2n           *= alsq2;
+    bn[1]             = (bn[0]+alsq2n*exp2a)/r2;
+
+    alsq2n           *= alsq2;
+    bn[2]             = (3.0f*bn[1]+alsq2n*exp2a)/r2;
+
+    alsq2n           *= alsq2;
+    bn[3]             = (5.0f*bn[2]+alsq2n*exp2a)/r2;
+
+    // compute the error function scaled and unscaled terms
+
+    float scale3      = 1.0f;
+    float scale5      = 1.0f;
+    float scale7      = 1.0f;
+    float damp        = atomI.damp*atomJ.damp;
+    if( damp != 0.0f ){
+
+        float ratio  = (r/damp);
+              ratio  = ratio*ratio*ratio;
+
+        float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
+
+              damp   = -pgamma*ratio;
+
+        if( damp > -50.0f) {
+            float expdamp = exp(damp);
+            scale3        = 1.0f - expdamp;
+            scale5        = 1.0f - expdamp*(1.0f-damp);
+            scale7        = 1.0f - expdamp*(1.0f-damp+(0.6f*damp*damp));
+        }
+    }
+    float dsc3        = dscale*scale3;
+    float dsc5        = dscale*scale5;
+    float dsc7        = dscale*scale7;
+
+    float psc3        = pscale*scale3;
+    float psc5        = pscale*scale5;
+    float psc7        = pscale*scale7;
+
+    float r3          = (r*r2);
+    float r5          = (r3*r2);
+    float r7          = (r5*r2);
+    float drr3        = (1.0f-dsc3)/r3;
+    float drr5        = 3.0f * (1.0f-dsc5)/r5;
+    float drr7        = 15.0f * (1.0f-dsc7)/r7;
+
+    float prr3        = (1.0f-psc3) / r3;
+    float prr5        = 3.0f *(1.0f-psc5)/r5;
+    float prr7        = 15.0f*(1.0f-psc7)/r7;
+
+    float dir         = atomI.labFrameDipole_X*xr + atomI.labFrameDipole_Y*yr + atomI.labFrameDipole_Z*zr;
+
+    float qix         = atomI.labFrameQuadrupole_XX*xr + atomI.labFrameQuadrupole_XY*yr + atomI.labFrameQuadrupole_XZ*zr;
+    float qiy         = atomI.labFrameQuadrupole_XY*xr + atomI.labFrameQuadrupole_YY*yr + atomI.labFrameQuadrupole_YZ*zr;
+    float qiz         = atomI.labFrameQuadrupole_XZ*xr + atomI.labFrameQuadrupole_YZ*yr + atomI.labFrameQuadrupole_ZZ*zr;
+
+    float qir         = qix*xr + qiy*yr + qiz*zr;
+
+    float dkr         = atomJ.labFrameDipole_X*xr + atomJ.labFrameDipole_Y*yr + atomJ.labFrameDipole_Z*zr;
+    float qkx         = atomJ.labFrameQuadrupole_XX*xr + atomJ.labFrameQuadrupole_XY*yr + atomJ.labFrameQuadrupole_XZ*zr;
+    float qky         = atomJ.labFrameQuadrupole_XY*xr + atomJ.labFrameQuadrupole_YY*yr + atomJ.labFrameQuadrupole_YZ*zr;
+    float qkz         = atomJ.labFrameQuadrupole_XZ*xr + atomJ.labFrameQuadrupole_YZ*yr + atomJ.labFrameQuadrupole_ZZ*zr;
+    float qkr         = qkx*xr + qky*yr + qkz*zr;
+
+    float fim[3],fkm[3];
+    float fid[3],fkd[3];
+    float fip[3],fkp[3];
+    fim[0]            = -xr*(bn[1]*atomJ.q-bn[2]*dkr+bn[3]*qkr)
+                         - bn[1]*atomJ.labFrameDipole_X + 2.0f*bn[2]*qkx;
+
+    fim[1]            = -yr*(bn[1]*atomJ.q-bn[2]*dkr+bn[3]*qkr)
+                         - bn[1]*atomJ.labFrameDipole_Y + 2.0f*bn[2]*qky;
+
+    fim[2]            = -zr*(bn[1]*atomJ.q-bn[2]*dkr+bn[3]*qkr)
+                         - bn[1]*atomJ.labFrameDipole_Z + 2.0f*bn[2]*qkz;
+
+    fkm[0]            = xr*(bn[1]*atomI.q+bn[2]*dir+bn[3]*qir)
+                         - bn[1]*atomI.labFrameDipole_X - 2.0f*bn[2]*qix;
+
+    fkm[1]            = yr*(bn[1]*atomI.q+bn[2]*dir+bn[3]*qir)
+                         - bn[1]*atomI.labFrameDipole_Y - 2.0f*bn[2]*qiy;
+
+    fkm[2]            = zr*(bn[1]*atomI.q+bn[2]*dir+bn[3]*qir)
+                         - bn[1]*atomI.labFrameDipole_Z - 2.0f*bn[2]*qiz;
+
+    fid[0]            = -xr*(drr3*atomJ.q-drr5*dkr+drr7*qkr)
+                         - drr3*atomJ.labFrameDipole_X + 2.0f*drr5*qkx;
+
+    fid[1]            = -yr*(drr3*atomJ.q-drr5*dkr+drr7*qkr)
+                         - drr3*atomJ.labFrameDipole_Y + 2.0f*drr5*qky;
+
+    fid[2]            = -zr*(drr3*atomJ.q-drr5*dkr+drr7*qkr)
+                         - drr3*atomJ.labFrameDipole_Z + 2.0f*drr5*qkz;
+
+    fkd[0]            = xr*(drr3*atomI.q+drr5*dir+drr7*qir)
+                         - drr3*atomI.labFrameDipole_X - 2.0f*drr5*qix;
+
+    fkd[1]            = yr*(drr3*atomI.q+drr5*dir+drr7*qir)
+                         - drr3*atomI.labFrameDipole_Y - 2.0f*drr5*qiy;
+
+    fkd[2]            = zr*(drr3*atomI.q+drr5*dir+drr7*qir)
+                         - drr3*atomI.labFrameDipole_Z - 2.0f*drr5*qiz;
+
+    fip[0]            = -xr*(prr3*atomJ.q-prr5*dkr+prr7*qkr)
+                         - prr3*atomJ.labFrameDipole_X + 2.0f*prr5*qkx;
+
+    fip[1]            = -yr*(prr3*atomJ.q-prr5*dkr+prr7*qkr)
+                         - prr3*atomJ.labFrameDipole_Y + 2.0f*prr5*qky;
+
+    fip[2]            = -zr*(prr3*atomJ.q-prr5*dkr+prr7*qkr)
+                         - prr3*atomJ.labFrameDipole_Z + 2.0f*prr5*qkz;
+
+    fkp[0]            = xr*(prr3*atomI.q+prr5*dir+prr7*qir)
+                         - prr3*atomI.labFrameDipole_X - 2.0f*prr5*qix;
+
+    fkp[1]            = yr*(prr3*atomI.q+prr5*dir+prr7*qir)
+                         - prr3*atomI.labFrameDipole_Y - 2.0f*prr5*qiy;
+
+    fkp[2]            = zr*(prr3*atomI.q+prr5*dir+prr7*qir)
+                         - prr3*atomI.labFrameDipole_Z - 2.0f*prr5*qiz;
+  
+    // increment the field at each site due to this interaction
+
+    if( r2 <= cAmoebaSim.cutoffDistance2 ){
+
+        fields[0][0]       = fim[0] - fid[0];
+        fields[1][0]       = fkm[0] - fkd[0];
+        fields[2][0]       = fim[0] - fip[0];
+        fields[3][0]       = fkm[0] - fkp[0];
+    
+        fields[0][1]       = fim[1] - fid[1];
+        fields[1][1]       = fkm[1] - fkd[1];
+        fields[2][1]       = fim[1] - fip[1];
+        fields[3][1]       = fkm[1] - fkp[1];
+    
+        fields[0][2]       = fim[2] - fid[2];
+        fields[1][2]       = fkm[2] - fkd[2];
+        fields[2][2]       = fim[2] - fip[2];
+        fields[3][2]       = fkm[2] - fkp[2];
+ 
+    } else {
+
+        fields[0][0]       = 0.0f;
+        fields[1][0]       = 0.0f;
+        fields[2][0]       = 0.0f;
+        fields[3][0]       = 0.0f;
+    
+        fields[0][1]       = 0.0f;
+        fields[1][1]       = 0.0f;
+        fields[2][1]       = 0.0f;
+        fields[3][1]       = 0.0f;
+    
+        fields[0][2]       = 0.0f;
+        fields[1][2]       = 0.0f;
+        fields[2][2]       = 0.0f;
+        fields[3][2]       = 0.0f;
+    }
+#ifdef AMOEBA_DEBUG
+    pullBack[0].x = xr;
+    pullBack[0].y = yr;
+    pullBack[0].z = zr;
+    pullBack[0].w = r2;
+
+    pullBack[1].x = atomJ.x - atomI.x;
+    pullBack[1].y = atomJ.y - atomI.y;
+    pullBack[1].z = atomJ.z - atomI.z;
+    pullBack[1].w = (atomJ.x - atomI.x)*(atomJ.x - atomI.x) + (atomJ.y - atomI.y)*(atomJ.y - atomI.y)+ (atomJ.z - atomI.z)*(atomJ.z - atomI.z);
+/*
+    pullBack[1].x = scale3;
+    pullBack[1].y = scale5;
+    pullBack[1].z = scale7;
+*/
+#endif
+}
+
+// Include versions of the kernels for N^2 calculations.
+
+#define METHOD_NAME(a, b) a##N2##b
+#include "kCalculateAmoebaCudaPmeFixedEField.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##N2ByWarp##b
+#include "kCalculateAmoebaCudaPmeFixedEField.h"
+
+/**---------------------------------------------------------------------------------------
+
+   Compute fixed electric field using PME
+
+   @param amoebaGpu        amoebaGpu context
+
+   --------------------------------------------------------------------------------------- */
+
+void cudaComputeAmoebaPmeFixedEField( amoebaGpuContext amoebaGpu )
+{
+  
+   // ---------------------------------------------------------------------------------------
+   // ---------------------------------------------------------------------------------------
+
+    gpuContext gpu    = amoebaGpu->gpuContext;
+
+#ifdef AMOEBA_DEBUG
+    static const char* methodName = "computeCudaAmoebaPmeFixedEField";
+    if( amoebaGpu->log ){
+        (void) fprintf( amoebaGpu->log, "\n%s\n", methodName ); (void) fflush( amoebaGpu->log );
+    }
+    int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
+
+    // N2 debug array
+
+    CUDAStream<float4>* debugArray             = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
+    memset( debugArray->_pSysStream[0],      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
+    debugArray->Upload();
+
+    (*gpu->psInteractionCount)[0]              = gpu->sim.workUnits;
+    gpu->psInteractionCount->Upload();
+
+    // print intermediate results for the targetAtom 
+
+    unsigned int targetAtom  = 0;
+#endif
+
+    kClearFields_3( amoebaGpu, 2 );
+
+    if (gpu->bOutputBufferPerWarp){
+        (void) fprintf( amoebaGpu->log, "N2 warp\n" );
+        kCalculateAmoebaPmeDirectFixedE_FieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
+                                                                           amoebaGpu->psWorkUnit->_pDevStream[0],
+                                                                           amoebaGpu->psWorkArray_3_1->_pDevStream[0],
+#ifdef AMOEBA_DEBUG
+                                                                           amoebaGpu->psWorkArray_3_2->_pDevStream[0],
+                                                                           debugArray->_pDevStream[0], targetAtom );
+#else
+                                                                           amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
+#endif
+    } else {
+
+#ifdef AMOEBA_DEBUG
+        (void) fprintf( amoebaGpu->log, "N2 no warp\n" );
+        (void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n",
+                        amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
+                        sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
+        (void) fflush( amoebaGpu->log );
+#endif
+
+        kCalculateAmoebaPmeDirectFixedE_FieldN2_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
+                                                                           amoebaGpu->psWorkUnit->_pDevStream[0],
+                                                                           amoebaGpu->psWorkArray_3_1->_pDevStream[0],
+#ifdef AMOEBA_DEBUG
+                                                                           amoebaGpu->psWorkArray_3_2->_pDevStream[0],
+                                                                           debugArray->_pDevStream[0], targetAtom );
+#else
+                                                                           amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
+#endif
+    }
+
+    LAUNCHERROR("kCalculateAmoebaPmeDirectFixedE_Field_kernel");
+
+#if 0
+        for( unsigned int ii = 0; ii < amoebaGpu->outputBuffers; ii++ ){
+            //float index = 1.0f;
+            float index = (float) ii;
+            for( unsigned int jj = 0; jj < 3*amoebaGpu->paddedNumberOfAtoms; jj += 3 ){
+                unsigned int kk = 3*ii*amoebaGpu->paddedNumberOfAtoms + jj;
+                amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk]   = index;
+                amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk+1] = index;
+                amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk+2] = index;
+            }
+        }
+        amoebaGpu->psWorkArray_3_1->Upload();
+#endif
+
+    kReducePmeDirectE_Fields( amoebaGpu );
+
+#ifdef AMOEBA_DEBUG
+    if( amoebaGpu->log ){
+        gpu->psInteractionCount->Download();
+        (void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n",
+                        amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
+                        sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
+        (void) fflush( amoebaGpu->log );
+        amoebaGpu->psWorkArray_3_1->Download();
+        amoebaGpu->psWorkArray_3_2->Download();
+        amoebaGpu->psE_Field->Download();
+        amoebaGpu->psE_FieldPolar->Download();
+        (void) fprintf( amoebaGpu->log, "OutEFields\n" );
+        int maxPrint        = 32;
+        for( int ii = 0; ii < gpu->natoms; ii++ ){
+           (void) fprintf( amoebaGpu->log, "%5d ", ii); 
+
+            int indexOffset     = ii*3;
+
+           // E_Field
+
+           (void) fprintf( amoebaGpu->log,"E[%16.9e %16.9e %16.9e] ",
+                           amoebaGpu->psE_Field->_pSysStream[0][indexOffset],
+                           amoebaGpu->psE_Field->_pSysStream[0][indexOffset+1],
+                           amoebaGpu->psE_Field->_pSysStream[0][indexOffset+2] );
+   
+           // E_Field polar
+
+           (void) fprintf( amoebaGpu->log,"Epol[%16.9e %16.9e %16.9e] ",
+                           amoebaGpu->psE_FieldPolar->_pSysStream[0][indexOffset],
+                           amoebaGpu->psE_FieldPolar->_pSysStream[0][indexOffset+1],
+                           amoebaGpu->psE_FieldPolar->_pSysStream[0][indexOffset+2] );
+
+           (void) fprintf( amoebaGpu->log,"\n" );
+           if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
+                ii = gpu->natoms - maxPrint;
+           }
+        }
+        (void) fflush( amoebaGpu->log );
+        (void) fprintf( amoebaGpu->log, "EFields End\n" );
+
+        (void) fprintf( amoebaGpu->log, "DebugQ\n" );
+        debugArray->Download();
+
+        int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
+        for( int jj = 0; jj < gpu->natoms; jj++ ){
+            int debugIndex = jj;
+            (void) fprintf( amoebaGpu->log,"%5d PmeFixedEField\n", jj );
+            for( int kk = 0; kk < 7; kk++ ){
+                (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
+                                debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
+                                debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
+                debugIndex += paddedNumberOfAtoms;
+            }
+            (void) fprintf( amoebaGpu->log,"\n" );
+
+        }
+
+        // write results to file
+
+        if( 1 ){
+            std::vector<int> fileId;
+            //fileId.push_back( 0 );
+            VectorOfDoubleVectors outputVector;
+            cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,              outputVector );
+            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_Field,      outputVector );
+            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_FieldPolar, outputVector);
+            cudaWriteVectorOfDoubleVectorsToFile( "CudaEField", fileId, outputVector );
+
+         }
+         delete debugArray;
+    }
+#endif
+
+        if( 1 ){
+            std::vector<int> fileId;
+            fileId.push_back( 0 );
+            VectorOfDoubleVectors outputVector;
+            cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,              outputVector );
+            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_Field,      outputVector );
+            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_FieldPolar, outputVector);
+            cudaWriteVectorOfDoubleVectorsToFile( "CudaEField", fileId, outputVector );
+
+         }
+}
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "amoebaScaleFactors.h"
+
+__global__ 
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
+#elif (__CUDA_ARCH__ >= 130)
+__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
+#else
+__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
+#endif
+void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
+                            unsigned int* workUnit,
+                            float* outputEField,
+                            float* outputEFieldPolar
+#ifdef AMOEBA_DEBUG
+                           , float4* debugArray, unsigned int targetAtom
+#endif
+){
+
+#ifdef AMOEBA_DEBUG
+    float4 pullBack[12];
+    float dScaleVal;
+    float pScaleVal;
+#endif
+
+    extern __shared__ FixedFieldParticle sA[];
+
+    unsigned int totalWarps      = gridDim.x*blockDim.x/GRID;
+    unsigned int warp            = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
+    unsigned int numWorkUnits    = cSim.pInteractionCount[0];
+    unsigned int pos             = warp*numWorkUnits/totalWarps;
+    unsigned int end             = (warp+1)*numWorkUnits/totalWarps;
+    unsigned int lasty           = 0xFFFFFFFF;
+
+    while (pos < end)
+    {
+
+        unsigned int x;
+        unsigned int y;
+        bool bExclusionFlag;
+
+        // extract cell coordinates
+
+        decodeCell( workUnit[pos], &x, &y, &bExclusionFlag );
+
+        unsigned int tgx           = threadIdx.x & (GRID - 1);
+        unsigned int tbx           = threadIdx.x - tgx;
+        unsigned int tj            = tgx;
+
+        FixedFieldParticle* psA    = &sA[tbx];
+        unsigned int atomI         = x + tgx;
+        FixedFieldParticle localParticle;
+        loadFixedFieldShared( &localParticle, atomI );
+
+        float fieldSum[3];
+        float fieldPolarSum[3];
+
+        fieldSum[0]                = 0.0f;
+        fieldSum[1]                = 0.0f;
+        fieldSum[2]                = 0.0f;
+
+        fieldPolarSum[0]           = 0.0f;
+        fieldPolarSum[1]           = 0.0f;
+        fieldPolarSum[2]           = 0.0f;
+
+        if (x == y)
+        {
+
+            // load coordinates, charge, ...
+
+            loadFixedFieldShared( &(sA[threadIdx.x]), atomI );
+
+            if (!bExclusionFlag)
+            {
+
+                // this branch is never exercised since it includes the
+                // interaction between atomI and itself which is always excluded
+
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+
+                    float ijField[4][3];
+
+                    // load coords, charge, ...
+
+#ifdef AMOEBA_DEBUG
+dScaleVal = 1.0f;
+pScaleVal = 1.0f;
+#endif
+                    calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[j], 1.0f, 1.0f, ijField
+#ifdef AMOEBA_DEBUG
+                                                , pullBack
+#endif
+                    );
+
+                    unsigned int match      = (atomI == (y + j)) ? 1 : 0;
+
+                    // add to field at atomI the field due atomJ's charge/dipole/quadrupole
+
+                    fieldSum[0]            += match ? 0.0f : ijField[0][0];
+                    fieldSum[1]            += match ? 0.0f : ijField[0][1];
+                    fieldSum[2]            += match ? 0.0f : ijField[0][2];
+
+                    fieldPolarSum[0]       += match ? 0.0f : ijField[2][0];
+                    fieldPolarSum[1]       += match ? 0.0f : ijField[2][1];
+                    fieldPolarSum[2]       += match ? 0.0f : ijField[2][2];
+                }
+
+            }
+            else  // bExclusion
+            {
+                unsigned int xi       = x >> GRIDBITS;
+                unsigned int cell     = xi + xi*cAmoebaSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
+                int  dScaleMask       = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
+                int2 pScaleMask       = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
+
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+
+                    // load coords, charge, ...
+
+                    float ijField[4][3];
+                    float dScaleValue;
+                    float pScaleValue;
+
+                    getMaskedDScaleFactor( j, dScaleMask, &dScaleValue );
+                    getMaskedPScaleFactor( j, pScaleMask, &pScaleValue );
+
+#ifdef AMOEBA_DEBUG
+dScaleVal = dScaleValue;
+pScaleVal = pScaleValue;
+#endif
+                    calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[j], dScaleValue, pScaleValue, ijField
+#ifdef AMOEBA_DEBUG
+                                                , pullBack
+#endif
+                    );
+
+
+                    // nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution
+                    // by setting match flag
+
+                    unsigned int match      = (atomI == (y + j)) ? 1 : 0;
+
+                    // add to field at atomI the field due atomJ's charge/dipole/quadrupole
+
+                    fieldSum[0]            += match ? 0.0f : ijField[0][0];
+                    fieldSum[1]            += match ? 0.0f : ijField[0][1];
+                    fieldSum[2]            += match ? 0.0f : ijField[0][2];
+
+                    fieldPolarSum[0]       += match ? 0.0f : ijField[2][0];
+                    fieldPolarSum[1]       += match ? 0.0f : ijField[2][1];
+                    fieldPolarSum[2]       += match ? 0.0f : ijField[2][2];
+
+#ifdef AMOEBA_DEBUG
+if( atomI == targetAtom ){
+            unsigned int index                 = atomI == targetAtom ? (y + j) : atomI;
+            unsigned int pullBackIndex         = 0;
+            unsigned int indexI                = 0;
+            unsigned int indexJ                = indexI ? 0 : 2;
+
+            debugArray[index].x                = (float) atomI;
+            debugArray[index].y                = (float) (y + j);
+            debugArray[index].z                = dScaleValue;
+            debugArray[index].w                = pScaleValue;
+
+
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = pullBack[pullBackIndex].x;
+            debugArray[index].y                = pullBack[pullBackIndex].y;
+            debugArray[index].z                = pullBack[pullBackIndex].z;
+            debugArray[index].w                = pullBack[pullBackIndex].w;
+
+            pullBackIndex++;
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = pullBack[pullBackIndex].x;
+            debugArray[index].y                = pullBack[pullBackIndex].y;
+            debugArray[index].z                = pullBack[pullBackIndex].z;
+            debugArray[index].w                = pullBack[pullBackIndex].w;
+
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            float flag                         = 7.0f;
+            debugArray[index].x                = ijField[indexI][0];
+            debugArray[index].y                = ijField[indexI][1];
+            debugArray[index].z                = ijField[indexI][2];
+            debugArray[index].w                = flag;
+
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = ijField[indexJ][0];
+            debugArray[index].y                = ijField[indexJ][1];
+            debugArray[index].z                = ijField[indexJ][2];
+            debugArray[index].w                = flag;
+
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = ijField[indexI+1][0];
+            debugArray[index].y                = ijField[indexI+1][1];
+            debugArray[index].z                = ijField[indexI+1][2];
+            debugArray[index].w                = flag;
+
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = ijField[indexJ+1][0];
+            debugArray[index].y                = ijField[indexJ+1][1];
+            debugArray[index].z                = ijField[indexJ+1][2];
+            debugArray[index].w                = flag;
+
+/*
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = match ? 0.0f : ijField[indexI][0];
+            debugArray[index].y                = match ? 0.0f : ijField[indexI][1];
+            debugArray[index].z                = match ? 0.0f : ijField[indexI][2];
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            unsigned int mask                  = 1 << j;
+            unsigned int pScaleIndex           = (scaleMask.x & mask) ? 1 : 0;
+            pScaleIndex                       += (scaleMask.y & mask) ? 2 : 0;
+            debugArray[index].x                = (float) pScaleIndex;
+
+            debugArray[index].y                = scaleMask.x & mask ? 1.0f : -1.0f;
+            debugArray[index].z                = scaleMask.y & mask ? 1.0f : -1.0f;
+            debugArray[index].w                = + 10.0f;
+*/
+
+}
+#endif
+                }
+            }
+
+            // Write results
+
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            unsigned int offset                 = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+            load3dArrayBufferPerWarp( offset, fieldSum,       outputEField );
+            load3dArrayBufferPerWarp( offset, fieldPolarSum,  outputEFieldPolar );
+#else
+            unsigned int offset                 = 3*(x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+            load3dArray( offset, fieldSum,       outputEField );
+            load3dArray( offset, fieldPolarSum,  outputEFieldPolar );
+#endif
+
+        }
+        else        // 100% utilization
+        {
+            // Read fixed atom data into registers and GRF
+            if (lasty != y)
+            {
+
+                // load coordinates, charge, ...
+
+                loadFixedFieldShared( &(sA[threadIdx.x]), (y+tgx) );
+
+            }
+
+            // zero shared fields
+
+            zeroFixedFieldParticleSharedField( &(sA[threadIdx.x]) );
+
+            if (!bExclusionFlag)
+            {
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+
+                    float ijField[4][3];
+    
+                    // load coords, charge, ...
+    
+#ifdef AMOEBA_DEBUG
+dScaleVal = 1.0f;
+pScaleVal = 1.0f;
+#endif
+                    calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[tj], 1.0f, 1.0f, ijField
+#ifdef AMOEBA_DEBUG
+                                                 , pullBack
+#endif
+                    );
+    
+                    // add to field at atomI the field due atomJ's charge/dipole/quadrupole
+
+                    fieldSum[0]        += ijField[0][0];
+                    fieldSum[1]        += ijField[0][1];
+                    fieldSum[2]        += ijField[0][2];
+        
+                    fieldPolarSum[0]   += ijField[2][0];
+                    fieldPolarSum[1]   += ijField[2][1];
+                    fieldPolarSum[2]   += ijField[2][2];
+
+                    // add to field at atomJ the field due atomI's charge/dipole/quadrupole
+
+                    psA[tj].eField[0]  += ijField[1][0];
+                    psA[tj].eField[1]  += ijField[1][1];
+                    psA[tj].eField[2]  += ijField[1][2];
+
+                    psA[tj].eFieldP[0] += ijField[3][0];
+                    psA[tj].eFieldP[1] += ijField[3][1];
+                    psA[tj].eFieldP[2] += ijField[3][2];
+
+
+#ifdef AMOEBA_DEBUG
+if( (atomI == targetAtom  || (y + tj) == targetAtom) ){
+            unsigned int index                 = (atomI == targetAtom) ? (y + tj) : atomI;
+            unsigned int indexI                = (atomI == targetAtom) ? 0 : 2;
+            unsigned int indexJ                = (atomI == targetAtom) ? 2 : 0;
+
+            debugArray[index].x                = (float) atomI;
+            debugArray[index].y                = (float) (y + tj);
+            debugArray[index].z                = dScaleVal;
+            debugArray[index].w                = pScaleVal;
+
+            unsigned int pullBackIndex         = 0;
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = pullBack[pullBackIndex].x;
+            debugArray[index].y                = pullBack[pullBackIndex].y;
+            debugArray[index].z                = pullBack[pullBackIndex].z;
+            debugArray[index].w                = pullBack[pullBackIndex].w;;
+
+            pullBackIndex++;
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = pullBack[pullBackIndex].x;
+            debugArray[index].y                = pullBack[pullBackIndex].y;
+            debugArray[index].z                = pullBack[pullBackIndex].z;
+            debugArray[index].w                = pullBack[pullBackIndex].w;;
+
+
+            float flag                         = 8.0f;
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = ijField[indexI][0];
+            debugArray[index].y                = ijField[indexI][1];
+            debugArray[index].z                = ijField[indexI][2];
+            debugArray[index].w                = flag;
+
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = ijField[indexJ][0];
+            debugArray[index].y                = ijField[indexJ][1];
+            debugArray[index].z                = ijField[indexJ][2];
+            debugArray[index].w                = flag;
+
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = ijField[indexI+1][0];
+            debugArray[index].y                = ijField[indexI+1][1];
+            debugArray[index].z                = ijField[indexI+1][2];
+            debugArray[index].w                = flag;
+
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = ijField[indexJ+1][0];
+            debugArray[index].y                = ijField[indexJ+1][1];
+            debugArray[index].z                = ijField[indexJ+1][2];
+            debugArray[index].w                = flag;
+
+#if 0
+
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            unsigned int mask                  = 1 << j;
+            unsigned int pScaleIndex           = (scaleMask.x & mask) ? 1 : 0;
+            pScaleIndex                       += (scaleMask.y & mask) ? 2 : 0;
+            debugArray[index].x                = (float) pScaleIndex;
+
+            debugArray[index].y                = scaleMask.x & mask ? 1.0f : -1.0f;
+            debugArray[index].z                = scaleMask.y & mask ? 1.0f : -1.0f;
+            debugArray[index].w                = pScaleValue + 10.0f;
+#endif
+}
+#endif
+
+                    tj                  = (tj + 1) & (GRID - 1);
+
+                }
+            }
+            else  // bExclusion
+            {
+                // Read fixed atom data into registers and GRF
+
+                unsigned int xi   = x >> GRIDBITS;
+                unsigned int yi   = y >> GRIDBITS;
+                unsigned int cell = xi+yi*cAmoebaSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
+                int  dScaleMask   = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
+                int2 pScaleMask   = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
+
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    // load coords, charge, ...
+
+                    float ijField[4][3];
+
+                    float dScaleValue;
+                    float pScaleValue;
+                    getMaskedDScaleFactor( tj, dScaleMask, &dScaleValue );
+                    getMaskedPScaleFactor( tj, pScaleMask, &pScaleValue );
+
+#ifdef AMOEBA_DEBUG
+dScaleVal = dScaleValue;
+pScaleVal = pScaleValue;
+#endif
+                    calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[tj], dScaleValue, pScaleValue, ijField
+#ifdef AMOEBA_DEBUG
+                                                , pullBack
+#endif
+                    );
+
+                    // add to field at atomI the field due atomJ's charge/dipole/quadrupole
+
+                    fieldSum[0]        += ijField[0][0];
+                    fieldSum[1]        += ijField[0][1];
+                    fieldSum[2]        += ijField[0][2];
+
+                    fieldPolarSum[0]   += ijField[2][0];
+                    fieldPolarSum[1]   += ijField[2][1];
+                    fieldPolarSum[2]   += ijField[2][2];
+
+                    // add to field at atomJ the field due atomI's charge/dipole/quadrupole
+
+                    psA[tj].eField[0]  += ijField[1][0];
+                    psA[tj].eField[1]  += ijField[1][1];
+                    psA[tj].eField[2]  += ijField[1][2];
+
+                    psA[tj].eFieldP[0] += ijField[3][0];
+                    psA[tj].eFieldP[1] += ijField[3][1];
+                    psA[tj].eFieldP[2] += ijField[3][2];
+
+
+#ifdef AMOEBA_DEBUG
+if( (atomI == targetAtom || (y + tj) == targetAtom) ){
+
+            unsigned int index                 = (atomI == targetAtom) ? (y + tj) : atomI;
+            unsigned int indexI                = (atomI == targetAtom) ? 0 : 2;
+            unsigned int indexJ                = (atomI == targetAtom) ? 2 : 0;
+
+            debugArray[index].x                = (float) atomI;
+            debugArray[index].y                = (float) (y + tj);
+            debugArray[index].z                = dScaleVal;
+            debugArray[index].w                = pScaleVal;
+
+            unsigned int pullBackIndex         = 0;
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = pullBack[pullBackIndex].x;
+            debugArray[index].y                = pullBack[pullBackIndex].y;
+            debugArray[index].z                = pullBack[pullBackIndex].z;
+            debugArray[index].w                = pullBack[pullBackIndex].w;;
+
+            pullBackIndex++;
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = pullBack[pullBackIndex].x;
+            debugArray[index].y                = pullBack[pullBackIndex].y;
+            debugArray[index].z                = pullBack[pullBackIndex].z;
+            debugArray[index].w                = pullBack[pullBackIndex].w;;
+
+
+            float flag                         = 9.0f;
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = ijField[indexI][0];
+            debugArray[index].y                = ijField[indexI][1];
+            debugArray[index].z                = ijField[indexI][2];
+            debugArray[index].w                = flag;
+
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = ijField[indexJ][0];
+            debugArray[index].y                = ijField[indexJ][1];
+            debugArray[index].z                = ijField[indexJ][2];
+            debugArray[index].w                = flag;
+
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = ijField[indexI+1][0];
+            debugArray[index].y                = ijField[indexI+1][1];
+            debugArray[index].z                = ijField[indexI+1][2];
+            debugArray[index].w                = flag;
+
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = ijField[indexJ+1][0];
+            debugArray[index].y                = ijField[indexJ+1][1];
+            debugArray[index].z                = ijField[indexJ+1][2];
+            debugArray[index].w                = flag;
+
+
+}
+#endif
+
+
+
+                    tj                  = (tj + 1) & (GRID - 1);
+                }
+            }
+
+            // Write results
+
+
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            unsigned int offset                 = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+            load3dArrayBufferPerWarp( offset, fieldSum,       outputEField );
+            load3dArrayBufferPerWarp( offset, fieldPolarSum,  outputEFieldPolar );
+
+            offset                              = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+            load3dArrayBufferPerWarp( offset, sA[threadIdx.x].eField,  outputEField );
+            load3dArrayBufferPerWarp( offset, sA[threadIdx.x].eFieldP, outputEFieldPolar );
+
+#else
+            unsigned int offset                 = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+            load3dArray( offset, fieldSum,       outputEField );
+            load3dArray( offset, fieldPolarSum,  outputEFieldPolar );
+
+            offset                              = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+            load3dArray( offset, sA[threadIdx.x].eField,  outputEField );
+            load3dArray( offset, sA[threadIdx.x].eFieldP, outputEFieldPolar );
+ 
+#endif
+            lasty = y;
+        }
+
+        pos++;
+    }
+}
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaRealSpaceEwald.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaRealSpaceEwald.cu
@@ -1027,6 +1027,9 @@ void cudaComputeAmoebaRealSpaceEwald( amoebaGpuContext amoebaGpu )
      (void) fflush( amoebaGpu->log );
 #endif

+      cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+      cudaBindTexture(NULL, &tabulatedErfcRef, gpu->psTabulatedErfc->_pDevData, &channelDesc, gpu->psTabulatedErfc->_length*sizeof(float));
+
      kCalculateAmoebaCudaRealSpaceEwaldN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(RealSpaceEwaldParticle)*threadsPerBlock>>>(
                                                                         amoebaGpu->psWorkUnit->_pDevStream[0],
                                                                         gpu->psPosq4->_pDevStream[0],

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaRotateFrame.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaRotateFrame.cu
@@ -368,8 +368,12 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
        cudaComputeAmoebaFixedEAndGkFields( amoebaGpu );
        cudaComputeAmoebaMutualInducedAndGkField( amoebaGpu );
    } else {
-        cudaComputeAmoebaFixedEField( amoebaGpu );
-        cudaComputeAmoebaMutualInducedField( amoebaGpu );
+        if( amoebaGpu->multipoleNonbondedMethod == AMOEBA_NO_CUTOFF ){
+            cudaComputeAmoebaFixedEField( amoebaGpu );
+            cudaComputeAmoebaMutualInducedField( amoebaGpu );
+        } else {
+            cudaComputeAmoebaPmeFixedEField( amoebaGpu );
+        }
    }

    // check if induce dipole calculation converged -- abort if it did not

--- a/plugins/amoeba/platforms/cuda/tests/AmoebaTinkerParameterFile.cpp
+++ b/plugins/amoeba/platforms/cuda/tests/AmoebaTinkerParameterFile.cpp
@@ -1989,6 +1989,7 @@ static int readAmoebaMultipoleParameters( FILE* filePtr, int version, MapStringI
    int usePme                        = 0;
    double aewald                     = 0.0;
    double cutoffDistance             = 0.0;
+    double box[3]                     = { 10.0, 10.0, 10.0 };

    // usePme, aewald, cutoffDistance added w/ Version 1

@@ -1996,7 +1997,11 @@ static int readAmoebaMultipoleParameters( FILE* filePtr, int version, MapStringI
        usePme                        = atoi( tokens[2].c_str() );
        aewald                        = atof( tokens[3].c_str() );
        cutoffDistance                = atof( tokens[4].c_str() );
+        box[0]                        = atof( tokens[5].c_str() );
+        box[1]                        = atof( tokens[6].c_str() );
+        box[2]                        = atof( tokens[7].c_str() );
    }
+  
    if( usePme ){
        multipoleForce->setNonbondedMethod( AmoebaMultipoleForce::PME );
    } else {
@@ -2004,6 +2009,7 @@ static int readAmoebaMultipoleParameters( FILE* filePtr, int version, MapStringI
    }
    multipoleForce->setAEwald( aewald );
    multipoleForce->setCutoffDistance( cutoffDistance );
+    system.setDefaultPeriodicBoxVectors( Vec3(box[0], 0.0, 0.0), Vec3(0.0, box[1], 0.0), Vec3(0.0, 0.0, box[2]) );
    if( log ){
        (void) fprintf( log, "%s number of MultipoleParameter terms=%d usePme=%d aewald=%15.7e cutoffDistance=%12.4f\n",
                        methodName.c_str(), numberOfMultipoles, usePme, aewald, cutoffDistance );
@@ -2106,10 +2112,26 @@ static int readAmoebaMultipoleParameters( FILE* filePtr, int version, MapStringI
        double polarityConversion      = AngstromToNm*AngstromToNm*AngstromToNm;
        double dampingFactorConversion = sqrt( AngstromToNm );
      
-        float scalingDistanceCutoff    = static_cast<float>(multipoleForce->getScalingDistanceCutoff());
-              scalingDistanceCutoff   *= static_cast<float>(AngstromToNm);
-        multipoleForce->setScalingDistanceCutoff( scalingDistanceCutoff );
+        multipoleForce->setAEwald(                multipoleForce->getAEwald()/AngstromToNm);
+        multipoleForce->setCutoffDistance(        multipoleForce->getCutoffDistance()*AngstromToNm );
+        multipoleForce->setScalingDistanceCutoff( multipoleForce->getScalingDistanceCutoff()*AngstromToNm );
+
+        Vec3 a,b,c;
+        system.getDefaultPeriodicBoxVectors( a, b, c);
+
+        a[0] *= AngstromToNm;
+        a[1] *= AngstromToNm;
+        a[2] *= AngstromToNm;
+
+        b[0] *= AngstromToNm;
+        b[1] *= AngstromToNm;
+        b[2] *= AngstromToNm;

+        c[0] *= AngstromToNm;
+        c[1] *= AngstromToNm;
+        c[2] *= AngstromToNm;
+
+        system.setDefaultPeriodicBoxVectors( a, b, c);

        for( int ii = 0; ii < multipoleForce->getNumMultipoles();  ii++ ){

@@ -2144,6 +2166,15 @@ static int readAmoebaMultipoleParameters( FILE* filePtr, int version, MapStringI
        (void) fprintf( log, "%s Sample of parameters using %s units.\n", methodName.c_str(),
                        (useOpenMMUnits ? "OpenMM" : "Amoeba") );

+        std::string nonbondedMethod = multipoleForce->getNonbondedMethod( ) == AmoebaMultipoleForce::PME ? "PME" : "NoCutoff";
+        (void) fprintf( log, "NonbondedMethod=%s aEwald=%15.7e cutoff=%15.7e.\n", nonbondedMethod.c_str(),
+                         multipoleForce->getAEwald(), multipoleForce->getCutoffDistance() );
+
+        Vec3 a,b,c;
+        system.getDefaultPeriodicBoxVectors( a, b, c );
+        (void) fprintf( log, "Box=[%12.3f %12.3f %12.3f] [%12.3f %12.3f %12.3f] [%12.3f %12.3f %12.3f]\n",
+                        a[0], a[1], a[2], b[0], b[1], b[2], c[0], c[1], c[2] );
+
        (void) fprintf( log, "Supplementary fields %u: ",  static_cast<unsigned int>(supplementary.size())  );
        for( MapStringVectorOfVectorsCI ii = supplementary.begin(); ii != supplementary.end();  ii++ ){
            (void) fprintf( log, "%s ", (*ii).first.c_str() );