Mods for direct PME

a9054686 · Mark Friedrichs · 01260070 · a9054686 · a9054686 · a9054686
Commit a9054686 authored Oct 06, 2010 by Mark Friedrichs
18 changed files
--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaData.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaData.cpp
@@ -42,6 +42,8 @@ AmoebaCudaData::AmoebaCudaData( CudaPlatform::PlatformData& data ) : cudaPlatfor
    log                           = NULL;
    contextImpl                   = NULL;
    gpuInitialized                = false;
+    applyCutoff                   = 0;
+    multipoleForceCount           = 0;
 }   
 AmoebaCudaData::~AmoebaCudaData() {
@@ -122,5 +124,21 @@ void AmoebaCudaData::initializeGpu( void ) {
    return;
 }
+void AmoebaCudaData::incrementMultipoleForceCount( void ) {
+    multipoleForceCount++;
+}
+int AmoebaCudaData::getMultipoleForceCount( void ) const {
+    return multipoleForceCount;
+}
+void AmoebaCudaData::setApplyCutoff( int inputApplyCutoff ) {
+    applyCutoff = inputApplyCutoff;
+}
+int AmoebaCudaData::getApplyCutoff( void ) const {
+    return applyCutoff;
+}
 }
--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaData.h
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaData.h
@@ -139,11 +139,41 @@ public:
     */
    void setContextImpl( void* contextImpl ); 
+    /**
+     * Get multipole force count 
+     * 
+     * @return multipole force count
+     */
+    int getMultipoleForceCount( void ) const; 
+    /**
+     * Get multipole force count 
+     * 
+     * @return multipole force count
+     */
+    void incrementMultipoleForceCount( void ); 
+    /**
+     * Get multipole force count 
+     * 
+     * @return multipole force count
+     */
+    int getApplyCutoff( ) const; 
+    /**
+     * Get multipole force count 
+     * 
+     * @return multipole force count
+     */
+    void setApplyCutoff( int applyCutoff ); 
 private:
    CudaPlatform::PlatformData& cudaPlatformData;
    amoebaGpuContext amoebaGpu;
    bool hasAmoebaBonds, hasAmoebaGeneralizedKirkwood, hasAmoebaMultipole;
+    int multipoleForceCount;
+    int applyCutoff;
    KernelImpl* localForceKernel;
    unsigned int kernelCount;
    void* contextImpl;

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -669,6 +669,13 @@ double CudaCalcAmoebaTorsionTorsionForceKernel::execute(ContextImpl& context, bo
 static void computeAmoebaMultipoleForce( AmoebaCudaData& data ) {
    amoebaGpuContext gpu = data.getAmoebaGpu();
+    if( data.getMultipoleForceCount() == 0 ){
+        gpuCopyInteractingWorkUnit( gpu );
+    }
+    if( data.getApplyCutoff() && (data.getMultipoleForceCount() % 100) == 0 ){
+        gpuReorderAtoms(gpu->gpuContext);
+    }
+    data.incrementMultipoleForceCount();
    data.initializeGpu();
    if( 0 && data.getLog() ){
@@ -867,6 +874,11 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
            zsize = pmeGridDimension[2];
        }
        gpuSetAmoebaPMEParameters(data.getAmoebaGpu(), (float) alpha, xsize, ysize, zsize);
+        data.setApplyCutoff( 1 );
+        amoebaGpuContext amoebaGpu  = data.getAmoebaGpu();
+        gpuContext gpu              = amoebaGpu->gpuContext;
+        gpu->sim.nonbondedCutoffSqr = force.getCutoffDistance()*force.getCutoffDistance();
+        gpu->sim.nonbondedMethod    = PARTICLE_MESH_EWALD;
    }
    data.getAmoebaGpu()->gpuContext->forces.push_back(new ForceInfo(force));
 }

--- a/plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
+++ b/plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
@@ -350,7 +350,7 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
    (void) fprintf( log, "     sqrtPi                             %15.7e\n",  amoebaGpu->amoebaSim.sqrtPi );
    (void) fprintf( log, "     alpha Ewald                        %15.7e\n",  gpu->sim.alphaEwald );
    (void) fprintf( log, "     PME grid dimensions                %6d %6d %6d\n",  gpu->sim.pmeGridSize.x, gpu->sim.pmeGridSize.y, gpu->sim.pmeGridSize.z);
-    (void) fprintf( log, "     cutoffDistance2                    %15.7e\n",  amoebaGpu->amoebaSim.cutoffDistance2 );
+    (void) fprintf( log, "     nonbondedCutoffSqr                 %15.7e\n",  gpu->sim.nonbondedCutoffSqr);
    (void) fprintf( log, "     electric                           %15.7e\n",  amoebaGpu->amoebaSim.electric );
    (void) fprintf( log, "     box                                %15.7e %15.7e %15.7e\n", gpu->sim.periodicBoxSizeX, gpu->sim.periodicBoxSizeY, gpu->sim.periodicBoxSizeZ);
    (void) fprintf( log, "     gkc                                %15.7e\n",  amoebaGpu->amoebaSim.gkc );
@@ -1554,7 +1554,6 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
                        AMOEBA_NO_CUTOFF, AMOEBA_PARTICLE_MESH_EWALD );
        (void) fflush( amoebaGpu->log );
    }
-    amoebaGpu->amoebaSim.cutoffDistance2             = cutoffDistance*cutoffDistance;
    amoebaGpu->amoebaSim.sqrtPi                      = std::sqrt( 3.14159265358f );
    amoebaGpu->amoebaSim.electric                    = electricConstant;
    amoebaGpu->gpuContext->sim.alphaEwald            = alphaEwald;
@@ -4297,4 +4296,34 @@ void trackMutualInducedIterations( amoebaGpuContext amoebaGpu, int iteration){
    }
 }
+/**---------------------------------------------------------------------------------------
+   Track iterations for MI dipoles
+   @param amoebaGpu            amoebaGpuContext reference
+   @param iteration            MI iteration
+   --------------------------------------------------------------------------------------- */
+void gpuCopyInteractingWorkUnit( amoebaGpuContext amoebaGpu ){
+// ---------------------------------------------------------------------------------------
+    gpuContext gpu = amoebaGpu->gpuContext;
+    gpu->psInteractingWorkUnit->Download();
+    gpu->psWorkUnit->Download();
+    amoebaGpu->psWorkUnit->Download();
+    (void) fprintf( amoebaGpu->log, "gpuCopyInteractingWorkUnit called -- to be removed.\n" );
+    for( unsigned int ii = 0; ii < gpu->psInteractingWorkUnit->_length; ii++ ){
+        gpu->psInteractingWorkUnit->_pSysStream[0][ii] = amoebaGpu->psWorkUnit->_pSysStream[0][ii];
+        gpu->psWorkUnit->_pSysStream[0][ii]            = amoebaGpu->psWorkUnit->_pSysStream[0][ii];
+    }    
+    gpu->psInteractingWorkUnit->Upload();
+    gpu->psWorkUnit->Upload();
+// ---------------------------------------------------------------------------------------
+}
 #undef  AMOEBA_DEBUG
--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaTypes.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaTypes.h
@@ -126,7 +126,7 @@ struct cudaAmoebaGmxSimulation {
    unsigned int numberOfAtoms;                     // number of atoms
    unsigned int paddedNumberOfAtoms;               // padded number of atoms
-    float cutoffDistance2;                          // cutoff distance squared for PME
+    //float cutoffDistance2;                          // cutoff distance squared for PME
    float sqrtPi;                                   // sqrt(PI)
    float scalingDistanceCutoff;                    // scaling cutoff
    float2*         pDampingFactorAndThole;         // Thole & damping factors

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
@@ -343,6 +343,9 @@ void amoebaGpuSetConstants(amoebaGpuContext gpu);
 extern "C"
 void gpuSetAmoebaBondOffsets(amoebaGpuContext gpu);
+extern "C"
+void gpuCopyInteractingWorkUnit(amoebaGpuContext gpu);
 /*
 extern "C"
 void gpuSetDihedralParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<int>& atom3, const std::vector<int>& atom4,

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedFieldParticle.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedFieldParticle.h
@@ -44,6 +44,11 @@ struct FixedFieldParticle {
    float gkField[3];
 #endif
+#ifdef INCLUDE_FIXED_FIELD_BUFFERS
+    float tempBuffer[3];
+    float tempBufferP[3];
+#endif
 };
 __device__ static void loadFixedFieldShared( struct FixedFieldParticle* sA, unsigned int atomI 

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedParticle.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedParticle.h
@@ -24,6 +24,11 @@ struct MutualInducedParticle {
    float fieldS[3];
    float fieldPolarS[3];
 #endif
+#ifdef INCLUDE_MI_FIELD_BUFFERS
+    float tempBuffer[3];
+    float tempBufferP[3];
+#endif
 };
 __device__ static void loadMutualInducedShared( MutualInducedParticle* sA, unsigned int atomI )

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPME.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPME.cu
@@ -775,15 +775,15 @@ void kComputeFixedMultipoleForceAndEnergy_kernel()
        multipole[8] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+2];
        multipole[9] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+5];
        float* phi = &cAmoebaSim.pPhi[20*i];
-        cAmoebaSim.pTorque[3*i] = -cAmoebaSim.electric*(multipole[3]*yscale*phi[2] - multipole[2]*zscale*phi[3]
+        cAmoebaSim.pTorque[3*i] = cAmoebaSim.electric*(multipole[3]*yscale*phi[2] - multipole[2]*zscale*phi[3]
                      + 2.0f*(multipole[6]-multipole[5])*zscale*zscale*phi[9]
                      + multipole[8]*yscale*yscale*phi[7] + multipole[9]*xscale*yscale*phi[5]
                      - multipole[7]*yscale*zscale*phi[8] - multipole[9]*xscale*zscale*phi[6]);
-        cAmoebaSim.pTorque[3*i+1] = -cAmoebaSim.electric*(multipole[1]*zscale*phi[3] - multipole[3]*xscale*phi[1]
+        cAmoebaSim.pTorque[3*i+1] = cAmoebaSim.electric*(multipole[1]*zscale*phi[3] - multipole[3]*xscale*phi[1]
                      + 2.0f*(multipole[4]-multipole[6])*zscale*zscale*phi[8]
                      + multipole[7]*zscale*zscale*phi[9] + multipole[8]*xscale*zscale*phi[6]
                      - multipole[8]*xscale*xscale*phi[4] - multipole[9]*yscale*yscale*phi[7]);
-        cAmoebaSim.pTorque[3*i+2] = -cAmoebaSim.electric*(multipole[2]*xscale*phi[1] - multipole[1]*yscale*phi[2]
+        cAmoebaSim.pTorque[3*i+2] = cAmoebaSim.electric*(multipole[2]*xscale*phi[1] - multipole[1]*yscale*phi[2]
                      + 2.0f*(multipole[5]-multipole[4])*yscale*yscale*phi[7]
                      + multipole[7]*xscale*xscale*phi[4] + multipole[9]*yscale*zscale*phi[8]
                      - multipole[7]*xscale*yscale*phi[5] - multipole[8]*zscale*zscale*phi[9]);
@@ -810,9 +810,9 @@ void kComputeFixedMultipoleForceAndEnergy_kernel()
        f.y *= cAmoebaSim.electric*cSim.pmeGridSize.y*cSim.invPeriodicBoxSizeY;
        f.z *= cAmoebaSim.electric*cSim.pmeGridSize.z*cSim.invPeriodicBoxSizeZ;
        float4 force = cSim.pForce4[i];
-        force.x += f.x;
+        force.x -= f.x;
-        force.y += f.y;
+        force.y -= f.y;
-        force.z += f.z;
+        force.z -= f.z;
        cSim.pForce4[i] = force;
@@ -854,15 +854,15 @@ void kComputeInducedDipoleForceAndEnergy_kernel()
        multipole[8] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+2];
        multipole[9] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+5];
        float* phidp = &cAmoebaSim.pPhidp[20*i];
-        cAmoebaSim.pTorque[3*i] = -0.5f*cAmoebaSim.electric*(multipole[3]*yscale*phidp[2] - multipole[2]*zscale*phidp[3]
+        cAmoebaSim.pTorque[3*i] = 0.5f*cAmoebaSim.electric*(multipole[3]*yscale*phidp[2] - multipole[2]*zscale*phidp[3]
                      + 2.0f*(multipole[6]-multipole[5])*zscale*zscale*phidp[9]
                      + multipole[8]*yscale*yscale*phidp[7] + multipole[9]*xscale*yscale*phidp[5]
                      - multipole[7]*yscale*zscale*phidp[8] - multipole[9]*xscale*zscale*phidp[6]);
-        cAmoebaSim.pTorque[3*i+1] = -0.5f*cAmoebaSim.electric*(multipole[1]*zscale*phidp[3] - multipole[3]*xscale*phidp[1]
+        cAmoebaSim.pTorque[3*i+1] = 0.5f*cAmoebaSim.electric*(multipole[1]*zscale*phidp[3] - multipole[3]*xscale*phidp[1]
                      + 2.0f*(multipole[4]-multipole[6])*zscale*zscale*phidp[8]
                      + multipole[7]*zscale*zscale*phidp[9] + multipole[8]*xscale*zscale*phidp[6]
                      - multipole[8]*xscale*xscale*phidp[4] - multipole[9]*yscale*yscale*phidp[7]);
-        cAmoebaSim.pTorque[3*i+2] = -0.5f*cAmoebaSim.electric*(multipole[2]*xscale*phidp[1] - multipole[1]*yscale*phidp[2]
+        cAmoebaSim.pTorque[3*i+2] = 0.5f*cAmoebaSim.electric*(multipole[2]*xscale*phidp[1] - multipole[1]*yscale*phidp[2]
                      + 2.0f*(multipole[5]-multipole[4])*yscale*yscale*phidp[7]
                      + multipole[7]*xscale*xscale*phidp[4] + multipole[9]*yscale*zscale*phidp[8]
                      - multipole[7]*xscale*yscale*phidp[5] - multipole[8]*zscale*zscale*phidp[9]);
@@ -906,9 +906,9 @@ void kComputeInducedDipoleForceAndEnergy_kernel()
        f.y *= 0.5f*cAmoebaSim.electric*cSim.pmeGridSize.y*cSim.invPeriodicBoxSizeY;
        f.z *= 0.5f*cAmoebaSim.electric*cSim.pmeGridSize.z*cSim.invPeriodicBoxSizeZ;
        float4 force = cSim.pForce4[i];
-        force.x += f.x;
+        force.x -= f.x;
-        force.y += f.y;
+        force.y -= f.y;
-        force.z += f.z;
+        force.z -= f.z;
        cSim.pForce4[i] = force;
    }
    cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*cAmoebaSim.electric*energy;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
@@ -72,8 +72,20 @@ struct PmeDirectElectrostaticParticle {
    float torque[3];
    float padding;
+    float tempForce[3];
+    float tempTorque[3];
 };
+__device__ void sumTempBuffer( PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ ){
+    atomI.tempForce[0]  += atomJ.tempForce[0];
+    atomI.tempForce[1]  += atomJ.tempForce[1];
+    atomI.tempForce[2]  += atomJ.tempForce[2];
+    atomI.tempTorque[0] += atomJ.tempTorque[0];
+    atomI.tempTorque[1] += atomJ.tempTorque[1];
+    atomI.tempTorque[2] += atomJ.tempTorque[2];
+}
 /*
 __device__ static void debugSetup( unsigned int atomI, unsigned int atomJ,
@@ -134,9 +146,9 @@ __device__ static void calculatePmeSelfTorqueElectrostaticPairIxn_kernel( PmeDir
    float uiy        = 0.5f*(atomI.inducedDipole[1] + atomI.inducedDipoleP[1]);
    float uiz        = 0.5f*(atomI.inducedDipole[2] + atomI.inducedDipoleP[2]);
-    atomI.torque[0] -= term*(atomI.labFrameDipole[1]*uiz - atomI.labFrameDipole[2]*uiy);
+    atomI.torque[0] += term*(atomI.labFrameDipole[1]*uiz - atomI.labFrameDipole[2]*uiy);
-    atomI.torque[1] -= term*(atomI.labFrameDipole[2]*uix - atomI.labFrameDipole[0]*uiz);
+    atomI.torque[1] += term*(atomI.labFrameDipole[2]*uix - atomI.labFrameDipole[0]*uiz);
-    atomI.torque[2] -= term*(atomI.labFrameDipole[0]*uiy - atomI.labFrameDipole[1]*uix);
+    atomI.torque[2] += term*(atomI.labFrameDipole[0]*uiy - atomI.labFrameDipole[1]*uix);
 }
 __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectrostaticParticle& atomI,   PmeDirectElectrostaticParticle& atomJ,
@@ -186,7 +198,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
    float gfr[8],gfri[7];
    float gti[7],gtri[7];
-    float conversionFactor   = (cAmoebaSim.electric/cAmoebaSim.dielec);
+    float conversionFactor   = (-cAmoebaSim.electric/cAmoebaSim.dielec);
    // set the permanent multipole and induced dipole values;
@@ -219,7 +231,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
    zr         -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
    float r2    = xr*xr + yr*yr + zr*zr;
-    if( r2 <= cAmoebaSim.cutoffDistance2 ){
+    if( r2 <= cSim.nonbondedCutoffSqr ){
        float r      = sqrt(r2);
        float ck     = atomJ.q;
@@ -540,7 +552,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
        e = e - (1.0f-scalingFactors[MScaleIndex])*erl;
        ei = ei - erli;
-        *energy = conversionFactor*(e + ei);
+        *energy = -conversionFactor*(e + ei);
        // increment the total intramolecular energy; assumes;
        // intramolecular distances are less than half of cell;
@@ -1154,22 +1166,34 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
    // on first pass, set threads/block
    if( threadsPerBlock == 0 ){
-      unsigned int maxThreads;
+        unsigned int maxThreads;
-      if (gpu->sm_version >= SM_20)
+        if (gpu->sm_version >= SM_20)
-          maxThreads = 384;
+            maxThreads = 384;
-      else if (gpu->sm_version >= SM_12)
+        else if (gpu->sm_version >= SM_12)
-          maxThreads = 128;
+            maxThreads = 128;
-      else
+        else
-          maxThreads = 64;
+            maxThreads = 64;
-      threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(PmeDirectElectrostaticParticle)), maxThreads);
+        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(PmeDirectElectrostaticParticle)+sizeof(float3)), maxThreads);
    }
    kClearFields_3( amoebaGpu, 2 );
+#ifdef AMOEBA_DEBUG
+    (void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticN2Forces:  threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u\n", 
+                    threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(PmeDirectElectrostaticParticle)+sizeof(float3)),
+                    (sizeof(PmeDirectElectrostaticParticle)+sizeof(float3)) );
+      (void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticN2Forces no warp:  numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Obuf=%u ixnCt=%u workUnits=%u gpu->nonbond_threads_per_block=%u\n",
+                      amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
+                      sizeof(PmeDirectElectrostaticParticle)+sizeof(float3), (sizeof(PmeDirectElectrostaticParticle)+sizeof(float3))*threadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits,
+                      gpu->sim.nonbond_threads_per_block );
+      (void) fflush( amoebaGpu->log );
+#endif
    if (gpu->bOutputBufferPerWarp){
-      kCalculateAmoebaPmeDirectElectrostaticN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>(
+      kCalculateAmoebaPmeDirectElectrostaticN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, (sizeof(PmeDirectElectrostaticParticle)+sizeof(float3))*threadsPerBlock>>>(
-                                                                         amoebaGpu->psWorkUnit->_pDevStream[0],
+                                                                         gpu->sim.pInteractingWorkUnit,
                                                                         amoebaGpu->psWorkArray_3_1->_pDevStream[0],
 #ifdef AMOEBA_DEBUG
                                                                         amoebaGpu->psWorkArray_3_2->_pDevStream[0],
@@ -1180,15 +1204,11 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
    } else {
-#ifdef AMOEBA_DEBUG
-      (void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticN2Forces no warp:  numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n",
-                      amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
-                      sizeof(PmeDirectElectrostaticParticle), sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
-      (void) fflush( amoebaGpu->log );
-#endif
-      kCalculateAmoebaPmeDirectElectrostaticN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>(
+//                                                                         gpu->sim.pInteractingWorkUnit,
-                                                                         amoebaGpu->psWorkUnit->_pDevStream[0],
+//                                                                         amoebaGpu->psWorkUnit->_pDevStream[0],
+      kCalculateAmoebaPmeDirectElectrostaticN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, (sizeof(PmeDirectElectrostaticParticle)+sizeof(float3))*threadsPerBlock>>>(
+                                                                         gpu->sim.pInteractingWorkUnit,
                                                                         amoebaGpu->psWorkArray_3_1->_pDevStream[0],
 #ifdef AMOEBA_DEBUG
                                                                         amoebaGpu->psWorkArray_3_2->_pDevStream[0],
@@ -1209,7 +1229,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
        (void) fprintf( amoebaGpu->log, "Finished PmeDirectElectrostatic kernel execution\n" ); (void) fflush( amoebaGpu->log );
-        int maxPrint        = 1400;
+        int maxPrint        = 5;
        float conversion    = 1.0f/41.84f;
        float forceSum[3]   = { 0.0f, 0.0f, 0.0f};
        for( int ii = 0; ii < gpu->natoms; ii++ ){
@@ -1270,7 +1290,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
        }
        (void) fprintf( amoebaGpu->log,"\n" );
-        if( 1 ){
+        if( 0 ){
            (void) fprintf( amoebaGpu->log,"DebugElec\n" );
            int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
            for( int jj = 0; jj < gpu->natoms; jj++ ){

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.h
@@ -65,6 +65,9 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
        unsigned int x;
        unsigned int y;
        bool bExclusionFlag;
+        int  dScaleMask;
+        int2 pScaleMask;
+        int2 mScaleMask;
        // Extract cell coordinates
@@ -99,90 +102,60 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
            loadPmeDirectElectrostaticShared( &(sA[threadIdx.x]), atomI );
-            if (!bExclusionFlag)
+            if (bExclusionFlag)
-            {
-                // this branch is never exercised since it includes the
-                // interaction between atomI and itself which is always excluded
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    float force[3];
-                    float torque[2][3];
-                    float energy;
-                    calculatePmeDirectElectrostaticPairIxn_kernel( localParticle, psA[j],
-                                                                   scalingFactors, force, torque, &energy
-#ifdef AMOEBA_DEBUG
-, pullBack
-#endif
- );
-                    unsigned int mask       =  ( (atomI == (y + j)) || (atomI >= cAmoebaSim.numberOfAtoms) || ((y+j) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
-                    // add to field at atomI the field due atomJ's charge/dipole/quadrupole
-                    localParticle.force[0]            += mask ? force[0]     : 0.0f;
-                    localParticle.force[1]            += mask ? force[1]     : 0.0f;
-                    localParticle.force[2]            += mask ? force[2]     : 0.0f;
-                    localParticle.torque[0]           += mask ? torque[0][0] : 0.0f;
-                    localParticle.torque[1]           += mask ? torque[0][1] : 0.0f;
-                    localParticle.torque[2]           += mask ? torque[0][2] : 0.0f;
-                    totalEnergy                       += mask ? 0.5*energy   : 0.0f;
-                }
-            }
-            else  // bExclusion
            {
                unsigned int xi       = x >> GRIDBITS;
                unsigned int cell     = xi + xi*cAmoebaSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
-                int  dScaleMask       = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
+                dScaleMask            = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
-                int2 pScaleMask       = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
+                pScaleMask            = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
-                int2 mScaleMask       = cAmoebaSim.pM_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
+                mScaleMask            = cAmoebaSim.pM_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
+            } else {
+                scalingFactors[DScaleIndex] = scalingFactors[PScaleIndex] = scalingFactors[MScaleIndex] = 1.0f;
+            }
-                for (unsigned int j = 0; j < GRID; j++)
+            for (unsigned int j = 0; j < GRID; j++)
-                {
+            {
-                    float force[3];
+                float force[3];
-                    float torque[2][3];
+                float torque[2][3];
-                    unsigned int atomJ = y + j;
+                unsigned int atomJ = y + j;
-                    // set scale factors
+                // set scale factors
+                if (bExclusionFlag)
+                {
                    getMaskedDScaleFactor( j, dScaleMask, scalingFactors + DScaleIndex );
                    getMaskedPScaleFactor( j, pScaleMask, scalingFactors + PScaleIndex );
                    getMaskedMScaleFactor( j, mScaleMask, scalingFactors + MScaleIndex );
+                }
-                    // force
+                // force
-                    float energy;
+                float energy;
-                    calculatePmeDirectElectrostaticPairIxn_kernel( localParticle,   psA[j],
+                calculatePmeDirectElectrostaticPairIxn_kernel( localParticle,   psA[j],
-                                                                   scalingFactors, force, torque, &energy
+                                                               scalingFactors, force, torque, &energy
 #ifdef AMOEBA_DEBUG
 , pullBack
 #endif
 );
-                    // nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution
+                // nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution
-                    // by setting match flag
+                // by setting match flag
-                    unsigned int mask       =  ( (atomI == atomJ) || (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
+                unsigned int mask       =  ( (atomI == atomJ) || (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
-                    // add to field at atomI the field due atomJ's charge/dipole/quadrupole
+                // add to field at atomI the field due atomJ's charge/dipole/quadrupole
-                    localParticle.force[0]            += mask ? force[0]     : 0.0f;
+                localParticle.force[0]            += mask ? force[0]     : 0.0f;
-                    localParticle.force[1]            += mask ? force[1]     : 0.0f;
+                localParticle.force[1]            += mask ? force[1]     : 0.0f;
-                    localParticle.force[2]            += mask ? force[2]     : 0.0f;
+                localParticle.force[2]            += mask ? force[2]     : 0.0f;
-                    localParticle.torque[0]           += mask ? torque[0][0] : 0.0f;
+                localParticle.torque[0]           += mask ? torque[0][0] : 0.0f;
-                    localParticle.torque[1]           += mask ? torque[0][1] : 0.0f;
+                localParticle.torque[1]           += mask ? torque[0][1] : 0.0f;
-                    localParticle.torque[2]           += mask ? torque[0][2] : 0.0f;
+                localParticle.torque[2]           += mask ? torque[0][2] : 0.0f;
-                    totalEnergy                       += mask ? 0.5*energy   : 0.0f;
+                totalEnergy                       += mask ? 0.5*energy   : 0.0f;
 #ifdef AMOEBA_DEBUG
 /*
@@ -229,8 +202,7 @@ if( atomI == targetAtom ){
 }
 #endif
-                }
+            } // end of j-loop
-            }
            // include self energy and self torque
@@ -281,165 +253,79 @@ if( atomI == targetAtom ){
            outputTorque[offset+2]              = localParticle.torque[2];
 #endif
-        }
+        } else {
-        else
-        {
-            if (lasty != y)
-            {
-                // load shared data
-               loadPmeDirectElectrostaticShared( &(sA[threadIdx.x]), (y+tgx) );
+            unsigned int flags           = cSim.pInteractionFlag[pos];
+            if (flags == 0) {
-            }
+                // No interactions in this block.
+            } else {
-            sA[threadIdx.x].force[0]     = 0.0f;
+                if (lasty != y) {
-            sA[threadIdx.x].force[1]     = 0.0f;
-            sA[threadIdx.x].force[2]     = 0.0f;
+                    // load shared data
-            sA[threadIdx.x].torque[0]    = 0.0f;
+                   loadPmeDirectElectrostaticShared( &(sA[threadIdx.x]), (y+tgx) );
-            sA[threadIdx.x].torque[1]    = 0.0f;
-            sA[threadIdx.x].torque[2]    = 0.0f;
+                }
-            if (!bExclusionFlag)
+                sA[threadIdx.x].force[0]     = 0.0f;
-            {
+                sA[threadIdx.x].force[1]     = 0.0f;
+                sA[threadIdx.x].force[2]     = 0.0f;
+                sA[threadIdx.x].torque[0]    = 0.0f;
+                sA[threadIdx.x].torque[1]    = 0.0f;
+                sA[threadIdx.x].torque[2]    = 0.0f;
+                if( bExclusionFlag )
+                {
+                    unsigned int xi   = x >> GRIDBITS;
+                    unsigned int yi   = y >> GRIDBITS;
+                    unsigned int cell = xi+yi*cAmoebaSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
+                    dScaleMask        = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
+                    pScaleMask        = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
+                    mScaleMask        = cAmoebaSim.pM_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
+                } else {
+                    scalingFactors[DScaleIndex] = scalingFactors[PScaleIndex] = scalingFactors[MScaleIndex] = 1.0f;
+                }
                for (unsigned int j = 0; j < GRID; j++)
                {
+                    unsigned int jIdx  = (flags == 0xFFFFFFFF) ? tj : j;
+                    unsigned int atomJ = y + jIdx;
                    float force[3];
                    float torque[2][3];
-                    unsigned int atomJ = y + tj;
-                    float energy;
-                    calculatePmeDirectElectrostaticPairIxn_kernel( localParticle,  psA[tj],
-                                                                   scalingFactors, force, torque, &energy
-#ifdef AMOEBA_DEBUG
-, pullBack
-#endif
- );
-                           unsigned int mask       =  ( (atomI >= cAmoebaSim.numberOfAtoms) || ( atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
-                           // add force and torque to atom I due atom J
-                           localParticle.force[0]         += mask ? force[0]      : 0.0f;
-                           localParticle.force[1]         += mask ? force[1]      : 0.0f;
-                           localParticle.force[2]         += mask ? force[2]      : 0.0f;
-                           localParticle.torque[0]        += mask ? torque[0][0]  : 0.0f;
-                           localParticle.torque[1]        += mask ? torque[0][1]  : 0.0f;
-                           localParticle.torque[2]        += mask ? torque[0][2]  : 0.0f;
-                           totalEnergy                    += mask ? energy        : 0.0f;
-                           // add force and torque to atom J due atom I
+                    // set scale factors
-                           psA[tj].force[0]               -= mask ?  force[0]     : 0.0f;
+                    if( bExclusionFlag )
-                           psA[tj].force[1]               -= mask ?  force[1]     : 0.0f;
+                    {
-                           psA[tj].force[2]               -= mask ?  force[2]     : 0.0f;
+                        getMaskedDScaleFactor( jIdx, dScaleMask, scalingFactors + DScaleIndex );
+                        getMaskedPScaleFactor( jIdx, pScaleMask, scalingFactors + PScaleIndex );
-                           psA[tj].torque[0]              += mask ?  torque[1][0] : 0.0f;
+                        getMaskedMScaleFactor( jIdx, mScaleMask, scalingFactors + MScaleIndex );
-                           psA[tj].torque[1]              += mask ?  torque[1][1] : 0.0f;
+                    }
-                           psA[tj].torque[2]              += mask ?  torque[1][2] : 0.0f;
+                    // force
-#ifdef AMOEBA_DEBUG
-/*
-energy =  mask ? 0.5*energy   : 0.0f;
-if( atomI < 200 && (fabs( energy ) > 1.0e+8 || energy != energy) ){
-    debugSetup( atomI, atomJ, debugArray, pullBack );
-} */
-if( atomI == targetAtom  || atomJ == targetAtom ){
-    unsigned int index                 = (atomI == targetAtom) ? atomJ : atomI;
-    unsigned int indexI                = (atomI == targetAtom) ? 0 : 1;
-    unsigned int indexJ                = (atomI == targetAtom) ? 1 : 0;
-    float forceSign                    = (atomI == targetAtom) ? 1.0f : -1.0f;
-    float blockId                      = 2.0f;
-    debugArray[index].x                = (float) atomI;
-    debugArray[index].y                = (float) atomJ;
-    debugArray[index].z                = (float) y;
-    debugArray[index].w                = blockId;
-    index                             += cAmoebaSim.paddedNumberOfAtoms;
-    debugArray[index].x                = mask ? forceSign*force[0]     : 0.0f;
-    debugArray[index].y                = mask ? forceSign*force[1]     : 0.0f;
-    debugArray[index].z                = mask ? forceSign*force[2]     : 0.0f;
-    debugArray[index].w                = blockId;
-    index                             += cAmoebaSim.paddedNumberOfAtoms;
-    debugArray[index].x                = mask ? torque[indexI][0] : 0.0f;
-    debugArray[index].y                = mask ? torque[indexI][1] : 0.0f;
-    debugArray[index].z                = mask ? torque[indexI][2] : 0.0f;
-    debugArray[index].w                = mask ? energy       : 0.0f;
-    index                             += cAmoebaSim.paddedNumberOfAtoms;
-    debugArray[index].x                = mask ? torque[indexJ][0] : 0.0f;
-    debugArray[index].y                = mask ? torque[indexJ][1] : 0.0f;
-    debugArray[index].z                = mask ? torque[indexJ][2] : 0.0f;
-    debugArray[index].w                = (float) (blockIdx.x * blockDim.x + threadIdx.x);
-    for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
-        index                             += cAmoebaSim.paddedNumberOfAtoms;
-        debugArray[index].x                = pullBack[pullIndex].x;
-        debugArray[index].y                = pullBack[pullIndex].y;
-        debugArray[index].z                = pullBack[pullIndex].z;
-        debugArray[index].w                = pullBack[pullIndex].w;
-    }
-}
-#endif
-                           tj                  = (tj + 1) & (GRID - 1);
-                       }
-                   }
-                   else  // bExclusion
-                   {
-                       // Read fixed atom data into registers and GRF
-                       unsigned int xi   = x >> GRIDBITS;
-                       unsigned int yi   = y >> GRIDBITS;
-                       unsigned int cell = xi+yi*cAmoebaSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
-                       int  dScaleMask   = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
-                       int2 pScaleMask   = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
-                       int2 mScaleMask   = cAmoebaSim.pM_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
-                       for (unsigned int j = 0; j < GRID; j++)
-                       {
-                           float force[3];
-                           float torque[2][3];
-                           unsigned int atomJ = y + tj;
-                           // set scale factors
-                           getMaskedDScaleFactor( tj, dScaleMask, scalingFactors + DScaleIndex );
-                           getMaskedPScaleFactor( tj, pScaleMask, scalingFactors + PScaleIndex );
-                           getMaskedMScaleFactor( tj, mScaleMask, scalingFactors + MScaleIndex );
-                           // force
                    float energy;
-                    calculatePmeDirectElectrostaticPairIxn_kernel( localParticle,   psA[tj],
+                    calculatePmeDirectElectrostaticPairIxn_kernel( localParticle,   psA[jIdx],
                                                                   scalingFactors, force, torque, &energy
 #ifdef AMOEBA_DEBUG
-, pullBack
+    , pullBack
 #endif
- );
+     );
                    // check if atoms out-of-bounds
                    unsigned int mask    =  ( (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
                    // add force and torque to atom I due atom J
                    localParticle.force[0]         += mask ? force[0]      : 0.0f;
                    localParticle.force[1]         += mask ? force[1]      : 0.0f;
                    localParticle.force[2]         += mask ? force[2]      : 0.0f;
                    localParticle.torque[0]        += mask ? torque[0][0]  : 0.0f;
                    localParticle.torque[1]        += mask ? torque[0][1]  : 0.0f;
                    localParticle.torque[2]        += mask ? torque[0][2]  : 0.0f;
@@ -448,182 +334,138 @@ if( atomI == targetAtom  || atomJ == targetAtom ){
                    // add force and torque to atom J due atom I
-                    psA[tj].force[0]               -= mask ?  force[0]     : 0.0f;
+                    if( flags == 0xFFFFFFFF ){
-                    psA[tj].force[1]               -= mask ?  force[1]     : 0.0f;
-                    psA[tj].force[2]               -= mask ?  force[2]     : 0.0f;
-                    psA[tj].torque[0]              += mask ?  torque[1][0] : 0.0f;
-                    psA[tj].torque[1]              += mask ?  torque[1][1] : 0.0f;
-                    psA[tj].torque[2]              += mask ?  torque[1][2] : 0.0f;
-#ifdef AMOEBA_DEBUG
+                        psA[jIdx].force[0]               -= mask ?  force[0]     : 0.0f;
-/*
+                        psA[jIdx].force[1]               -= mask ?  force[1]     : 0.0f;
-energy =  mask ? 0.5*energy   : 0.0f;
+                        psA[jIdx].force[2]               -= mask ?  force[2]     : 0.0f;
-if( atomI < 200 && (fabs( energy ) > 1.0e+8 || energy != energy) ){
-    debugSetup( atomI, atomJ, debugArray, pullBack );
+                        psA[jIdx].torque[0]              += mask ?  torque[1][0] : 0.0f;
-} */
+                        psA[jIdx].torque[1]              += mask ?  torque[1][1] : 0.0f;
-if( atomI == targetAtom  || atomJ == targetAtom ){
+                        psA[jIdx].torque[2]              += mask ?  torque[1][2] : 0.0f;
-        unsigned int index                 = (atomI == targetAtom) ? atomJ : atomI;
-        unsigned int indexI                = (atomI == targetAtom) ? 0 : 1;
+                    } else {
-        unsigned int indexJ                = (atomI == targetAtom) ? 1 : 0;
-        float forceSign                    = (atomI == targetAtom) ? 1.0f : -1.0f;
+                        psA[threadIdx.x].tempForce[0]     = mask ? 0.0f : force[0];
-        float blockId                      = 3.0f;
+                        psA[threadIdx.x].tempForce[1]     = mask ? 0.0f : force[1];
+                        psA[threadIdx.x].tempForce[2]     = mask ? 0.0f : force[2];
-        debugArray[index].x                = (float) atomI;
-        debugArray[index].y                = (float) atomJ;
+                        psA[threadIdx.x].tempTorque[0]    = mask ? 0.0f : torque[1][0];
-        debugArray[index].z                = (float) y;
+                        psA[threadIdx.x].tempTorque[1]    = mask ? 0.0f : torque[1][1];
-        debugArray[index].w                = blockId;
+                        psA[threadIdx.x].tempTorque[2]    = mask ? 0.0f : torque[1][2];
-        index                             += cAmoebaSim.paddedNumberOfAtoms;
+                        if( tgx % 2 == 0 ){
-        debugArray[index].x                = mask ? forceSign*force[0]     : 0.0f;
+                            sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+1] );  
-        debugArray[index].y                = mask ? forceSign*force[1]     : 0.0f;
+                        }
-        debugArray[index].z                = mask ? forceSign*force[2]     : 0.0f;
+                        if( tgx % 4 == 0 ){
-        debugArray[index].w                = blockId;
+                            sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+2] );  
+                        }
-        index                             += cAmoebaSim.paddedNumberOfAtoms;
+                        if( tgx % 8 == 0 ){
-        debugArray[index].x                = mask ? torque[indexI][0] : 0.0f;
+                            sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+4] );  
-        debugArray[index].y                = mask ? torque[indexI][1] : 0.0f;
+                        }
-        debugArray[index].z                = mask ? torque[indexI][2] : 0.0f;
+                        if( tgx % 16 == 0 ){
-        debugArray[index].w                = energy;
+                            sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+8] );  
+                        }
-        index                             += cAmoebaSim.paddedNumberOfAtoms;
-        debugArray[index].x                = mask ? torque[indexJ][0] : 0.0f;
+                        if (tgx == 0)
-        debugArray[index].y                = mask ? torque[indexJ][1] : 0.0f;
+                        {
-        debugArray[index].z                = mask ? torque[indexJ][2] : 0.0f;
+                            psA[jIdx].force[0]  -= psA[threadIdx.x].tempForce[0]  + psA[threadIdx.x+16].tempForce[0];
-        debugArray[index].w                = (float) (blockIdx.x * blockDim.x + threadIdx.x);
+                            psA[jIdx].force[1]  -= psA[threadIdx.x].tempForce[1]  + psA[threadIdx.x+16].tempForce[1];
+                            psA[jIdx].force[2]  -= psA[threadIdx.x].tempForce[2]  + psA[threadIdx.x+16].tempForce[2];
-        for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+                            psA[jIdx].torque[0] += psA[threadIdx.x].tempTorque[0] + psA[threadIdx.x+16].tempTorque[0];
-            debugArray[index].x                = pullBack[pullIndex].x;
+                            psA[jIdx].torque[1] += psA[threadIdx.x].tempTorque[1] + psA[threadIdx.x+16].tempTorque[1];
-            debugArray[index].y                = pullBack[pullIndex].y;
+                            psA[jIdx].torque[2] += psA[threadIdx.x].tempTorque[2] + psA[threadIdx.x+16].tempTorque[2];
-            debugArray[index].z                = pullBack[pullIndex].z;
+                        }
-            debugArray[index].w                = pullBack[pullIndex].w;
+                    }
-        }
+                    tj                              = (tj + 1) & (GRID - 1);
-#if 0
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                = scalingFactors[DScaleIndex];
-            debugArray[index].y                = dScaleVal;
-            debugArray[index].z                = scalingFactors[PScaleIndex];
-            debugArray[index].w                = pScaleVal;
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                = scalingFactors[MScaleIndex];
-            debugArray[index].y                = mScaleVal;
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                = labFrameDipole[3*atomI];
-            debugArray[index].y                = labFrameDipole[3*atomI+1];
-            debugArray[index].z                = labFrameDipole[3*atomI+2];
-            debugArray[index].w                = 25.0f;
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                = labFrameDipole[3*atomJ];
-            debugArray[index].y                = labFrameDipole[3*atomJ+1];
-            debugArray[index].z                = labFrameDipole[3*atomJ+2];
-            debugArray[index].w                = 26.0f;
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                = jDipole[0];
-            debugArray[index].y                = jDipole[1];
-            debugArray[index].z                = jDipole[2];
-            debugArray[index].w                = 27.0f;
-#endif
-}
-#endif
-                    tj                  = (tj + 1) & (GRID - 1);
-                }
-            }
-            // Write results
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            float of;
-            unsigned int offset                 = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
-            of                                  = outputForce[offset];
-            of                                 += localParticle.force[0];
-            outputForce[offset]                 = of;
-            of                                  = outputForce[offset+1];
-            of                                 += localParticle.force[1];
-            outputForce[offset+1]               = of;
-            of                                  = outputForce[offset+2];
-            of                                 += localParticle.force[2];
-            outputForce[offset+2]               = of;
-            of                                  = outputTorque[offset];
-            of                                 += localParticle.torque[0];
-            outputTorque[offset]                 = of;
-            of                                  = outputTorque[offset+1];
-            of                                 += localParticle.torque[1];
-            outputTorque[offset+1]               = of;
-            of                                  = outputTorque[offset+2];
-            of                                 += localParticle.torque[2];
-            outputTorque[offset+2]               = of;
-            offset                              = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
-            of                                  = outputForce[offset];
-            of                                 += sA[threadIdx.x].force[0];
-            outputForce[offset]                 = of;
-            of                                  = outputForce[offset+1];
-            of                                 += sA[threadIdx.x].force[1];
-            outputForce[offset+1]               = of;
-            of                                  = outputForce[offset+2];
-            of                                 += sA[threadIdx.x].force[2];
-            outputForce[offset+2]               = of;
-            of                                  = outputTorque[offset];
-            of                                 += sA[threadIdx.x].torque[0];
-            outputTorque[offset]                = of;
-            of                                  = outputTorque[offset+1];
-            of                                 += sA[threadIdx.x].torque[1];
-            outputTorque[offset+1]              = of;
-            of                                  = outputTorque[offset+2];
-            of                                 += sA[threadIdx.x].torque[2];
-            outputTorque[offset+2]              = of;
-#else
-            unsigned int offset                 = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
-            outputForce[offset]                 = localParticle.force[0];
-            outputForce[offset+1]               = localParticle.force[1];
-            outputForce[offset+2]               = localParticle.force[2];
-            outputTorque[offset]                = localParticle.torque[0];
-            outputTorque[offset+1]              = localParticle.torque[1];
-            outputTorque[offset+2]              = localParticle.torque[2];
-            offset                              = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
-            outputForce[offset]                 = sA[threadIdx.x].force[0];
-            outputForce[offset+1]               = sA[threadIdx.x].force[1];
-            outputForce[offset+2]               = sA[threadIdx.x].force[2];
-            outputTorque[offset]                = sA[threadIdx.x].torque[0];
+                } // end of j-loop
-            outputTorque[offset+1]              = sA[threadIdx.x].torque[1];
-            outputTorque[offset+2]              = sA[threadIdx.x].torque[2];
+                // Write results
+    #ifdef USE_OUTPUT_BUFFER_PER_WARP
+                float of;
+                unsigned int offset                 = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+                of                                  = outputForce[offset];
+                of                                 += localParticle.force[0];
+                outputForce[offset]                 = of;
+                of                                  = outputForce[offset+1];
+                of                                 += localParticle.force[1];
+                outputForce[offset+1]               = of;
+                of                                  = outputForce[offset+2];
+                of                                 += localParticle.force[2];
+                outputForce[offset+2]               = of;
+                of                                  = outputTorque[offset];
+                of                                 += localParticle.torque[0];
+                outputTorque[offset]                 = of;
+                of                                  = outputTorque[offset+1];
+                of                                 += localParticle.torque[1];
+                outputTorque[offset+1]               = of;
+                of                                  = outputTorque[offset+2];
+                of                                 += localParticle.torque[2];
+                outputTorque[offset+2]               = of;
+                offset                              = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+                of                                  = outputForce[offset];
+                of                                 += sA[threadIdx.x].force[0];
+                outputForce[offset]                 = of;
+                of                                  = outputForce[offset+1];
+                of                                 += sA[threadIdx.x].force[1];
+                outputForce[offset+1]               = of;
+                of                                  = outputForce[offset+2];
+                of                                 += sA[threadIdx.x].force[2];
+                outputForce[offset+2]               = of;
+                of                                  = outputTorque[offset];
+                of                                 += sA[threadIdx.x].torque[0];
+                outputTorque[offset]                = of;
+                of                                  = outputTorque[offset+1];
+                of                                 += sA[threadIdx.x].torque[1];
+                outputTorque[offset+1]              = of;
+                of                                  = outputTorque[offset+2];
+                of                                 += sA[threadIdx.x].torque[2];
+                outputTorque[offset+2]              = of;
+    #else
+                unsigned int offset                 = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+                outputForce[offset]                 = localParticle.force[0];
+                outputForce[offset+1]               = localParticle.force[1];
+                outputForce[offset+2]               = localParticle.force[2];
+                outputTorque[offset]                = localParticle.torque[0];
+                outputTorque[offset+1]              = localParticle.torque[1];
+                outputTorque[offset+2]              = localParticle.torque[2];
+                offset                              = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+                outputForce[offset]                 = sA[threadIdx.x].force[0];
+                outputForce[offset+1]               = sA[threadIdx.x].force[1];
+                outputForce[offset+2]               = sA[threadIdx.x].force[2];
+                outputTorque[offset]                = sA[threadIdx.x].torque[0];
+                outputTorque[offset+1]              = sA[threadIdx.x].torque[1];
+                outputTorque[offset+2]              = sA[threadIdx.x].torque[2];
+    #endif
+                lasty = y;
-#endif
+            } // end of pInteractionFlag block
-            lasty = y;
        }
        pos++;
    }
-//printf( "Hello thread: %d %d %d %d\n", blockIdx.x * blockDim.x + threadIdx.x, blockIdx.x, blockDim.x, threadIdx.x );
    cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += totalEnergy;
 }
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
@@ -80,7 +80,6 @@ static void kReducePmeEFieldPolar_kernel( unsigned int fieldComponents, unsigned
    }   
 }
 __global__ 
 #if (__CUDA_ARCH__ >= 200)
 __launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
@@ -96,7 +95,6 @@ static void kReducePmeEField_kernel( unsigned int fieldComponents, unsigned int
    // Reduce field
    const float term = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
-    //const float term = 0.0f;
    while (pos < fieldComponents)
    {   
@@ -154,7 +152,20 @@ static void kReducePmeDirectE_Fields(amoebaGpuContext amoebaGpu )
 // file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
 #undef GK
+#undef INCLUDE_FIXED_FIELD_BUFFERS
+#define INCLUDE_FIXED_FIELD_BUFFERS
 #include "kCalculateAmoebaCudaFixedFieldParticle.h"
+#undef INCLUDE_FIXED_FIELD_BUFFERS
+__device__ void sumTempBuffer( FixedFieldParticle& atomI, FixedFieldParticle& atomJ ){
+    atomI.tempBuffer[0]  += atomJ.tempBuffer[0];
+    atomI.tempBuffer[1]  += atomJ.tempBuffer[1];
+    atomI.tempBuffer[2]  += atomJ.tempBuffer[2];
+    atomI.tempBufferP[0] += atomJ.tempBufferP[0];
+    atomI.tempBufferP[1] += atomJ.tempBufferP[1];
+    atomI.tempBufferP[2] += atomJ.tempBufferP[2];
+}
 __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle& atomI, FixedFieldParticle& atomJ,
                                                            float dscale, float pscale, float fields[4][3]
 #ifdef AMOEBA_DEBUG
@@ -175,7 +186,7 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
    yr               -= floor(yr*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
    zr               -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-    float r2          = xr*xr + yr* yr + zr*zr;
+    float r2          = xr*xr + yr*yr + zr*zr;
    float r           = sqrtf(r2);
    // calculate the error function damping terms
@@ -310,7 +321,7 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
    // increment the field at each site due to this interaction
-    if( r2 <= cAmoebaSim.cutoffDistance2 ){
+    if( r2 <= cSim.nonbondedCutoffSqr ){
        fields[0][0]       = fim[0] - fid[0];
        fields[0][1]       = fim[1] - fid[1];
@@ -345,6 +356,7 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
        fields[2][2]       = 0.0f;
        fields[3][2]       = 0.0f;
    }
 #ifdef AMOEBA_DEBUG
    pullBack[0].x = xr;
    pullBack[0].y = yr;
@@ -399,7 +411,8 @@ static int isNanOrInfinity( double number ){
 static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
 {
-    gpuContext gpu    = amoebaGpu->gpuContext;
+    static unsigned int threadsPerBlock  = 0;
+    gpuContext gpu                       = amoebaGpu->gpuContext;
 #ifdef AMOEBA_DEBUG
    static const char* methodName = "computeCudaAmoebaPmeFixedEField";
@@ -416,40 +429,27 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
    // print intermediate results for the targetAtom 
-    unsigned int targetAtom  = 0;
+    unsigned int targetAtom  = 354;
-    int maxPrint             = 3002;
-    amoebaGpu->psE_Field->Download();
-    (void) fprintf( amoebaGpu->log, "Recip EFields In\n" );
-    for( int ii = 0; ii < gpu->natoms; ii++ ){
-        (void) fprintf( amoebaGpu->log, "%5d ", ii); 
-        int indexOffset     = ii*3;
-        // E_Field
-        int isNan  = isNanOrInfinity( amoebaGpu->psE_Field->_pSysStream[0][indexOffset] );
-            isNan += isNanOrInfinity( amoebaGpu->psE_Field->_pSysStream[0][indexOffset+1] );
-            isNan += isNanOrInfinity( amoebaGpu->psE_Field->_pSysStream[0][indexOffset+2] );
-        (void) fprintf( amoebaGpu->log,"E[%16.9e %16.9e %16.9e] %s\n",
-                        amoebaGpu->psE_Field->_pSysStream[0][indexOffset],
-                        amoebaGpu->psE_Field->_pSysStream[0][indexOffset+1],
-                        amoebaGpu->psE_Field->_pSysStream[0][indexOffset+2], (isNan ? "XXX" :"") );
-        if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
-            ii = gpu->natoms - maxPrint;
-        }
-    }
-    (void) fflush( amoebaGpu->log );
-    (void) fprintf( amoebaGpu->log, "Recip EFields End\n" );
 #endif
    kClearFields_3( amoebaGpu, 2 );
+    // on first pass, set threads/block
+    if( threadsPerBlock == 0 ){ 
+        unsigned int maxThreads;
+        if (gpu->sm_version >= SM_20)
+            maxThreads = 384; 
+        else if (gpu->sm_version >= SM_12)
+            maxThreads = 128; 
+        else
+            maxThreads = 64;
+        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)), maxThreads);
+    }    
    if (gpu->bOutputBufferPerWarp){
-        kCalculateAmoebaPmeDirectFixedE_FieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
+        kCalculateAmoebaPmeDirectFixedE_FieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
-                                                                           amoebaGpu->psWorkUnit->_pDevStream[0],
+                                                                           gpu->sim.pInteractingWorkUnit,
                                                                           amoebaGpu->psWorkArray_3_1->_pDevStream[0],
 #ifdef AMOEBA_DEBUG
                                                                           amoebaGpu->psWorkArray_3_2->_pDevStream[0],
@@ -459,8 +459,9 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
 #endif
    } else {
-        kCalculateAmoebaPmeDirectFixedE_FieldN2_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
+                                                                           //amoebaGpu->psWorkUnit->_pDevStream[0],
-                                                                           amoebaGpu->psWorkUnit->_pDevStream[0],
+        kCalculateAmoebaPmeDirectFixedE_FieldN2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
+                                                                           gpu->sim.pInteractingWorkUnit,
                                                                           amoebaGpu->psWorkArray_3_1->_pDevStream[0],
 #ifdef AMOEBA_DEBUG
                                                                           amoebaGpu->psWorkArray_3_2->_pDevStream[0],
@@ -471,27 +472,16 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
    }
    LAUNCHERROR("kCalculateAmoebaPmeDirectFixedE_Field_kernel");
-#if 0
-        for( unsigned int ii = 0; ii < amoebaGpu->outputBuffers; ii++ ){
-            //float index = 1.0f;
-            float index = (float) ii;
-            for( unsigned int jj = 0; jj < 3*amoebaGpu->paddedNumberOfAtoms; jj += 3 ){
-                unsigned int kk = 3*ii*amoebaGpu->paddedNumberOfAtoms + jj;
-                amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk]   = index;
-                amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk+1] = index;
-                amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk+2] = index;
-            }
-        }
-        amoebaGpu->psWorkArray_3_1->Upload();
-#endif
    kReducePmeDirectE_Fields( amoebaGpu );
 #ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){
        gpu->psInteractionCount->Download();
+        (void) fprintf( amoebaGpu->log, "cudaComputeAmoebaPmeDirectFixedEField:  threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u shrd=%u\n", 
+                        threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)+sizeof(float3)),
+                        (sizeof(FixedFieldParticle)+sizeof(float3)), (sizeof(FixedFieldParticle)+sizeof(float3))*threadsPerBlock );
        (void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u warp=%d\n",
-                        amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
+                        amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
                        sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->energyOutputBuffers, 
                        (*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->bOutputBufferPerWarp );
        (void) fflush( amoebaGpu->log );
@@ -527,6 +517,8 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
 */
        amoebaGpu->psE_Field->Download();
        amoebaGpu->psE_FieldPolar->Download();
+        (void) fprintf( amoebaGpu->log,"E-field (includes self term)" );
+        int maxPrint             = 3002;
        for( int ii = 0; ii < gpu->natoms; ii++ ){
           (void) fprintf( amoebaGpu->log, "%5d ", ii); 
@@ -558,16 +550,29 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
        debugArray->Download();
        int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
+        amoebaGpu->gpuContext->psPosq4->Download();
        for( int jj = 0; jj < gpu->natoms; jj++ ){
            int debugIndex = jj;
+if( fabs(debugArray->_pSysStream[0][jj+paddedNumberOfAtoms].x) > 0.0 ){
            (void) fprintf( amoebaGpu->log,"%5d PmeFixedEField\n", jj );
-            for( int kk = 0; kk < 10; kk++ ){
+            for( int kk = 0; kk < 6; kk++ ){
                (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
                                debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
                                debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
                debugIndex += paddedNumberOfAtoms;
            }
+            (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e ] [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] p\n",
+                            amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].x,
+                            amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].y,
+                            amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].z,
+                            amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].x - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].x,
+                            amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].y - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].y,
+                            amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].z - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].z,
+                           (amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].x - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].x)/5.50f,
+                           (amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].y - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].y)/5.50f,
+                           (amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].z - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].z)/5.50f);
            (void) fprintf( amoebaGpu->log,"\n" );
+}
        }
@@ -581,13 +586,12 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_Field,      outputVector );
            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_FieldPolar, outputVector);
            cudaWriteVectorOfDoubleVectorsToFile( "CudaEField", fileId, outputVector );
         }
         delete debugArray;
    }
 #endif
-        if( 0 ){
+        if( 1 ){
            std::vector<int> fileId;
            fileId.push_back( 0 );
            VectorOfDoubleVectors outputVector;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.h
@@ -44,10 +44,8 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
 ){
 #ifdef AMOEBA_DEBUG
-    int pullIndexMax = 12;
+    int maxPullIndex = 1;
    float4 pullBack[12];
-    float dScaleVal;
-    float pScaleVal;
 #endif
    extern __shared__ FixedFieldParticle sA[];
@@ -65,6 +63,10 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
        unsigned int x;
        unsigned int y;
        bool bExclusionFlag;
+        float dScaleValue;
+        float pScaleValue;
+        int  dScaleMask;
+        int2 pScaleMask;
        // extract cell coordinates
@@ -97,148 +99,77 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
            loadFixedFieldShared( &(sA[threadIdx.x]), atomI );
-            if (!bExclusionFlag)
+            if( bExclusionFlag ){
-            {
-                // this branch is never exercised since it includes the
-                // interaction between atomI and itself which is always excluded
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    float ijField[4][3];
-                    // load coords, charge, ...
-#ifdef AMOEBA_DEBUG
-dScaleVal = 1.0f;
-pScaleVal = 1.0f;
-#endif
-                    calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[j], 1.0f, 1.0f, ijField
-#ifdef AMOEBA_DEBUG
-                                                , pullBack
-#endif
-                    );
-                    unsigned int match      = (atomI == (y + j)) ? 1 : 0;
-match = 1;
-                    // add to field at atomI the field due atomJ's charge/dipole/quadrupole
-                    fieldSum[0]            += match ? 0.0f : ijField[0][0];
-                    fieldSum[1]            += match ? 0.0f : ijField[0][1];
-                    fieldSum[2]            += match ? 0.0f : ijField[0][2];
-                    fieldPolarSum[0]       += match ? 0.0f : ijField[2][0];
-                    fieldPolarSum[1]       += match ? 0.0f : ijField[2][1];
-                    fieldPolarSum[2]       += match ? 0.0f : ijField[2][2];
-                }
-            }
-            else  // bExclusion
-            {
                unsigned int xi       = x >> GRIDBITS;
                unsigned int cell     = xi + xi*cAmoebaSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
-                int  dScaleMask       = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
+                dScaleMask            = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
-                int2 pScaleMask       = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
+                pScaleMask            = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
+            } else {
+                dScaleValue = pScaleValue = 1.0f;
-                for (unsigned int j = 0; j < GRID; j++)
+            }
-                {
-                    // load coords, charge, ...
-                    float ijField[4][3];
+            for (unsigned int j = 0; j < GRID; j++)
-                    float dScaleValue;
+            {
-                    float pScaleValue;
+                if( bExclusionFlag ){
                    getMaskedDScaleFactor( j, dScaleMask, &dScaleValue );
                    getMaskedPScaleFactor( j, pScaleMask, &pScaleValue );
+                }
+                float ijField[4][3];
+                calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[j], dScaleValue, pScaleValue, ijField
 #ifdef AMOEBA_DEBUG
-dScaleVal = dScaleValue;
+                                            , pullBack
-pScaleVal = pScaleValue;
 #endif
-                    calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[j], dScaleValue, pScaleValue, ijField
+                );
-#ifdef AMOEBA_DEBUG
-                                                , pullBack
-#endif
-                    );
+                // nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution
+                // by setting match flag
-                    // nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution
+                unsigned int match      = ( (atomI == (y + j)) || (atomI >= cAmoebaSim.numberOfAtoms) || ((y+j) >= cAmoebaSim.numberOfAtoms) ) ? 1 : 0;
-                    // by setting match flag
-                    unsigned int match      = ( (atomI == (y + j)) || (atomI >= cAmoebaSim.numberOfAtoms) || ((y+j) >= cAmoebaSim.numberOfAtoms) ) ? 1 : 0;
+                // add to field at atomI the field due atomJ's charge/dipole/quadrupole
-                    // add to field at atomI the field due atomJ's charge/dipole/quadrupole
+                fieldSum[0]            += match ? 0.0f : ijField[0][0];
+                fieldSum[1]            += match ? 0.0f : ijField[0][1];
+                fieldSum[2]            += match ? 0.0f : ijField[0][2];
-                    fieldSum[0]            += match ? 0.0f : ijField[0][0];
+                fieldPolarSum[0]       += match ? 0.0f : ijField[2][0];
-                    fieldSum[1]            += match ? 0.0f : ijField[0][1];
+                fieldPolarSum[1]       += match ? 0.0f : ijField[2][1];
-                    fieldSum[2]            += match ? 0.0f : ijField[0][2];
+                fieldPolarSum[2]       += match ? 0.0f : ijField[2][2];
-                    fieldPolarSum[0]       += match ? 0.0f : ijField[2][0];
-                    fieldPolarSum[1]       += match ? 0.0f : ijField[2][1];
-                    fieldPolarSum[2]       += match ? 0.0f : ijField[2][2];
 #ifdef AMOEBA_DEBUG
 if( atomI == targetAtom ){
-        unsigned int index                 = atomI == targetAtom ? (y + j) : atomI;
+    unsigned int index                 = atomI == targetAtom ? (y + j) : atomI;
-        unsigned int indexI                = 0;
+    unsigned int indexI                = 0;
-        unsigned int indexJ                = indexI ? 0 : 2;
+    unsigned int indexJ                = indexI ? 0 : 2;
-        unsigned int indices[4]            = { indexI, indexJ, indexI+1, indexJ+1 };
+    unsigned int indices[4]            = { indexI, indexJ, indexI+1, indexJ+1 };
-        debugArray[index].x                = (float) atomI;
+    debugArray[index].x                = (float) atomI;
-        debugArray[index].y                = (float) (y + j);
+    debugArray[index].y                = (float) (y + j);
-        debugArray[index].z                = dScaleValue;
+    debugArray[index].z                = dScaleValue;
-        debugArray[index].w                = pScaleValue;
+    debugArray[index].w                = pScaleValue;
+    float flag                         = 7.0f;
+    for( int ii = 0; ii < 4; ii++ ){
        index                             += cAmoebaSim.paddedNumberOfAtoms;
-        unsigned int off                   = 3*(x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+        debugArray[index].x                = match ? 0.0f : ijField[indices[ii]][0];
-        debugArray[index].x                = (float) x;
+        debugArray[index].y                = match ? 0.0f : ijField[indices[ii]][1];
-        debugArray[index].y                = (float) tgx;
+        debugArray[index].z                = match ? 0.0f : ijField[indices[ii]][2];
-        debugArray[index].z                = -2;
+        debugArray[index].w                = flag;
-        debugArray[index].w                = (float) off;
+    }
+    for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
-        float flag                         = 7.0f;
+        index                             += cAmoebaSim.paddedNumberOfAtoms;
-        for( int ii = 0; ii < 4; ii++ ){
+        debugArray[index].x                = pullBack[pullIndex].x;
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+        debugArray[index].y                = pullBack[pullIndex].y;
-            debugArray[index].x                = match ? 0.0f : ijField[indices[ii]][0];
+        debugArray[index].z                = pullBack[pullIndex].z;
-            debugArray[index].y                = match ? 0.0f : ijField[indices[ii]][1];
+        debugArray[index].w                = pullBack[pullIndex].w;
-            debugArray[index].z                = match ? 0.0f : ijField[indices[ii]][2];
+    }   
-            debugArray[index].w                = flag;
-        }
-        for( int pullIndex = 0; pullIndex < pullIndexMax; pullIndex++ ){
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                = pullBack[pullIndex].x;
-            debugArray[index].y                = pullBack[pullIndex].y;
-            debugArray[index].z                = pullBack[pullIndex].z;
-            debugArray[index].w                = pullBack[pullIndex].w;
-        }   
-/*
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                = match ? 0.0f : ijField[indexI][0];
-            debugArray[index].y                = match ? 0.0f : ijField[indexI][1];
-            debugArray[index].z                = match ? 0.0f : ijField[indexI][2];
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            unsigned int mask                  = 1 << j;
-            unsigned int pScaleIndex           = (scaleMask.x & mask) ? 1 : 0;
-            pScaleIndex                       += (scaleMask.y & mask) ? 2 : 0;
-            debugArray[index].x                = (float) pScaleIndex;
-            debugArray[index].y                = scaleMask.x & mask ? 1.0f : -1.0f;
-            debugArray[index].z                = scaleMask.y & mask ? 1.0f : -1.0f;
-            debugArray[index].w                = + 10.0f;
-*/
 }
 #endif
-                }
            }
            // Write results
@@ -253,170 +184,54 @@ if( atomI == targetAtom ){
            load3dArray( offset, fieldPolarSum,  outputEFieldPolar );
 #endif
-        }
+        } else {
-        else        // 100% utilization
-        {
-            // Read fixed atom data into registers and GRF
-            if (lasty != y)
-            {
-                // load coordinates, charge, ...
-                loadFixedFieldShared( &(sA[threadIdx.x]), (y+tgx) );
-            }
-            // zero shared fields
-            zeroFixedFieldParticleSharedField( &(sA[threadIdx.x]) );
-            if (!bExclusionFlag)
-            {
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    float ijField[4][3];
+            unsigned int flags = cSim.pInteractionFlag[pos];
+//  flags = 0xFFFFFFFF;
+            if (flags == 0) {
+                // No interactions in this block.
+            } else {
+                if (lasty != y ) {
-                    // load coords, charge, ...
+                    // load coordinates, charge, ...
-#ifdef AMOEBA_DEBUG
+                    loadFixedFieldShared( &(sA[threadIdx.x]), (y+tgx) );
-dScaleVal = 1.0f;
-pScaleVal = 1.0f;
-#endif
-                    calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[tj], 1.0f, 1.0f, ijField
-#ifdef AMOEBA_DEBUG
-                                                 , pullBack
-#endif
-                    );
-                    unsigned int outOfBounds     = ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+tj) >= cAmoebaSim.numberOfAtoms) ) ? 1 : 0;
+                }
-                    // add to field at atomI the field due atomJ's charge/dipole/quadrupole
-                    fieldSum[0]                 += outOfBounds ? 0.0f : ijField[0][0];
-                    fieldSum[1]                 += outOfBounds ? 0.0f : ijField[0][1];
-                    fieldSum[2]                 += outOfBounds ? 0.0f : ijField[0][2];
-                    fieldPolarSum[0]            += outOfBounds ? 0.0f : ijField[2][0];
-                    fieldPolarSum[1]            += outOfBounds ? 0.0f : ijField[2][1];
-                    fieldPolarSum[2]            += outOfBounds ? 0.0f : ijField[2][2];
-                    // add to field at atomJ the field due atomI's charge/dipole/quadrupole
-                    psA[tj].eField[0]           += outOfBounds ? 0.0f : ijField[1][0];
-                    psA[tj].eField[1]           += outOfBounds ? 0.0f : ijField[1][1];
-                    psA[tj].eField[2]           += outOfBounds ? 0.0f : ijField[1][2];
-                    psA[tj].eFieldP[0]          += outOfBounds ? 0.0f : ijField[3][0];
-                    psA[tj].eFieldP[1]          += outOfBounds ? 0.0f : ijField[3][1];
-                    psA[tj].eFieldP[2]          += outOfBounds ? 0.0f : ijField[3][2];
-#ifdef AMOEBA_DEBUG
-if( (atomI == targetAtom  || (y + tj) == targetAtom) ){
-            unsigned int index                 = (atomI == targetAtom) ? (y + tj) : atomI;
-            unsigned int indexI                = (atomI == targetAtom) ? 0 : 2;
-            unsigned int indexJ                = (atomI == targetAtom) ? 2 : 0;
-            debugArray[index].x                = (float) atomI;
-            debugArray[index].y                = (float) (y + tj);
-            debugArray[index].z                = dScaleVal;
-            debugArray[index].w                = pScaleVal;
-            unsigned int pullBackIndex         = 0;
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                = pullBack[pullBackIndex].x;
-            debugArray[index].y                = pullBack[pullBackIndex].y;
-            debugArray[index].z                = pullBack[pullBackIndex].z;
-            debugArray[index].w                = pullBack[pullBackIndex].w;;
-            pullBackIndex++;
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                = pullBack[pullBackIndex].x;
-            debugArray[index].y                = pullBack[pullBackIndex].y;
-            debugArray[index].z                = pullBack[pullBackIndex].z;
-            debugArray[index].w                = pullBack[pullBackIndex].w;;
-            float flag                         = 8.0f;
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexI][0];
-            debugArray[index].y                = ijField[indexI][1];
-            debugArray[index].z                = ijField[indexI][2];
-            debugArray[index].w                = flag;
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexJ][0];
-            debugArray[index].y                = ijField[indexJ][1];
-            debugArray[index].z                = ijField[indexJ][2];
-            debugArray[index].w                = flag;
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexI+1][0];
-            debugArray[index].y                = ijField[indexI+1][1];
-            debugArray[index].z                = ijField[indexI+1][2];
-            debugArray[index].w                = flag;
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexJ+1][0];
-            debugArray[index].y                = ijField[indexJ+1][1];
-            debugArray[index].z                = ijField[indexJ+1][2];
-            debugArray[index].w                = flag;
-#if 0
+                // zero shared fields
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+                zeroFixedFieldParticleSharedField( &(sA[threadIdx.x]) );
-            unsigned int mask                  = 1 << j;
-            unsigned int pScaleIndex           = (scaleMask.x & mask) ? 1 : 0;
-            pScaleIndex                       += (scaleMask.y & mask) ? 2 : 0;
-            debugArray[index].x                = (float) pScaleIndex;
-            debugArray[index].y                = scaleMask.x & mask ? 1.0f : -1.0f;
-            debugArray[index].z                = scaleMask.y & mask ? 1.0f : -1.0f;
-            debugArray[index].w                = pScaleValue + 10.0f;
-#endif
-}
-#endif
-                    tj                  = (tj + 1) & (GRID - 1);
+                if( bExclusionFlag ) {
+                    unsigned int xi   = x >> GRIDBITS;
+                    unsigned int yi   = y >> GRIDBITS;
+                    unsigned int cell = xi+yi*cAmoebaSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
+                    dScaleMask        = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
+                    pScaleMask        = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
+                } else {
+                    dScaleValue = pScaleValue  = 1.0f;
                }
-            }
-            else  // bExclusion
-            {
-                // Read fixed atom data into registers and GRF
-                unsigned int xi   = x >> GRIDBITS;
+                for (unsigned int j = 0; j < GRID; j++){
-                unsigned int yi   = y >> GRIDBITS;
-                unsigned int cell = xi+yi*cAmoebaSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
-                int  dScaleMask   = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
-                int2 pScaleMask   = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
-                for (unsigned int j = 0; j < GRID; j++)
+                    unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j;
-                {
+                    if( bExclusionFlag ){
-                    // load coords, charge, ...
+                        getMaskedDScaleFactor( jIdx, dScaleMask, &dScaleValue );
+                        getMaskedPScaleFactor( jIdx, pScaleMask, &pScaleValue );
+                    }
                    float ijField[4][3];
+                    calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[jIdx], dScaleValue, pScaleValue, ijField
-                    float dScaleValue;
-                    float pScaleValue;
-                    getMaskedDScaleFactor( tj, dScaleMask, &dScaleValue );
-                    getMaskedPScaleFactor( tj, pScaleMask, &pScaleValue );
-#ifdef AMOEBA_DEBUG
-dScaleVal = dScaleValue;
-pScaleVal = pScaleValue;
-#endif
-                    calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[tj], dScaleValue, pScaleValue, ijField
 #ifdef AMOEBA_DEBUG
                                                , pullBack
 #endif
                    );
-                    unsigned int outOfBounds     = ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+tj) >= cAmoebaSim.numberOfAtoms) ) ? 1 : 0;
+                    unsigned int outOfBounds     = ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+jIdx) >= cAmoebaSim.numberOfAtoms) ) ? 1 : 0;
                    // add to field at atomI the field due atomJ's charge/dipole/quadrupole
                    fieldSum[0]                 += outOfBounds ? 0.0f : ijField[0][0];
                    fieldSum[1]                 += outOfBounds ? 0.0f : ijField[0][1];
                    fieldSum[2]                 += outOfBounds ? 0.0f : ijField[0][2];
@@ -424,104 +239,128 @@ pScaleVal = pScaleValue;
                    fieldPolarSum[0]            += outOfBounds ? 0.0f : ijField[2][0];
                    fieldPolarSum[1]            += outOfBounds ? 0.0f : ijField[2][1];
                    fieldPolarSum[2]            += outOfBounds ? 0.0f : ijField[2][2];
+                    if( flags == 0xFFFFFFFF ){
-                    // add to field at atomJ the field due atomI's charge/dipole/quadrupole
+                        // add to field at atomJ the field due atomI's charge/dipole/quadrupole
-                    psA[tj].eField[0]           += outOfBounds ? 0.0f : ijField[1][0];
+                        psA[jIdx].eField[0]        += outOfBounds ? 0.0f : ijField[1][0];
-                    psA[tj].eField[1]           += outOfBounds ? 0.0f : ijField[1][1];
+                        psA[jIdx].eField[1]        += outOfBounds ? 0.0f : ijField[1][1];
-                    psA[tj].eField[2]           += outOfBounds ? 0.0f : ijField[1][2];
+                        psA[jIdx].eField[2]        += outOfBounds ? 0.0f : ijField[1][2];
-                    psA[tj].eFieldP[0]          += outOfBounds ? 0.0f : ijField[3][0];
+                        psA[jIdx].eFieldP[0]       += outOfBounds ? 0.0f : ijField[3][0];
-                    psA[tj].eFieldP[1]          += outOfBounds ? 0.0f : ijField[3][1];
+                        psA[jIdx].eFieldP[1]       += outOfBounds ? 0.0f : ijField[3][1];
-                    psA[tj].eFieldP[2]          += outOfBounds ? 0.0f : ijField[3][2];
+                        psA[jIdx].eFieldP[2]       += outOfBounds ? 0.0f : ijField[3][2];
+                    } else {
+                         psA[threadIdx.x].tempBuffer[0]  = outOfBounds ? 0.0f : ijField[1][0];
+                         psA[threadIdx.x].tempBuffer[1]  = outOfBounds ? 0.0f : ijField[1][1];
+                         psA[threadIdx.x].tempBuffer[2]  = outOfBounds ? 0.0f : ijField[1][2];
+                         psA[threadIdx.x].tempBufferP[0] = outOfBounds ? 0.0f : ijField[3][0];
+                         psA[threadIdx.x].tempBufferP[1] = outOfBounds ? 0.0f : ijField[3][1];
+                         psA[threadIdx.x].tempBufferP[2] = outOfBounds ? 0.0f : ijField[3][2];
+                        if( tgx % 2 == 0 ){
+                            sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+1] ); 
+                        } 
+                        if( tgx % 4 == 0 ){
+                            sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+2] ); 
+                        } 
+                        if( tgx % 8 == 0 ){
+                            sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+4] ); 
+                        } 
+                        if( tgx % 16 == 0 ){
+                            sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+8] ); 
+                        } 
+                        if (tgx == 0)
+                        {
+                            psA[jIdx].eField[0]  += psA[threadIdx.x].tempBuffer[0]  + psA[threadIdx.x+16].tempBuffer[0];
+                            psA[jIdx].eField[1]  += psA[threadIdx.x].tempBuffer[1]  + psA[threadIdx.x+16].tempBuffer[1];
+                            psA[jIdx].eField[2]  += psA[threadIdx.x].tempBuffer[2]  + psA[threadIdx.x+16].tempBuffer[2];
+                            psA[jIdx].eFieldP[0] += psA[threadIdx.x].tempBufferP[0] + psA[threadIdx.x+16].tempBufferP[0];
+                            psA[jIdx].eFieldP[1] += psA[threadIdx.x].tempBufferP[1] + psA[threadIdx.x+16].tempBufferP[1];
+                            psA[jIdx].eFieldP[2] += psA[threadIdx.x].tempBufferP[2] + psA[threadIdx.x+16].tempBufferP[2];
+                        }
+                    }
 #ifdef AMOEBA_DEBUG
-if( (atomI == targetAtom || (y + tj) == targetAtom) ){
+if( (atomI == targetAtom || (y + jIdx) == targetAtom) ){
-            unsigned int index                 = (atomI == targetAtom) ? (y + tj) : atomI;
+            unsigned int index                 = (atomI == targetAtom) ? (y + jIdx) : atomI;
            unsigned int indexI                = (atomI == targetAtom) ? 0 : 2;
            unsigned int indexJ                = (atomI == targetAtom) ? 2 : 0;
            debugArray[index].x                = (float) atomI;
-            debugArray[index].y                = (float) (y + tj);
+            debugArray[index].y                = (float) (y + jIdx);
-            debugArray[index].z                = dScaleVal;
+            debugArray[index].z                = dScaleValue;
-            debugArray[index].w                = pScaleVal;
+            debugArray[index].w                = pScaleValue;
-            unsigned int pullBackIndex         = 0;
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                = pullBack[pullBackIndex].x;
-            debugArray[index].y                = pullBack[pullBackIndex].y;
-            debugArray[index].z                = pullBack[pullBackIndex].z;
-            debugArray[index].w                = pullBack[pullBackIndex].w;;
-            pullBackIndex++;
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                = pullBack[pullBackIndex].x;
-            debugArray[index].y                = pullBack[pullBackIndex].y;
-            debugArray[index].z                = pullBack[pullBackIndex].z;
-            debugArray[index].w                = pullBack[pullBackIndex].w;;
            float flag                         = 9.0f;
            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexI][0];
+            debugArray[index].x                =  outOfBounds ? 0.0f : ijField[indexI][0];
-            debugArray[index].y                = ijField[indexI][1];
+            debugArray[index].y                =  outOfBounds ? 0.0f : ijField[indexI][1];
-            debugArray[index].z                = ijField[indexI][2];
+            debugArray[index].z                =  outOfBounds ? 0.0f : ijField[indexI][2];
-            debugArray[index].w                = flag;
+            debugArray[index].w                =  flag;
            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexJ][0];
+            debugArray[index].x                =  outOfBounds ? 0.0f : ijField[indexJ][0];
-            debugArray[index].y                = ijField[indexJ][1];
+            debugArray[index].y                =  outOfBounds ? 0.0f : ijField[indexJ][1];
-            debugArray[index].z                = ijField[indexJ][2];
+            debugArray[index].z                =  outOfBounds ? 0.0f : ijField[indexJ][2];
            debugArray[index].w                = flag;
            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexI+1][0];
+            debugArray[index].x                =  outOfBounds ? 0.0f : ijField[indexI+1][0];
-            debugArray[index].y                = ijField[indexI+1][1];
+            debugArray[index].y                =  outOfBounds ? 0.0f : ijField[indexI+1][1];
-            debugArray[index].z                = ijField[indexI+1][2];
+            debugArray[index].z                =  outOfBounds ? 0.0f : ijField[indexI+1][2];
            debugArray[index].w                = flag;
            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexJ+1][0];
+            debugArray[index].x                =  outOfBounds ? 0.0f : ijField[indexJ+1][0];
-            debugArray[index].y                = ijField[indexJ+1][1];
+            debugArray[index].y                =  outOfBounds ? 0.0f : ijField[indexJ+1][1];
-            debugArray[index].z                = ijField[indexJ+1][2];
+            debugArray[index].z                =  outOfBounds ? 0.0f : ijField[indexJ+1][2];
            debugArray[index].w                = flag;
+            for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
-}
+                index                             += cAmoebaSim.paddedNumberOfAtoms;
+                debugArray[index].x                = pullBack[pullIndex].x;
+                debugArray[index].y                = pullBack[pullIndex].y;
+                debugArray[index].z                = pullBack[pullIndex].z;
+                debugArray[index].w                = pullBack[pullIndex].w;
+            }
+}        
 #endif
                    tj                  = (tj + 1) & (GRID - 1);
-                }
-            }
-            // Write results
+                } // j-loop block
+                // Write results
 #ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset                 = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+                unsigned int offset                 = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
-            load3dArrayBufferPerWarp( offset, fieldSum,       outputEField );
+                load3dArrayBufferPerWarp( offset, fieldSum,       outputEField );
-            load3dArrayBufferPerWarp( offset, fieldPolarSum,  outputEFieldPolar );
+                load3dArrayBufferPerWarp( offset, fieldPolarSum,  outputEFieldPolar );
-            offset                              = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+                offset                              = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
-            load3dArrayBufferPerWarp( offset, sA[threadIdx.x].eField,  outputEField );
+                load3dArrayBufferPerWarp( offset, sA[threadIdx.x].eField,  outputEField );
-            load3dArrayBufferPerWarp( offset, sA[threadIdx.x].eFieldP, outputEFieldPolar );
+                load3dArrayBufferPerWarp( offset, sA[threadIdx.x].eFieldP, outputEFieldPolar );
 #else
-            unsigned int offset                 = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+                unsigned int offset                 = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
-            load3dArray( offset, fieldSum,       outputEField );
+                load3dArray( offset, fieldSum,       outputEField );
-            load3dArray( offset, fieldPolarSum,  outputEFieldPolar );
+                load3dArray( offset, fieldPolarSum,  outputEFieldPolar );
-            offset                              = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+                offset                              = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
-            load3dArray( offset, sA[threadIdx.x].eField,  outputEField );
+                load3dArray( offset, sA[threadIdx.x].eField,  outputEField );
-            load3dArray( offset, sA[threadIdx.x].eFieldP, outputEFieldPolar );
+                load3dArray( offset, sA[threadIdx.x].eFieldP, outputEFieldPolar );
 #endif
+            } // end of pInteractionFlag block 
            lasty = y;
-        }
+        } // x == y block
        pos++;
    }

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
@@ -36,7 +36,21 @@ void GetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
 //#define AMOEBA_DEBUG
 #undef AMOEBA_DEBUG
+#undef INCLUDE_MI_FIELD_BUFFERS
+#define INCLUDE_MI_FIELD_BUFFERS 
 #include "kCalculateAmoebaCudaMutualInducedParticle.h"
+#undef INCLUDE_MI_FIELD_BUFFERS
+__device__ void sumTempBuffer( MutualInducedParticle& atomI, MutualInducedParticle& atomJ ){
+    atomI.tempBuffer[0]  += atomJ.tempBuffer[0];
+    atomI.tempBuffer[1]  += atomJ.tempBuffer[1];
+    atomI.tempBuffer[2]  += atomJ.tempBuffer[2];
+    atomI.tempBufferP[0] += atomJ.tempBufferP[0];
+    atomI.tempBufferP[1] += atomJ.tempBufferP[1];
+    atomI.tempBufferP[2] += atomJ.tempBufferP[2];
+}
 // file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
@@ -152,7 +166,7 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce
    // increment the field at each site due to this interaction
-    if( r2 <= cAmoebaSim.cutoffDistance2 ){
+    if( r2 <= cSim.nonbondedCutoffSqr ){
        fields[0][0]       = fimd[0] - fid[0];
        fields[1][0]       = fkmd[0] - fkd[0];
@@ -370,7 +384,8 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
                                                                  CUDAStream<float>* outputArray, CUDAStream<float>* outputPolarArray )
 {
-  gpuContext gpu    = amoebaGpu->gpuContext;
+  static unsigned int threadsPerBlock  = 0;
+  gpuContext gpu                       = amoebaGpu->gpuContext;
 #ifdef AMOEBA_DEBUG
    int targetAtom    = 0;
@@ -389,9 +404,24 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
    kClearFields_3( amoebaGpu, 2 );
+    // on first pass, set threads/block
+    if( threadsPerBlock == 0 ){  
+        unsigned int maxThreads;
+        if (gpu->sm_version >= SM_20)
+            maxThreads = 384; 
+        else if (gpu->sm_version >= SM_12)
+            maxThreads = 128; 
+        else
+            maxThreads = 64; 
+        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(MutualInducedParticle)), maxThreads);
+    }    
    if (gpu->bOutputBufferPerWarp){
-        kCalculateAmoebaPmeMutualInducedFieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
+                                                                 //gpu->sim.pInteractingWorkUnit,
-                                                                 amoebaGpu->psWorkUnit->_pDevStream[0],
+                                                                 //amoebaGpu->psWorkUnit->_pDevStream[0],
+        kCalculateAmoebaPmeMutualInducedFieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
+                                                                 gpu->sim.pInteractingWorkUnit,
                                                                 amoebaGpu->psWorkArray_3_1->_pDevStream[0],
 #ifdef AMOEBA_DEBUG
                                                                 amoebaGpu->psWorkArray_3_2->_pDevStream[0],
@@ -405,14 +435,13 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
 #ifdef AMOEBA_DEBUG
        (void) fprintf( amoebaGpu->log, "N2 no warp\n" );
        (void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n",
-                        amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
+                        amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
-                        sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock,
+                        sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock,
                        amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
        (void) fflush( amoebaGpu->log );
 #endif
-        kCalculateAmoebaPmeMutualInducedFieldN2_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock,
+        kCalculateAmoebaPmeMutualInducedFieldN2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
-                                                         sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
+                                                                 gpu->sim.pInteractingWorkUnit,
-                                                                 amoebaGpu->psWorkUnit->_pDevStream[0],
                                                                 amoebaGpu->psWorkArray_3_1->_pDevStream[0],
 #ifdef AMOEBA_DEBUG
                                                                 amoebaGpu->psWorkArray_3_2->_pDevStream[0],
@@ -717,6 +746,17 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
                }
            }   
            (void) fflush( amoebaGpu->log );
+    if( 1 ){
+        std::vector<int> fileId;
+        fileId.push_back( iteration );
+        VectorOfDoubleVectors outputVector;
+        cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,                    outputVector );
+        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipole,      outputVector );
+        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipolePolar, outputVector );
+        cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeMI", fileId, outputVector );
+     }
        }
 #endif
        iteration++;
@@ -725,7 +765,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
    amoebaGpu->mutualInducedDone             = done;
    amoebaGpu->mutualInducedConverged        = ( !done || iteration > amoebaGpu->mutualInducedMaxIterations ) ? 0 : 1;
-    if( 0 ){
+    if( 1 ){
        std::vector<int> fileId;
        //fileId.push_back( 0 );
        VectorOfDoubleVectors outputVector;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.h
@@ -131,7 +131,7 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
            debugArray[index].x                = (float) atomI;
            debugArray[index].y                = (float) (y + j);
-            debugArray[index].z                = cAmoebaSim.cutoffDistance2;
+            debugArray[index].z                = cSim.nonbondedCutoffSqr;
            debugArray[index].w                = 6.0f;
@@ -209,72 +209,114 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
 #endif
-        }
+        } else {
-        else        // 100% utilization
-        {
-            // Read fixed atom data into registers and GRF
-            if (lasty != y)
-            {
-                unsigned int atomJ        = y + tgx;
-                // load coordinates, charge, ...
-                loadMutualInducedShared( &(sA[threadIdx.x]), atomJ );
-            }
-           // zero shared fields
-            zeroMutualInducedParticleSharedField(  &(sA[threadIdx.x]) );
-            for (unsigned int j = 0; j < GRID; j++)
-            {
-                float ijField[4][3];
+            unsigned int flags = cSim.pInteractionFlag[pos];
+            if (flags == 0) {
-                // load coords, charge, ...
+                // No interactions in this block.
+            } else {
-                calculatePmeDirectMutualInducedFieldPairIxn_kernel( localParticle, psA[tj], uscale, ijField
+                if (lasty != y)
+                {
+                    unsigned int atomJ        = y + tgx;
+                    // load coordinates, charge, ...
+                    loadMutualInducedShared( &(sA[threadIdx.x]), atomJ );
+                }
+               // zero shared fields
+                zeroMutualInducedParticleSharedField(  &(sA[threadIdx.x]) );
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j;
+                    float ijField[4][3];
+                    // load coords, charge, ...
+                    calculatePmeDirectMutualInducedFieldPairIxn_kernel( localParticle, psA[jIdx], uscale, ijField
 #ifdef AMOEBA_DEBUG
-, pullBack 
+    , pullBack 
 #endif
-   );
+       );
-                unsigned int mask   =  ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+tj) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
+                    unsigned int mask   =  ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+jIdx) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
-                // add to field at atomI the field due atomJ's dipole
+                    // add to field at atomI the field due atomJ's dipole
-                fieldSum[0]              += mask ? ijField[0][0] : 0.0f;
+                    fieldSum[0]              += mask ? ijField[0][0] : 0.0f;
-                fieldSum[1]              += mask ? ijField[0][1] : 0.0f;
+                    fieldSum[1]              += mask ? ijField[0][1] : 0.0f;
-                fieldSum[2]              += mask ? ijField[0][2] : 0.0f;
+                    fieldSum[2]              += mask ? ijField[0][2] : 0.0f;
+                    // add to polar field at atomI the field due atomJ's dipole
+                    fieldPolarSum[0]         += mask ? ijField[2][0] : 0.0f;
+                    fieldPolarSum[1]         += mask ? ijField[2][1] : 0.0f;
+                    fieldPolarSum[2]         += mask ? ijField[2][2] : 0.0f;
+                    // add to field at atomJ the field due atomI's dipole
+                    if( flags == 0xFFFFFFFF ){
+                        psA[jIdx].field[0]             += mask ? ijField[1][0] : 0.0f;
+                        psA[jIdx].field[1]             += mask ? ijField[1][1] : 0.0f;
+                        psA[jIdx].field[2]             += mask ? ijField[1][2] : 0.0f;
+                        // add to polar field at atomJ the field due atomI's dipole
+                        psA[jIdx].fieldPolar[0]        += mask ? ijField[3][0] : 0.0f;
+                        psA[jIdx].fieldPolar[1]        += mask ? ijField[3][1] : 0.0f;
+                        psA[jIdx].fieldPolar[2]        += mask ? ijField[3][2] : 0.0f;
+                    } else {
+                        psA[threadIdx.x].tempBuffer[0]  = mask ? 0.0f : ijField[1][0];
+                        psA[threadIdx.x].tempBuffer[1]  = mask ? 0.0f : ijField[1][1];
+                        psA[threadIdx.x].tempBuffer[2]  = mask ? 0.0f : ijField[1][2];
+                        psA[threadIdx.x].tempBufferP[0] = mask ? 0.0f : ijField[3][0];
+                        psA[threadIdx.x].tempBufferP[1] = mask ? 0.0f : ijField[3][1];
+                        psA[threadIdx.x].tempBufferP[2] = mask ? 0.0f : ijField[3][2];
+                        if( tgx % 2 == 0 ){
+                            sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+1] );
+                        }
+                        if( tgx % 4 == 0 ){
+                            sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+2] );
+                        }
+                        if( tgx % 8 == 0 ){
+                            sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+4] );
+                        }
+                        if( tgx % 16 == 0 ){
+                            sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+8] );
+                        }
+                        if (tgx == 0)
+                        {
+                            psA[jIdx].field[0]         += psA[threadIdx.x].tempBuffer[0]  + psA[threadIdx.x+16].tempBuffer[0];
+                            psA[jIdx].field[1]         += psA[threadIdx.x].tempBuffer[1]  + psA[threadIdx.x+16].tempBuffer[1];
+                            psA[jIdx].field[2]         += psA[threadIdx.x].tempBuffer[2]  + psA[threadIdx.x+16].tempBuffer[2];
+                            psA[jIdx].fieldPolar[0]    += psA[threadIdx.x].tempBufferP[0] + psA[threadIdx.x+16].tempBufferP[0];
+                            psA[jIdx].fieldPolar[1]    += psA[threadIdx.x].tempBufferP[1] + psA[threadIdx.x+16].tempBufferP[1];
+                            psA[jIdx].fieldPolar[2]    += psA[threadIdx.x].tempBufferP[2] + psA[threadIdx.x+16].tempBufferP[2];
+                        }
+                    }
-                // add to polar field at atomI the field due atomJ's dipole
-                fieldPolarSum[0]         += mask ? ijField[2][0] : 0.0f;
-                fieldPolarSum[1]         += mask ? ijField[2][1] : 0.0f;
-                fieldPolarSum[2]         += mask ? ijField[2][2] : 0.0f;
-                // add to field at atomJ the field due atomI's dipole
-                psA[tj].field[0]         += mask ? ijField[1][0] : 0.0f;
-                psA[tj].field[1]         += mask ? ijField[1][1] : 0.0f;
-                psA[tj].field[2]         += mask ? ijField[1][2] : 0.0f;
-                // add to polar field at atomJ the field due atomI's dipole
-                psA[tj].fieldPolar[0]    += mask ? ijField[3][0] : 0.0f;
-                psA[tj].fieldPolar[1]    += mask ? ijField[3][1] : 0.0f;
-                psA[tj].fieldPolar[2]    += mask ? ijField[3][2] : 0.0f;
 #ifdef AMOEBA_DEBUG
-if( atomI == targetAtom || (y+tj) == targetAtom ){
+if( atomI == targetAtom || (y+jIdx) == targetAtom ){
-            unsigned int index                 = atomI == targetAtom ? (y+tj) : atomI;
+            unsigned int index                 = atomI == targetAtom ? (y+jIdx) : atomI;
            unsigned int pullBackIndex         = 0;
            unsigned int indexI                = 0;
            unsigned int indexJ                = indexI ? 0 : 2;
            debugArray[index].x                = (float) atomI;
-            debugArray[index].y                = (float) (y + tj);
+            debugArray[index].y                = (float) (y + jIdx);
-            debugArray[index].z                = cAmoebaSim.cutoffDistance2;
+            debugArray[index].z                = cSim.nonbondedCutoffSqr;
            debugArray[index].w                = 7.0f;
@@ -315,57 +357,40 @@ if( atomI == targetAtom || (y+tj) == targetAtom ){
            debugArray[index].y                = ijField[indexJ+1][1];
            debugArray[index].z                = ijField[indexJ+1][2];
            debugArray[index].w                = flag;
-/*
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                = match ? 0.0f : ijField[indexI][0];
-            debugArray[index].y                = match ? 0.0f : ijField[indexI][1];
-            debugArray[index].z                = match ? 0.0f : ijField[indexI][2];
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            unsigned int mask                  = 1 << j;
-            unsigned int pScaleIndex           = (scaleMask.x & mask) ? 1 : 0;
-            pScaleIndex                       += (scaleMask.y & mask) ? 2 : 0;
-            debugArray[index].x                = (float) pScaleIndex;
-            debugArray[index].y                = scaleMask.x & mask ? 1.0f : -1.0f;
-            debugArray[index].z                = scaleMask.y & mask ? 1.0f : -1.0f;
-            debugArray[index].w                = + 10.0f;
-*/
 }
 #endif
-                tj                  = (tj + 1) & (GRID - 1);
+                    tj                  = (tj + 1) & (GRID - 1);
-            }
+                } // end of j-loop
-            // Write results
+                // Write results
 #ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset     = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+                unsigned int offset     = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
-            load3dArrayBufferPerWarp( offset, fieldSum,      outputField );
+                load3dArrayBufferPerWarp( offset, fieldSum,      outputField );
-            load3dArrayBufferPerWarp( offset, fieldPolarSum, outputFieldPolar);
+                load3dArrayBufferPerWarp( offset, fieldPolarSum, outputFieldPolar);
-            offset                  = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+                offset                  = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
-            load3dArrayBufferPerWarp( offset, sA[threadIdx.x].field,      outputField );
+                load3dArrayBufferPerWarp( offset, sA[threadIdx.x].field,      outputField );
-            load3dArrayBufferPerWarp( offset, sA[threadIdx.x].fieldPolar, outputFieldPolar);
+                load3dArrayBufferPerWarp( offset, sA[threadIdx.x].fieldPolar, outputFieldPolar);
 #else
-            unsigned int offset     = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+                unsigned int offset     = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
-            load3dArray( offset, fieldSum,      outputField );
+                load3dArray( offset, fieldSum,      outputField );
-            load3dArray( offset, fieldPolarSum, outputFieldPolar);
+                load3dArray( offset, fieldPolarSum, outputFieldPolar);
-            offset                  = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+                offset                  = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
-            load3dArray( offset, sA[threadIdx.x].field,      outputField );
+                load3dArray( offset, sA[threadIdx.x].field,      outputField );
-            load3dArray( offset, sA[threadIdx.x].fieldPolar, outputFieldPolar);
+                load3dArray( offset, sA[threadIdx.x].fieldPolar, outputFieldPolar);
 #endif
-            lasty = y;
+                lasty = y;
-        }
+            } // end of pInteractionFlag block
+        } // end of x == y block
        pos++;
    }
 }
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaMapTorques.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaMapTorques.cu
@@ -653,7 +653,7 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu,
        (void) fprintf( amoebaGpu->log, "%s: numBlocks=%d numThreads=%d %d\n", methodName, numBlocks, numThreads, amoebaGpu->maxMapTorqueDifferencePow2); (void) fflush( amoebaGpu->log );
        amoebaGpu->psForce->Download();
        psCudaForce4->Download();
+        amoebaGpu->torqueMapForce->Download();
        amoebaGpu->psTorque->Download();
        int maxPrint        = 10;
        (void) fprintf( amoebaGpu->log,"Post torqueMap\n" );
@@ -670,6 +670,10 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu,
                            amoebaGpu->psForce->_pSysStream[0][indexOffset],
                            amoebaGpu->psForce->_pSysStream[0][indexOffset+1],
                            amoebaGpu->psForce->_pSysStream[0][indexOffset+2] );
+            (void) fprintf( amoebaGpu->log,"fT[%16.9e %16.9e %16.9e] ",
+                            amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset],
+                            amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset+1],
+                            amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset+2] );
            (void) fprintf( amoebaGpu->log,"T[%16.9e %16.9e %16.9e]\n",
                            amoebaGpu->psTorque->_pSysStream[0][indexOffset],
                            amoebaGpu->psTorque->_pSysStream[0][indexOffset+1],
@@ -741,7 +745,7 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce2( amoebaGpuContext amoebaGpu,
            amoebaGpu->maxMapTorqueDifference,
            amoebaGpu->torqueMapForce->_pDevStream[0],
            psCudaForce4->_pDevStream[0] );
-    LAUNCHERROR("amoebaMapTorqueReduce_kernel2");
+    LAUNCHERROR("amoebaMapTorqueReduce_kernel3");
 #ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaRotateFrame.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaRotateFrame.cu
@@ -353,6 +353,13 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
 }
+#undef USE_PERIODIC
+#define USE_PERIODIC
+#define METHOD_NAME(a, b) a##Periodic##b
+#include "kFindInteractingBlocks.h"
+#undef USE_PERIODIC
+#undef METHOD_NAME
 void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaGeneralizedKirkwood ) 
 {
    std::string methodName = "kCalculateAmoebaMultipoleForces";
@@ -372,6 +379,37 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
            cudaComputeAmoebaFixedEField( amoebaGpu );
            cudaComputeAmoebaMutualInducedField( amoebaGpu );
        } else {
+            gpuContext gpu = amoebaGpu->gpuContext;
+            kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
+            LAUNCHERROR("kFindBlockBoundsPeriodic");
+            kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
+            LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
+            compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
+            kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            LAUNCHERROR("kFindInteractionsWithinBlocksPeriodic");
+if( 0 ){ 
+    gpu->psInteractionCount->Download();
+    gpu->psInteractingWorkUnit->Download();
+    gpu->psInteractionFlag->Download();
+    amoebaGpu->psWorkUnit->Download();
+    (void) fprintf( amoebaGpu->log, "Ixn count=%u\n", gpu->psInteractionCount->_pSysStream[0][0] );
+    for( unsigned int ii = 0; ii < gpu->psInteractingWorkUnit->_length; ii++ ){
+        unsigned int x          = gpu->psInteractingWorkUnit->_pSysStream[0][ii];
+        unsigned int y          = ((x >> 2) & 0x7fff) << GRIDBITS;
+        unsigned int exclusions = (x & 0x1);
+                     x          = (x >> 17) << GRIDBITS;
+        (void) fprintf( amoebaGpu->log, "Cell %8u  %8u [%5u %5u %1u] ", ii, gpu->psInteractingWorkUnit->_pSysStream[0][ii], x,y,exclusions );
+        x          = amoebaGpu->psWorkUnit->_pSysStream[0][ii];
+        y          = ((x >> 2) & 0x7fff) << GRIDBITS;
+        exclusions = (x & 0x1);
+        x          = (x >> 17) << GRIDBITS;
+        (void) fprintf( amoebaGpu->log, "   %8u [%5u %5u %1u]   %10u\n", amoebaGpu->psWorkUnit->_pSysStream[0][ii], x,y,exclusions, gpu->psInteractionFlag->_pSysStream[0][ii] );
+    }    
+} else {
+}
            cudaComputeAmoebaPmeFixedEField( amoebaGpu );
            cudaComputeAmoebaPmeMutualInducedField( amoebaGpu );
        }

--- a/plugins/amoeba/platforms/cuda/tests/AmoebaTinkerParameterFile.cpp
+++ b/plugins/amoeba/platforms/cuda/tests/AmoebaTinkerParameterFile.cpp
@@ -4535,7 +4535,6 @@ void testUsingAmoebaTinkerParameterFile( const std::string& amoebaTinkerParamete
    MapStringDouble tinkerEnergies;
    MapStringVectorOfVectors supplementary;
    MapStringIntI isPresent = forceMap.find( AMOEBA_GK_FORCE );
    bool gkIsActive;
    if( isPresent != forceMap.end() && isPresent->second != 0 ){