Mods for direct PME

a9054686 · Mark Friedrichs · 01260070 · a9054686 · a9054686 · a9054686
Commit a9054686 authored Oct 06, 2010 by Mark Friedrichs
18 changed files
--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaData.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaData.cpp
@@ -42,6 +42,8 @@ AmoebaCudaData::AmoebaCudaData( CudaPlatform::PlatformData& data ) : cudaPlatfor
    log                           = NULL;
    contextImpl                   = NULL;
    gpuInitialized                = false;
+    applyCutoff                   = 0;
+    multipoleForceCount           = 0;
 }   
 AmoebaCudaData::~AmoebaCudaData() {
@@ -122,5 +124,21 @@ void AmoebaCudaData::initializeGpu( void ) {
    return;
 }
+void AmoebaCudaData::incrementMultipoleForceCount( void ) {
+    multipoleForceCount++;
+}
+int AmoebaCudaData::getMultipoleForceCount( void ) const {
+    return multipoleForceCount;
+}
+void AmoebaCudaData::setApplyCutoff( int inputApplyCutoff ) {
+    applyCutoff = inputApplyCutoff;
+}
+int AmoebaCudaData::getApplyCutoff( void ) const {
+    return applyCutoff;
+}
 }
--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaData.h
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaData.h
@@ -139,11 +139,41 @@ public:
     */
    void setContextImpl( void* contextImpl ); 
+    /**
+     * Get multipole force count 
+     * 
+     * @return multipole force count
+     */
+    int getMultipoleForceCount( void ) const; 
+    /**
+     * Get multipole force count 
+     * 
+     * @return multipole force count
+     */
+    void incrementMultipoleForceCount( void ); 
+    /**
+     * Get multipole force count 
+     * 
+     * @return multipole force count
+     */
+    int getApplyCutoff( ) const; 
+    /**
+     * Get multipole force count 
+     * 
+     * @return multipole force count
+     */
+    void setApplyCutoff( int applyCutoff ); 
 private:
    CudaPlatform::PlatformData& cudaPlatformData;
    amoebaGpuContext amoebaGpu;
    bool hasAmoebaBonds, hasAmoebaGeneralizedKirkwood, hasAmoebaMultipole;
+    int multipoleForceCount;
+    int applyCutoff;
    KernelImpl* localForceKernel;
    unsigned int kernelCount;
    void* contextImpl;

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -669,6 +669,13 @@ double CudaCalcAmoebaTorsionTorsionForceKernel::execute(ContextImpl& context, bo
 static void computeAmoebaMultipoleForce( AmoebaCudaData& data ) {
    amoebaGpuContext gpu = data.getAmoebaGpu();
+    if( data.getMultipoleForceCount() == 0 ){
+        gpuCopyInteractingWorkUnit( gpu );
+    }
+    if( data.getApplyCutoff() && (data.getMultipoleForceCount() % 100) == 0 ){
+        gpuReorderAtoms(gpu->gpuContext);
+    }
+    data.incrementMultipoleForceCount();
    data.initializeGpu();
    if( 0 && data.getLog() ){
@@ -867,6 +874,11 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
            zsize = pmeGridDimension[2];
        }
        gpuSetAmoebaPMEParameters(data.getAmoebaGpu(), (float) alpha, xsize, ysize, zsize);
+        data.setApplyCutoff( 1 );
+        amoebaGpuContext amoebaGpu  = data.getAmoebaGpu();
+        gpuContext gpu              = amoebaGpu->gpuContext;
+        gpu->sim.nonbondedCutoffSqr = force.getCutoffDistance()*force.getCutoffDistance();
+        gpu->sim.nonbondedMethod    = PARTICLE_MESH_EWALD;
    }
    data.getAmoebaGpu()->gpuContext->forces.push_back(new ForceInfo(force));
 }

--- a/plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
+++ b/plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
@@ -350,7 +350,7 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
    (void) fprintf( log, "     sqrtPi                             %15.7e\n",  amoebaGpu->amoebaSim.sqrtPi );
    (void) fprintf( log, "     alpha Ewald                        %15.7e\n",  gpu->sim.alphaEwald );
    (void) fprintf( log, "     PME grid dimensions                %6d %6d %6d\n",  gpu->sim.pmeGridSize.x, gpu->sim.pmeGridSize.y, gpu->sim.pmeGridSize.z);
-    (void) fprintf( log, "     cutoffDistance2                    %15.7e\n",  amoebaGpu->amoebaSim.cutoffDistance2 );
+    (void) fprintf( log, "     nonbondedCutoffSqr                 %15.7e\n",  gpu->sim.nonbondedCutoffSqr);
    (void) fprintf( log, "     electric                           %15.7e\n",  amoebaGpu->amoebaSim.electric );
    (void) fprintf( log, "     box                                %15.7e %15.7e %15.7e\n", gpu->sim.periodicBoxSizeX, gpu->sim.periodicBoxSizeY, gpu->sim.periodicBoxSizeZ);
    (void) fprintf( log, "     gkc                                %15.7e\n",  amoebaGpu->amoebaSim.gkc );
@@ -1554,7 +1554,6 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
                        AMOEBA_NO_CUTOFF, AMOEBA_PARTICLE_MESH_EWALD );
        (void) fflush( amoebaGpu->log );
    }
-    amoebaGpu->amoebaSim.cutoffDistance2             = cutoffDistance*cutoffDistance;
    amoebaGpu->amoebaSim.sqrtPi                      = std::sqrt( 3.14159265358f );
    amoebaGpu->amoebaSim.electric                    = electricConstant;
    amoebaGpu->gpuContext->sim.alphaEwald            = alphaEwald;
@@ -4297,4 +4296,34 @@ void trackMutualInducedIterations( amoebaGpuContext amoebaGpu, int iteration){
    }
 }
+/**---------------------------------------------------------------------------------------
+   Track iterations for MI dipoles
+   @param amoebaGpu            amoebaGpuContext reference
+   @param iteration            MI iteration
+   --------------------------------------------------------------------------------------- */
+void gpuCopyInteractingWorkUnit( amoebaGpuContext amoebaGpu ){
+// ---------------------------------------------------------------------------------------
+    gpuContext gpu = amoebaGpu->gpuContext;
+    gpu->psInteractingWorkUnit->Download();
+    gpu->psWorkUnit->Download();
+    amoebaGpu->psWorkUnit->Download();
+    (void) fprintf( amoebaGpu->log, "gpuCopyInteractingWorkUnit called -- to be removed.\n" );
+    for( unsigned int ii = 0; ii < gpu->psInteractingWorkUnit->_length; ii++ ){
+        gpu->psInteractingWorkUnit->_pSysStream[0][ii] = amoebaGpu->psWorkUnit->_pSysStream[0][ii];
+        gpu->psWorkUnit->_pSysStream[0][ii]            = amoebaGpu->psWorkUnit->_pSysStream[0][ii];
+    }    
+    gpu->psInteractingWorkUnit->Upload();
+    gpu->psWorkUnit->Upload();
+// ---------------------------------------------------------------------------------------
+}
 #undef  AMOEBA_DEBUG
--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaTypes.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaTypes.h
@@ -126,7 +126,7 @@ struct cudaAmoebaGmxSimulation {
    unsigned int numberOfAtoms;                     // number of atoms
    unsigned int paddedNumberOfAtoms;               // padded number of atoms
-    float cutoffDistance2;                          // cutoff distance squared for PME
+    //float cutoffDistance2;                          // cutoff distance squared for PME
    float sqrtPi;                                   // sqrt(PI)
    float scalingDistanceCutoff;                    // scaling cutoff
    float2*         pDampingFactorAndThole;         // Thole & damping factors

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
@@ -343,6 +343,9 @@ void amoebaGpuSetConstants(amoebaGpuContext gpu);
 extern "C"
 void gpuSetAmoebaBondOffsets(amoebaGpuContext gpu);
+extern "C"
+void gpuCopyInteractingWorkUnit(amoebaGpuContext gpu);
 /*
 extern "C"
 void gpuSetDihedralParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<int>& atom3, const std::vector<int>& atom4,

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedFieldParticle.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedFieldParticle.h
@@ -44,6 +44,11 @@ struct FixedFieldParticle {
    float gkField[3];
 #endif
+#ifdef INCLUDE_FIXED_FIELD_BUFFERS
+    float tempBuffer[3];
+    float tempBufferP[3];
+#endif
 };
 __device__ static void loadFixedFieldShared( struct FixedFieldParticle* sA, unsigned int atomI 

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedParticle.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedParticle.h
@@ -24,6 +24,11 @@ struct MutualInducedParticle {
    float fieldS[3];
    float fieldPolarS[3];
 #endif
+#ifdef INCLUDE_MI_FIELD_BUFFERS
+    float tempBuffer[3];
+    float tempBufferP[3];
+#endif
 };
 __device__ static void loadMutualInducedShared( MutualInducedParticle* sA, unsigned int atomI )

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPME.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPME.cu
@@ -775,15 +775,15 @@ void kComputeFixedMultipoleForceAndEnergy_kernel()
        multipole[8] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+2];
        multipole[9] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+5];
        float* phi = &cAmoebaSim.pPhi[20*i];
-        cAmoebaSim.pTorque[3*i] = -cAmoebaSim.electric*(multipole[3]*yscale*phi[2] - multipole[2]*zscale*phi[3]
+        cAmoebaSim.pTorque[3*i] = cAmoebaSim.electric*(multipole[3]*yscale*phi[2] - multipole[2]*zscale*phi[3]
                      + 2.0f*(multipole[6]-multipole[5])*zscale*zscale*phi[9]
                      + multipole[8]*yscale*yscale*phi[7] + multipole[9]*xscale*yscale*phi[5]
                      - multipole[7]*yscale*zscale*phi[8] - multipole[9]*xscale*zscale*phi[6]);
-        cAmoebaSim.pTorque[3*i+1] = -cAmoebaSim.electric*(multipole[1]*zscale*phi[3] - multipole[3]*xscale*phi[1]
+        cAmoebaSim.pTorque[3*i+1] = cAmoebaSim.electric*(multipole[1]*zscale*phi[3] - multipole[3]*xscale*phi[1]
                      + 2.0f*(multipole[4]-multipole[6])*zscale*zscale*phi[8]
                      + multipole[7]*zscale*zscale*phi[9] + multipole[8]*xscale*zscale*phi[6]
                      - multipole[8]*xscale*xscale*phi[4] - multipole[9]*yscale*yscale*phi[7]);
-        cAmoebaSim.pTorque[3*i+2] = -cAmoebaSim.electric*(multipole[2]*xscale*phi[1] - multipole[1]*yscale*phi[2]
+        cAmoebaSim.pTorque[3*i+2] = cAmoebaSim.electric*(multipole[2]*xscale*phi[1] - multipole[1]*yscale*phi[2]
                      + 2.0f*(multipole[5]-multipole[4])*yscale*yscale*phi[7]
                      + multipole[7]*xscale*xscale*phi[4] + multipole[9]*yscale*zscale*phi[8]
                      - multipole[7]*xscale*yscale*phi[5] - multipole[8]*zscale*zscale*phi[9]);
@@ -810,9 +810,9 @@ void kComputeFixedMultipoleForceAndEnergy_kernel()
        f.y *= cAmoebaSim.electric*cSim.pmeGridSize.y*cSim.invPeriodicBoxSizeY;
        f.z *= cAmoebaSim.electric*cSim.pmeGridSize.z*cSim.invPeriodicBoxSizeZ;
        float4 force = cSim.pForce4[i];
-        force.x += f.x;
+        force.x -= f.x;
-        force.y += f.y;
+        force.y -= f.y;
-        force.z += f.z;
+        force.z -= f.z;
        cSim.pForce4[i] = force;
@@ -854,15 +854,15 @@ void kComputeInducedDipoleForceAndEnergy_kernel()
        multipole[8] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+2];
        multipole[9] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+5];
        float* phidp = &cAmoebaSim.pPhidp[20*i];
-        cAmoebaSim.pTorque[3*i] = -0.5f*cAmoebaSim.electric*(multipole[3]*yscale*phidp[2] - multipole[2]*zscale*phidp[3]
+        cAmoebaSim.pTorque[3*i] = 0.5f*cAmoebaSim.electric*(multipole[3]*yscale*phidp[2] - multipole[2]*zscale*phidp[3]
                      + 2.0f*(multipole[6]-multipole[5])*zscale*zscale*phidp[9]
                      + multipole[8]*yscale*yscale*phidp[7] + multipole[9]*xscale*yscale*phidp[5]
                      - multipole[7]*yscale*zscale*phidp[8] - multipole[9]*xscale*zscale*phidp[6]);
-        cAmoebaSim.pTorque[3*i+1] = -0.5f*cAmoebaSim.electric*(multipole[1]*zscale*phidp[3] - multipole[3]*xscale*phidp[1]
+        cAmoebaSim.pTorque[3*i+1] = 0.5f*cAmoebaSim.electric*(multipole[1]*zscale*phidp[3] - multipole[3]*xscale*phidp[1]
                      + 2.0f*(multipole[4]-multipole[6])*zscale*zscale*phidp[8]
                      + multipole[7]*zscale*zscale*phidp[9] + multipole[8]*xscale*zscale*phidp[6]
                      - multipole[8]*xscale*xscale*phidp[4] - multipole[9]*yscale*yscale*phidp[7]);
-        cAmoebaSim.pTorque[3*i+2] = -0.5f*cAmoebaSim.electric*(multipole[2]*xscale*phidp[1] - multipole[1]*yscale*phidp[2]
+        cAmoebaSim.pTorque[3*i+2] = 0.5f*cAmoebaSim.electric*(multipole[2]*xscale*phidp[1] - multipole[1]*yscale*phidp[2]
                      + 2.0f*(multipole[5]-multipole[4])*yscale*yscale*phidp[7]
                      + multipole[7]*xscale*xscale*phidp[4] + multipole[9]*yscale*zscale*phidp[8]
                      - multipole[7]*xscale*yscale*phidp[5] - multipole[8]*zscale*zscale*phidp[9]);
@@ -906,9 +906,9 @@ void kComputeInducedDipoleForceAndEnergy_kernel()
        f.y *= 0.5f*cAmoebaSim.electric*cSim.pmeGridSize.y*cSim.invPeriodicBoxSizeY;
        f.z *= 0.5f*cAmoebaSim.electric*cSim.pmeGridSize.z*cSim.invPeriodicBoxSizeZ;
        float4 force = cSim.pForce4[i];
-        force.x += f.x;
+        force.x -= f.x;
-        force.y += f.y;
+        force.y -= f.y;
-        force.z += f.z;
+        force.z -= f.z;
        cSim.pForce4[i] = force;
    }
    cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*cAmoebaSim.electric*energy;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
@@ -72,8 +72,20 @@ struct PmeDirectElectrostaticParticle {
    float torque[3];
    float padding;
+    float tempForce[3];
+    float tempTorque[3];
 };
+__device__ void sumTempBuffer( PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ ){
+    atomI.tempForce[0]  += atomJ.tempForce[0];
+    atomI.tempForce[1]  += atomJ.tempForce[1];
+    atomI.tempForce[2]  += atomJ.tempForce[2];
+    atomI.tempTorque[0] += atomJ.tempTorque[0];
+    atomI.tempTorque[1] += atomJ.tempTorque[1];
+    atomI.tempTorque[2] += atomJ.tempTorque[2];
+}
 /*
 __device__ static void debugSetup( unsigned int atomI, unsigned int atomJ,
@@ -134,9 +146,9 @@ __device__ static void calculatePmeSelfTorqueElectrostaticPairIxn_kernel( PmeDir
    float uiy        = 0.5f*(atomI.inducedDipole[1] + atomI.inducedDipoleP[1]);
    float uiz        = 0.5f*(atomI.inducedDipole[2] + atomI.inducedDipoleP[2]);
-    atomI.torque[0] -= term*(atomI.labFrameDipole[1]*uiz - atomI.labFrameDipole[2]*uiy);
+    atomI.torque[0] += term*(atomI.labFrameDipole[1]*uiz - atomI.labFrameDipole[2]*uiy);
-    atomI.torque[1] -= term*(atomI.labFrameDipole[2]*uix - atomI.labFrameDipole[0]*uiz);
+    atomI.torque[1] += term*(atomI.labFrameDipole[2]*uix - atomI.labFrameDipole[0]*uiz);
-    atomI.torque[2] -= term*(atomI.labFrameDipole[0]*uiy - atomI.labFrameDipole[1]*uix);
+    atomI.torque[2] += term*(atomI.labFrameDipole[0]*uiy - atomI.labFrameDipole[1]*uix);
 }
 __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectrostaticParticle& atomI,   PmeDirectElectrostaticParticle& atomJ,
@@ -186,7 +198,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
    float gfr[8],gfri[7];
    float gti[7],gtri[7];
-    float conversionFactor   = (cAmoebaSim.electric/cAmoebaSim.dielec);
+    float conversionFactor   = (-cAmoebaSim.electric/cAmoebaSim.dielec);
    // set the permanent multipole and induced dipole values;
@@ -219,7 +231,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
    zr         -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
    float r2    = xr*xr + yr*yr + zr*zr;
-    if( r2 <= cAmoebaSim.cutoffDistance2 ){
+    if( r2 <= cSim.nonbondedCutoffSqr ){
        float r      = sqrt(r2);
        float ck     = atomJ.q;
@@ -540,7 +552,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
        e = e - (1.0f-scalingFactors[MScaleIndex])*erl;
        ei = ei - erli;
-        *energy = conversionFactor*(e + ei);
+        *energy = -conversionFactor*(e + ei);
        // increment the total intramolecular energy; assumes;
        // intramolecular distances are less than half of cell;
@@ -1161,15 +1173,27 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
            maxThreads = 128;
        else
            maxThreads = 64;
-      threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(PmeDirectElectrostaticParticle)), maxThreads);
+        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(PmeDirectElectrostaticParticle)+sizeof(float3)), maxThreads);
    }
    kClearFields_3( amoebaGpu, 2 );
+#ifdef AMOEBA_DEBUG
+    (void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticN2Forces:  threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u\n", 
+                    threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(PmeDirectElectrostaticParticle)+sizeof(float3)),
+                    (sizeof(PmeDirectElectrostaticParticle)+sizeof(float3)) );
+      (void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticN2Forces no warp:  numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Obuf=%u ixnCt=%u workUnits=%u gpu->nonbond_threads_per_block=%u\n",
+                      amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
+                      sizeof(PmeDirectElectrostaticParticle)+sizeof(float3), (sizeof(PmeDirectElectrostaticParticle)+sizeof(float3))*threadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits,
+                      gpu->sim.nonbond_threads_per_block );
+      (void) fflush( amoebaGpu->log );
+#endif
    if (gpu->bOutputBufferPerWarp){
-      kCalculateAmoebaPmeDirectElectrostaticN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>(
+      kCalculateAmoebaPmeDirectElectrostaticN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, (sizeof(PmeDirectElectrostaticParticle)+sizeof(float3))*threadsPerBlock>>>(
-                                                                         amoebaGpu->psWorkUnit->_pDevStream[0],
+                                                                         gpu->sim.pInteractingWorkUnit,
                                                                         amoebaGpu->psWorkArray_3_1->_pDevStream[0],
 #ifdef AMOEBA_DEBUG
                                                                         amoebaGpu->psWorkArray_3_2->_pDevStream[0],
@@ -1180,15 +1204,11 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
    } else {
-#ifdef AMOEBA_DEBUG
-      (void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticN2Forces no warp:  numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n",
-                      amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
-                      sizeof(PmeDirectElectrostaticParticle), sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
-      (void) fflush( amoebaGpu->log );
-#endif
-      kCalculateAmoebaPmeDirectElectrostaticN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>(
+//                                                                         gpu->sim.pInteractingWorkUnit,
-                                                                         amoebaGpu->psWorkUnit->_pDevStream[0],
+//                                                                         amoebaGpu->psWorkUnit->_pDevStream[0],
+      kCalculateAmoebaPmeDirectElectrostaticN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, (sizeof(PmeDirectElectrostaticParticle)+sizeof(float3))*threadsPerBlock>>>(
+                                                                         gpu->sim.pInteractingWorkUnit,
                                                                         amoebaGpu->psWorkArray_3_1->_pDevStream[0],
 #ifdef AMOEBA_DEBUG
                                                                         amoebaGpu->psWorkArray_3_2->_pDevStream[0],
@@ -1209,7 +1229,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
        (void) fprintf( amoebaGpu->log, "Finished PmeDirectElectrostatic kernel execution\n" ); (void) fflush( amoebaGpu->log );
-        int maxPrint        = 1400;
+        int maxPrint        = 5;
        float conversion    = 1.0f/41.84f;
        float forceSum[3]   = { 0.0f, 0.0f, 0.0f};
        for( int ii = 0; ii < gpu->natoms; ii++ ){
@@ -1270,7 +1290,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
        }
        (void) fprintf( amoebaGpu->log,"\n" );
-        if( 1 ){
+        if( 0 ){
            (void) fprintf( amoebaGpu->log,"DebugElec\n" );
            int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
            for( int jj = 0; jj < gpu->natoms; jj++ ){

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.h
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
@@ -80,7 +80,6 @@ static void kReducePmeEFieldPolar_kernel( unsigned int fieldComponents, unsigned
    }   
 }
 __global__ 
 #if (__CUDA_ARCH__ >= 200)
 __launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
@@ -96,7 +95,6 @@ static void kReducePmeEField_kernel( unsigned int fieldComponents, unsigned int
    // Reduce field
    const float term = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
-    //const float term = 0.0f;
    while (pos < fieldComponents)
    {   
@@ -154,7 +152,20 @@ static void kReducePmeDirectE_Fields(amoebaGpuContext amoebaGpu )
 // file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
 #undef GK
+#undef INCLUDE_FIXED_FIELD_BUFFERS
+#define INCLUDE_FIXED_FIELD_BUFFERS
 #include "kCalculateAmoebaCudaFixedFieldParticle.h"
+#undef INCLUDE_FIXED_FIELD_BUFFERS
+__device__ void sumTempBuffer( FixedFieldParticle& atomI, FixedFieldParticle& atomJ ){
+    atomI.tempBuffer[0]  += atomJ.tempBuffer[0];
+    atomI.tempBuffer[1]  += atomJ.tempBuffer[1];
+    atomI.tempBuffer[2]  += atomJ.tempBuffer[2];
+    atomI.tempBufferP[0] += atomJ.tempBufferP[0];
+    atomI.tempBufferP[1] += atomJ.tempBufferP[1];
+    atomI.tempBufferP[2] += atomJ.tempBufferP[2];
+}
 __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle& atomI, FixedFieldParticle& atomJ,
                                                            float dscale, float pscale, float fields[4][3]
 #ifdef AMOEBA_DEBUG
@@ -175,7 +186,7 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
    yr               -= floor(yr*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
    zr               -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-    float r2          = xr*xr + yr* yr + zr*zr;
+    float r2          = xr*xr + yr*yr + zr*zr;
    float r           = sqrtf(r2);
    // calculate the error function damping terms
@@ -310,7 +321,7 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
    // increment the field at each site due to this interaction
-    if( r2 <= cAmoebaSim.cutoffDistance2 ){
+    if( r2 <= cSim.nonbondedCutoffSqr ){
        fields[0][0]       = fim[0] - fid[0];
        fields[0][1]       = fim[1] - fid[1];
@@ -345,6 +356,7 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
        fields[2][2]       = 0.0f;
        fields[3][2]       = 0.0f;
    }
 #ifdef AMOEBA_DEBUG
    pullBack[0].x = xr;
    pullBack[0].y = yr;
@@ -399,6 +411,7 @@ static int isNanOrInfinity( double number ){
 static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
 {
+    static unsigned int threadsPerBlock  = 0;
    gpuContext gpu                       = amoebaGpu->gpuContext;
 #ifdef AMOEBA_DEBUG
@@ -416,40 +429,27 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
    // print intermediate results for the targetAtom 
-    unsigned int targetAtom  = 0;
+    unsigned int targetAtom  = 354;
-    int maxPrint             = 3002;
-    amoebaGpu->psE_Field->Download();
-    (void) fprintf( amoebaGpu->log, "Recip EFields In\n" );
-    for( int ii = 0; ii < gpu->natoms; ii++ ){
-        (void) fprintf( amoebaGpu->log, "%5d ", ii); 
-        int indexOffset     = ii*3;
-        // E_Field
-        int isNan  = isNanOrInfinity( amoebaGpu->psE_Field->_pSysStream[0][indexOffset] );
-            isNan += isNanOrInfinity( amoebaGpu->psE_Field->_pSysStream[0][indexOffset+1] );
-            isNan += isNanOrInfinity( amoebaGpu->psE_Field->_pSysStream[0][indexOffset+2] );
-        (void) fprintf( amoebaGpu->log,"E[%16.9e %16.9e %16.9e] %s\n",
-                        amoebaGpu->psE_Field->_pSysStream[0][indexOffset],
-                        amoebaGpu->psE_Field->_pSysStream[0][indexOffset+1],
-                        amoebaGpu->psE_Field->_pSysStream[0][indexOffset+2], (isNan ? "XXX" :"") );
-        if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
-            ii = gpu->natoms - maxPrint;
-        }
-    }
-    (void) fflush( amoebaGpu->log );
-    (void) fprintf( amoebaGpu->log, "Recip EFields End\n" );
 #endif
    kClearFields_3( amoebaGpu, 2 );
+    // on first pass, set threads/block
+    if( threadsPerBlock == 0 ){ 
+        unsigned int maxThreads;
+        if (gpu->sm_version >= SM_20)
+            maxThreads = 384; 
+        else if (gpu->sm_version >= SM_12)
+            maxThreads = 128; 
+        else
+            maxThreads = 64;
+        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)), maxThreads);
+    }    
    if (gpu->bOutputBufferPerWarp){
-        kCalculateAmoebaPmeDirectFixedE_FieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
+        kCalculateAmoebaPmeDirectFixedE_FieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
-                                                                           amoebaGpu->psWorkUnit->_pDevStream[0],
+                                                                           gpu->sim.pInteractingWorkUnit,
                                                                           amoebaGpu->psWorkArray_3_1->_pDevStream[0],
 #ifdef AMOEBA_DEBUG
                                                                           amoebaGpu->psWorkArray_3_2->_pDevStream[0],
@@ -459,8 +459,9 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
 #endif
    } else {
-        kCalculateAmoebaPmeDirectFixedE_FieldN2_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
+                                                                           //amoebaGpu->psWorkUnit->_pDevStream[0],
-                                                                           amoebaGpu->psWorkUnit->_pDevStream[0],
+        kCalculateAmoebaPmeDirectFixedE_FieldN2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
+                                                                           gpu->sim.pInteractingWorkUnit,
                                                                           amoebaGpu->psWorkArray_3_1->_pDevStream[0],
 #ifdef AMOEBA_DEBUG
                                                                           amoebaGpu->psWorkArray_3_2->_pDevStream[0],
@@ -471,27 +472,16 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
    }
    LAUNCHERROR("kCalculateAmoebaPmeDirectFixedE_Field_kernel");
-#if 0
-        for( unsigned int ii = 0; ii < amoebaGpu->outputBuffers; ii++ ){
-            //float index = 1.0f;
-            float index = (float) ii;
-            for( unsigned int jj = 0; jj < 3*amoebaGpu->paddedNumberOfAtoms; jj += 3 ){
-                unsigned int kk = 3*ii*amoebaGpu->paddedNumberOfAtoms + jj;
-                amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk]   = index;
-                amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk+1] = index;
-                amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk+2] = index;
-            }
-        }
-        amoebaGpu->psWorkArray_3_1->Upload();
-#endif
    kReducePmeDirectE_Fields( amoebaGpu );
 #ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){
        gpu->psInteractionCount->Download();
+        (void) fprintf( amoebaGpu->log, "cudaComputeAmoebaPmeDirectFixedEField:  threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u shrd=%u\n", 
+                        threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)+sizeof(float3)),
+                        (sizeof(FixedFieldParticle)+sizeof(float3)), (sizeof(FixedFieldParticle)+sizeof(float3))*threadsPerBlock );
        (void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u warp=%d\n",
-                        amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
+                        amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
                        sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->energyOutputBuffers, 
                        (*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->bOutputBufferPerWarp );
        (void) fflush( amoebaGpu->log );
@@ -527,6 +517,8 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
 */
        amoebaGpu->psE_Field->Download();
        amoebaGpu->psE_FieldPolar->Download();
+        (void) fprintf( amoebaGpu->log,"E-field (includes self term)" );
+        int maxPrint             = 3002;
        for( int ii = 0; ii < gpu->natoms; ii++ ){
           (void) fprintf( amoebaGpu->log, "%5d ", ii); 
@@ -558,16 +550,29 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
        debugArray->Download();
        int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
+        amoebaGpu->gpuContext->psPosq4->Download();
        for( int jj = 0; jj < gpu->natoms; jj++ ){
            int debugIndex = jj;
+if( fabs(debugArray->_pSysStream[0][jj+paddedNumberOfAtoms].x) > 0.0 ){
            (void) fprintf( amoebaGpu->log,"%5d PmeFixedEField\n", jj );
-            for( int kk = 0; kk < 10; kk++ ){
+            for( int kk = 0; kk < 6; kk++ ){
                (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
                                debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
                                debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
                debugIndex += paddedNumberOfAtoms;
            }
+            (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e ] [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] p\n",
+                            amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].x,
+                            amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].y,
+                            amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].z,
+                            amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].x - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].x,
+                            amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].y - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].y,
+                            amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].z - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].z,
+                           (amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].x - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].x)/5.50f,
+                           (amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].y - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].y)/5.50f,
+                           (amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].z - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].z)/5.50f);
            (void) fprintf( amoebaGpu->log,"\n" );
+}
        }
@@ -581,13 +586,12 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_Field,      outputVector );
            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_FieldPolar, outputVector);
            cudaWriteVectorOfDoubleVectorsToFile( "CudaEField", fileId, outputVector );
         }
         delete debugArray;
    }
 #endif
-        if( 0 ){
+        if( 1 ){
            std::vector<int> fileId;
            fileId.push_back( 0 );
            VectorOfDoubleVectors outputVector;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.h
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
@@ -36,7 +36,21 @@ void GetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
 //#define AMOEBA_DEBUG
 #undef AMOEBA_DEBUG
+#undef INCLUDE_MI_FIELD_BUFFERS
+#define INCLUDE_MI_FIELD_BUFFERS 
 #include "kCalculateAmoebaCudaMutualInducedParticle.h"
+#undef INCLUDE_MI_FIELD_BUFFERS
+__device__ void sumTempBuffer( MutualInducedParticle& atomI, MutualInducedParticle& atomJ ){
+    atomI.tempBuffer[0]  += atomJ.tempBuffer[0];
+    atomI.tempBuffer[1]  += atomJ.tempBuffer[1];
+    atomI.tempBuffer[2]  += atomJ.tempBuffer[2];
+    atomI.tempBufferP[0] += atomJ.tempBufferP[0];
+    atomI.tempBufferP[1] += atomJ.tempBufferP[1];
+    atomI.tempBufferP[2] += atomJ.tempBufferP[2];
+}
 // file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
@@ -152,7 +166,7 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce
    // increment the field at each site due to this interaction
-    if( r2 <= cAmoebaSim.cutoffDistance2 ){
+    if( r2 <= cSim.nonbondedCutoffSqr ){
        fields[0][0]       = fimd[0] - fid[0];
        fields[1][0]       = fkmd[0] - fkd[0];
@@ -370,6 +384,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
                                                                  CUDAStream<float>* outputArray, CUDAStream<float>* outputPolarArray )
 {
+  static unsigned int threadsPerBlock  = 0;
  gpuContext gpu                       = amoebaGpu->gpuContext;
 #ifdef AMOEBA_DEBUG
@@ -389,9 +404,24 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
    kClearFields_3( amoebaGpu, 2 );
+    // on first pass, set threads/block
+    if( threadsPerBlock == 0 ){  
+        unsigned int maxThreads;
+        if (gpu->sm_version >= SM_20)
+            maxThreads = 384; 
+        else if (gpu->sm_version >= SM_12)
+            maxThreads = 128; 
+        else
+            maxThreads = 64; 
+        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(MutualInducedParticle)), maxThreads);
+    }    
    if (gpu->bOutputBufferPerWarp){
-        kCalculateAmoebaPmeMutualInducedFieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
+                                                                 //gpu->sim.pInteractingWorkUnit,
-                                                                 amoebaGpu->psWorkUnit->_pDevStream[0],
+                                                                 //amoebaGpu->psWorkUnit->_pDevStream[0],
+        kCalculateAmoebaPmeMutualInducedFieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
+                                                                 gpu->sim.pInteractingWorkUnit,
                                                                 amoebaGpu->psWorkArray_3_1->_pDevStream[0],
 #ifdef AMOEBA_DEBUG
                                                                 amoebaGpu->psWorkArray_3_2->_pDevStream[0],
@@ -405,14 +435,13 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
 #ifdef AMOEBA_DEBUG
        (void) fprintf( amoebaGpu->log, "N2 no warp\n" );
        (void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n",
-                        amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
+                        amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
-                        sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock,
+                        sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock,
                        amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
        (void) fflush( amoebaGpu->log );
 #endif
-        kCalculateAmoebaPmeMutualInducedFieldN2_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock,
+        kCalculateAmoebaPmeMutualInducedFieldN2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
-                                                         sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
+                                                                 gpu->sim.pInteractingWorkUnit,
-                                                                 amoebaGpu->psWorkUnit->_pDevStream[0],
                                                                 amoebaGpu->psWorkArray_3_1->_pDevStream[0],
 #ifdef AMOEBA_DEBUG
                                                                 amoebaGpu->psWorkArray_3_2->_pDevStream[0],
@@ -717,6 +746,17 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
                }
            }   
            (void) fflush( amoebaGpu->log );
+    if( 1 ){
+        std::vector<int> fileId;
+        fileId.push_back( iteration );
+        VectorOfDoubleVectors outputVector;
+        cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,                    outputVector );
+        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipole,      outputVector );
+        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipolePolar, outputVector );
+        cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeMI", fileId, outputVector );
+     }
        }
 #endif
        iteration++;
@@ -725,7 +765,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
    amoebaGpu->mutualInducedDone             = done;
    amoebaGpu->mutualInducedConverged        = ( !done || iteration > amoebaGpu->mutualInducedMaxIterations ) ? 0 : 1;
-    if( 0 ){
+    if( 1 ){
        std::vector<int> fileId;
        //fileId.push_back( 0 );
        VectorOfDoubleVectors outputVector;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.h
@@ -131,7 +131,7 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
            debugArray[index].x                = (float) atomI;
            debugArray[index].y                = (float) (y + j);
-            debugArray[index].z                = cAmoebaSim.cutoffDistance2;
+            debugArray[index].z                = cSim.nonbondedCutoffSqr;
            debugArray[index].w                = 6.0f;
@@ -209,10 +209,13 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
 #endif
-        }
+        } else {
-        else        // 100% utilization
-        {
+            unsigned int flags = cSim.pInteractionFlag[pos];
-            // Read fixed atom data into registers and GRF
+            if (flags == 0) {
+                // No interactions in this block.
+            } else {
                if (lasty != y)
                {
                    unsigned int atomJ        = y + tgx;
@@ -229,17 +232,18 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
                for (unsigned int j = 0; j < GRID; j++)
                {
+                    unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j;
                    float ijField[4][3];
                    // load coords, charge, ...
-                calculatePmeDirectMutualInducedFieldPairIxn_kernel( localParticle, psA[tj], uscale, ijField
+                    calculatePmeDirectMutualInducedFieldPairIxn_kernel( localParticle, psA[jIdx], uscale, ijField
 #ifdef AMOEBA_DEBUG
-, pullBack 
+    , pullBack 
 #endif
       );
-                unsigned int mask   =  ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+tj) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
+                    unsigned int mask   =  ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+jIdx) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
                    // add to field at atomI the field due atomJ's dipole
@@ -255,26 +259,64 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
                    // add to field at atomJ the field due atomI's dipole
-                psA[tj].field[0]         += mask ? ijField[1][0] : 0.0f;
+                    if( flags == 0xFFFFFFFF ){
-                psA[tj].field[1]         += mask ? ijField[1][1] : 0.0f;
-                psA[tj].field[2]         += mask ? ijField[1][2] : 0.0f;
+                        psA[jIdx].field[0]             += mask ? ijField[1][0] : 0.0f;
+                        psA[jIdx].field[1]             += mask ? ijField[1][1] : 0.0f;
+                        psA[jIdx].field[2]             += mask ? ijField[1][2] : 0.0f;
                        // add to polar field at atomJ the field due atomI's dipole
-                psA[tj].fieldPolar[0]    += mask ? ijField[3][0] : 0.0f;
+                        psA[jIdx].fieldPolar[0]        += mask ? ijField[3][0] : 0.0f;
-                psA[tj].fieldPolar[1]    += mask ? ijField[3][1] : 0.0f;
+                        psA[jIdx].fieldPolar[1]        += mask ? ijField[3][1] : 0.0f;
-                psA[tj].fieldPolar[2]    += mask ? ijField[3][2] : 0.0f;
+                        psA[jIdx].fieldPolar[2]        += mask ? ijField[3][2] : 0.0f;
+                    } else {
+                        psA[threadIdx.x].tempBuffer[0]  = mask ? 0.0f : ijField[1][0];
+                        psA[threadIdx.x].tempBuffer[1]  = mask ? 0.0f : ijField[1][1];
+                        psA[threadIdx.x].tempBuffer[2]  = mask ? 0.0f : ijField[1][2];
+                        psA[threadIdx.x].tempBufferP[0] = mask ? 0.0f : ijField[3][0];
+                        psA[threadIdx.x].tempBufferP[1] = mask ? 0.0f : ijField[3][1];
+                        psA[threadIdx.x].tempBufferP[2] = mask ? 0.0f : ijField[3][2];
+                        if( tgx % 2 == 0 ){
+                            sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+1] );
+                        }
+                        if( tgx % 4 == 0 ){
+                            sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+2] );
+                        }
+                        if( tgx % 8 == 0 ){
+                            sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+4] );
+                        }
+                        if( tgx % 16 == 0 ){
+                            sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+8] );
+                        }
+                        if (tgx == 0)
+                        {
+                            psA[jIdx].field[0]         += psA[threadIdx.x].tempBuffer[0]  + psA[threadIdx.x+16].tempBuffer[0];
+                            psA[jIdx].field[1]         += psA[threadIdx.x].tempBuffer[1]  + psA[threadIdx.x+16].tempBuffer[1];
+                            psA[jIdx].field[2]         += psA[threadIdx.x].tempBuffer[2]  + psA[threadIdx.x+16].tempBuffer[2];
+                            psA[jIdx].fieldPolar[0]    += psA[threadIdx.x].tempBufferP[0] + psA[threadIdx.x+16].tempBufferP[0];
+                            psA[jIdx].fieldPolar[1]    += psA[threadIdx.x].tempBufferP[1] + psA[threadIdx.x+16].tempBufferP[1];
+                            psA[jIdx].fieldPolar[2]    += psA[threadIdx.x].tempBufferP[2] + psA[threadIdx.x+16].tempBufferP[2];
+                        }
+                    }
 #ifdef AMOEBA_DEBUG
-if( atomI == targetAtom || (y+tj) == targetAtom ){
+if( atomI == targetAtom || (y+jIdx) == targetAtom ){
-            unsigned int index                 = atomI == targetAtom ? (y+tj) : atomI;
+            unsigned int index                 = atomI == targetAtom ? (y+jIdx) : atomI;
            unsigned int pullBackIndex         = 0;
            unsigned int indexI                = 0;
            unsigned int indexJ                = indexI ? 0 : 2;
            debugArray[index].x                = (float) atomI;
-            debugArray[index].y                = (float) (y + tj);
+            debugArray[index].y                = (float) (y + jIdx);
-            debugArray[index].z                = cAmoebaSim.cutoffDistance2;
+            debugArray[index].z                = cSim.nonbondedCutoffSqr;
            debugArray[index].w                = 7.0f;
@@ -315,31 +357,12 @@ if( atomI == targetAtom || (y+tj) == targetAtom ){
            debugArray[index].y                = ijField[indexJ+1][1];
            debugArray[index].z                = ijField[indexJ+1][2];
            debugArray[index].w                = flag;
-/*
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                = match ? 0.0f : ijField[indexI][0];
-            debugArray[index].y                = match ? 0.0f : ijField[indexI][1];
-            debugArray[index].z                = match ? 0.0f : ijField[indexI][2];
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            unsigned int mask                  = 1 << j;
-            unsigned int pScaleIndex           = (scaleMask.x & mask) ? 1 : 0;
-            pScaleIndex                       += (scaleMask.y & mask) ? 2 : 0;
-            debugArray[index].x                = (float) pScaleIndex;
-            debugArray[index].y                = scaleMask.x & mask ? 1.0f : -1.0f;
-            debugArray[index].z                = scaleMask.y & mask ? 1.0f : -1.0f;
-            debugArray[index].w                = + 10.0f;
-*/
 }
 #endif
                    tj                  = (tj + 1) & (GRID - 1);
-            }
+                } // end of j-loop
                // Write results
@@ -364,8 +387,10 @@ if( atomI == targetAtom || (y+tj) == targetAtom ){
 #endif
                lasty = y;
-        }
+            } // end of pInteractionFlag block
+        } // end of x == y block
        pos++;
    }
 }
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaMapTorques.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaMapTorques.cu
@@ -653,7 +653,7 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu,
        (void) fprintf( amoebaGpu->log, "%s: numBlocks=%d numThreads=%d %d\n", methodName, numBlocks, numThreads, amoebaGpu->maxMapTorqueDifferencePow2); (void) fflush( amoebaGpu->log );
        amoebaGpu->psForce->Download();
        psCudaForce4->Download();
+        amoebaGpu->torqueMapForce->Download();
        amoebaGpu->psTorque->Download();
        int maxPrint        = 10;
        (void) fprintf( amoebaGpu->log,"Post torqueMap\n" );
@@ -670,6 +670,10 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu,
                            amoebaGpu->psForce->_pSysStream[0][indexOffset],
                            amoebaGpu->psForce->_pSysStream[0][indexOffset+1],
                            amoebaGpu->psForce->_pSysStream[0][indexOffset+2] );
+            (void) fprintf( amoebaGpu->log,"fT[%16.9e %16.9e %16.9e] ",
+                            amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset],
+                            amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset+1],
+                            amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset+2] );
            (void) fprintf( amoebaGpu->log,"T[%16.9e %16.9e %16.9e]\n",
                            amoebaGpu->psTorque->_pSysStream[0][indexOffset],
                            amoebaGpu->psTorque->_pSysStream[0][indexOffset+1],
@@ -741,7 +745,7 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce2( amoebaGpuContext amoebaGpu,
            amoebaGpu->maxMapTorqueDifference,
            amoebaGpu->torqueMapForce->_pDevStream[0],
            psCudaForce4->_pDevStream[0] );
-    LAUNCHERROR("amoebaMapTorqueReduce_kernel2");
+    LAUNCHERROR("amoebaMapTorqueReduce_kernel3");
 #ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaRotateFrame.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaRotateFrame.cu
@@ -353,6 +353,13 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
 }
+#undef USE_PERIODIC
+#define USE_PERIODIC
+#define METHOD_NAME(a, b) a##Periodic##b
+#include "kFindInteractingBlocks.h"
+#undef USE_PERIODIC
+#undef METHOD_NAME
 void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaGeneralizedKirkwood ) 
 {
    std::string methodName = "kCalculateAmoebaMultipoleForces";
@@ -372,6 +379,37 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
            cudaComputeAmoebaFixedEField( amoebaGpu );
            cudaComputeAmoebaMutualInducedField( amoebaGpu );
        } else {
+            gpuContext gpu = amoebaGpu->gpuContext;
+            kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
+            LAUNCHERROR("kFindBlockBoundsPeriodic");
+            kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
+            LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
+            compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
+            kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            LAUNCHERROR("kFindInteractionsWithinBlocksPeriodic");
+if( 0 ){ 
+    gpu->psInteractionCount->Download();
+    gpu->psInteractingWorkUnit->Download();
+    gpu->psInteractionFlag->Download();
+    amoebaGpu->psWorkUnit->Download();
+    (void) fprintf( amoebaGpu->log, "Ixn count=%u\n", gpu->psInteractionCount->_pSysStream[0][0] );
+    for( unsigned int ii = 0; ii < gpu->psInteractingWorkUnit->_length; ii++ ){
+        unsigned int x          = gpu->psInteractingWorkUnit->_pSysStream[0][ii];
+        unsigned int y          = ((x >> 2) & 0x7fff) << GRIDBITS;
+        unsigned int exclusions = (x & 0x1);
+                     x          = (x >> 17) << GRIDBITS;
+        (void) fprintf( amoebaGpu->log, "Cell %8u  %8u [%5u %5u %1u] ", ii, gpu->psInteractingWorkUnit->_pSysStream[0][ii], x,y,exclusions );
+        x          = amoebaGpu->psWorkUnit->_pSysStream[0][ii];
+        y          = ((x >> 2) & 0x7fff) << GRIDBITS;
+        exclusions = (x & 0x1);
+        x          = (x >> 17) << GRIDBITS;
+        (void) fprintf( amoebaGpu->log, "   %8u [%5u %5u %1u]   %10u\n", amoebaGpu->psWorkUnit->_pSysStream[0][ii], x,y,exclusions, gpu->psInteractionFlag->_pSysStream[0][ii] );
+    }    
+} else {
+}
            cudaComputeAmoebaPmeFixedEField( amoebaGpu );
            cudaComputeAmoebaPmeMutualInducedField( amoebaGpu );
        }

--- a/plugins/amoeba/platforms/cuda/tests/AmoebaTinkerParameterFile.cpp
+++ b/plugins/amoeba/platforms/cuda/tests/AmoebaTinkerParameterFile.cpp
@@ -4535,7 +4535,6 @@ void testUsingAmoebaTinkerParameterFile( const std::string& amoebaTinkerParamete
    MapStringDouble tinkerEnergies;
    MapStringVectorOfVectors supplementary;
    MapStringIntI isPresent = forceMap.find( AMOEBA_GK_FORCE );
    bool gkIsActive;
    if( isPresent != forceMap.end() && isPresent->second != 0 ){