Direct space optimizations

8a331fb9 · Mark Friedrichs · af4d503a · 8a331fb9 · 8a331fb9 · 8a331fb9
Commit 8a331fb9 authored Apr 20, 2011 by Mark Friedrichs
17 changed files
--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
@@ -3449,19 +3449,20 @@ tgx     = 0;

   Get threads/block

-   @param amoebaGpu        amoebaGpuContext
-   @param sharedMemoryPerThread shared memory/thread
+   @param amoebaGpu              amoebaGpuContext
+   @param sharedMemoryPerThread  shared memory/thread
+   @param sharedMemoryPerBlock   shared memory/block

   @return threadsPerBlock

   --------------------------------------------------------------------------------------- */

-unsigned int getThreadsPerBlock( amoebaGpuContext amoebaGpu, unsigned int sharedMemoryPerThread )
+unsigned int getThreadsPerBlock( amoebaGpuContext amoebaGpu, unsigned int sharedMemoryPerThread, unsigned int sharedMemoryPerBlock )
 {
    unsigned int grid               = amoebaGpu->gpuContext->grid;
-    unsigned int threadsPerBlock    = (amoebaGpu->gpuContext->sharedMemoryPerBlock + grid -1)/(grid*sharedMemoryPerThread);
+    unsigned int threadsPerBlock    = (sharedMemoryPerBlock + grid -1)/(grid*sharedMemoryPerThread);
    threadsPerBlock                 = threadsPerBlock < 1 ? 1 : threadsPerBlock;
-    threadsPerBlock                 *= grid;
+    threadsPerBlock                *= grid;

   return threadsPerBlock;
 }

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
@@ -160,7 +160,7 @@ extern void kClearFloat( amoebaGpuContext amoebaGpu, unsigned int entries, CUDAS
 extern void kClearFloat4( amoebaGpuContext amoebaGpu, unsigned int entries, CUDAStream<float4>* fieldToClear );
 extern void kClearFields_1( amoebaGpuContext amoebaGpu );
 extern void kClearFields_3( amoebaGpuContext amoebaGpu, unsigned int numberToClear );
-extern unsigned int getThreadsPerBlock( amoebaGpuContext amoebaGpu, unsigned int sharedMemoryPerThread );
+extern unsigned int getThreadsPerBlock( amoebaGpuContext amoebaGpu, unsigned int sharedMemoryPerThread, unsigned int sharedMemoryPerBlock );

 //extern int isNanOrInfinity( double number );
 extern void trackMutualInducedIterations( amoebaGpuContext amoebaGpu, int iteration);

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
@@ -759,7 +759,7 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo
            maxThreads = 128;
        else
            maxThreads = 64;
-        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(ElectrostaticParticle)), maxThreads);
+        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(ElectrostaticParticle), gpu->sharedMemoryPerBlock), maxThreads);
    }

    kClearFields_3( amoebaGpu, 1 );

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.cu
@@ -362,7 +362,7 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
            maxThreads = 128;
        else
            maxThreads = 64;
-        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)), maxThreads);
+        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle), gpu->sharedMemoryPerBlock ), maxThreads);
    }

    kClearFields_3( amoebaGpu, 3 );

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.cu
@@ -108,7 +108,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
            maxThreads = 128; 
        else 
            maxThreads = 64;
-        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)), maxThreads);
+        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle), gpu->sharedMemoryPerBlock ), maxThreads);
    }

 #ifdef AMOEBA_DEBUG

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
@@ -1813,7 +1813,7 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
            maxThreads = 128;
        else
            maxThreads = 64;
-        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(KirkwoodParticle)), maxThreads);
+        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(KirkwoodParticle), gpu->sharedMemoryPerBlock ), maxThreads);

 #ifdef AMOEBA_DEBUG
        if( amoebaGpu->log ){

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.cu
@@ -978,7 +978,7 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
            maxThreads = 96;
        else
            maxThreads = 32;
-        threadsPerBlock = std::min(getThreadsPerBlock( amoebaGpu, sizeof(KirkwoodEDiffParticle)), maxThreads);
+        threadsPerBlock = std::min(getThreadsPerBlock( amoebaGpu, sizeof(KirkwoodEDiffParticle), gpu->sharedMemoryPerBlock ), maxThreads);
    }   
    
 #ifdef AMOEBA_DEBUG

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
@@ -490,7 +490,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
            maxThreads = 128;
        else
            maxThreads = 64;
-        threadsPerBlock = std::min(getThreadsPerBlock( amoebaGpu, sizeof(MutualInducedParticle)), maxThreads);
+        threadsPerBlock = std::min(getThreadsPerBlock( amoebaGpu, sizeof(MutualInducedParticle), gpu->sharedMemoryPerBlock ), maxThreads);
    }
    
 #ifdef AMOEBA_DEBUG

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
@@ -276,7 +276,7 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
            maxThreads = 128; 
        else 
            maxThreads = 64; 
-        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(MutualInducedParticle)), maxThreads);
+        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(MutualInducedParticle), gpu->sharedMemoryPerBlock ), maxThreads);
    }   

 #ifdef AMOEBA_DEBUG

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedParticle.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedParticle.h
@@ -4,7 +4,6 @@ struct MutualInducedParticle {
    float x;
    float y;
    float z;
-    float q;

    float inducedDipole[3];
    float inducedDipolePolar[3];
@@ -41,7 +40,6 @@ __device__ static void loadMutualInducedShared( MutualInducedParticle* sA, unsig
    sA->x                        = posq.x;
    sA->y                        = posq.y;
    sA->z                        = posq.z;
-    sA->q                        = posq.w;

    // dipole


--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.h
@@ -239,6 +239,9 @@ if( atomI == targetAtom || atomJ == targetAtom ){
                // No interactions in this block.
            } else {

+#ifdef CALCULATE_FULL_TILE
+                flags = 0xFFFFFFFF;
+#endif
                sA[threadIdx.x].force[0]     = 0.0f;
                sA[threadIdx.x].force[1]     = 0.0f;
                sA[threadIdx.x].force[2]     = 0.0f;
@@ -311,7 +314,8 @@ if( atomI == targetAtom || atomJ == targetAtom ){
                                psA[jIdx].torque[0]        += forceTorqueEnergy[2].x;
                                psA[jIdx].torque[1]        += forceTorqueEnergy[2].y;
                                psA[jIdx].torque[2]        += forceTorqueEnergy[2].z;
-    
+
+#ifndef CALCULATE_FULL_TILE
                            } else {
    
                                sA[threadIdx.x].tempForce[0]  = forceTorqueEnergy[0].x;
@@ -345,6 +349,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
                                    psA[jIdx].torque[1] += sA[threadIdx.x].tempTorque[1] + sA[threadIdx.x+16].tempTorque[1];
                                    psA[jIdx].torque[2] += sA[threadIdx.x].tempTorque[2] + sA[threadIdx.x+16].tempTorque[2];
                                }
+#endif
                            }
                        } // end of atoms out-of-bounds
                    } // end of flags&(1<<j block

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
@@ -437,7 +437,7 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
            maxThreads = 192;
        else
            maxThreads = 64;
-        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)), maxThreads);
+        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle), gpu->sharedMemoryPerBlock ), maxThreads);
    }    

    if (gpu->bOutputBufferPerWarp){
@@ -469,7 +469,7 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
    if( amoebaGpu->log ){
        gpu->psInteractionCount->Download();
        (void) fprintf( amoebaGpu->log, "cudaComputeAmoebaPmeDirectFixedEField:  threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u shrd=%u\n", 
-                        threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)+sizeof(float3)),
+                        threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)+sizeof(float3), gpu->sharedMemoryPerBlock),
                        (sizeof(FixedFieldParticle)+sizeof(float3)), (sizeof(FixedFieldParticle)+sizeof(float3))*threadsPerBlock );
        (void) fprintf( amoebaGpu->log, "AmoebaCutoffForces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u warp=%d\n",
                        gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
@@ -37,10 +37,9 @@ void GetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
 #undef AMOEBA_DEBUG

 #undef INCLUDE_MI_FIELD_BUFFERS
-#define INCLUDE_MI_FIELD_BUFFERS 
+//#define INCLUDE_MI_FIELD_BUFFERS 
 #include "kCalculateAmoebaCudaMutualInducedParticle.h"
-#undef INCLUDE_MI_FIELD_BUFFERS
-
+#ifdef INCLUDE_MI_FIELD_BUFFERS
 __device__ void sumTempBuffer( MutualInducedParticle& atomI, MutualInducedParticle& atomJ ){

    atomI.tempBuffer[0]  += atomJ.tempBuffer[0];
@@ -51,6 +50,93 @@ __device__ void sumTempBuffer( MutualInducedParticle& atomI, MutualInducedPartic
    atomI.tempBufferP[1] += atomJ.tempBufferP[1];
    atomI.tempBufferP[2] += atomJ.tempBufferP[2];
 }
+#endif
+
+// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
+
+__device__ void setupMutualInducedFieldPairIxn_kernel( const MutualInducedParticle& atomI, const MutualInducedParticle& atomJ,
+                                                       const float uscale, float4* delta, float* preFactor2 ) {
+
+    // compute thedelta->xeal space portion of the Ewald summation
+  
+    delta->x                = atomJ.x - atomI.x;
+    delta->y                = atomJ.y - atomI.y;
+    delta->z                = atomJ.z - atomI.z;
+
+    // pdelta->xiodic boundary conditions
+
+    delta->x               -= floor(delta->x*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+    delta->y               -= floor(delta->y*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+    delta->z               -= floor(delta->z*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+
+    float r2                = (delta->x*delta->x) + (delta->y*delta->y) + (delta->z*delta->z); 
+    if( r2 <= cSim.nonbondedCutoffSqr ){
+        float r           = sqrtf(r2);
+
+        // calculate the error function damping terms
+
+        float ralpha      = cSim.alphaEwald*r;
+
+        float bn0         = erfc(ralpha)/r;
+        float alsq2       = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
+        float alsq2n      = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald);
+        float exp2a       = exp(-(ralpha*ralpha));
+        alsq2n           *= alsq2;
+        float bn1         = (bn0+alsq2n*exp2a)/r2;
+
+        alsq2n           *= alsq2;
+        float bn2         = (3.0f*bn1+alsq2n*exp2a)/r2;
+
+        // compute the error function scaled and unscaled terms
+
+        float scale3      = 1.0f;
+        float scale5      = 1.0f;
+        float damp        = atomI.damp*atomJ.damp;
+        if( damp != 0.0f ){
+
+            float ratio  = (r/damp);
+                  ratio  = ratio*ratio*ratio;
+            float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
+                  damp   = -pgamma*ratio;
+
+            if( damp > -50.0f) {
+                float expdamp = exp(damp);
+                scale3        = 1.0f - expdamp;
+                scale5        = 1.0f - expdamp*(1.0f-damp);
+            }
+        }
+        float dsc3        = uscale*scale3;
+        float dsc5        = uscale*scale5;
+
+        float r3          = (r*r2);
+        float r5          = (r3*r2);
+        float rr3         = (1.0f-dsc3)/r3;
+        float rr5         = 3.0f*(1.0f-dsc5)/r5;
+
+        delta->w          = rr3 - bn1;
+        *preFactor2       = bn2 - rr5;
+    } else {
+        delta->w = *preFactor2 = 0.0f;
+    }
+}
+
+__device__ void calculateMutualInducedFieldPairIxn_kernel( const float inducedDipole[3], const float4 delta, const float preFactor2, float fieldSum[3] ) {
+
+    float preFactor3  = preFactor2*(inducedDipole[0]*delta.x   + inducedDipole[1]*delta.y  + inducedDipole[2]*delta.z);
+
+    fieldSum[0]      += preFactor3*delta.x + delta.w*inducedDipole[0];
+    fieldSum[1]      += preFactor3*delta.y + delta.w*inducedDipole[1];
+    fieldSum[2]      += preFactor3*delta.z + delta.w*inducedDipole[2];
+}
+
+__device__ void calculateMutualInducedFieldPairIxnNoAdd_kernel( const float inducedDipole[3], const float4 delta, const float preFactor2, float fieldSum[3] ) {
+
+    float preFactor3  = preFactor2*(inducedDipole[0]*delta.x   + inducedDipole[1]*delta.y  + inducedDipole[2]*delta.z);
+
+    fieldSum[0]       = preFactor3*delta.x + delta.w*inducedDipole[0];
+    fieldSum[1]       = preFactor3*delta.y + delta.w*inducedDipole[1];
+    fieldSum[2]       = preFactor3*delta.z + delta.w*inducedDipole[2];
+}

 // file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field

@@ -385,7 +471,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
            maxThreads = 128; 
        else
            maxThreads = 64; 
-        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(MutualInducedParticle)), maxThreads);
+        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(MutualInducedParticle), gpu->sharedMemoryPerBlock ), maxThreads);
    }    

 #ifdef AMOEBA_DEBUG
@@ -573,17 +659,17 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
           amoebaGpu->psWorkVector[0]->_pDevData, amoebaGpu->psWorkVector[1]->_pDevData );
        LAUNCHERROR("kSorUpdatePmeMutualInducedField");  

-            if( 0 ){
-                gpuContext gpu = amoebaGpu->gpuContext;
-                std::vector<int> fileId;
-                fileId.push_back( iteration );
-                VectorOfDoubleVectors outputVector;
-//                cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData );
-//                cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData );
-                cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-                cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-                cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeDirectMI", fileId, outputVector );
-            }
+        if( 0 ){
+            gpuContext gpu = amoebaGpu->gpuContext;
+            std::vector<int> fileId;
+            fileId.push_back( iteration );
+            VectorOfDoubleVectors outputVector;
+//          cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData );
+//          cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData );
+            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
+            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
+            cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeDirectMI", fileId, outputVector );
+        }

        // get total epsilon -- performing sums on gpu


--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.h
@@ -100,99 +100,17 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)(
            for (unsigned int j = 0; j < GRID; j++)
            {

-                float4 ijField[3];
-
                // load coords, charge, ...

-                calculatePmeDirectMutualInducedFieldPairIxn_kernel( localParticle, psA[j], uscale, ijField
-#ifdef AMOEBA_DEBUG
-, pullBack 
-#endif
-);
-
-                unsigned int mask       =  ( (atomI == (y + j)) || (atomI >= cSim.atoms) || ((y+j) >= cSim.atoms) ) ? 0 : 1;
-
-                // add to field at atomI the field due atomJ's dipole
-
-                fieldSum[0]            += mask ? ijField[0].x : 0.0f;
-                fieldSum[1]            += mask ? ijField[1].x : 0.0f;
-                fieldSum[2]            += mask ? ijField[2].x : 0.0f;
-
-                fieldPolarSum[0]       += mask ? ijField[0].z : 0.0f;
-                fieldPolarSum[1]       += mask ? ijField[1].z : 0.0f;
-                fieldPolarSum[2]       += mask ? ijField[2].z : 0.0f;
-
-#ifdef AMOEBA_DEBUG
-/*
-if( atomI == targetAtom || (y+j) == targetAtom ){
-            unsigned int index                 = atomI == targetAtom ? (y+j) : atomI;
-            unsigned int pullBackIndex         = 0;
-            unsigned int indexI                = 0;
-            unsigned int indexJ                = indexI ? 0 : 2;
-
-            debugArray[index].x                = (float) atomI;
-            debugArray[index].y                = (float) (y + j);
-            debugArray[index].z                = cSim.nonbondedCutoffSqr;
-            debugArray[index].w                = 6.0f;
-
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = pullBack[pullBackIndex].x;
-            debugArray[index].y                = pullBack[pullBackIndex].y;
-            debugArray[index].z                = pullBack[pullBackIndex].z;
-            debugArray[index].w                = pullBack[pullBackIndex].w;
-
-            pullBackIndex++;
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = pullBack[pullBackIndex].x;
-            debugArray[index].y                = pullBack[pullBackIndex].y;
-            debugArray[index].z                = pullBack[pullBackIndex].z;
-            debugArray[index].w                = pullBack[pullBackIndex].w;
-
-            index                             += cSim.paddedNumberOfAtoms;
-            float flag                         = 6.0f;
-            debugArray[index].x                = ijField[0].x;
-            debugArray[index].y                = ijField[1].x;
-            debugArray[index].z                = ijField[2].x;
-            debugArray[index].w                = flag;
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[0].x;
-            debugArray[index].y                = ijField[1].x;
-            debugArray[index].z                = ijField[2].x;
-            debugArray[index].w                = flag;
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[0].z;
-            debugArray[index].y                = ijField[1].z;
-            debugArray[index].z                = ijField[2].z;
-            debugArray[index].w                = flag;
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[0].z;
-            debugArray[index].y                = ijField[1].z;
-            debugArray[index].z                = ijField[2].z;
-            debugArray[index].w                = flag;
-
-            index                             += cSim.paddedNumberOfAtoms;
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = match ? 0.0f : ijField[0].x;
-            debugArray[index].y                = match ? 0.0f : ijField[1].x;
-            debugArray[index].z                = match ? 0.0f : ijField[2].x;
-            index                             += cSim.paddedNumberOfAtoms;
-            unsigned int mask                  = 1 << j;
-            unsigned int pScaleIndex           = (scaleMask.x & mask) ? 1 : 0;
-            pScaleIndex                       += (scaleMask.y & mask) ? 2 : 0;
-            debugArray[index].x                = (float) pScaleIndex;
-
-            debugArray[index].y                = scaleMask.x & mask ? 1.0f : -1.0f;
-            debugArray[index].z                = scaleMask.y & mask ? 1.0f : -1.0f;
-            debugArray[index].w                = + 10.0f;
-
-}
-*/
-#endif
+                float4 delta;
+                float prefactor2;
+                if(  ( (atomI == (y + j)) || (atomI >= cSim.atoms) || ((y+j) >= cSim.atoms) ) ){
+                    delta.w = prefactor2 = 0.0f;
+                } else {
+                    setupMutualInducedFieldPairIxn_kernel( localParticle, psA[j], uscale, &delta, &prefactor2 );
+                }
+                calculateMutualInducedFieldPairIxn_kernel(  psA[j].inducedDipole,      delta, prefactor2, fieldSum );
+                calculateMutualInducedFieldPairIxn_kernel(  psA[j].inducedDipolePolar, delta, prefactor2, fieldPolarSum );

            }

@@ -226,6 +144,10 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
                // No interactions in this block.
            } else {

+#ifndef INCLUDE_MI_FIELD_BUFFERS
+                flags = 0xFFFFFFFF;
+#endif
+
               // zero shared fields
    
                zeroMutualInducedParticleSharedField(  &(sA[threadIdx.x]) );
@@ -235,53 +157,25 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
                    if ((flags&(1<<j)) != 0)
                    {
                        unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j;
-                        float4 ijField[3];
-
-                        // load coords, charge, ...
-
-                        calculatePmeDirectMutualInducedFieldPairIxn_kernel( localParticle, psA[jIdx], uscale, ijField
-    #ifdef AMOEBA_DEBUG
-        , pullBack
-    #endif
-           );
-
-                        unsigned int mask   =  ( (atomI >= cSim.atoms) || ((y+jIdx) >= cSim.atoms) ) ? 0 : 1;
-
-                        // add to field at atomI the field due atomJ's dipole
-
-                        fieldSum[0]              += mask ? ijField[0].x : 0.0f;
-                        fieldSum[1]              += mask ? ijField[1].x : 0.0f;
-                        fieldSum[2]              += mask ? ijField[2].x : 0.0f;
-
-                        // add to polar field at atomI the field due atomJ's dipole
-
-                        fieldPolarSum[0]         += mask ? ijField[0].z : 0.0f;
-                        fieldPolarSum[1]         += mask ? ijField[1].z : 0.0f;
-                        fieldPolarSum[2]         += mask ? ijField[2].z : 0.0f;
-
-                        // add to field at atomJ the field due atomI's dipole
-
+                        float4 delta;
+                        float prefactor2;
+                        if( (atomI >= cSim.atoms) || ((y+jIdx) >= cSim.atoms) ){
+                            delta.w = prefactor2 = 0.0f;
+                        } else {
+                            setupMutualInducedFieldPairIxn_kernel( localParticle, psA[jIdx], uscale, &delta, &prefactor2 );
+                        }
+                        calculateMutualInducedFieldPairIxn_kernel(  psA[jIdx].inducedDipole,          delta, prefactor2, fieldSum );
+                        calculateMutualInducedFieldPairIxn_kernel(  psA[jIdx].inducedDipolePolar,     delta, prefactor2, fieldPolarSum );
+#ifndef INCLUDE_MI_FIELD_BUFFERS
+                        calculateMutualInducedFieldPairIxn_kernel(  localParticle.inducedDipole,      delta, prefactor2, psA[jIdx].field );
+                        calculateMutualInducedFieldPairIxn_kernel(  localParticle.inducedDipolePolar, delta, prefactor2, psA[jIdx].fieldPolar );
+#else
                        if( flags == 0xFFFFFFFF ){
-
-                            psA[jIdx].field[0]             += mask ? ijField[0].y : 0.0f;
-                            psA[jIdx].field[1]             += mask ? ijField[1].y : 0.0f;
-                            psA[jIdx].field[2]             += mask ? ijField[2].y : 0.0f;
-
-                            // add to polar field at atomJ the field due atomI's dipole
-
-                            psA[jIdx].fieldPolar[0]        += mask ? ijField[0].w : 0.0f;
-                            psA[jIdx].fieldPolar[1]        += mask ? ijField[1].w : 0.0f;
-                            psA[jIdx].fieldPolar[2]        += mask ? ijField[2].w : 0.0f;
-
+                            calculateMutualInducedFieldPairIxn_kernel(  localParticle.inducedDipole,      delta, prefactor2, psA[jIdx].field );
+                            calculateMutualInducedFieldPairIxn_kernel(  localParticle.inducedDipolePolar, delta, prefactor2, psA[jIdx].fieldPolar );
                        } else {
-
-                            sA[threadIdx.x].tempBuffer[0]  = mask ? ijField[0].y : 0.0;
-                            sA[threadIdx.x].tempBuffer[1]  = mask ? ijField[1].y : 0.0;
-                            sA[threadIdx.x].tempBuffer[2]  = mask ? ijField[2].y : 0.0;
-
-                            sA[threadIdx.x].tempBufferP[0] = mask ? ijField[0].w : 0.0;
-                            sA[threadIdx.x].tempBufferP[1] = mask ? ijField[1].w : 0.0;
-                            sA[threadIdx.x].tempBufferP[2] = mask ? ijField[2].w : 0.0;
+                            calculateMutualInducedFieldPairIxnNoAdd_kernel(  localParticle.inducedDipole,      delta, prefactor2,  sA[threadIdx.x].tempBuffer );
+                            calculateMutualInducedFieldPairIxnNoAdd_kernel(  localParticle.inducedDipolePolar, delta, prefactor2,  sA[threadIdx.x].tempBufferP );

                            if( tgx % 2 == 0 ){
                                sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );
@@ -308,61 +202,8 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
                            }

                        }
-    
-/*
-#ifdef AMOEBA_DEBUG
-if( atomI == targetAtom || (y+jIdx) == targetAtom ){
-            unsigned int index                 = atomI == targetAtom ? (y+jIdx) : atomI;
-            unsigned int pullBackIndex         = 0;
-            unsigned int indexI                = 0;
-            unsigned int indexJ                = indexI ? 0 : 2;
-
-            debugArray[index].x                = (float) atomI;
-            debugArray[index].y                = (float) (y + jIdx);
-            debugArray[index].z                = cSim.nonbondedCutoffSqr;
-            debugArray[index].w                = 7.0f;
-
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = pullBack[pullBackIndex].x;
-            debugArray[index].y                = pullBack[pullBackIndex].y;
-            debugArray[index].z                = pullBack[pullBackIndex].z;
-            debugArray[index].w                = pullBack[pullBackIndex].w;
-
-            pullBackIndex++;
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = pullBack[pullBackIndex].x;
-            debugArray[index].y                = pullBack[pullBackIndex].y;
-            debugArray[index].z                = pullBack[pullBackIndex].z;
-            debugArray[index].w                = pullBack[pullBackIndex].w;
-
-            index                             += cSim.paddedNumberOfAtoms;
-            float flag                         = 7.0f;
-            debugArray[index].x                = ijField[indexI][0];
-            debugArray[index].y                = ijField[indexI][1];
-            debugArray[index].z                = ijField[indexI][2];
-            debugArray[index].w                = flag;
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexJ][0];
-            debugArray[index].y                = ijField[indexJ][1];
-            debugArray[index].z                = ijField[indexJ][2];
-            debugArray[index].w                = flag;
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexI+1][0];
-            debugArray[index].y                = ijField[indexI+1][1];
-            debugArray[index].z                = ijField[indexI+1][2];
-            debugArray[index].w                = flag;
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexJ+1][0];
-            debugArray[index].y                = ijField[indexJ+1][1];
-            debugArray[index].z                = ijField[indexJ+1][2];
-            debugArray[index].w                = flag;
-}
 #endif
-*/
+    
                    }
    
                    tj                  = (tj + 1) & (GRID - 1);

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.cu
@@ -531,7 +531,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
            maxThreads = 192; 
        else
            maxThreads = 128;
-        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(Vdw14_7Particle)), maxThreads);
+        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(Vdw14_7Particle), gpu->sharedMemoryPerBlock ), maxThreads);
    }    

    if( 0 ){

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersion.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersion.cu
@@ -382,7 +382,7 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )
            maxThreads = 192;
        else
            maxThreads = 64;
-       threadsPerBlock = std::min(getThreadsPerBlock( amoebaGpu, sizeof(WcaDispersionParticle)), maxThreads);
+       threadsPerBlock = std::min(getThreadsPerBlock( amoebaGpu, sizeof(WcaDispersionParticle), gpu->sharedMemoryPerBlock ), maxThreads);
    }

 #ifdef AMOEBA_DEBUG