Removal of several arrays no longer needed

6bad9d44 · Mark Friedrichs · 1beac75d · 6bad9d44 · 6bad9d44 · 6bad9d44
Commit 6bad9d44 authored Apr 08, 2011 by Mark Friedrichs
7 changed files
--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
@@ -30,20 +30,10 @@
 #include "kernels/gputypes.h"
 #include "amoebaCudaTypes.h"
-#define THREADS_PER_BLOCK 256
 #include <map>
 typedef std::map<int,float> MapIntFloat;
 typedef MapIntFloat::const_iterator MapIntFloatCI;
-/* 
- * Remove
- * pMapArray, dMapArray, paddedNumberOfAtoms, nonbondBlocks, nonbondThreadsPerBlock, nonbondOutputBuffers
- * allocation of torqueMapForce psCovalentDegree psPolarizationDegree
- * 
-   THREADS_PER_BLOCK
- */
 struct _amoebaGpuContext {
    _gpuContext* gpuContext;
@@ -112,7 +102,6 @@ struct _amoebaGpuContext {
    // multipole parameters
    CUDAStream<int4>* psMultipoleParticlesIdsAndAxisType;
-    CUDAStream<int>* psMultipoleAxisOffset;
    // buffer indices used for mapping torques onto forces 
@@ -133,10 +122,10 @@ struct _amoebaGpuContext {
    CUDAStream<float2>*  psDampingFactorAndThole;
-    // slated for removal -- no longer used
+    // used to setup scaling constants
-    CUDAStream<int>*    psCovalentDegree;
+    std::vector<int>    covalentDegree;
-    CUDAStream<int>*    psPolarizationDegree;
+    std::vector<int>    polarizationDegree;
    // fixed-E field

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
@@ -255,18 +255,22 @@ void kInitializeMutualInducedAndGkField_kernel(
                   float* inducedDipolePolarS )
 {
-    int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
+    int pos = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
-    if( threadId >= 3*cSim.atoms )return;
+    while( pos < 3*cSim.atoms )
+    {
-    fixedEField[threadId]          *= polarizability[threadId];
-    inducedDipole[threadId]         = fixedEField[threadId];
-    fixedEFieldPolar[threadId]     *= polarizability[threadId];
-    inducedDipolePolar[threadId]    = fixedEFieldPolar[threadId];
-    fixedGkField[threadId]         *= polarizability[threadId];
+        fixedEField[pos]          *= polarizability[pos];
-    inducedDipoleS[threadId]        = fixedEField[threadId]       + fixedGkField[threadId];
+        inducedDipole[pos]         = fixedEField[pos];
-    inducedDipolePolarS[threadId]   = fixedEFieldPolar[threadId]  + fixedGkField[threadId];
+        fixedEFieldPolar[pos]     *= polarizability[pos];
+        inducedDipolePolar[pos]    = fixedEFieldPolar[pos];
+        fixedGkField[pos]         *= polarizability[pos];
+        inducedDipoleS[pos]        = fixedEField[pos]       + fixedGkField[pos];
+        inducedDipolePolarS[pos]   = fixedEFieldPolar[pos]  + fixedGkField[pos];
+        pos                       += blockDim.x*gridDim.x;
+    }
 }
@@ -355,21 +359,24 @@ void kSorUpdateMutualInducedAndGkField_kernel(
 {
    float polarSOR = 0.70f;
-    int threadId   = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
+    int pos        = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
-    if( threadId  >= 3*cSim.atoms)return;
+    while( pos < 3*cSim.atoms )
+    {
-    float previousDipole                = inducedDipole[threadId];
-    float previousDipoleP               = inducedDipoleP[threadId];
-    inducedDipole[threadId]             = fixedEField[threadId]     + polarizability[threadId]*matrixProduct[threadId];
-    inducedDipoleP[threadId]            = fixedEFieldP[threadId]    + polarizability[threadId]*matrixProductP[threadId];
-    inducedDipole[threadId]             = previousDipole   + polarSOR*( inducedDipole[threadId]   - previousDipole  );   
-    inducedDipoleP[threadId]            = previousDipoleP  + polarSOR*( inducedDipoleP[threadId]  - previousDipoleP );
-    matrixProduct[threadId]             = ( inducedDipole[threadId]  - previousDipole  )*( inducedDipole[threadId]  - previousDipole  );
+        float previousDipole           = inducedDipole[pos];
-    matrixProductP[threadId]            = ( inducedDipoleP[threadId] - previousDipoleP )*( inducedDipoleP[threadId] - previousDipoleP );
+        float previousDipoleP          = inducedDipoleP[pos];
+        inducedDipole[pos]             = fixedEField[pos]     + polarizability[pos]*matrixProduct[pos];
+        inducedDipoleP[pos]            = fixedEFieldP[pos]    + polarizability[pos]*matrixProductP[pos];
+        inducedDipole[pos]             = previousDipole   + polarSOR*( inducedDipole[pos]   - previousDipole  );   
+        inducedDipoleP[pos]            = previousDipoleP  + polarSOR*( inducedDipoleP[pos]  - previousDipoleP );
+        matrixProduct[pos]             = ( inducedDipole[pos]  - previousDipole  )*( inducedDipole[pos]  - previousDipole  );
+        matrixProductP[pos]            = ( inducedDipoleP[pos] - previousDipoleP )*( inducedDipoleP[pos] - previousDipoleP );
+        pos                           += blockDim.x*gridDim.x;
+    }
 }
 __global__ 
@@ -389,21 +396,23 @@ void kSorUpdateMutualInducedAndGkFieldS_kernel(
 {
    float polarSOR = 0.70f;
-    int threadId   = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
+    int pos        = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
-    if( threadId  >= 3*cSim.atoms)return;
+    while( pos < 3*cSim.atoms )
+    {
-    float previousDipole                = inducedDipole[threadId];
+        float previousDipole      = inducedDipole[pos];
-    float previousDipoleP               = inducedDipoleP[threadId];
+        float previousDipoleP     = inducedDipoleP[pos];
-    inducedDipole[threadId]             = fixedGkField[threadId]    + fixedEField[threadId]     + polarizability[threadId]*matrixProduct[threadId];
+        inducedDipole[pos]        = fixedGkField[pos]    + fixedEField[pos]     + polarizability[pos]*matrixProduct[pos];
-    inducedDipoleP[threadId]            = fixedGkField[threadId]    + fixedEFieldP[threadId]    + polarizability[threadId]*matrixProductP[threadId];
+        inducedDipoleP[pos]       = fixedGkField[pos]    + fixedEFieldP[pos]    + polarizability[pos]*matrixProductP[pos];
-    inducedDipole[threadId]             = previousDipole   + polarSOR*( inducedDipole[threadId]   - previousDipole  );   
+        inducedDipole[pos]        = previousDipole   + polarSOR*( inducedDipole[pos]   - previousDipole  );   
-    inducedDipoleP[threadId]            = previousDipoleP  + polarSOR*( inducedDipoleP[threadId]  - previousDipoleP );
+        inducedDipoleP[pos]       = previousDipoleP  + polarSOR*( inducedDipoleP[pos]  - previousDipoleP );
-    matrixProduct[threadId]             = ( inducedDipole[threadId]  - previousDipole  )*( inducedDipole[threadId]  - previousDipole  );
+        matrixProduct[pos]        = ( inducedDipole[pos]  - previousDipole  )*( inducedDipole[pos]  - previousDipole  );
-    matrixProductP[threadId]            = ( inducedDipoleP[threadId] - previousDipoleP )*( inducedDipoleP[threadId] - previousDipoleP );
+        matrixProductP[pos]       = ( inducedDipoleP[pos] - previousDipoleP )*( inducedDipoleP[pos] - previousDipoleP );
+        pos                      += blockDim.x*gridDim.x;
+    }
 }
 // reduce psWorkArray_3_1 -> outputArray
@@ -437,46 +446,6 @@ static void kReduceMutualInducedAndGkFields(amoebaGpuContext amoebaGpu,
    LAUNCHERROR("kReduceMutualInducedAndGkFields4");
 }
-#ifdef AMOEBA_DEBUG
-#if 0
-static void printMiFieldBuffer( amoebaGpuContext amoebaGpu, unsigned int bufferIndex )
-{
-    (void) fprintf( amoebaGpu->log, "MI Field Buffer %u\n", bufferIndex );
-    unsigned int start = bufferIndex*3*gpu->sim.paddedNumberOfAtoms;
-    unsigned int stop  = (bufferIndex+1)*3*gpu->sim.paddedNumberOfAtoms;
-    for( unsigned int ii = start; ii < stop; ii += 3 ){
-        unsigned int ii3Index      = ii/3;
-        unsigned int bufferIndex   = ii3Index/(gpu->sim.paddedNumberOfAtoms);
-        unsigned int particleIndex = ii3Index - bufferIndex*(gpu->sim.paddedNumberOfAtoms);
-        (void) fprintf( amoebaGpu->log, "   %6u %3u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n", 
-                            ii/3,  bufferIndex, particleIndex,
-                            amoebaGpu->psWorkArray_3_1->_pSysData[ii],
-                            amoebaGpu->psWorkArray_3_1->_pSysData[ii+1],
-                            amoebaGpu->psWorkArray_3_1->_pSysData[ii+2],
-                            amoebaGpu->psWorkArray_3_2->_pSysData[ii],
-                            amoebaGpu->psWorkArray_3_2->_pSysData[ii+1],
-                            amoebaGpu->psWorkArray_3_2->_pSysData[ii+2] );
-    } 
-}
-static void printMiFieldAtomBuffers( amoebaGpuContext amoebaGpu, unsigned int targetAtom )
-{
-    (void) fprintf( amoebaGpu->log, "MI Field atom %u\n", targetAtom );
-    for( unsigned int ii = 0; ii < gpu->sim.outputBuffers; ii++ ){
-        unsigned int particleIndex = 3*(targetAtom + ii*gpu->sim.paddedNumberOfAtoms);
-        (void) fprintf( amoebaGpu->log, " %2u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n", 
-                        ii, particleIndex,
-                        amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex],
-                        amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+1],
-                        amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+2],
-                        amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex],
-                        amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+1],
-                        amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+2] );
-    } 
-}
-#endif
-#endif
 /**---------------------------------------------------------------------------------------
   Compute mutual induce field
@@ -576,14 +545,6 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
        amoebaGpu->psWorkArray_3_3->Download();
        amoebaGpu->psWorkArray_3_4->Download();
-        //printMiFieldAtomBuffers( amoebaGpu, (targetAtom + 0) );
-        //printMiFieldAtomBuffers( amoebaGpu, (targetAtom + 1) );
-        //printMiFieldAtomBuffers( amoebaGpu, 100 );
-        //printMiFieldBuffer( amoebaGpu, 0 );
-        //printMiFieldBuffer( amoebaGpu, 1 );
-        //printMiFieldBuffer( amoebaGpu, 37 );
-        //printMiFieldBuffer( amoebaGpu, 38 );
    if( amoebaGpu->log && iteration == 1 ){
        (void) fprintf( amoebaGpu->log, "Finished MI kernel execution %d\n", iteration ); (void) fflush( amoebaGpu->log );
@@ -711,28 +672,13 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
    int iteration;
    gpuContext gpu     = amoebaGpu->gpuContext;
-    int numOfElems     = gpu->natoms*3;
-    int numThreads     = min( THREADS_PER_BLOCK, numOfElems );
-    int numBlocks      = numOfElems/numThreads;
-    if( (numOfElems % numThreads) != 0 )numBlocks++;
-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log && timestep == 1 ){
-        (void) fprintf( amoebaGpu->log, "%s %d numOfElems=%d numThreads=%d numBlocks=%d "
-                        "maxIterations=%d targetEpsilon=%.3e\n", 
-                        methodName, gpu->natoms, numOfElems, numThreads, numBlocks,
-                        amoebaGpu->mutualInducedMaxIterations, amoebaGpu->mutualInducedTargetEpsilon);
-        (void) fflush( amoebaGpu->log );
-    }   
-#endif
   // ---------------------------------------------------------------------------------------
    // set  E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability
    // initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability
-    kInitializeMutualInducedAndGkField_kernel<<< numBlocks, numThreads >>>(
+    kInitializeMutualInducedAndGkField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
         amoebaGpu->psE_Field->_pDevData,
         amoebaGpu->psE_FieldPolar->_pDevData,
         amoebaGpu->psGk_Field->_pDevData,
@@ -812,14 +758,14 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
        // post matrix multiply
-        kSorUpdateMutualInducedAndGkField_kernel<<< numBlocks, numThreads >>>(
+        kSorUpdateMutualInducedAndGkField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
           amoebaGpu->psPolarizability->_pDevData,
           amoebaGpu->psInducedDipole->_pDevData,     amoebaGpu->psInducedDipolePolar->_pDevData,
           amoebaGpu->psE_Field->_pDevData,           amoebaGpu->psE_FieldPolar->_pDevData,
           amoebaGpu->psWorkVector[0]->_pDevData,     amoebaGpu->psWorkVector[1]->_pDevData );
        LAUNCHERROR("cudaComputeAmoebaMutualInducedAndGkFieldSorUpdate1");  
-        kSorUpdateMutualInducedAndGkFieldS_kernel<<< numBlocks, numThreads >>>(
+        kSorUpdateMutualInducedAndGkFieldS_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
           amoebaGpu->psPolarizability->_pDevData,
           amoebaGpu->psInducedDipoleS->_pDevData,    amoebaGpu->psInducedDipolePolarS->_pDevData,
           amoebaGpu->psE_Field->_pDevData,          amoebaGpu->psE_FieldPolar->_pDevData,

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
@@ -120,14 +120,18 @@ void kInitializeMutualInducedField_kernel(
                   float* inducedDipolePolar )
 {
-    int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
+    int pos = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
-    if( threadId >= 3*numberOfAtoms )return;
+    while( pos < 3*cSim.atoms )
+    {
-    fixedEField[threadId]         *= polarizability[threadId];
+        fixedEField[pos]         *= polarizability[pos];
-    inducedDipole[threadId]        = fixedEField[threadId];
+        inducedDipole[pos]        = fixedEField[pos];
-    fixedEFieldPolar[threadId]    *= polarizability[threadId];
+        fixedEFieldPolar[pos]    *= polarizability[pos];
-    inducedDipolePolar[threadId]   = fixedEFieldPolar[threadId];
+        inducedDipolePolar[pos]   = fixedEFieldPolar[pos];
+        pos                      += blockDim.x*gridDim.x;
+    }
 }
@@ -195,20 +199,24 @@ void kSorUpdateMutualInducedField_kernel(
 {
    float polarSOR = 0.70f;
-    int threadId   = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
+    int pos        = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
-    if( threadId  >= 3*numberOfEntries )return;
+    while( pos < 3*cSim.atoms )
+    {
-    float previousDipole                = inducedDipole[threadId];
-    float previousDipoleP               = inducedDipoleP[threadId];
-    inducedDipole[threadId]             = fixedEField[threadId]     + polarizability[threadId]*matrixProduct[threadId];
-    inducedDipoleP[threadId]            = fixedEFieldP[threadId]    + polarizability[threadId]*matrixProductP[threadId];
-    inducedDipole[threadId]             = previousDipole   + polarSOR*( inducedDipole[threadId]   - previousDipole  );   
+        float previousDipole       = inducedDipole[pos];
-    inducedDipoleP[threadId]            = previousDipoleP  + polarSOR*( inducedDipoleP[threadId]  - previousDipoleP );
+        float previousDipoleP      = inducedDipoleP[pos];
-    matrixProduct[threadId]             = ( inducedDipole[threadId]  - previousDipole  )*( inducedDipole[threadId]  - previousDipole  );
+        inducedDipole[pos]         = fixedEField[pos]     + polarizability[pos]*matrixProduct[pos];
-    matrixProductP[threadId]            = ( inducedDipoleP[threadId] - previousDipoleP )*( inducedDipoleP[threadId] - previousDipoleP );
+        inducedDipoleP[pos]        = fixedEFieldP[pos]    + polarizability[pos]*matrixProductP[pos];
+        inducedDipole[pos]         = previousDipole   + polarSOR*( inducedDipole[pos]   - previousDipole  );   
+        inducedDipoleP[pos]        = previousDipoleP  + polarSOR*( inducedDipoleP[pos]  - previousDipoleP );
+        matrixProduct[pos]         = ( inducedDipole[pos]  - previousDipole  )*( inducedDipole[pos]  - previousDipole  );
+        matrixProductP[pos]        = ( inducedDipoleP[pos] - previousDipoleP )*( inducedDipoleP[pos] - previousDipoleP );
+        pos                       += blockDim.x*gridDim.x;
+    }
 }
@@ -469,29 +477,14 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
    int done;
    int iteration;
-     gpuContext gpu    = amoebaGpu->gpuContext;
+    gpuContext gpu    = amoebaGpu->gpuContext;
-    int numOfElems     = gpu->natoms*3;
-    int numThreads     = min( THREADS_PER_BLOCK, numOfElems );
-    int numBlocks      = numOfElems/numThreads;
-    if( (numOfElems % numThreads) != 0 )numBlocks++;
-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "%s %d numOfElems=%d numThreads=%d numBlocks=%d "
-                        "maxIterations=%d targetEpsilon=%.3e\n", 
-                        methodName, gpu->natoms, numOfElems, numThreads, numBlocks,
-                        amoebaGpu->mutualInducedMaxIterations, amoebaGpu->mutualInducedTargetEpsilon);
-        (void) fflush( amoebaGpu->log );
-    }   
-#endif
   // ---------------------------------------------------------------------------------------
    // set  E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability
    // initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability
-    kInitializeMutualInducedField_kernel<<< numBlocks, numThreads >>>(
+    kInitializeMutualInducedField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
         gpu->natoms,
         amoebaGpu->psE_Field->_pDevData,
         amoebaGpu->psE_FieldPolar->_pDevData,
@@ -555,7 +548,7 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
        // post matrix multiply
-        kSorUpdateMutualInducedField_kernel<<< numBlocks, numThreads >>>(
+        kSorUpdateMutualInducedField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
           gpu->natoms, amoebaGpu->psPolarizability->_pDevData,
           amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psInducedDipolePolar->_pDevData,
           amoebaGpu->psE_Field->_pDevData,       amoebaGpu->psE_FieldPolar->_pDevData,

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
@@ -242,14 +242,16 @@ static void kInitializeMutualInducedField_kernel(
                   float* inducedDipolePolar )
 {
-    int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
+    int pos = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
-    if( threadId >= 3*numberOfAtoms )return;
+    while( pos < 3*cSim.atoms )
+    {   
-    fixedEField[threadId]         *= polarizability[threadId];
+        fixedEField[pos]         *= polarizability[pos];
-    inducedDipole[threadId]        = fixedEField[threadId];
+        inducedDipole[pos]        = fixedEField[pos];
-    fixedEFieldPolar[threadId]    *= polarizability[threadId];
+        fixedEFieldPolar[pos]    *= polarizability[pos];
-    inducedDipolePolar[threadId]   = fixedEFieldPolar[threadId];
+        inducedDipolePolar[pos]   = fixedEFieldPolar[pos];
+        pos                      += blockDim.x*gridDim.x;
+    }
 }
@@ -325,27 +327,31 @@ static void kSorUpdateMutualInducedField_kernel(
                   float* matrixProduct, float* matrixProductP )
 {
-    int threadId                        = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
+    int pos = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
-    if( threadId  >= 3*numberOfEntries )return;
+    while( pos < 3*cSim.atoms )
+    {   
-    float previousDipole                = inducedDipole[threadId];
-    float previousDipoleP               = inducedDipoleP[threadId];
-    // add self terms to fields
-    const float term                    = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
-    matrixProduct[threadId]            +=  term*previousDipole;
-    matrixProductP[threadId]           +=  term*previousDipoleP;
-    inducedDipole[threadId]             = fixedEField[threadId]     + polarizability[threadId]*matrixProduct[threadId];
-    inducedDipoleP[threadId]            = fixedEFieldP[threadId]    + polarizability[threadId]*matrixProductP[threadId];
-    const float polarSOR                = 0.70f;
+        float previousDipole           = inducedDipole[pos];
-    inducedDipole[threadId]             = previousDipole   + polarSOR*( inducedDipole[threadId]   - previousDipole  );   
+        float previousDipoleP          = inducedDipoleP[pos];
-    inducedDipoleP[threadId]            = previousDipoleP  + polarSOR*( inducedDipoleP[threadId]  - previousDipoleP );
+        // add self terms to fields
+        const float term               = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
+        matrixProduct[pos]            +=  term*previousDipole;
+        matrixProductP[pos]           +=  term*previousDipoleP;
+        inducedDipole[pos]             = fixedEField[pos]     + polarizability[pos]*matrixProduct[pos];
+        inducedDipoleP[pos]            = fixedEFieldP[pos]    + polarizability[pos]*matrixProductP[pos];
+        const float polarSOR           = 0.70f;
+        inducedDipole[pos]             = previousDipole   + polarSOR*( inducedDipole[pos]   - previousDipole  );   
+        inducedDipoleP[pos]            = previousDipoleP  + polarSOR*( inducedDipoleP[pos]  - previousDipoleP );
+        matrixProduct[pos]             = ( inducedDipole[pos]  - previousDipole  )*( inducedDipole[pos]  - previousDipole  );
+        matrixProductP[pos]            = ( inducedDipoleP[pos] - previousDipoleP )*( inducedDipoleP[pos] - previousDipoleP );
-    matrixProduct[threadId]             = ( inducedDipole[threadId]  - previousDipole  )*( inducedDipole[threadId]  - previousDipole  );
+        pos                           += blockDim.x*gridDim.x;
-    matrixProductP[threadId]            = ( inducedDipoleP[threadId] - previousDipoleP )*( inducedDipoleP[threadId] - previousDipoleP );
+    }
 }
@@ -539,28 +545,13 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
    int iteration;
     gpuContext gpu    = amoebaGpu->gpuContext;
-    int numOfElems     = gpu->natoms*3;
-    int numThreads     = min( THREADS_PER_BLOCK, numOfElems );
-    int numBlocks      = numOfElems/numThreads;
-    if( (numOfElems % numThreads) != 0 )numBlocks++;
-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "%s %d numOfElems=%d numThreads=%d numBlocks=%d "
-                        "maxIterations=%d targetEpsilon=%.3e\n", 
-                        methodName, gpu->natoms, numOfElems, numThreads, numBlocks,
-                        amoebaGpu->mutualInducedMaxIterations, amoebaGpu->mutualInducedTargetEpsilon);
-        (void) fflush( amoebaGpu->log );
-    }   
-#endif
   // ---------------------------------------------------------------------------------------
    // set  E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability
    // initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability
-    kInitializeMutualInducedField_kernel<<< numBlocks, numThreads >>>(
+    kInitializeMutualInducedField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
         gpu->natoms,
         amoebaGpu->psE_Field->_pDevData,
         amoebaGpu->psE_FieldPolar->_pDevData,
@@ -607,7 +598,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
        // post matrix multiply
-        kSorUpdateMutualInducedField_kernel<<< numBlocks, numThreads >>>(
+        kSorUpdateMutualInducedField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
           gpu->natoms, amoebaGpu->psPolarizability->_pDevData,
           amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psInducedDipolePolar->_pDevData,
           amoebaGpu->psE_Field->_pDevData,       amoebaGpu->psE_FieldPolar->_pDevData,

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaReduce.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaReduce.cu
-//-----------------------------------------------------------------------------------------
-//-----------------------------------------------------------------------------------------
-#include "amoebaGpuTypes.h"
-#include "amoebaCudaKernels.h"
-#include <stdio.h>
-#undef AMOEBA_OFFSET_3
-#undef AMOEBA_INCLUDE_DIAGONAL 
-#define METHOD_NAME(a, b) a##ExcludeDiagonalOffset1##b
-#include "kCalculateAmoebaCudaReduce.h"
-#undef METHOD_NAME
-#define AMOEBA_OFFSET_3
-#define METHOD_NAME(a, b) a##ExcludeDiagonalOffset3##b
-#include "kCalculateAmoebaCudaReduce.h"
-#undef METHOD_NAME
-#undef AMOEBA_OFFSET_3
-#define AMOEBA_INCLUDE_DIAGONAL 
-#define METHOD_NAME(a, b) a##IncludeDiagonalOffset1##b
-#include "kCalculateAmoebaCudaReduce.h"
-#undef METHOD_NAME
-#define AMOEBA_OFFSET_3
-#define METHOD_NAME(a, b) a##IncludeDiagonalOffset3##b
-#include "kCalculateAmoebaCudaReduce.h"
-#undef METHOD_NAME
-#undef AMOEBA_OFFSET_3
-#undef AMOEBA_INCLUDE_DIAGONAL 
-void cudaReduceN2ToN( float *N2Array, int Nsz, float *NArray, int includeDiagonal, int offset )
-{
-    int numThreads        = min(THREADS_PER_BLOCK, (Nsz));
-    int numBlocksPerAtom  = (Nsz / numThreads);
-    if( Nsz % numThreads ){
-        numBlocksPerAtom++;
-    }
-    int numBlocks = numBlocksPerAtom*Nsz;
-    float *partialSum1_d;
-    // allocate GPU memory 
-    cudaMalloc( (void**) &partialSum1_d, numBlocks*offset*sizeof(float) );  
-    if( includeDiagonal ){
-       if( offset == 3 ){
-           kCalculateAmoebaReduceIncludeDiagonalOffset3N2ToNBlockLevel<<< numBlocks, numThreads >>>( N2Array, partialSum1_d, Nsz, numBlocksPerAtom );
-           LAUNCHERROR("kCalculateAmoebaReduceN2ToNBlockLevel1");
-       } else if( offset == 1 ){
-           kCalculateAmoebaReduceIncludeDiagonalOffset1N2ToNBlockLevel<<< numBlocks, numThreads >>>( N2Array, partialSum1_d, Nsz, numBlocksPerAtom );
-           LAUNCHERROR("kCalculateAmoebaReduceN2ToNBlockLevel2");
-       }
-    } else {
-       if( offset == 3 ){
-           kCalculateAmoebaReduceExcludeDiagonalOffset3N2ToNBlockLevel<<< numBlocks, numThreads >>>( N2Array, partialSum1_d, Nsz, numBlocksPerAtom );
-           LAUNCHERROR("kCalculateAmoebaReduceN2ToNBlockLevel3");
-       } else if( offset == 1 ){
-           kCalculateAmoebaReduceExcludeDiagonalOffset1N2ToNBlockLevel<<< numBlocks, numThreads >>>( N2Array, partialSum1_d, Nsz, numBlocksPerAtom );
-           LAUNCHERROR("kCalculateAmoebaReduceN2ToNBlockLevel4");
-       }
-    }
-    int numBlocks2 = numBlocks;
-    numBlocks      = numBlocks2*Nsz/numThreads;
-    if( (numBlocks2*Nsz) % numThreads ){
-        numBlocks++;
-    }
-    if( offset == 3 ){
-        kCalculateAmoebaReduceIncludeDiagonalOffset3N2ToNFinal<<< numBlocks, numThreads >>>(partialSum1_d, NArray, Nsz, numBlocksPerAtom );
-        LAUNCHERROR("kCalculateAmoebaReduceN2ToNFinal3");
-    } else if( offset == 1 ){
-        kCalculateAmoebaReduceIncludeDiagonalOffset1N2ToNFinal<<< numBlocks, numThreads >>>(partialSum1_d, NArray, Nsz, numBlocksPerAtom );
-        LAUNCHERROR("kCalculateAmoebaReduceN2ToNFinal1");
-    }
-    //Free memory
-    cudaFree(partialSum1_d);
-}
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaReduce.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaReduce.h
-//-----------------------------------------------------------------------------------------
-//-----------------------------------------------------------------------------------------
-typedef unsigned int uint;
-__global__
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
-#endif
-void METHOD_NAME(kCalculateAmoebaReduce, N2ToNBlockLevel)( float *N2Array, float *partialSum, int num,int numberOfBlocksPerAtom )
-{
-   uint tid = threadIdx.x;
-   __shared__ float asx[THREADS_PER_BLOCK];
-   asx[tid] = 0.0f;
-#ifdef AMOEBA_OFFSET_3
-    __shared__ float asy[THREADS_PER_BLOCK];
-    __shared__ float asz[THREADS_PER_BLOCK];
-    asx[tid] = 0.0f;
-    asy[tid] = asz[tid] = 0.0f;
-    int offset = 3;
-#else
-    int offset = 1;
-#endif
-    int atomI =  blockIdx.x / numberOfBlocksPerAtom;
-    int atomJ = (blockIdx.x % numberOfBlocksPerAtom)*blockDim.x+tid;
-#ifdef AMOEBA_INCLUDE_DIAGONAL
-    if( atomJ < num && atomI < num ){
-#else
-    if( atomJ < num && atomJ != atomI ){
-#endif
-      int index = offset*(atomI*num + atomJ);
-      asx[tid] = N2Array[index];
-#ifdef AMOEBA_OFFSET_3
-      asy[tid] = N2Array[index+1];
-      asz[tid] = N2Array[index+2];
-#endif
-    }
-    __syncthreads(); //to make sure all the elements are loaded
-    for( uint s = (blockDim.x)/2; s != 0; s >>= 1 ){
-      if( tid < s ){
-        asx[tid] += asx[tid+s];
-#ifdef AMOEBA_OFFSET_3
-        asy[tid] += asy[tid+s];
-        asz[tid] += asz[tid+s];
-#endif
-      }
-      __syncthreads();
-    }
-    if( tid == 0 ){
-      partialSum[blockIdx.x*offset] = asx[0];
-#ifdef AMOEBA_OFFSET_3
-      partialSum[blockIdx.x*3+1]    = asy[0];
-      partialSum[blockIdx.x*3+2]    = asz[0];
-#endif
-    }  
-}
-__global__
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
-#endif
-void METHOD_NAME(kCalculateAmoebaReduce, N2ToNFinal)( float *partialSum, float *final,int num,int numberOfBlocksPerAtom )
-{
-    uint thread_id = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
-    if( thread_id > num )return;
-    float3 sum;
-#ifdef AMOEBA_OFFSET_3
-    int offset = 3;
-    sum.x = sum.y = sum.z = 0.0f;
-#else
-    int offset = 1;
-    sum.x      = 0.0f;
-#endif
-    int index = thread_id*offset*numberOfBlocksPerAtom;
-    for( int i=0; i < numberOfBlocksPerAtom; i++ ){
-      sum.x += partialSum[index + i*offset];
-#ifdef AMOEBA_OFFSET_3
-      sum.y += partialSum[index + i*offset+1];
-      sum.z += partialSum[index + i*offset+2];
-#endif
-    }
-    final[thread_id*offset   ] = sum.x;
-#ifdef AMOEBA_OFFSET_3
-    final[thread_id*3+1 ]      = sum.y;
-    final[thread_id*3+2 ]      = sum.z;
-#endif
-}