Removal of several arrays no longer needed

6bad9d44 · Mark Friedrichs · 1beac75d · 6bad9d44 · 6bad9d44 · 6bad9d44
Commit 6bad9d44 authored Apr 08, 2011 by Mark Friedrichs
7 changed files
--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
@@ -97,87 +97,94 @@ void gpuPrintCudaStream( std::string name,
 }
 extern "C"
-void gpuPrintCudaStreamFloat( CUDAStream<float>* cUDAStream, FILE* log )
+int gpuPrintCudaStreamFloat( CUDAStream<float>* cUDAStream, FILE* log )
 {
-    if( cUDAStream == NULL )return;
+    if( cUDAStream == NULL )return 0;
    gpuPrintCudaStream( cUDAStream->_name.c_str(),
                        cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
                        cUDAStream->_length*cUDAStream->_subStreams*sizeof( float ),
                        static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
                        static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
+    return cUDAStream->_length*cUDAStream->_subStreams*sizeof( float );
 }
 extern "C"
-void gpuPrintCudaStreamFloat2( CUDAStream<float2>* cUDAStream, FILE* log )
+int gpuPrintCudaStreamFloat2( CUDAStream<float2>* cUDAStream, FILE* log )
 {
-    if( cUDAStream == NULL )return;
+    if( cUDAStream == NULL )return 0;
    gpuPrintCudaStream( cUDAStream->_name.c_str(),
                        cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
                        cUDAStream->_length*cUDAStream->_subStreams*sizeof( float2 ),
                        static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
                        static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
+    return cUDAStream->_length*cUDAStream->_subStreams*2*sizeof( float );
 }
 extern "C"
-void gpuPrintCudaStreamFloat4( CUDAStream<float4>* cUDAStream, FILE* log )
+int gpuPrintCudaStreamFloat4( CUDAStream<float4>* cUDAStream, FILE* log )
 {
-    if( cUDAStream == NULL )return;
+    if( cUDAStream == NULL )return 0;
    gpuPrintCudaStream( cUDAStream->_name.c_str(),
                        cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
                        cUDAStream->_length*cUDAStream->_subStreams*sizeof( float4 ),
                        static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
                        static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
+    return cUDAStream->_length*cUDAStream->_subStreams*4*sizeof( float );
 }
 extern "C"
-void gpuPrintCudaStreamUnsignedInt( CUDAStream<unsigned int>* cUDAStream, FILE* log )
+int gpuPrintCudaStreamUnsignedInt( CUDAStream<unsigned int>* cUDAStream, FILE* log )
 {
-    if( cUDAStream == NULL )return;
+    if( cUDAStream == NULL )return 0;
    gpuPrintCudaStream( cUDAStream->_name.c_str(),
                        cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
                        cUDAStream->_length*cUDAStream->_subStreams*sizeof( unsigned int ),
                        static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
                        static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
+    return cUDAStream->_length*cUDAStream->_subStreams*sizeof( unsigned int );
 }
 extern "C"
-void gpuPrintCudaStreamInt( CUDAStream<int>* cUDAStream, FILE* log )
+int gpuPrintCudaStreamInt( CUDAStream<int>* cUDAStream, FILE* log )
 {
-    if( cUDAStream == NULL )return;
+    if( cUDAStream == NULL )return 0;
    gpuPrintCudaStream( cUDAStream->_name.c_str(),
                        cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
                        cUDAStream->_length*cUDAStream->_subStreams*sizeof( int ),
                        static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
                        static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
+    return cUDAStream->_length*cUDAStream->_subStreams*sizeof( int );
 }
 extern "C"
-void gpuPrintCudaStreamInt2( CUDAStream<int2>* cUDAStream, FILE* log )
+int gpuPrintCudaStreamInt2( CUDAStream<int2>* cUDAStream, FILE* log )
 {
-    if( cUDAStream == NULL )return;
+    if( cUDAStream == NULL )return 0;
    gpuPrintCudaStream( cUDAStream->_name.c_str(),
                        cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
                        cUDAStream->_length*cUDAStream->_subStreams*sizeof( int2 ),
                        static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
                        static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
+    return cUDAStream->_length*cUDAStream->_subStreams*2*sizeof( int );
 }
 extern "C"
-void gpuPrintCudaStreamInt4( CUDAStream<int4>* cUDAStream, FILE* log )
+int gpuPrintCudaStreamInt4( CUDAStream<int4>* cUDAStream, FILE* log )
 {
-    if( cUDAStream == NULL )return;
+    if( cUDAStream == NULL )return 0;
    gpuPrintCudaStream( cUDAStream->_name.c_str(),
                        cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
                        cUDAStream->_length*cUDAStream->_subStreams*sizeof( int4 ),
                        static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
                        static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
+    return cUDAStream->_length*cUDAStream->_subStreams*4*sizeof( int );
 }
 extern "C"
@@ -186,6 +193,8 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
    if( log == NULL )return;
    _gpuContext* gpu                            = amoebaGpu->gpuContext;
+    int totalMemory                             = 0;
    (void) fprintf( log, "cudaAmoebaGmxSimulation:\n\n" );
    (void) fprintf( log, "\n" );
@@ -206,32 +215,32 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
    (void) fprintf( log, "     outputBuffers                      %u\n",      gpu->sim.outputBuffers );
    (void) fprintf( log, "     workUnits                          %u\n",      amoebaGpu->workUnits );
-    gpuPrintCudaStreamFloat(  amoebaGpu->gpuContext->psEnergy,    log );
+    totalMemory += gpuPrintCudaStreamFloat(  amoebaGpu->gpuContext->psEnergy,    log );
-    gpuPrintCudaStreamFloat4( amoebaGpu->gpuContext->psForce4,    log );
+    totalMemory += gpuPrintCudaStreamFloat4( amoebaGpu->gpuContext->psForce4,    log );
-    gpuPrintCudaStreamFloat4( amoebaGpu->gpuContext->psPosq4,     log );
+    totalMemory += gpuPrintCudaStreamFloat4( amoebaGpu->gpuContext->psPosq4,     log );
-    gpuPrintCudaStreamFloat2( amoebaGpu->gpuContext->psObcData,   log );
+    totalMemory += gpuPrintCudaStreamFloat2( amoebaGpu->gpuContext->psObcData,   log );
-    gpuPrintCudaStreamFloat(  amoebaGpu->gpuContext->psBornForce, log );
+    totalMemory += gpuPrintCudaStreamFloat(  amoebaGpu->gpuContext->psBornForce, log );
    (void) fprintf( log, "\n\n" );
    (void) fprintf( log, "     amoebaBonds                       %u\n",      amoebaGpu->amoebaSim.amoebaBonds );
-    gpuPrintCudaStreamFloat(  amoebaGpu->psWorkArray_3_1, log );
+    totalMemory += gpuPrintCudaStreamFloat(  amoebaGpu->psWorkArray_3_1, log );
-    gpuPrintCudaStreamFloat(  amoebaGpu->psWorkArray_3_2, log );
+    totalMemory += gpuPrintCudaStreamFloat(  amoebaGpu->psWorkArray_3_2, log );
-    gpuPrintCudaStreamFloat(  amoebaGpu->psWorkArray_3_3, log );
+    totalMemory += gpuPrintCudaStreamFloat(  amoebaGpu->psWorkArray_3_3, log );
-    gpuPrintCudaStreamFloat(  amoebaGpu->psWorkArray_3_4, log );
+    totalMemory += gpuPrintCudaStreamFloat(  amoebaGpu->psWorkArray_3_4, log );
-    gpuPrintCudaStreamFloat(  amoebaGpu->psWorkArray_1_1, log );
+    totalMemory += gpuPrintCudaStreamFloat(  amoebaGpu->psWorkArray_1_1, log );
-    gpuPrintCudaStreamFloat(  amoebaGpu->psWorkArray_1_2, log );
+    totalMemory += gpuPrintCudaStreamFloat(  amoebaGpu->psWorkArray_1_2, log );
    (void) fprintf( log, "\n\n" );
-    gpuPrintCudaStreamUnsignedInt( amoebaGpu->psWorkUnit, log );
+    totalMemory += gpuPrintCudaStreamUnsignedInt( amoebaGpu->psWorkUnit, log );
-    gpuPrintCudaStreamInt(  amoebaGpu->psScalingIndicesIndex, log );
+    totalMemory += gpuPrintCudaStreamInt(  amoebaGpu->psScalingIndicesIndex, log );
-    gpuPrintCudaStreamInt(  amoebaGpu->ps_D_ScaleIndices, log );
+    totalMemory += gpuPrintCudaStreamInt(  amoebaGpu->ps_D_ScaleIndices, log );
-    gpuPrintCudaStreamInt2( amoebaGpu->ps_P_ScaleIndices, log );
+    totalMemory += gpuPrintCudaStreamInt2( amoebaGpu->ps_P_ScaleIndices, log );
-    gpuPrintCudaStreamInt2( amoebaGpu->ps_M_ScaleIndices, log );
+    totalMemory += gpuPrintCudaStreamInt2( amoebaGpu->ps_M_ScaleIndices, log );
    if( amoebaGpu->psAmoebaBondParameter)(void) fprintf( log, "\n" );
-    gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaBondID, log );
+    totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaBondID, log );
-    gpuPrintCudaStreamFloat2( amoebaGpu->psAmoebaBondParameter, log );
+    totalMemory += gpuPrintCudaStreamFloat2( amoebaGpu->psAmoebaBondParameter, log );
    (void) fprintf( log, "     amoebaBonds                       %u\n",      amoebaGpu->amoebaSim.amoebaBonds );
    (void) fprintf( log, "     amoebaBond_offset                 %u\n",      amoebaGpu->amoebaSim.amoebaBond_offset );
    (void) fprintf( log, "     cubic                             %15.7e\n",  amoebaGpu->amoebaSim.amoebaBondCubicParameter);
@@ -239,9 +248,9 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
    (void) fprintf( log, "     pAmoebaBondID                     %p\n",      amoebaGpu->amoebaSim.pAmoebaBondID );
    (void) fprintf( log, "     pAmoebaBondParameter              %p\n",      amoebaGpu->amoebaSim.pAmoebaBondParameter );
-    gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaAngleID1, log );
+    totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaAngleID1, log );
-    gpuPrintCudaStreamInt2( amoebaGpu->psAmoebaAngleID2, log );
+    totalMemory += gpuPrintCudaStreamInt2( amoebaGpu->psAmoebaAngleID2, log );
-    gpuPrintCudaStreamFloat2( amoebaGpu->psAmoebaAngleParameter, log );
+    totalMemory += gpuPrintCudaStreamFloat2( amoebaGpu->psAmoebaAngleParameter, log );
    (void) fprintf( log, "\n" );
    (void) fprintf( log, "     amoebaAngles                      %u\n",      amoebaGpu->amoebaSim.amoebaAngles );
    (void) fprintf( log, "     amoebaAngle_offset                %u\n",      amoebaGpu->amoebaSim.amoebaAngle_offset );
@@ -254,9 +263,9 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
    (void) fprintf( log, "     pAmoebaAngleParameter             %p\n",      amoebaGpu->amoebaSim.pAmoebaAngleParameter );
    if( amoebaGpu->psAmoebaInPlaneAngleID1 )(void) fprintf( log, "\n" );
-    gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaInPlaneAngleID1, log );
+    totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaInPlaneAngleID1, log );
-    gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaInPlaneAngleID2, log );
+    totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaInPlaneAngleID2, log );
-    gpuPrintCudaStreamFloat2( amoebaGpu->psAmoebaInPlaneAngleParameter, log );
+    totalMemory += gpuPrintCudaStreamFloat2( amoebaGpu->psAmoebaInPlaneAngleParameter, log );
    (void) fprintf( log, "\n" );
    (void) fprintf( log, "     amoebaInPlaneAngles               %u\n",      amoebaGpu->amoebaSim.amoebaInPlaneAngles );
    (void) fprintf( log, "     amoebaInPlaneAngle_offset         %u\n",      amoebaGpu->amoebaSim.amoebaInPlaneAngle_offset );
@@ -270,10 +279,10 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
    if( amoebaGpu->psAmoebaTorsionID1)(void) fprintf( log, "\n" );
-    gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaTorsionID1, log );
+    totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaTorsionID1, log );
-    gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaTorsionID2, log );
+    totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaTorsionID2, log );
-    gpuPrintCudaStreamFloat4( amoebaGpu->psAmoebaTorsionParameter1, log );
+    totalMemory += gpuPrintCudaStreamFloat4( amoebaGpu->psAmoebaTorsionParameter1, log );
-    gpuPrintCudaStreamFloat2( amoebaGpu->psAmoebaTorsionParameter2, log );
+    totalMemory += gpuPrintCudaStreamFloat2( amoebaGpu->psAmoebaTorsionParameter2, log );
    (void) fprintf( log, "     amoebaTorsions                    %u\n",      amoebaGpu->amoebaSim.amoebaTorsions );
    (void) fprintf( log, "     amoebaTorsion_offset              %u\n",      amoebaGpu->amoebaSim.amoebaTorsion_offset );
    (void) fprintf( log, "     pAmoebaTorsionID1                 %p\n",      amoebaGpu->amoebaSim.pAmoebaTorsionID1 );
@@ -282,10 +291,10 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
    (void) fprintf( log, "     pAmoebaTorsionParameter2          %p\n",      amoebaGpu->amoebaSim.pAmoebaTorsionParameter2 );
    if( amoebaGpu->psAmoebaPiTorsionID1)(void) fprintf( log, "\n" );
-    gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaPiTorsionID1, log );
+    totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaPiTorsionID1, log );
-    gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaPiTorsionID2, log );
+    totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaPiTorsionID2, log );
-    gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaPiTorsionID3, log );
+    totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaPiTorsionID3, log );
-    gpuPrintCudaStreamFloat( amoebaGpu->psAmoebaPiTorsionParameter, log );
+    totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psAmoebaPiTorsionParameter, log );
    (void) fprintf( log, "     amoebaPiTorsions                  %u\n",      amoebaGpu->amoebaSim.amoebaPiTorsions );
    (void) fprintf( log, "     amoebaPiTorsion_offset            %u\n",      amoebaGpu->amoebaSim.amoebaPiTorsion_offset );
@@ -295,9 +304,9 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
    (void) fprintf( log, "     pAmoebaPiTorsionParameter         %p\n",      amoebaGpu->amoebaSim.pAmoebaPiTorsionParameter );
    if( amoebaGpu->psAmoebaStretchBendID1)(void) fprintf( log, "\n" );
-    gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaStretchBendID1, log );
+    totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaStretchBendID1, log );
-    gpuPrintCudaStreamInt2( amoebaGpu->psAmoebaStretchBendID2, log );
+    totalMemory += gpuPrintCudaStreamInt2( amoebaGpu->psAmoebaStretchBendID2, log );
-    gpuPrintCudaStreamFloat4( amoebaGpu->psAmoebaStretchBendParameter, log );
+    totalMemory += gpuPrintCudaStreamFloat4( amoebaGpu->psAmoebaStretchBendParameter, log );
    (void) fprintf( log, "     amoebaStretchBend                  %u\n",      amoebaGpu->amoebaSim.amoebaStretchBends );
    (void) fprintf( log, "     amoebaStretchBend_offset           %u\n",      amoebaGpu->amoebaSim.amoebaStretchBend_offset );
    (void) fprintf( log, "     pAmoebaStretchBendID1              %p\n",      amoebaGpu->amoebaSim.pAmoebaStretchBendID1 );
@@ -305,9 +314,9 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
    (void) fprintf( log, "     pAmoebaStretchBendParameter        %p\n",      amoebaGpu->amoebaSim.pAmoebaStretchBendParameter );
    if( amoebaGpu->psAmoebaOutOfPlaneBendID1)(void) fprintf( log, "\n" );
-    gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaOutOfPlaneBendID1, log );
+    totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaOutOfPlaneBendID1, log );
-    gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaOutOfPlaneBendID2, log );
+    totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaOutOfPlaneBendID2, log );
-    gpuPrintCudaStreamFloat( amoebaGpu->psAmoebaOutOfPlaneBendParameter, log );
+    totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psAmoebaOutOfPlaneBendParameter, log );
    (void) fprintf( log, "     amoebaOutOfPlaneBend               %u\n",      amoebaGpu->amoebaSim.amoebaOutOfPlaneBends );
    (void) fprintf( log, "     amoebaOutOfPlaneBend_offset        %u\n",      amoebaGpu->amoebaSim.amoebaOutOfPlaneBend_offset );
    (void) fprintf( log, "     amoebaOutOfPlaneBendCubicK         %15.7e\n",  amoebaGpu->amoebaSim.amoebaOutOfPlaneBendCubicK );
@@ -319,10 +328,10 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
    (void) fprintf( log, "     pAmoebaOutOfPlaneBendParameter     %p\n",      amoebaGpu->amoebaSim.pAmoebaOutOfPlaneBendParameter );
    if( amoebaGpu->psAmoebaTorsionTorsionID1)(void) fprintf( log, "\n" );
-    gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaTorsionTorsionID1, log );
+    totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaTorsionTorsionID1, log );
-    gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaTorsionTorsionID2, log );
+    totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaTorsionTorsionID2, log );
-    gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaTorsionTorsionID3, log );
+    totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaTorsionTorsionID3, log );
-    gpuPrintCudaStreamFloat4( amoebaGpu->psAmoebaTorsionTorsionGrids, log );
+    totalMemory += gpuPrintCudaStreamFloat4( amoebaGpu->psAmoebaTorsionTorsionGrids, log );
    (void) fprintf( log, "\n" );
    (void) fprintf( log, "     amoebaTorsionTorsions              %u\n",      amoebaGpu->amoebaSim.amoebaTorsionTorsions );
    (void) fprintf( log, "     amoebaTorsionTorsion_offset        %u\n",      amoebaGpu->amoebaSim.amoebaTorsionTorsion_offset );
@@ -333,8 +342,8 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
    (void) fprintf( log, "     pOutputBufferCounter               %p\n",      amoebaGpu->gpuContext->pOutputBufferCounter );
    if( amoebaGpu->psAmoebaUreyBradleyParameter)(void) fprintf( log, "\n" );
-    gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaUreyBradleyID, log );
+    totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaUreyBradleyID, log );
-    gpuPrintCudaStreamFloat2( amoebaGpu->psAmoebaUreyBradleyParameter, log );
+    totalMemory += gpuPrintCudaStreamFloat2( amoebaGpu->psAmoebaUreyBradleyParameter, log );
    (void) fprintf( log, "     amoebaUreyBradleys                 %u\n",      amoebaGpu->amoebaSim.amoebaUreyBradleys );
    (void) fprintf( log, "     amoebaUreyBradley_offset           %u\n",      amoebaGpu->amoebaSim.amoebaUreyBradley_offset );
    (void) fprintf( log, "     cubic                              %15.7e\n",  amoebaGpu->amoebaSim.amoebaUreyBradleyCubicParameter);
@@ -343,20 +352,26 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
    (void) fprintf( log, "     pAmoebaUreyBradleyParameter        %p\n",      amoebaGpu->amoebaSim.pAmoebaUreyBradleyParameter );
    (void) fprintf( log, "\n\n" );
-    gpuPrintCudaStreamInt4( amoebaGpu->psMultipoleParticlesIdsAndAxisType, log );
+    totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psMultipoleParticlesIdsAndAxisType, log );
    (void) fprintf( log, "     pMultipoleParticlesIdsAndAxisType  %p\n",      amoebaGpu->amoebaSim.pMultipoleParticlesIdsAndAxisType);
    (void) fprintf( log, "     maxTorqueBufferIndex            %d\n",      amoebaGpu->maxTorqueBufferIndex );
-    gpuPrintCudaStreamInt4( amoebaGpu->psMultipoleParticlesTorqueBufferIndices, log );
+    totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psMultipoleParticlesTorqueBufferIndices, log );
+    int memory   = gpuPrintCudaStreamFloat4( amoebaGpu->psTorqueMapForce4, log );
+    if( amoebaGpu->torqueMapForce4Delete )totalMemory += memory;
+    totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psTorque, log );
    (void) fprintf( log, "     psMultipoleParticlesTorqueBufferIndices %p\n",      amoebaGpu->amoebaSim.pMultipoleParticlesTorqueBufferIndices);
-    gpuPrintCudaStreamFloat( amoebaGpu->psMolecularDipole, log );
+    totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psMolecularDipole, log );
    (void) fprintf( log, "     pMolecularDipole                   %p\n",      amoebaGpu->amoebaSim.pMolecularDipole);
-    gpuPrintCudaStreamFloat( amoebaGpu->psMolecularQuadrupole, log );
+    totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psMolecularQuadrupole, log );
    (void) fprintf( log, "     pMolecularQuadrupole               %p\n",      amoebaGpu->amoebaSim.pMolecularQuadrupole );
-    gpuPrintCudaStreamFloat( amoebaGpu->psLabFrameDipole, log );
+    totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psLabFrameDipole, log );
-    gpuPrintCudaStreamFloat( amoebaGpu->psLabFrameQuadrupole, log );
+    totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psLabFrameQuadrupole, log );
    (void) fprintf( log, "     polarizationType                   %d\n",      amoebaGpu->amoebaSim.polarizationType );
    (void) fprintf( log, "     maxCovalentDegreeSz                %d\n",      amoebaGpu->maxCovalentDegreeSz );
@@ -380,15 +395,11 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
    (void) fprintf( log, "     fd                                 %15.7e\n",  amoebaGpu->amoebaSim.fd );
    (void) fprintf( log, "     fq                                 %15.7e\n",  amoebaGpu->amoebaSim.fq );
-    gpuPrintCudaStreamFloat2( amoebaGpu->psDampingFactorAndThole, log );
+    totalMemory += gpuPrintCudaStreamFloat2( amoebaGpu->psDampingFactorAndThole, log );
-    gpuPrintCudaStreamInt( amoebaGpu->psCovalentDegree, log );
-    gpuPrintCudaStreamInt( amoebaGpu->psPolarizationDegree, log );
-    gpuPrintCudaStreamFloat( amoebaGpu->psE_Field, log );
-    gpuPrintCudaStreamFloat( amoebaGpu->psE_FieldPolar, log );
-    gpuPrintCudaStreamFloat( amoebaGpu->psPolarizability, log );
+    totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psE_Field, log );
+    totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psE_FieldPolar, log );
+    totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psPolarizability, log );
    (void) fprintf( log, "     mutualInducedIterativeMethod       %d\n",  amoebaGpu->mutualInducedIterativeMethod);
    (void) fprintf( log, "     mutualInducedMaxIterations         %d\n",  amoebaGpu->mutualInducedMaxIterations);
@@ -396,19 +407,22 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
    (void) fprintf( log, "     mutualInducedTargetEpsilon         %10.3e\n",  amoebaGpu->mutualInducedTargetEpsilon);
    (void) fprintf( log, "     mutualInducedCurrentEpsilon        %10.3e\n",  amoebaGpu->mutualInducedCurrentEpsilon );
-    gpuPrintCudaStreamFloat( amoebaGpu->psInducedDipole, log );
+    totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psInducedDipole, log );
-    gpuPrintCudaStreamFloat( amoebaGpu->psInducedDipolePolar, log );
+    totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psInducedDipolePolar, log );
-    gpuPrintCudaStreamFloat( amoebaGpu->psCurrentEpsilon, log );
+    totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psCurrentEpsilon, log );
    (void) fprintf( log, "     numberOfSorWorkVectors             %u\n",  amoebaGpu->numberOfSorWorkVectors);
-    gpuPrintCudaStreamFloat( amoebaGpu->psWorkVector[0], log );
+    totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psWorkVector[0], log );
-    gpuPrintCudaStreamFloat( amoebaGpu->psTorque, log );
+    totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psWorkVector[1], log );
+    totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psWorkVector[2], log );
-    gpuPrintCudaStreamFloat( amoebaGpu->psGk_Field, log );
+    totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psWorkVector[3], log );
-    gpuPrintCudaStreamFloat( amoebaGpu->psInducedDipoleS, log );
+    totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psTorque, log );
-    gpuPrintCudaStreamFloat( amoebaGpu->psInducedDipolePolarS, log );
-    gpuPrintCudaStreamFloat( amoebaGpu->psBorn, log );
+    totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psGk_Field, log );
-    gpuPrintCudaStreamFloat( amoebaGpu->psBornPolar, log );
+    totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psInducedDipoleS, log );
+    totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psInducedDipolePolarS, log );
+    totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psBorn, log );
+    totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psBornPolar, log );
    (void) fprintf( log, "     includeObcCavityTerm               %d\n",      amoebaGpu->includeObcCavityTerm );
    (void) fprintf( log, "     dielectricOffset                   %15.7e\n",  gpu->sim.dielectricOffset );
    (void) fprintf( log, "     probeRadius                        %15.7e\n",  gpu->sim.probeRadius );
@@ -420,14 +434,14 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
    (void) fprintf( log, "     vdwEpsilonCombiningRule            %d\n",      amoebaGpu->vdwEpsilonCombiningRule);
    (void) fprintf( log, "     vdwUsePBC                          %d\n",      amoebaGpu->amoebaSim.vdwUsePBC);
    (void) fprintf( log, "     vdwCutoff2                         %15.7e\n",  amoebaGpu->amoebaSim.vdwCutoff2);
-    gpuPrintCudaStreamFloat2( amoebaGpu->psVdwSigmaEpsilon, log );
+    totalMemory += gpuPrintCudaStreamFloat2( amoebaGpu->psVdwSigmaEpsilon, log );
-    gpuPrintCudaStreamInt( amoebaGpu->psAmoebaVdwNonReductionID, log );
+    totalMemory += gpuPrintCudaStreamInt( amoebaGpu->psAmoebaVdwNonReductionID, log );
-    gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaVdwReductionID, log );
+    totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaVdwReductionID, log );
-    gpuPrintCudaStreamFloat( amoebaGpu->psAmoebaVdwReduction, log );
+    totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psAmoebaVdwReduction, log );
-    gpuPrintCudaStreamFloat4( amoebaGpu->psAmoebaVdwCoordinates, log );
+    totalMemory += gpuPrintCudaStreamFloat4( amoebaGpu->psAmoebaVdwCoordinates, log );
-    gpuPrintCudaStreamUnsignedInt( amoebaGpu->psVdwWorkUnit, log );
+    totalMemory += gpuPrintCudaStreamUnsignedInt( amoebaGpu->psVdwWorkUnit, log );
-    gpuPrintCudaStreamInt( amoebaGpu->psVdwExclusionIndicesIndex, log );
+    totalMemory += gpuPrintCudaStreamInt( amoebaGpu->psVdwExclusionIndicesIndex, log );
-    gpuPrintCudaStreamInt( amoebaGpu->psVdwExclusionIndices, log );
+    totalMemory += gpuPrintCudaStreamInt( amoebaGpu->psVdwExclusionIndices, log );
    (void) fprintf( log, "     amoebaVdwNonReductions             %u\n",      amoebaGpu->amoebaSim.amoebaVdwNonReductions );
    (void) fprintf( log, "     pAmoebaVdwNonReductionID           %p\n",      amoebaGpu->amoebaSim.pAmoebaVdwNonReductionID );
    (void) fprintf( log, "     amoebaVdwReductions                %u\n",      amoebaGpu->amoebaSim.amoebaVdwReductions );
@@ -436,7 +450,7 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
    (void) fprintf( log, "     pVdwExclusionIndicesIndex          %p\n",      amoebaGpu->amoebaSim.pVdwExclusionIndicesIndex);
    (void) fprintf( log, "     pVdwExclusionIndices               %p\n",      amoebaGpu->amoebaSim.pVdwExclusionIndices);
-    gpuPrintCudaStreamFloat2( amoebaGpu->psWcaDispersionRadiusEpsilon, log );
+    totalMemory += gpuPrintCudaStreamFloat2( amoebaGpu->psWcaDispersionRadiusEpsilon, log );
    (void) fprintf( log, "\n" );
    (void) fprintf( log, "     epso                               %15.7e\n",  amoebaGpu->amoebaSim.epso );
    (void) fprintf( log, "     epsh                               %15.7e\n",  amoebaGpu->amoebaSim.epsh );
@@ -447,6 +461,8 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
    (void) fprintf( log, "     dispoff                            %15.7e\n",  amoebaGpu->amoebaSim.dispoff );
    (void) fprintf( log, "     totalMaxWcaDispersionEnergy        %15.7e\n",  amoebaGpu->amoebaSim.totalMaxWcaDispersionEnergy );
+    (void) fprintf( log, "     total array memory                     %d\n",  totalMemory );
    (void) fflush( log );
 }
@@ -1349,20 +1365,14 @@ static void gpuFixedEFieldAllocate( amoebaGpuContext amoebaGpu )
    amoebaGpu->psDampingFactorAndThole               = new CUDAStream<float2>(paddedNumberOfAtoms, 1, "DampingFactorAndThole");
    amoebaGpu->amoebaSim.pDampingFactorAndThole      = amoebaGpu->psDampingFactorAndThole->_pDevData;
-    amoebaGpu->psCovalentDegree                      = new CUDAStream<int>(amoebaGpu->maxCovalentDegreeSz*paddedNumberOfAtoms, 1, "CovalentDegree");
+    amoebaGpu->covalentDegree.resize(      amoebaGpu->maxCovalentDegreeSz*paddedNumberOfAtoms, 0 );
-    amoebaGpu->psPolarizationDegree                  = new CUDAStream<int>(amoebaGpu->maxCovalentDegreeSz*paddedNumberOfAtoms, 1, "PolarizationDegree");
+    amoebaGpu->polarizationDegree.resize( amoebaGpu->maxCovalentDegreeSz*paddedNumberOfAtoms, 0 );
    unsigned int offset                              = paddedNumberOfAtoms*sizeof( float );
    memset( amoebaGpu->psDampingFactorAndThole->_pSysData,              0,2*offset );
    //memset( amoebaGpu->psE_Field->_pSysData,            0, offset*3 );
    //memset( amoebaGpu->psE_FieldPolar->_pSysData,       0, offset*3 );
-    // should be removed XXXXX
-    offset                                           = amoebaGpu->maxCovalentDegreeSz*paddedNumberOfAtoms*sizeof( int );
-    memset( amoebaGpu->psCovalentDegree->_pSysData,     0, offset );
-    memset( amoebaGpu->psPolarizationDegree->_pSysData, 0, offset );
 }
 /**---------------------------------------------------------------------------------------
@@ -1666,7 +1676,7 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
        const int particlesOffset                                        = ii*amoebaGpu->maxCovalentDegreeSz;
        const int minCovalentIndex                                       = minCovalentIndices[ii];
-        amoebaGpu->psCovalentDegree->_pSysData[particlesOffset]          = minCovalentIndex;
+        amoebaGpu->covalentDegree[particlesOffset]                       = minCovalentIndex;
        // covalent info
@@ -1680,7 +1690,7 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
                               ") is out of range -- maxCovalentDegreeSz needs to be increased." << std::endl;
                    errorCount++;
                } else {
-                    amoebaGpu->psCovalentDegree->_pSysData[particlesOffset+covalentIndex] = covalentDegree[jj] + 1;
+                    amoebaGpu->covalentDegree[particlesOffset+covalentIndex] = covalentDegree[jj] + 1;
                }
            }
        }
@@ -1688,7 +1698,7 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
        // polarization covalent info
        const int minCovalentPolarizationIndex                                = minCovalentPolarizationIndices[ii];
-        amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset]      = minCovalentPolarizationIndex;
+        amoebaGpu->polarizationDegree[particlesOffset]      = minCovalentPolarizationIndex;
        for( unsigned int jj = 4; jj < covalentInfo.size(); jj++ ){
            const std::vector<int> covalentList = covalentInfo[jj];
@@ -1699,7 +1709,7 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
                               ") is out of range -- maxCovalentDegreeSz needs to be increased." << std::endl;
                    errorCount++;
                } else {
-                    amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset+covalentIndex] = covalentDegree[jj] + 1;
+                    amoebaGpu->polarizationDegree[particlesOffset+covalentIndex] = covalentDegree[jj] + 1;
                }
            }
        }
@@ -1739,7 +1749,7 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
            // covalent/polarization degree
            (void) fprintf( amoebaGpu->log,"%3d covalent/polarization degree: minIdx[%6d %6d] Thole=%12.5f dampingFactor=%12.5f\n", ii,
-                            amoebaGpu->psCovalentDegree->_pSysData[particlesOffset],  amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset],
+                            amoebaGpu->covalentDegree[particlesOffset],  amoebaGpu->polarizationDegree[particlesOffset],
                            amoebaGpu->psDampingFactorAndThole->_pSysData[ii].y, amoebaGpu->psDampingFactorAndThole->_pSysData[ii].x );
            // covalent
@@ -1752,20 +1762,20 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
                int count = 0;
                for( int jj = 1; jj < amoebaGpu->maxCovalentDegreeSz; jj++ ){
-                    if( amoebaGpu->psCovalentDegree->_pSysData[particlesOffset+jj] == kk ){ 
+                    if( amoebaGpu->covalentDegree[particlesOffset+jj] == kk ){ 
                        if( count == 0 ){
                            (void) fprintf( amoebaGpu->log,"%d [", kk );
                        }
                        float pScale        = polarScale[kk-1];
-                        int particle2Index  = amoebaGpu->psCovalentDegree->_pSysData[particlesOffset] + jj - 1;
+                        int particle2Index  = amoebaGpu->covalentDegree[particlesOffset] + jj - 1;
-                        if( kk == 4 && particle2Index >= amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset] ){
+                        if( kk == 4 && particle2Index >= amoebaGpu->polarizationDegree[particlesOffset] ){
-                            int particle2Offset = particle2Index - amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset] + 1;
+                            int particle2Offset = particle2Index - amoebaGpu->polarizationDegree[particlesOffset] + 1;
-                            if( particle2Offset < amoebaGpu->maxCovalentDegreeSz && amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset+particle2Offset] == 1 ){
+                            if( particle2Offset < amoebaGpu->maxCovalentDegreeSz && amoebaGpu->polarizationDegree[particlesOffset+particle2Offset] == 1 ){
                                pScale *= 0.5;
                            }
                        }
                        (void) fprintf( amoebaGpu->log,"%5d %5.1f   ",
-                                        amoebaGpu->psCovalentDegree->_pSysData[particlesOffset] + jj - 1, pScale );
+                                        amoebaGpu->covalentDegree[particlesOffset] + jj - 1, pScale );
                        count++;
                   }
                }
@@ -1782,11 +1792,11 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
                int count = 0;
                for( int jj = 1; jj < amoebaGpu->maxCovalentDegreeSz; jj++ ){
-                    if( amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset+jj] == kk ){ 
+                    if( amoebaGpu->polarizationDegree[particlesOffset+jj] == kk ){ 
                        if( count == 0 ){
                            (void) fprintf( amoebaGpu->log,"%d [", kk );
                        }
-                        (void) fprintf( amoebaGpu->log,"%5d ", amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset] + jj - 1 );
+                        (void) fprintf( amoebaGpu->log,"%5d ", amoebaGpu->polarizationDegree[particlesOffset] + jj - 1 );
                        count++;
                    }
                }
@@ -1818,19 +1828,19 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
            // print entries w/ degree=kk
            for( int jj = 1; jj < amoebaGpu->maxCovalentDegreeSz; jj++ ){
-                if( amoebaGpu->psCovalentDegree->_pSysData[particlesOffset+jj] ){ 
+                if( amoebaGpu->covalentDegree[particlesOffset+jj] ){ 
-                    int index           = amoebaGpu->psCovalentDegree->_pSysData[particlesOffset+jj];
+                    int index           = amoebaGpu->covalentDegree[particlesOffset+jj];
                    float pScale        = polarScale[index-1];
                    float mScale        = mpoleScale[index-1];
-                    int particle2Index  = amoebaGpu->psCovalentDegree->_pSysData[particlesOffset] + jj - 1;
+                    int particle2Index  = amoebaGpu->covalentDegree[particlesOffset] + jj - 1;
-                    if( index == 4 && particle2Index >= amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset] ){
+                    if( index == 4 && particle2Index >= amoebaGpu->polarizationDegree[particlesOffset] ){
-                        int particle2Offset = particle2Index - amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset] + 1;
+                        int particle2Offset = particle2Index - amoebaGpu->polarizationDegree[particlesOffset] + 1;
-                        if( particle2Offset < amoebaGpu->maxCovalentDegreeSz && amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset+particle2Offset] == 1 ){
+                        if( particle2Offset < amoebaGpu->maxCovalentDegreeSz && amoebaGpu->polarizationDegree[particlesOffset+particle2Offset] == 1 ){
                            pScale *= 0.5;
                        }
                    }
                    pScaleCheckSum[ii] += (pScale - 1.0f);
-                    int covIndex        = amoebaGpu->psCovalentDegree->_pSysData[particlesOffset];
+                    int covIndex        = amoebaGpu->covalentDegree[particlesOffset];
                    if( pScale != 1.0f ){
                        MapIntFloat* pMap = amoebaGpu->pMapArray[ii];
                        (*pMap)[covIndex+jj-1] = pScale;
@@ -1841,10 +1851,10 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
            // polarization
            for( int jj = 1; jj < amoebaGpu->maxCovalentDegreeSz; jj++ ){
-                if( amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset+jj] ){ 
+                if( amoebaGpu->polarizationDegree[particlesOffset+jj] ){ 
-                    int index    = amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset+jj];
+                    int index    = amoebaGpu->polarizationDegree[particlesOffset+jj];
                    dScaleCheckSum[ii] += (directScale[index-1] - 1.0f);
-                    int covIndex        = amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset];
+                    int covIndex        = amoebaGpu->polarizationDegree[particlesOffset];
                    if( directScale[index-1] != 1.0f ){
                        MapIntFloat* dMap      = amoebaGpu->dMapArray[ii];
                        (*dMap)[covIndex+jj-1] = directScale[index-1];
@@ -1872,8 +1882,6 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
    amoebaGpu->psMultipoleParticlesTorqueBufferIndices->Upload();
    amoebaGpu->psMolecularDipole->Upload();
    amoebaGpu->psMolecularQuadrupole->Upload();
-    amoebaGpu->psCovalentDegree->Upload();
-    amoebaGpu->psPolarizationDegree->Upload();
    amoebaGpu->psDampingFactorAndThole->Upload();
    amoebaGpu->psPolarizability->Upload();
    amoebaGpu->gpuContext->psPosq4->Upload();
@@ -2652,21 +2660,27 @@ void amoebaGpuShutDown(amoebaGpuContext gpu)
    delete gpu->psMolecularDipole;
    delete gpu->psMolecularQuadrupole;
    delete gpu->psLabFrameDipole;
    delete gpu->psLabFrameQuadrupole;
    delete gpu->psDampingFactorAndThole;
-    delete gpu->psCovalentDegree;
-    delete gpu->psPolarizationDegree;
    delete gpu->psE_Field;
    delete gpu->psE_FieldPolar;
    delete gpu->psInducedDipole;
    delete gpu->psInducedDipolePolar;
    delete gpu->psPolarizability;
    delete gpu->psCurrentEpsilon;
    delete gpu->psWorkVector[0];
    delete gpu->psWorkVector[1];
    delete gpu->psWorkVector[2];
    delete gpu->psWorkVector[3];
    delete gpu->psTorque;
    delete gpu->psGk_Field;
@@ -2773,6 +2787,7 @@ void amoebaGpuBuildOutputBuffers( amoebaGpuContext amoebaGpu, int hasAmoebaGener
    amoebaGpu->amoebaSim.pWorkArray_3_2         = amoebaGpu->psWorkArray_3_2->_pDevData;
    // used in GK calculations
    if( hasAmoebaGeneralizedKirkwood )
    {
        if( amoebaGpu->psWorkArray_3_3 )
@@ -2810,19 +2825,19 @@ static void getScalingDegrees( amoebaGpuContext amoebaGpu, unsigned int particle
 {
    int particlesOffset                        = particleI*amoebaGpu->maxCovalentDegreeSz;
-    unsigned int minCovalentIndex              = static_cast<unsigned int>(amoebaGpu->psCovalentDegree->_pSysData[particlesOffset]);
+    unsigned int minCovalentIndex              = static_cast<unsigned int>(amoebaGpu->covalentDegree[particlesOffset]);
-    unsigned int minCovalentPolarizationIndex  = static_cast<unsigned int>(amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset]);
+    unsigned int minCovalentPolarizationIndex  = static_cast<unsigned int>(amoebaGpu->polarizationDegree[particlesOffset]);
    if( particleJ < minCovalentIndex || particleJ >= (minCovalentIndex + amoebaGpu->maxCovalentDegreeSz-1) ){
        *covalentDegree     = 0;
    } else {
-        *covalentDegree     = amoebaGpu->psCovalentDegree->_pSysData[particlesOffset + (particleJ-minCovalentIndex) + 1];
+        *covalentDegree     = amoebaGpu->covalentDegree[particlesOffset + (particleJ-minCovalentIndex) + 1];
    }
    if( particleJ < minCovalentPolarizationIndex || particleJ >= (minCovalentPolarizationIndex + amoebaGpu->maxCovalentDegreeSz-1) ){
        *polarizationDegree = 0;
    } else {
-        *polarizationDegree = amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset + (particleJ-minCovalentPolarizationIndex) + 1];
+        *polarizationDegree = amoebaGpu->polarizationDegree[particlesOffset + (particleJ-minCovalentPolarizationIndex) + 1];
    }
 /* if( *covalentDegree > 5 || *polarizationDegree > 5 ){
@@ -2886,9 +2901,7 @@ void amoebaGpuBuildScalingList( amoebaGpuContext amoebaGpu )
    // ---------------------------------------------------------------------------------------
-    if( amoebaGpu->psCovalentDegree == NULL ){
+    if( amoebaGpu->covalentDegree.size() < 1 )return;
-        return;
-    }    
    const unsigned int paddedAtoms     = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
    const unsigned int actualAtoms     = amoebaGpu->gpuContext->natoms;
@@ -2914,18 +2927,18 @@ void amoebaGpuBuildScalingList( amoebaGpuContext amoebaGpu )
    {
        int x                  = atom1/grid;
        int particlesOffset    = atom1*amoebaGpu->maxCovalentDegreeSz;
-        int minCovalentIndex   = amoebaGpu->psCovalentDegree->_pSysData[particlesOffset];
+        int minCovalentIndex   = amoebaGpu->covalentDegree[particlesOffset];
-        int minPolarCovIndex   = amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset];
+        int minPolarCovIndex   = amoebaGpu->polarizationDegree[particlesOffset];
        int maxCIndex          = 0;
        int maxPIndex          = 0;
        for (int jj = amoebaGpu->maxCovalentDegreeSz - 1; jj >= 1 && (maxPIndex == 0 || maxCIndex == 0); jj-- )
        {
-            if( amoebaGpu->psCovalentDegree->_pSysData[particlesOffset+jj] && maxCellIndex[x] < (minCovalentIndex+jj) )
+            if( amoebaGpu->covalentDegree[particlesOffset+jj] && maxCellIndex[x] < (minCovalentIndex+jj) )
            { 
                maxCellIndex[x] =  minCovalentIndex + jj;
                maxCIndex++; 
            }
-            if( amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset+jj] && maxCellIndex[x] < (minPolarCovIndex+jj) )
+            if( amoebaGpu->polarizationDegree[particlesOffset+jj] && maxCellIndex[x] < (minPolarCovIndex+jj) )
            { 
                maxCellIndex[x] =  minPolarCovIndex + jj;
                maxPIndex++; 

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
@@ -30,20 +30,10 @@
 #include "kernels/gputypes.h"
 #include "amoebaCudaTypes.h"
-#define THREADS_PER_BLOCK 256
 #include <map>
 typedef std::map<int,float> MapIntFloat;
 typedef MapIntFloat::const_iterator MapIntFloatCI;
-/* 
- * Remove
- * pMapArray, dMapArray, paddedNumberOfAtoms, nonbondBlocks, nonbondThreadsPerBlock, nonbondOutputBuffers
- * allocation of torqueMapForce psCovalentDegree psPolarizationDegree
- * 
-   THREADS_PER_BLOCK
- */
 struct _amoebaGpuContext {
    _gpuContext* gpuContext;
@@ -112,7 +102,6 @@ struct _amoebaGpuContext {
    // multipole parameters
    CUDAStream<int4>* psMultipoleParticlesIdsAndAxisType;
-    CUDAStream<int>* psMultipoleAxisOffset;
    // buffer indices used for mapping torques onto forces 
@@ -133,10 +122,10 @@ struct _amoebaGpuContext {
    CUDAStream<float2>*  psDampingFactorAndThole;
-    // slated for removal -- no longer used
+    // used to setup scaling constants
-    CUDAStream<int>*    psCovalentDegree;
+    std::vector<int>    covalentDegree;
-    CUDAStream<int>*    psPolarizationDegree;
+    std::vector<int>    polarizationDegree;
    // fixed-E field

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
@@ -255,18 +255,22 @@ void kInitializeMutualInducedAndGkField_kernel(
                   float* inducedDipolePolarS )
 {
-    int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
+    int pos = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
-    if( threadId >= 3*cSim.atoms )return;
+    while( pos < 3*cSim.atoms )
+    {
+        fixedEField[pos]          *= polarizability[pos];
+        inducedDipole[pos]         = fixedEField[pos];
-    fixedEField[threadId]          *= polarizability[threadId];
+        fixedEFieldPolar[pos]     *= polarizability[pos];
-    inducedDipole[threadId]         = fixedEField[threadId];
+        inducedDipolePolar[pos]    = fixedEFieldPolar[pos];
-    fixedEFieldPolar[threadId]     *= polarizability[threadId];
+        fixedGkField[pos]         *= polarizability[pos];
-    inducedDipolePolar[threadId]    = fixedEFieldPolar[threadId];
+        inducedDipoleS[pos]        = fixedEField[pos]       + fixedGkField[pos];
+        inducedDipolePolarS[pos]   = fixedEFieldPolar[pos]  + fixedGkField[pos];
-    fixedGkField[threadId]         *= polarizability[threadId];
+        pos                       += blockDim.x*gridDim.x;
-    inducedDipoleS[threadId]        = fixedEField[threadId]       + fixedGkField[threadId];
+    }
-    inducedDipolePolarS[threadId]   = fixedEFieldPolar[threadId]  + fixedGkField[threadId];
 }
@@ -355,21 +359,24 @@ void kSorUpdateMutualInducedAndGkField_kernel(
 {
    float polarSOR = 0.70f;
-    int threadId   = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
+    int pos        = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
-    if( threadId  >= 3*cSim.atoms)return;
+    while( pos < 3*cSim.atoms )
+    {
-    float previousDipole                = inducedDipole[threadId];
+        float previousDipole           = inducedDipole[pos];
-    float previousDipoleP               = inducedDipoleP[threadId];
+        float previousDipoleP          = inducedDipoleP[pos];
-    inducedDipole[threadId]             = fixedEField[threadId]     + polarizability[threadId]*matrixProduct[threadId];
+        inducedDipole[pos]             = fixedEField[pos]     + polarizability[pos]*matrixProduct[pos];
-    inducedDipoleP[threadId]            = fixedEFieldP[threadId]    + polarizability[threadId]*matrixProductP[threadId];
+        inducedDipoleP[pos]            = fixedEFieldP[pos]    + polarizability[pos]*matrixProductP[pos];
-    inducedDipole[threadId]             = previousDipole   + polarSOR*( inducedDipole[threadId]   - previousDipole  );   
+        inducedDipole[pos]             = previousDipole   + polarSOR*( inducedDipole[pos]   - previousDipole  );   
-    inducedDipoleP[threadId]            = previousDipoleP  + polarSOR*( inducedDipoleP[threadId]  - previousDipoleP );
+        inducedDipoleP[pos]            = previousDipoleP  + polarSOR*( inducedDipoleP[pos]  - previousDipoleP );
-    matrixProduct[threadId]             = ( inducedDipole[threadId]  - previousDipole  )*( inducedDipole[threadId]  - previousDipole  );
+        matrixProduct[pos]             = ( inducedDipole[pos]  - previousDipole  )*( inducedDipole[pos]  - previousDipole  );
-    matrixProductP[threadId]            = ( inducedDipoleP[threadId] - previousDipoleP )*( inducedDipoleP[threadId] - previousDipoleP );
+        matrixProductP[pos]            = ( inducedDipoleP[pos] - previousDipoleP )*( inducedDipoleP[pos] - previousDipoleP );
+        pos                           += blockDim.x*gridDim.x;
+    }
 }
 __global__ 
@@ -389,21 +396,23 @@ void kSorUpdateMutualInducedAndGkFieldS_kernel(
 {
    float polarSOR = 0.70f;
-    int threadId   = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
+    int pos        = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
-    if( threadId  >= 3*cSim.atoms)return;
+    while( pos < 3*cSim.atoms )
+    {
-    float previousDipole                = inducedDipole[threadId];
+        float previousDipole      = inducedDipole[pos];
-    float previousDipoleP               = inducedDipoleP[threadId];
+        float previousDipoleP     = inducedDipoleP[pos];
-    inducedDipole[threadId]             = fixedGkField[threadId]    + fixedEField[threadId]     + polarizability[threadId]*matrixProduct[threadId];
+        inducedDipole[pos]        = fixedGkField[pos]    + fixedEField[pos]     + polarizability[pos]*matrixProduct[pos];
-    inducedDipoleP[threadId]            = fixedGkField[threadId]    + fixedEFieldP[threadId]    + polarizability[threadId]*matrixProductP[threadId];
+        inducedDipoleP[pos]       = fixedGkField[pos]    + fixedEFieldP[pos]    + polarizability[pos]*matrixProductP[pos];
-    inducedDipole[threadId]             = previousDipole   + polarSOR*( inducedDipole[threadId]   - previousDipole  );   
+        inducedDipole[pos]        = previousDipole   + polarSOR*( inducedDipole[pos]   - previousDipole  );   
-    inducedDipoleP[threadId]            = previousDipoleP  + polarSOR*( inducedDipoleP[threadId]  - previousDipoleP );
+        inducedDipoleP[pos]       = previousDipoleP  + polarSOR*( inducedDipoleP[pos]  - previousDipoleP );
-    matrixProduct[threadId]             = ( inducedDipole[threadId]  - previousDipole  )*( inducedDipole[threadId]  - previousDipole  );
+        matrixProduct[pos]        = ( inducedDipole[pos]  - previousDipole  )*( inducedDipole[pos]  - previousDipole  );
-    matrixProductP[threadId]            = ( inducedDipoleP[threadId] - previousDipoleP )*( inducedDipoleP[threadId] - previousDipoleP );
+        matrixProductP[pos]       = ( inducedDipoleP[pos] - previousDipoleP )*( inducedDipoleP[pos] - previousDipoleP );
+        pos                      += blockDim.x*gridDim.x;
+    }
 }
 // reduce psWorkArray_3_1 -> outputArray
@@ -437,46 +446,6 @@ static void kReduceMutualInducedAndGkFields(amoebaGpuContext amoebaGpu,
    LAUNCHERROR("kReduceMutualInducedAndGkFields4");
 }
-#ifdef AMOEBA_DEBUG
-#if 0
-static void printMiFieldBuffer( amoebaGpuContext amoebaGpu, unsigned int bufferIndex )
-{
-    (void) fprintf( amoebaGpu->log, "MI Field Buffer %u\n", bufferIndex );
-    unsigned int start = bufferIndex*3*gpu->sim.paddedNumberOfAtoms;
-    unsigned int stop  = (bufferIndex+1)*3*gpu->sim.paddedNumberOfAtoms;
-    for( unsigned int ii = start; ii < stop; ii += 3 ){
-        unsigned int ii3Index      = ii/3;
-        unsigned int bufferIndex   = ii3Index/(gpu->sim.paddedNumberOfAtoms);
-        unsigned int particleIndex = ii3Index - bufferIndex*(gpu->sim.paddedNumberOfAtoms);
-        (void) fprintf( amoebaGpu->log, "   %6u %3u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n", 
-                            ii/3,  bufferIndex, particleIndex,
-                            amoebaGpu->psWorkArray_3_1->_pSysData[ii],
-                            amoebaGpu->psWorkArray_3_1->_pSysData[ii+1],
-                            amoebaGpu->psWorkArray_3_1->_pSysData[ii+2],
-                            amoebaGpu->psWorkArray_3_2->_pSysData[ii],
-                            amoebaGpu->psWorkArray_3_2->_pSysData[ii+1],
-                            amoebaGpu->psWorkArray_3_2->_pSysData[ii+2] );
-    } 
-}
-static void printMiFieldAtomBuffers( amoebaGpuContext amoebaGpu, unsigned int targetAtom )
-{
-    (void) fprintf( amoebaGpu->log, "MI Field atom %u\n", targetAtom );
-    for( unsigned int ii = 0; ii < gpu->sim.outputBuffers; ii++ ){
-        unsigned int particleIndex = 3*(targetAtom + ii*gpu->sim.paddedNumberOfAtoms);
-        (void) fprintf( amoebaGpu->log, " %2u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n", 
-                        ii, particleIndex,
-                        amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex],
-                        amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+1],
-                        amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+2],
-                        amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex],
-                        amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+1],
-                        amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+2] );
-    } 
-}
-#endif
-#endif
 /**---------------------------------------------------------------------------------------
   Compute mutual induce field
@@ -576,14 +545,6 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
        amoebaGpu->psWorkArray_3_3->Download();
        amoebaGpu->psWorkArray_3_4->Download();
-        //printMiFieldAtomBuffers( amoebaGpu, (targetAtom + 0) );
-        //printMiFieldAtomBuffers( amoebaGpu, (targetAtom + 1) );
-        //printMiFieldAtomBuffers( amoebaGpu, 100 );
-        //printMiFieldBuffer( amoebaGpu, 0 );
-        //printMiFieldBuffer( amoebaGpu, 1 );
-        //printMiFieldBuffer( amoebaGpu, 37 );
-        //printMiFieldBuffer( amoebaGpu, 38 );
    if( amoebaGpu->log && iteration == 1 ){
        (void) fprintf( amoebaGpu->log, "Finished MI kernel execution %d\n", iteration ); (void) fflush( amoebaGpu->log );
@@ -711,28 +672,13 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
    int iteration;
    gpuContext gpu     = amoebaGpu->gpuContext;
-    int numOfElems     = gpu->natoms*3;
-    int numThreads     = min( THREADS_PER_BLOCK, numOfElems );
-    int numBlocks      = numOfElems/numThreads;
-    if( (numOfElems % numThreads) != 0 )numBlocks++;
-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log && timestep == 1 ){
-        (void) fprintf( amoebaGpu->log, "%s %d numOfElems=%d numThreads=%d numBlocks=%d "
-                        "maxIterations=%d targetEpsilon=%.3e\n", 
-                        methodName, gpu->natoms, numOfElems, numThreads, numBlocks,
-                        amoebaGpu->mutualInducedMaxIterations, amoebaGpu->mutualInducedTargetEpsilon);
-        (void) fflush( amoebaGpu->log );
-    }   
-#endif
   // ---------------------------------------------------------------------------------------
    // set  E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability
    // initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability
-    kInitializeMutualInducedAndGkField_kernel<<< numBlocks, numThreads >>>(
+    kInitializeMutualInducedAndGkField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
         amoebaGpu->psE_Field->_pDevData,
         amoebaGpu->psE_FieldPolar->_pDevData,
         amoebaGpu->psGk_Field->_pDevData,
@@ -812,14 +758,14 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
        // post matrix multiply
-        kSorUpdateMutualInducedAndGkField_kernel<<< numBlocks, numThreads >>>(
+        kSorUpdateMutualInducedAndGkField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
           amoebaGpu->psPolarizability->_pDevData,
           amoebaGpu->psInducedDipole->_pDevData,     amoebaGpu->psInducedDipolePolar->_pDevData,
           amoebaGpu->psE_Field->_pDevData,           amoebaGpu->psE_FieldPolar->_pDevData,
           amoebaGpu->psWorkVector[0]->_pDevData,     amoebaGpu->psWorkVector[1]->_pDevData );
        LAUNCHERROR("cudaComputeAmoebaMutualInducedAndGkFieldSorUpdate1");  
-        kSorUpdateMutualInducedAndGkFieldS_kernel<<< numBlocks, numThreads >>>(
+        kSorUpdateMutualInducedAndGkFieldS_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
           amoebaGpu->psPolarizability->_pDevData,
           amoebaGpu->psInducedDipoleS->_pDevData,    amoebaGpu->psInducedDipolePolarS->_pDevData,
           amoebaGpu->psE_Field->_pDevData,          amoebaGpu->psE_FieldPolar->_pDevData,

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
@@ -120,14 +120,18 @@ void kInitializeMutualInducedField_kernel(
                   float* inducedDipolePolar )
 {
-    int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
+    int pos = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
-    if( threadId >= 3*numberOfAtoms )return;
+    while( pos < 3*cSim.atoms )
+    {
+        fixedEField[pos]         *= polarizability[pos];
+        inducedDipole[pos]        = fixedEField[pos];
-    fixedEField[threadId]         *= polarizability[threadId];
+        fixedEFieldPolar[pos]    *= polarizability[pos];
-    inducedDipole[threadId]        = fixedEField[threadId];
+        inducedDipolePolar[pos]   = fixedEFieldPolar[pos];
-    fixedEFieldPolar[threadId]    *= polarizability[threadId];
+        pos                      += blockDim.x*gridDim.x;
-    inducedDipolePolar[threadId]   = fixedEFieldPolar[threadId];
+    }
 }
@@ -195,20 +199,24 @@ void kSorUpdateMutualInducedField_kernel(
 {
    float polarSOR = 0.70f;
-    int threadId   = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
+    int pos        = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
-    if( threadId  >= 3*numberOfEntries )return;
+    while( pos < 3*cSim.atoms )
+    {
+        float previousDipole       = inducedDipole[pos];
+        float previousDipoleP      = inducedDipoleP[pos];
-    float previousDipole                = inducedDipole[threadId];
+        inducedDipole[pos]         = fixedEField[pos]     + polarizability[pos]*matrixProduct[pos];
-    float previousDipoleP               = inducedDipoleP[threadId];
+        inducedDipoleP[pos]        = fixedEFieldP[pos]    + polarizability[pos]*matrixProductP[pos];
-    inducedDipole[threadId]             = fixedEField[threadId]     + polarizability[threadId]*matrixProduct[threadId];
+        inducedDipole[pos]         = previousDipole   + polarSOR*( inducedDipole[pos]   - previousDipole  );   
-    inducedDipoleP[threadId]            = fixedEFieldP[threadId]    + polarizability[threadId]*matrixProductP[threadId];
+        inducedDipoleP[pos]        = previousDipoleP  + polarSOR*( inducedDipoleP[pos]  - previousDipoleP );
-    inducedDipole[threadId]             = previousDipole   + polarSOR*( inducedDipole[threadId]   - previousDipole  );   
+        matrixProduct[pos]         = ( inducedDipole[pos]  - previousDipole  )*( inducedDipole[pos]  - previousDipole  );
-    inducedDipoleP[threadId]            = previousDipoleP  + polarSOR*( inducedDipoleP[threadId]  - previousDipoleP );
+        matrixProductP[pos]        = ( inducedDipoleP[pos] - previousDipoleP )*( inducedDipoleP[pos] - previousDipoleP );
-    matrixProduct[threadId]             = ( inducedDipole[threadId]  - previousDipole  )*( inducedDipole[threadId]  - previousDipole  );
+        pos                       += blockDim.x*gridDim.x;
-    matrixProductP[threadId]            = ( inducedDipoleP[threadId] - previousDipoleP )*( inducedDipoleP[threadId] - previousDipoleP );
+    }
 }
@@ -470,28 +478,13 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
    int iteration;
    gpuContext gpu    = amoebaGpu->gpuContext;
-    int numOfElems     = gpu->natoms*3;
-    int numThreads     = min( THREADS_PER_BLOCK, numOfElems );
-    int numBlocks      = numOfElems/numThreads;
-    if( (numOfElems % numThreads) != 0 )numBlocks++;
-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "%s %d numOfElems=%d numThreads=%d numBlocks=%d "
-                        "maxIterations=%d targetEpsilon=%.3e\n", 
-                        methodName, gpu->natoms, numOfElems, numThreads, numBlocks,
-                        amoebaGpu->mutualInducedMaxIterations, amoebaGpu->mutualInducedTargetEpsilon);
-        (void) fflush( amoebaGpu->log );
-    }   
-#endif
   // ---------------------------------------------------------------------------------------
    // set  E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability
    // initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability
-    kInitializeMutualInducedField_kernel<<< numBlocks, numThreads >>>(
+    kInitializeMutualInducedField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
         gpu->natoms,
         amoebaGpu->psE_Field->_pDevData,
         amoebaGpu->psE_FieldPolar->_pDevData,
@@ -555,7 +548,7 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
        // post matrix multiply
-        kSorUpdateMutualInducedField_kernel<<< numBlocks, numThreads >>>(
+        kSorUpdateMutualInducedField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
           gpu->natoms, amoebaGpu->psPolarizability->_pDevData,
           amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psInducedDipolePolar->_pDevData,
           amoebaGpu->psE_Field->_pDevData,       amoebaGpu->psE_FieldPolar->_pDevData,

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
@@ -242,14 +242,16 @@ static void kInitializeMutualInducedField_kernel(
                   float* inducedDipolePolar )
 {
-    int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
+    int pos = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
-    if( threadId >= 3*numberOfAtoms )return;
+    while( pos < 3*cSim.atoms )
+    {   
-    fixedEField[threadId]         *= polarizability[threadId];
+        fixedEField[pos]         *= polarizability[pos];
-    inducedDipole[threadId]        = fixedEField[threadId];
+        inducedDipole[pos]        = fixedEField[pos];
-    fixedEFieldPolar[threadId]    *= polarizability[threadId];
+        fixedEFieldPolar[pos]    *= polarizability[pos];
-    inducedDipolePolar[threadId]   = fixedEFieldPolar[threadId];
+        inducedDipolePolar[pos]   = fixedEFieldPolar[pos];
+        pos                      += blockDim.x*gridDim.x;
+    }
 }
@@ -325,27 +327,31 @@ static void kSorUpdateMutualInducedField_kernel(
                   float* matrixProduct, float* matrixProductP )
 {
-    int threadId                        = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
+    int pos = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
-    if( threadId  >= 3*numberOfEntries )return;
+    while( pos < 3*cSim.atoms )
+    {   
-    float previousDipole                = inducedDipole[threadId];
+        float previousDipole           = inducedDipole[pos];
-    float previousDipoleP               = inducedDipoleP[threadId];
+        float previousDipoleP          = inducedDipoleP[pos];
        // add self terms to fields
        const float term               = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
-    matrixProduct[threadId]            +=  term*previousDipole;
+        matrixProduct[pos]            +=  term*previousDipole;
-    matrixProductP[threadId]           +=  term*previousDipoleP;
+        matrixProductP[pos]           +=  term*previousDipoleP;
-    inducedDipole[threadId]             = fixedEField[threadId]     + polarizability[threadId]*matrixProduct[threadId];
+        inducedDipole[pos]             = fixedEField[pos]     + polarizability[pos]*matrixProduct[pos];
-    inducedDipoleP[threadId]            = fixedEFieldP[threadId]    + polarizability[threadId]*matrixProductP[threadId];
+        inducedDipoleP[pos]            = fixedEFieldP[pos]    + polarizability[pos]*matrixProductP[pos];
        const float polarSOR           = 0.70f;
-    inducedDipole[threadId]             = previousDipole   + polarSOR*( inducedDipole[threadId]   - previousDipole  );   
+        inducedDipole[pos]             = previousDipole   + polarSOR*( inducedDipole[pos]   - previousDipole  );   
-    inducedDipoleP[threadId]            = previousDipoleP  + polarSOR*( inducedDipoleP[threadId]  - previousDipoleP );
+        inducedDipoleP[pos]            = previousDipoleP  + polarSOR*( inducedDipoleP[pos]  - previousDipoleP );
+        matrixProduct[pos]             = ( inducedDipole[pos]  - previousDipole  )*( inducedDipole[pos]  - previousDipole  );
+        matrixProductP[pos]            = ( inducedDipoleP[pos] - previousDipoleP )*( inducedDipoleP[pos] - previousDipoleP );
-    matrixProduct[threadId]             = ( inducedDipole[threadId]  - previousDipole  )*( inducedDipole[threadId]  - previousDipole  );
+        pos                           += blockDim.x*gridDim.x;
-    matrixProductP[threadId]            = ( inducedDipoleP[threadId] - previousDipoleP )*( inducedDipoleP[threadId] - previousDipoleP );
+    }
 }
@@ -539,28 +545,13 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
    int iteration;
     gpuContext gpu    = amoebaGpu->gpuContext;
-    int numOfElems     = gpu->natoms*3;
-    int numThreads     = min( THREADS_PER_BLOCK, numOfElems );
-    int numBlocks      = numOfElems/numThreads;
-    if( (numOfElems % numThreads) != 0 )numBlocks++;
-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "%s %d numOfElems=%d numThreads=%d numBlocks=%d "
-                        "maxIterations=%d targetEpsilon=%.3e\n", 
-                        methodName, gpu->natoms, numOfElems, numThreads, numBlocks,
-                        amoebaGpu->mutualInducedMaxIterations, amoebaGpu->mutualInducedTargetEpsilon);
-        (void) fflush( amoebaGpu->log );
-    }   
-#endif
   // ---------------------------------------------------------------------------------------
    // set  E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability
    // initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability
-    kInitializeMutualInducedField_kernel<<< numBlocks, numThreads >>>(
+    kInitializeMutualInducedField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
         gpu->natoms,
         amoebaGpu->psE_Field->_pDevData,
         amoebaGpu->psE_FieldPolar->_pDevData,
@@ -607,7 +598,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
        // post matrix multiply
-        kSorUpdateMutualInducedField_kernel<<< numBlocks, numThreads >>>(
+        kSorUpdateMutualInducedField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
           gpu->natoms, amoebaGpu->psPolarizability->_pDevData,
           amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psInducedDipolePolar->_pDevData,
           amoebaGpu->psE_Field->_pDevData,       amoebaGpu->psE_FieldPolar->_pDevData,

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaReduce.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaReduce.cu
-//-----------------------------------------------------------------------------------------
-//-----------------------------------------------------------------------------------------
-#include "amoebaGpuTypes.h"
-#include "amoebaCudaKernels.h"
-#include <stdio.h>
-#undef AMOEBA_OFFSET_3
-#undef AMOEBA_INCLUDE_DIAGONAL 
-#define METHOD_NAME(a, b) a##ExcludeDiagonalOffset1##b
-#include "kCalculateAmoebaCudaReduce.h"
-#undef METHOD_NAME
-#define AMOEBA_OFFSET_3
-#define METHOD_NAME(a, b) a##ExcludeDiagonalOffset3##b
-#include "kCalculateAmoebaCudaReduce.h"
-#undef METHOD_NAME
-#undef AMOEBA_OFFSET_3
-#define AMOEBA_INCLUDE_DIAGONAL 
-#define METHOD_NAME(a, b) a##IncludeDiagonalOffset1##b
-#include "kCalculateAmoebaCudaReduce.h"
-#undef METHOD_NAME
-#define AMOEBA_OFFSET_3
-#define METHOD_NAME(a, b) a##IncludeDiagonalOffset3##b
-#include "kCalculateAmoebaCudaReduce.h"
-#undef METHOD_NAME
-#undef AMOEBA_OFFSET_3
-#undef AMOEBA_INCLUDE_DIAGONAL 
-void cudaReduceN2ToN( float *N2Array, int Nsz, float *NArray, int includeDiagonal, int offset )
-{
-    int numThreads        = min(THREADS_PER_BLOCK, (Nsz));
-    int numBlocksPerAtom  = (Nsz / numThreads);
-    if( Nsz % numThreads ){
-        numBlocksPerAtom++;
-    }
-    int numBlocks = numBlocksPerAtom*Nsz;
-    float *partialSum1_d;
-    // allocate GPU memory 
-    cudaMalloc( (void**) &partialSum1_d, numBlocks*offset*sizeof(float) );  
-    if( includeDiagonal ){
-       if( offset == 3 ){
-           kCalculateAmoebaReduceIncludeDiagonalOffset3N2ToNBlockLevel<<< numBlocks, numThreads >>>( N2Array, partialSum1_d, Nsz, numBlocksPerAtom );
-           LAUNCHERROR("kCalculateAmoebaReduceN2ToNBlockLevel1");
-       } else if( offset == 1 ){
-           kCalculateAmoebaReduceIncludeDiagonalOffset1N2ToNBlockLevel<<< numBlocks, numThreads >>>( N2Array, partialSum1_d, Nsz, numBlocksPerAtom );
-           LAUNCHERROR("kCalculateAmoebaReduceN2ToNBlockLevel2");
-       }
-    } else {
-       if( offset == 3 ){
-           kCalculateAmoebaReduceExcludeDiagonalOffset3N2ToNBlockLevel<<< numBlocks, numThreads >>>( N2Array, partialSum1_d, Nsz, numBlocksPerAtom );
-           LAUNCHERROR("kCalculateAmoebaReduceN2ToNBlockLevel3");
-       } else if( offset == 1 ){
-           kCalculateAmoebaReduceExcludeDiagonalOffset1N2ToNBlockLevel<<< numBlocks, numThreads >>>( N2Array, partialSum1_d, Nsz, numBlocksPerAtom );
-           LAUNCHERROR("kCalculateAmoebaReduceN2ToNBlockLevel4");
-       }
-    }
-    int numBlocks2 = numBlocks;
-    numBlocks      = numBlocks2*Nsz/numThreads;
-    if( (numBlocks2*Nsz) % numThreads ){
-        numBlocks++;
-    }
-    if( offset == 3 ){
-        kCalculateAmoebaReduceIncludeDiagonalOffset3N2ToNFinal<<< numBlocks, numThreads >>>(partialSum1_d, NArray, Nsz, numBlocksPerAtom );
-        LAUNCHERROR("kCalculateAmoebaReduceN2ToNFinal3");
-    } else if( offset == 1 ){
-        kCalculateAmoebaReduceIncludeDiagonalOffset1N2ToNFinal<<< numBlocks, numThreads >>>(partialSum1_d, NArray, Nsz, numBlocksPerAtom );
-        LAUNCHERROR("kCalculateAmoebaReduceN2ToNFinal1");
-    }
-    //Free memory
-    cudaFree(partialSum1_d);
-}
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaReduce.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaReduce.h
-//-----------------------------------------------------------------------------------------
-//-----------------------------------------------------------------------------------------
-typedef unsigned int uint;
-__global__
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
-#endif
-void METHOD_NAME(kCalculateAmoebaReduce, N2ToNBlockLevel)( float *N2Array, float *partialSum, int num,int numberOfBlocksPerAtom )
-{
-   uint tid = threadIdx.x;
-   __shared__ float asx[THREADS_PER_BLOCK];
-   asx[tid] = 0.0f;
-#ifdef AMOEBA_OFFSET_3
-    __shared__ float asy[THREADS_PER_BLOCK];
-    __shared__ float asz[THREADS_PER_BLOCK];
-    asx[tid] = 0.0f;
-    asy[tid] = asz[tid] = 0.0f;
-    int offset = 3;
-#else
-    int offset = 1;
-#endif
-    int atomI =  blockIdx.x / numberOfBlocksPerAtom;
-    int atomJ = (blockIdx.x % numberOfBlocksPerAtom)*blockDim.x+tid;
-#ifdef AMOEBA_INCLUDE_DIAGONAL
-    if( atomJ < num && atomI < num ){
-#else
-    if( atomJ < num && atomJ != atomI ){
-#endif
-      int index = offset*(atomI*num + atomJ);
-      asx[tid] = N2Array[index];
-#ifdef AMOEBA_OFFSET_3
-      asy[tid] = N2Array[index+1];
-      asz[tid] = N2Array[index+2];
-#endif
-    }
-    __syncthreads(); //to make sure all the elements are loaded
-    for( uint s = (blockDim.x)/2; s != 0; s >>= 1 ){
-      if( tid < s ){
-        asx[tid] += asx[tid+s];
-#ifdef AMOEBA_OFFSET_3
-        asy[tid] += asy[tid+s];
-        asz[tid] += asz[tid+s];
-#endif
-      }
-      __syncthreads();
-    }
-    if( tid == 0 ){
-      partialSum[blockIdx.x*offset] = asx[0];
-#ifdef AMOEBA_OFFSET_3
-      partialSum[blockIdx.x*3+1]    = asy[0];
-      partialSum[blockIdx.x*3+2]    = asz[0];
-#endif
-    }  
-}
-__global__
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
-#endif
-void METHOD_NAME(kCalculateAmoebaReduce, N2ToNFinal)( float *partialSum, float *final,int num,int numberOfBlocksPerAtom )
-{
-    uint thread_id = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
-    if( thread_id > num )return;
-    float3 sum;
-#ifdef AMOEBA_OFFSET_3
-    int offset = 3;
-    sum.x = sum.y = sum.z = 0.0f;
-#else
-    int offset = 1;
-    sum.x      = 0.0f;
-#endif
-    int index = thread_id*offset*numberOfBlocksPerAtom;
-    for( int i=0; i < numberOfBlocksPerAtom; i++ ){
-      sum.x += partialSum[index + i*offset];
-#ifdef AMOEBA_OFFSET_3
-      sum.y += partialSum[index + i*offset+1];
-      sum.z += partialSum[index + i*offset+2];
-#endif
-    }
-    final[thread_id*offset   ] = sum.x;
-#ifdef AMOEBA_OFFSET_3
-    final[thread_id*3+1 ]      = sum.y;
-    final[thread_id*3+2 ]      = sum.z;
-#endif
-}