Removal of limitation for 'long-range in sequence' covalent bonds

Reduced memory footprint

Removal of limitation for 'long-range in sequence' covalent bonds
Reduced memory footprint
761d7e17 · Mark Friedrichs · 80c4976e · 761d7e17 · 761d7e17 · 761d7e17
Commit 761d7e17 authored Apr 07, 2011 by Mark Friedrichs
16 changed files
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.h
@@ -118,7 +118,7 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedAndGkFields, _kernel)(
 #endif
 );

-                unsigned int mask       =  ( (atomI == (y + j)) || (atomI >= cAmoebaSim.numberOfAtoms) || ((y+j) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
+                unsigned int mask       =  ( (atomI == (y + j)) || (atomI >= cSim.atoms) || ((y+j) >= cSim.atoms) ) ? 0 : 1;

                // add to field at atomI the field due atomJ's dipole

@@ -162,7 +162,7 @@ if( atomI == targetAtom ){

                // atomI == atomJ contribution included

-                mask                    =  ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+j) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
+                mask                    =  ( (atomI >= cSim.atoms) || ((y+j) >= cSim.atoms) ) ? 0 : 1;
                fieldSumS[0]           += mask ? ijField[0][0] : 0.0f;
                fieldSumS[1]           += mask ? ijField[0][1] : 0.0f;
                fieldSumS[2]           += mask ? ijField[0][2] : 0.0f;
@@ -181,7 +181,7 @@ if( atomI == targetAtom ){
        index                              = debugAccumulate( index, debugArray, jDipoleS, 1, 7.0f );
        index                              = debugAccumulate( index, debugArray, jDipolePolarS, 1, 8.0f );

-        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        index                             += cSim.paddedNumberOfAtoms;
        debugArray[index].x                = bornRadii[atomI];
        debugArray[index].y                = jBornRadius;
        debugArray[index].w                = 9.0f;
@@ -195,7 +195,7 @@ if( atomI == targetAtom ){

 #ifdef USE_OUTPUT_BUFFER_PER_WARP

-            unsigned int offset                 = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+            unsigned int offset                 = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);

            load3dArrayBufferPerWarp( offset, fieldSum,       outputField );
            load3dArrayBufferPerWarp( offset, fieldPolarSum,  outputFieldPolar );
@@ -204,7 +204,7 @@ if( atomI == targetAtom ){
            load3dArrayBufferPerWarp( offset, fieldPolarSumS, outputFieldPolarS );

 #else
-            unsigned int offset                   = 3*(x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+            unsigned int offset                   = 3*(x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);

            load3dArray( offset, fieldSum,        outputField );
            load3dArray( offset, fieldPolarSum,   outputFieldPolar);
@@ -241,7 +241,7 @@ if( atomI == targetAtom ){
 #endif
   );

-                if( (atomI < cAmoebaSim.numberOfAtoms) && ((y+tj) < cAmoebaSim.numberOfAtoms) ){
+                if( (atomI < cSim.atoms) && ((y+tj) < cSim.atoms) ){
           
                    // add to field at atomI the field due atomJ's dipole
    
@@ -294,7 +294,7 @@ if( atomI == targetAtom ){
 unsigned int index                 = (atomI == targetAtom) ? (y + tj) : atomI;
 if( atomI == targetAtom  || (y + tj) == targetAtom ){
        unsigned int indexI                = (atomI == targetAtom) ? 0 : 2;
-        unsigned int maskD                 = (atomI < cAmoebaSim.numberOfAtoms) && ((y+tj) < cAmoebaSim.numberOfAtoms);
+        unsigned int maskD                 = (atomI < cSim.atoms) && ((y+tj) < cSim.atoms);

        debugArray[index].x                = (float) atomI;
        debugArray[index].y                = (float) (y + tj);
@@ -314,7 +314,7 @@ if( atomI == targetAtom  || (y + tj) == targetAtom ){
 );


-                if( (atomI < cAmoebaSim.numberOfAtoms) && ((y+tj) < cAmoebaSim.numberOfAtoms) ){
+                if( (atomI < cSim.atoms) && ((y+tj) < cSim.atoms) ){

                    fieldSumS[0]           += ijField[0][0];
                    fieldSumS[1]           += ijField[0][1];
@@ -341,14 +341,14 @@ if( atomI == targetAtom  || (y + tj) == targetAtom ){
 #if 0
 if( atomI == targetAtom  || (y + tj) == targetAtom ){
        unsigned int indexI                = (atomI == targetAtom) ? 0 : 1;
-        unsigned int maskD                 = (atomI < cAmoebaSim.numberOfAtoms) && ((y+tj) < cAmoebaSim.numberOfAtoms);
+        unsigned int maskD                 = (atomI < cSim.atoms) && ((y+tj) < cSim.atoms);

        index                              = debugAccumulate( index, debugArray, ijField[indexI],   maskD, -5.0f );
        index                              = debugAccumulate( index, debugArray, ijField[indexI+2], maskD, -6.0f );
        index                              = debugAccumulate( index, debugArray, jDipoleS, 1, -7.0f );
        index                              = debugAccumulate( index, debugArray, jDipolePolarS, 1, -8.0f );

-        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        index                             += cSim.paddedNumberOfAtoms;
        debugArray[index].x                = bornRadii[atomI];
        debugArray[index].y                = jBornRadius;
        debugArray[index].w                = -9.0f;
@@ -362,13 +362,13 @@ if( atomI == targetAtom  || (y + tj) == targetAtom ){
            // Write results

 #ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset                 = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+            unsigned int offset                 = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
            load3dArrayBufferPerWarp( offset, fieldSum,       outputField );
            load3dArrayBufferPerWarp( offset, fieldPolarSum,  outputFieldPolar);
            load3dArrayBufferPerWarp( offset, fieldSumS,      outputFieldS );
            load3dArrayBufferPerWarp( offset, fieldPolarSumS, outputFieldPolarS );

-            offset                              = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+            offset                              = 3*(y + tgx + warp*cSim.paddedNumberOfAtoms);

            load3dArrayBufferPerWarp( offset, sA[threadIdx.x].field,       outputField );
            load3dArrayBufferPerWarp( offset, sA[threadIdx.x].fieldPolar,  outputFieldPolar);
@@ -376,13 +376,13 @@ if( atomI == targetAtom  || (y + tj) == targetAtom ){
            load3dArrayBufferPerWarp( offset, sA[threadIdx.x].fieldPolarS, outputFieldPolarS);

 #else
-            unsigned int offset                 = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+            unsigned int offset                 = 3*(x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms);
            load3dArray( offset, fieldSum,       outputField );
            load3dArray( offset, fieldPolarSum,  outputFieldPolar);
            load3dArray( offset, fieldSumS,      outputFieldS );
            load3dArray( offset, fieldPolarSumS, outputFieldPolarS);

-            offset                              = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+            offset                              = 3*(y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
            load3dArray( offset, sA[threadIdx.x].field,       outputField );
            load3dArray( offset, sA[threadIdx.x].fieldPolar,  outputFieldPolar);
            load3dArray( offset, sA[threadIdx.x].fieldS,      outputFieldS );

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
@@ -217,13 +217,14 @@ void kSorUpdateMutualInducedField_kernel(

 static void kReduceMutualInducedFields(amoebaGpuContext amoebaGpu, CUDAStream<float>* outputArray, CUDAStream<float>* outputPolarArray )
 {
-    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
-                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
+    gpuContext gpu = amoebaGpu->gpuContext;
+    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
+                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
                               amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData );
    LAUNCHERROR("kReduceMI_Fields1");

-    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
-                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
+    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
+                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
                               amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData );
    LAUNCHERROR("kReduceMI_Fields2");
 }
@@ -277,14 +278,14 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext

 #ifdef AMOEBA_DEBUG
        (void) fprintf( amoebaGpu->log, "%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n", methodName,
-                        amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
+                        gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
                        sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock,
                        (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
        (void) fflush( amoebaGpu->log );
 #endif

    if (gpu->bOutputBufferPerWarp){
-        kCalculateAmoebaMutualInducedFieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
+        kCalculateAmoebaMutualInducedFieldN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
                                                                 amoebaGpu->psWorkUnit->_pDevData,
                                                                 amoebaGpu->psWorkArray_3_1->_pDevData,
 #ifdef AMOEBA_DEBUG
@@ -296,7 +297,7 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext

    } else {

-        kCalculateAmoebaMutualInducedFieldN2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
+        kCalculateAmoebaMutualInducedFieldN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
                                                                 amoebaGpu->psWorkUnit->_pDevData,
                                                                 amoebaGpu->psWorkArray_3_1->_pDevData,
 #ifdef AMOEBA_DEBUG

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.h
@@ -105,7 +105,7 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedField, _kernel)(
 #endif
 );

-                unsigned int mask       =  ( (atomI == (y + j)) || (atomI >= cAmoebaSim.numberOfAtoms) || ((y+j) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
+                unsigned int mask       =  ( (atomI == (y + j)) || (atomI >= cSim.atoms) || ((y+j) >= cSim.atoms) ) ? 0 : 1;

                // add to field at atomI the field due atomJ's dipole

@@ -126,20 +126,20 @@ if( atomI == targetAtom ){
        debugArray[index].x                = (float) atomI;
        debugArray[index].y                = (float) (y + j);
        //debugArray[index].z                = cAmoebaSim.pDampingFactorAndThole[atomI].x;
-        debugArray[index].z                = (float) cAmoebaSim.numberOfAtoms;
+        debugArray[index].z                = (float) cSim.atoms;
        debugArray[index].w                = (float) (mask + 1);

-        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        index                             += cSim.paddedNumberOfAtoms;
        debugArray[index].x                = mask ? ijField[indexI][0] : 0.0f;
        debugArray[index].y                = mask ? ijField[indexI][1] : 0.0f;
        debugArray[index].z                = mask ? ijField[indexI][2] : 0.0f;

-        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        index                             += cSim.paddedNumberOfAtoms;
        debugArray[index].x                = mask ? ijField[indexI+1][0] : 0.0f;
        debugArray[index].y                = mask ? ijField[indexI+1][1] : 0.0f;
        debugArray[index].z                = mask ? ijField[indexI+1][2] : 0.0f;

-        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        index                             += cSim.paddedNumberOfAtoms;
        debugArray[index].x                = (float) x;
        debugArray[index].y                = (float) y;
        debugArray[index].z                = (float) 1.0f;
@@ -150,12 +150,12 @@ if( atomI == targetAtom ){
            // Write results

 #ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset            = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+            unsigned int offset            = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
            load3dArrayBufferPerWarp( offset, fieldSum,      outputField );
            load3dArrayBufferPerWarp( offset, fieldPolarSum, outputFieldPolar);

 #else
-            unsigned int offset            = 3*(x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+            unsigned int offset            = 3*(x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
            load3dArray( offset, fieldSum,      outputField );
            load3dArray( offset, fieldPolarSum, outputFieldPolar);

@@ -191,7 +191,7 @@ if( atomI == targetAtom ){
 #endif
   );

-                unsigned int mask   =  ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+tj) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
+                unsigned int mask   =  ( (atomI >= cSim.atoms) || ((y+tj) >= cSim.atoms) ) ? 0 : 1;
           
                // add to field at atomI the field due atomJ's dipole

@@ -229,17 +229,17 @@ if( atomI == targetAtom  || (y + tj) == targetAtom ){
        debugArray[index].z                = cAmoebaSim.pDampingFactorAndThole[atomI].x;
        debugArray[index].w                = (float) (mask+1);

-        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        index                             += cSim.paddedNumberOfAtoms;
        debugArray[index].x                = mask ? ijField[indexI][0] : 0.0f;
        debugArray[index].y                = mask ? ijField[indexI][1] : 0.0f;
        debugArray[index].z                = mask ? ijField[indexI][2] : 0.0f;

-        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        index                             += cSim.paddedNumberOfAtoms;
        debugArray[index].x                = mask ? ijField[indexI+1][0] : 0.0f;
        debugArray[index].y                = mask ? ijField[indexI+1][1] : 0.0f;
        debugArray[index].z                = mask ? ijField[indexI+1][2] : 0.0f;

-        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        index                             += cSim.paddedNumberOfAtoms;
        debugArray[index].x                = (float) x;
        debugArray[index].y                = (float) y;
        debugArray[index].z                = (float) -1.0f;
@@ -253,21 +253,21 @@ if( atomI == targetAtom  || (y + tj) == targetAtom ){
            // Write results

 #ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset     = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+            unsigned int offset     = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
            load3dArrayBufferPerWarp( offset, fieldSum,      outputField );
            load3dArrayBufferPerWarp( offset, fieldPolarSum, outputFieldPolar);

-            offset                  = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+            offset                  = 3*(y + tgx + warp*cSim.paddedNumberOfAtoms);

            load3dArrayBufferPerWarp( offset, sA[threadIdx.x].field,      outputField );
            load3dArrayBufferPerWarp( offset, sA[threadIdx.x].fieldPolar, outputFieldPolar);

 #else
-            unsigned int offset     = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+            unsigned int offset     = 3*(x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms);
            load3dArray( offset, fieldSum,      outputField );
            load3dArray( offset, fieldPolarSum, outputFieldPolar);

-            offset                  = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+            offset                  = 3*(y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
            load3dArray( offset, sA[threadIdx.x].field,      outputField );
            load3dArray( offset, sA[threadIdx.x].fieldPolar, outputFieldPolar);


--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPME.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPME.cu
@@ -3,6 +3,7 @@
 //-----------------------------------------------------------------------------------------

 #include "amoebaGpuTypes.h"
+#include "cudaKernels.h"
 #include "amoebaCudaKernels.h"
 #include "bbsort.h"

@@ -982,7 +983,7 @@ void kRecordInducedDipoleField_kernel(float* output, float* outputPolar)
    }
 }

-extern void cudaComputeAmoebaMapTorquesAndAddTotalForce2(amoebaGpuContext gpu, CUDAStream<float>* psTorque, CUDAStream<float4>* psOutputForce);
+extern void cudaComputeAmoebaMapTorqueAndAddToForce(amoebaGpuContext gpu, CUDAStream<float>* psTorque);

 /**
 * Compute the potential and forces due to the reciprocal space PME calculation for fixed multipoles.
@@ -1014,7 +1015,21 @@ void kCalculateAmoebaPMEFixedMultipoles(amoebaGpuContext amoebaGpu)
    LAUNCHERROR("kRecordFixedMultipoleField");
    kComputeFixedMultipoleForceAndEnergy_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
    LAUNCHERROR("kComputeFixedMultipoleForceAndEnergy");
-    cudaComputeAmoebaMapTorquesAndAddTotalForce2(amoebaGpu, amoebaGpu->psTorque, gpu->psForce4);
+
+    if( 0 ){
+        gpuContext gpu                       = amoebaGpu->gpuContext;
+        std::vector<int> fileId;
+        fileId.push_back( 0 );
+        VectorOfDoubleVectors outputVector;
+        kReduceForces( gpu );
+        cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psForce4,              outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
+        cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,              outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
+        cudaWriteVectorOfDoubleVectorsToFile( "CudaRecipForceOnlyFixed", fileId, outputVector );
+        kClearForces( gpu );
+    }   
+
+    cudaComputeAmoebaMapTorqueAndAddToForce(amoebaGpu, amoebaGpu->psTorque);
+
 }

 /**
@@ -1047,6 +1062,5 @@ void kCalculateAmoebaPMEInducedDipoleForces(amoebaGpuContext amoebaGpu)
    kComputeInducedDipoleForceAndEnergy_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
    LAUNCHERROR("kComputeInducedDipoleForceAndEnergy");

-    cudaComputeAmoebaMapTorquesAndAddTotalForce2(amoebaGpu, amoebaGpu->psTorque, gpu->psForce4);
-    LAUNCHERROR("cudaComputeAmoebaMapTorquesAndAddTotalForce2_kCalculateAmoebaPMEInducedDipoleForces");
+    cudaComputeAmoebaMapTorqueAndAddToForce(amoebaGpu, amoebaGpu->psTorque );
 }
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
@@ -3,6 +3,7 @@
 //-----------------------------------------------------------------------------------------

 #include "amoebaGpuTypes.h"
+#include "cudaKernels.h"
 #include "amoebaCudaKernels.h"
 #include "kCalculateAmoebaCudaUtilities.h"

@@ -91,7 +92,7 @@ __device__ void sumTempBuffer( PmeDirectElectrostaticParticle& atomI, PmeDirectE
 __device__ static void debugSetup( unsigned int atomI, unsigned int atomJ,
                                   float4* debugArray, float4* pullBack )
 {
-    unsigned int index                 = atomI + atomJ*cAmoebaSim.paddedNumberOfAtoms;
+    unsigned int index                 = atomI + atomJ*cSim.paddedNumberOfAtoms;
    float blockId                      = 111.0f;

    debugArray[index].x                = (float) atomI;
@@ -100,7 +101,7 @@ __device__ static void debugSetup( unsigned int atomI, unsigned int atomJ,
    debugArray[index].w                = blockId;

    for( int pullIndex = 0; pullIndex < 1; pullIndex++ ){
-        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        index                             += cSim.paddedNumberOfAtoms;
        debugArray[index].x                = pullBack[pullIndex].x;
        debugArray[index].y                = pullBack[pullIndex].y;
        debugArray[index].z                = pullBack[pullIndex].z;
@@ -1082,66 +1083,17 @@ __device__ void loadPmeDirectElectrostaticShared( struct PmeDirectElectrostaticP
 #define METHOD_NAME(a, b) a##CutoffByWarp##b
 #include "kCalculateAmoebaCudaPmeDirectElectrostatic.h"

-// reduce psWorkArray_3_1 -> force
-// reduce psWorkArray_3_2 -> torque
+// reduce psWorkArray_3_1 -> torque

-static void kReduceForceTorque(amoebaGpuContext amoebaGpu )
+static void kReduceTorque(amoebaGpuContext amoebaGpu )
 {
-    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
-                             amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
-                             amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psForce->_pDevData );
-    LAUNCHERROR("kReducePmeDirectElectrostaticForce");
-
-    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
-                             amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
-                             amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psTorque->_pDevData );
+    gpuContext gpu = amoebaGpu->gpuContext;
+    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
+                             gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
+                             amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psTorque->_pDevData );
    LAUNCHERROR("kReducePmeDirectElectrostaticTorque");
 }

-/**---------------------------------------------------------------------------------------
-
-   Zero gpu->psForce4
-   @param amoebaGpu        amoebaGpu context
-
-   --------------------------------------------------------------------------------------- */
-
-static void zeroForce( amoebaGpuContext amoebaGpu )
-{
-    gpuContext gpu   = amoebaGpu->gpuContext;
-    for( int ii = 0; ii < amoebaGpu->gpuContext->natoms; ii++ ){
-        gpu->psForce4->_pSysData[ii].x  = 0.0f;
-        gpu->psForce4->_pSysData[ii].y  = 0.0f;
-        gpu->psForce4->_pSysData[ii].z  = 0.0f;
-    }
-    gpu->psForce4->Upload();
-}
-
-/**---------------------------------------------------------------------------------------
-
-   Copy gpu->psForce4 to amoebaGpu->psForce
-   @param amoebaGpu        amoebaGpu context
-
-   --------------------------------------------------------------------------------------- */
-
-static void copyForce( amoebaGpuContext amoebaGpu, float conversion )
-{
-    gpuContext gpu   = amoebaGpu->gpuContext;
-    gpu->psForce4->Download();
-    int indexOffset  = 0;
-    for( int ii = 0; ii < amoebaGpu->gpuContext->natoms; ii++ ){
-        amoebaGpu->psForce->_pSysData[indexOffset]    = conversion*(gpu->psForce4->_pSysData[ii].x);
-        amoebaGpu->psForce->_pSysData[indexOffset+1]  = conversion*(gpu->psForce4->_pSysData[ii].y);
-        amoebaGpu->psForce->_pSysData[indexOffset+2]  = conversion*(gpu->psForce4->_pSysData[ii].z);
-        indexOffset                                  += 3;
-    }
-    amoebaGpu->psForce->Upload();
-}
-
-//#define GET_INDUCED_DIPOLE_FROM_FILE
-#ifdef GET_INDUCED_DIPOLE_FROM_FILE
-#include <stdlib.h>
-#endif
-
 /**---------------------------------------------------------------------------------------

   Compute Amoeba dirrect space portion of electrostatic force & torque
@@ -1153,10 +1105,6 @@ static void copyForce( amoebaGpuContext amoebaGpu, float conversion )
 void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
 {

-   // ---------------------------------------------------------------------------------------
-
-    static unsigned int threadsPerBlock = 0;
-
 #ifdef AMOEBA_DEBUG
    static const char* methodName = "cudaComputeAmoebaPmeDirectElectrostatic";
    static int timestep = 0;
@@ -1186,42 +1134,9 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
    unsigned int targetAtom                   = 49;
 #endif

-#ifdef GET_INDUCED_DIPOLE_FROM_FILE
-    //std::string fileName = "waterInducedDipole.txt";
-    std::string fileName = "water_3_MI.txt";
-    StringVectorVector fileContents;
-    readFile( fileName, fileContents );
-    unsigned int offset  = 0;
-    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "Read file: %s %u\n", fileName.c_str(), fileContents.size() ); fflush(  amoebaGpu->log );
-    }
-    for( unsigned int ii = 1; ii < fileContents.size()-1; ii++ ){
-
-        StringVector lineTokens     = fileContents[ii];
-        unsigned int lineTokenIndex = 1;
-
-        // (void) fprintf( amoebaGpu->log, "   %u %s %s\n", ii, lineTokens[0].c_str(), lineTokens[lineTokenIndex].c_str() ); fflush(  amoebaGpu->log );
-        amoebaGpu->psInducedDipole->_pSysData[offset++]       = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
-        amoebaGpu->psInducedDipole->_pSysData[offset++]       = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
-        amoebaGpu->psInducedDipole->_pSysData[offset++]       = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
-        offset                                              -= 3;
-        amoebaGpu->psInducedDipolePolar->_pSysData[offset++]  = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
-        amoebaGpu->psInducedDipolePolar->_pSysData[offset++]  = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
-        amoebaGpu->psInducedDipolePolar->_pSysData[offset++]  = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
-    }
-    float conversion = 0.1f;
-    for( int ii = 0; ii < 3*gpu->natoms; ii++ ){
-        amoebaGpu->psInducedDipole->_pSysData[ii]       *= conversion;
-        amoebaGpu->psInducedDipolePolar->_pSysData[ii]  *= conversion;
-    }
-    amoebaGpu->gpuContext->sim.alphaEwald = 5.4459052e+00f;
-    SetCalculateAmoebaPmeDirectElectrostaticSim(amoebaGpu);
-    amoebaGpu->psInducedDipole->Upload();
-    amoebaGpu->psInducedDipolePolar->Upload();
-#endif
-
    // on first pass, set threads/block

+    static unsigned int threadsPerBlock = 0;
    if( threadsPerBlock == 0 ){
        unsigned int maxThreads;
        if (gpu->sm_version >= SM_20)
@@ -1233,154 +1148,44 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(PmeDirectElectrostaticParticle)), maxThreads);
    }

-    kClearFields_3( amoebaGpu, 2 );
+    kClearFields_3( amoebaGpu, 1 );

 #ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticCutoffForces:  threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u\n", 
-                        threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(PmeDirectElectrostaticParticle)),
-                        sizeof(PmeDirectElectrostaticParticle) );
-
-           (void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticCutoffForces:  numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u gpu->nonbond_threads_per_block=%u\n",
-                      amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
-                      sizeof(PmeDirectElectrostaticParticle), (sizeof(PmeDirectElectrostaticParticle))*threadsPerBlock,
-                      (*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->sim.nonbond_threads_per_block );
-         (void) fflush( amoebaGpu->log );
+        (void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticCutoffForces:  numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u gpu->nonbond_threads_per_block=%u\n",
+                        gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
+                        sizeof(PmeDirectElectrostaticParticle), (sizeof(PmeDirectElectrostaticParticle))*threadsPerBlock,
+                        (*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->sim.nonbond_threads_per_block );
+        (void) fflush( amoebaGpu->log );
    }   
 #endif

    if (gpu->bOutputBufferPerWarp){

-        kCalculateAmoebaPmeDirectElectrostaticCutoffByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>(
+        kCalculateAmoebaPmeDirectElectrostaticCutoffByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>(
                                                                          gpu->sim.pInteractingWorkUnit,
-                                                                          amoebaGpu->psWorkArray_3_1->_pDevData,
 #ifdef AMOEBA_DEBUG
-                                                                          amoebaGpu->psWorkArray_3_2->_pDevData,
+                                                                          amoebaGpu->psWorkArray_3_1->_pDevData,
                                                                          debugArray->_pDevData, targetAtom );
 #else
-                                                                          amoebaGpu->psWorkArray_3_2->_pDevData );
+                                                                          amoebaGpu->psWorkArray_3_1->_pDevData );
 #endif

    } else {

-        kCalculateAmoebaPmeDirectElectrostaticCutoffForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>(
+        kCalculateAmoebaPmeDirectElectrostaticCutoffForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>(
                                                                    gpu->sim.pInteractingWorkUnit,
-                                                                    amoebaGpu->psWorkArray_3_1->_pDevData,
 #ifdef AMOEBA_DEBUG
-                                                                    amoebaGpu->psWorkArray_3_2->_pDevData,
+                                                                    amoebaGpu->psWorkArray_3_1->_pDevData,
                                                                    debugArray->_pDevData, targetAtom );
 #else
-                                                                    amoebaGpu->psWorkArray_3_2->_pDevData );
+                                                                    amoebaGpu->psWorkArray_3_1->_pDevData );
 #endif
    }
    LAUNCHERROR("kCalculateAmoebaPmeDirectElectrostaticCutoffForces");
-    kReduceForceTorque( amoebaGpu );
-
-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){

-        amoebaGpu->psForce->Download();
-        amoebaGpu->psTorque->Download();
-        debugArray->Download();
-
-
-        int maxPrint        = 5;
-        float conversion    = 1.0f/41.84f;
-        float forceSum[3]   = { 0.0f, 0.0f, 0.0f};
-
-        (void) fprintf( amoebaGpu->log, "Finished PmeDirectElectrostatic kernel execution conversion=%.5f\n", conversion ); (void) fflush( amoebaGpu->log );
-        for( int ii = 0; ii < gpu->natoms; ii++ ){
-            (void) fprintf( amoebaGpu->log, "%5d ", ii);
-
-            int indexOffset     = ii*3;
-
-            // force
-
-            (void) fprintf( amoebaGpu->log,"PmeDirectElectrostaticF [%16.9e %16.9e %16.9e] ",
-                            conversion*amoebaGpu->psForce->_pSysData[indexOffset],
-                            conversion*amoebaGpu->psForce->_pSysData[indexOffset+1],
-                            conversion*amoebaGpu->psForce->_pSysData[indexOffset+2] );
-
-            forceSum[0]         += amoebaGpu->psForce->_pSysData[indexOffset];
-            forceSum[1]         += amoebaGpu->psForce->_pSysData[indexOffset+1];
-            forceSum[2]         += amoebaGpu->psForce->_pSysData[indexOffset+2];
-
-            // torque
-
-            (void) fprintf( amoebaGpu->log,"PmeDirectElectrostaticT [%16.9e %16.9e %16.9e] ",
-                            conversion*amoebaGpu->psTorque->_pSysData[indexOffset],
-                            conversion*amoebaGpu->psTorque->_pSysData[indexOffset+1],
-                            conversion*amoebaGpu->psTorque->_pSysData[indexOffset+2] );
-
-            (void) fprintf( amoebaGpu->log,"\n" );
-            if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
-                ii = gpu->natoms - maxPrint;
-            }
-        }
-        (void) fflush( amoebaGpu->log );
-
-        if( 0 ){
-            (void) fprintf( amoebaGpu->log,"DebugElec\n" );
-            int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-            float torqueSum0[3]       = { 0.0f, 0.0f, 0.0f};
-            float torqueSum1[3]       = { 0.0f, 0.0f, 0.0f};
-            int offset0             = 7*paddedNumberOfAtoms;
-            int offset1             = 8*paddedNumberOfAtoms;
-            int offset2             = 7*paddedNumberOfAtoms;
-            int offset3             = 8*paddedNumberOfAtoms;
-            for( int jj = 0; jj < gpu->natoms; jj++ ){
-                int debugIndex = jj;
-                if( fabs( debugArray->_pSysData[debugIndex+5*paddedNumberOfAtoms].y ) < 1.0e-10 )continue;
-                if( jj != targetAtom ){
-                    torqueSum0[0] += debugArray->_pSysData[debugIndex+offset0].x + debugArray->_pSysData[debugIndex+offset1].x; 
-                    torqueSum0[1] += debugArray->_pSysData[debugIndex+offset0].y + debugArray->_pSysData[debugIndex+offset1].y; 
-                    torqueSum0[2] += debugArray->_pSysData[debugIndex+offset0].z + debugArray->_pSysData[debugIndex+offset1].z; 
-
-                    torqueSum1[0] += debugArray->_pSysData[debugIndex+offset2].x + debugArray->_pSysData[debugIndex+offset3].x; 
-                    torqueSum1[1] += debugArray->_pSysData[debugIndex+offset2].y + debugArray->_pSysData[debugIndex+offset3].y; 
-                    torqueSum1[2] += debugArray->_pSysData[debugIndex+offset2].z + debugArray->_pSysData[debugIndex+offset3].z; 
-                }
-                if( jj == 2 ){
-                    offset0             += 2*paddedNumberOfAtoms;
-                    offset1             += 2*paddedNumberOfAtoms;
-                }
-                for( int kk = 0; kk < 12; kk++ ){
-                    (void) fprintf( amoebaGpu->log,"%5d %5d %5d [%16.9e %16.9e %16.9e %16.9e] E11\n", targetAtom, jj, kk,
-                                    debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
-                                    debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
-                    debugIndex += paddedNumberOfAtoms;
-                }
-                (void) fprintf( amoebaGpu->log,"%5d %5d [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] Sum\n", targetAtom, jj,
-                                 torqueSum0[0], torqueSum0[1], torqueSum0[2], torqueSum1[0], torqueSum1[1], torqueSum1[2]);
-                (void) fprintf( amoebaGpu->log,"\n" );
-            }
-        }
-        (void) fflush( amoebaGpu->log );
-
-        if( 1 ){
-            std::vector<int> fileId;
-            //fileId.push_back( 0 );
-            VectorOfDoubleVectors outputVector;
-            cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,            outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psForce,      outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psTorque,     outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-            cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeDirectForceTorque", fileId, outputVector );
-         }
-
-    }
-    delete debugArray;
-#endif
-
-    cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpu, amoebaGpu->psTorque, amoebaGpu->psForce, gpu->psForce4 );
-
-    if( 0 ){
-        std::vector<int> fileId;
-        //fileId.push_back( 0 );
-        VectorOfDoubleVectors outputVector;
-        copyForce( amoebaGpu, -1.0f/41.84f );
-        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psForce, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-        cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeDirectForce", fileId, outputVector );
-    }
+    kReduceTorque( amoebaGpu );
+    cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );

 }

@@ -1391,100 +1196,9 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
   @param amoebaGpu        amoebaGpu context

   --------------------------------------------------------------------------------------- */
-extern double kReduceEnergy(  gpuContext gpu);

 void cudaComputeAmoebaPmeElectrostatic( amoebaGpuContext amoebaGpu )
 {
-
-#ifdef AMOEBA_DEBUG
-    if( 0 ){
-        gpuContext gpu = amoebaGpu->gpuContext;
-        std::vector<int> fileId;
-        float conversion = -1.0f/41.84;
-        copyForce( amoebaGpu, conversion );
-        VectorOfDoubleVectors outputVector;
-        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psForce,      outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-        cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeRecipDemForce", fileId, outputVector );
-
-
-        int paddedNumberOfAtoms                   = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-        CUDAStream<float>* debugArray             = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "FArray");
-        int index                                 = 0;
-        for( int ii = 0; ii < amoebaGpu->gpuContext->natoms; ii++ ){
-            debugArray->_pSysData[index]    = amoebaGpu->psForce->_pSysData[index];
-            debugArray->_pSysData[index+1]  = amoebaGpu->psForce->_pSysData[index+1];
-            debugArray->_pSysData[index+2]  = amoebaGpu->psForce->_pSysData[index+2];
-            index                          += 3;
-        }
-        //zeroForce( amoebaGpu );
-        double dem = kReduceEnergy( gpu );
-        kCalculateAmoebaPMEInducedDipoleForces( amoebaGpu );
-        double dep = kReduceEnergy( gpu );
-        fprintf( stderr, "Recip Em=%15.7e ep=%15.7e  ttl=%15.7e", dem/4.184, (dep-dem)/4.184, dep/4.184 );
-        copyForce( amoebaGpu, conversion );
-        VectorOfDoubleVectors outputVector1;
-        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psForce,      outputVector1, gpu->psAtomIndex->_pSysData, 1.0f );
-        cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeRecipForce", fileId, outputVector1 );
-
-        VectorOfDoubleVectors outputVector2;
-        index = 0;
-        for( int ii = 0; ii < amoebaGpu->gpuContext->natoms; ii++ ){
-            amoebaGpu->psForce->_pSysData[index]   -= debugArray->_pSysData[index];
-            amoebaGpu->psForce->_pSysData[index+1] -= debugArray->_pSysData[index+1];
-            amoebaGpu->psForce->_pSysData[index+2] -= debugArray->_pSysData[index+2];
-            index                                  += 3;
- 
-        }
-        amoebaGpu->psForce->Upload();
-        outputVector.resize(0);
-        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psForce,      outputVector2, gpu->psAtomIndex->_pSysData, 1.0f );
-        cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeRecipDepForce", fileId, outputVector2 );
-
-
-        zeroForce( amoebaGpu );
-        kCalculateAmoebaPMEInducedDipoleForces( amoebaGpu );
-        //zeroForce( amoebaGpu );
-        exit(0);
-    }
-#endif
-
-#ifdef AMOEBA_DEBUG
-    if( 0 ){
-        gpuContext gpu = amoebaGpu->gpuContext;
-        std::vector<int> fileId;
-
-        zeroForce( amoebaGpu );
-        cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpu );
-        copyForce( amoebaGpu, -1.0f/41.84f );
-
-        VectorOfDoubleVectors outputVector;
-        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psForce,      outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-        cudaWriteVectorOfDoubleVectorsToFile( "yCudaPmeDirectForce", fileId, outputVector );
-        zeroForce( amoebaGpu );
-    }
-
-    if( 0 ){
-        gpuContext gpu = amoebaGpu->gpuContext;
-        std::vector<int> fileId;
-        zeroForce( amoebaGpu );
-        cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpu );
-        kCalculateAmoebaPMEInducedDipoleForces( amoebaGpu );
-        copyForce( amoebaGpu, -1.0f/41.84f );
-        VectorOfDoubleVectors outputVector;
-        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psForce,      outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-        cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeForce", fileId, outputVector );
-    }
-
-    if( 0 ){
-        gpuContext gpu = amoebaGpu->gpuContext;
-        std::vector<int> fileId;
-        copyForce( amoebaGpu, -1.0f/41.84f );
-        VectorOfDoubleVectors outputVector;
-        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psForce,      outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-        cudaWriteVectorOfDoubleVectorsToFile( "CudaPrePmeForce", fileId, outputVector );
-    }
-#endif
-
    cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpu );
    kCalculateAmoebaPMEInducedDipoleForces( amoebaGpu );
 }

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.h
@@ -35,13 +35,12 @@ __launch_bounds__(128, 1)
 __launch_bounds__(64, 1)
 #endif
 void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
-                            unsigned int* workUnit, float* outputForce, float* outputTorque
+                            unsigned int* workUnit, float* outputTorque

 #ifdef AMOEBA_DEBUG
                           , float4* debugArray, unsigned int targetAtom
 #endif
 ){
-
 #ifdef AMOEBA_DEBUG
    int maxPullIndex = 7;
    float4 pullBack[12];
@@ -103,7 +102,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
            if (bExclusionFlag)
            {
                unsigned int xi       = x >> GRIDBITS;
-                unsigned int cell     = xi + xi*cAmoebaSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
+                unsigned int cell     = xi + xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
                dScaleMask            = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
                pScaleMask            = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
                mScaleMask            = cAmoebaSim.pM_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
@@ -136,7 +135,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
                // nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution
                // by setting match flag

-                if( (atomI != atomJ) && (atomI < cAmoebaSim.numberOfAtoms) && (atomJ < cAmoebaSim.numberOfAtoms) )
+                if( (atomI != atomJ) && (atomI < cSim.atoms) && (atomJ < cSim.atoms) )
                {
                    localParticle.force[0]      += forceTorqueEnergy[0].x;
                    localParticle.force[1]      += forceTorqueEnergy[0].y;
@@ -154,7 +153,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
 #ifdef AMOEBA_DEBUG
 if( atomI == targetAtom || atomJ == targetAtom ){

-    unsigned int mask       =  ( (atomI == atomJ) || (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
+    unsigned int mask       =  ( (atomI == atomJ) || (atomI >= cSim.atoms) || (atomJ >= cSim.atoms) ) ? 0 : 1;
    unsigned int index                 = (atomI == targetAtom) ? atomJ : atomI;
    float blockId                      = 1.0f;

@@ -163,28 +162,28 @@ if( atomI == targetAtom || atomJ == targetAtom ){
    debugArray[index].z                = (float) y;
    debugArray[index].w                = blockId;

-    index                             += cAmoebaSim.paddedNumberOfAtoms;
+    index                             += cSim.paddedNumberOfAtoms;
    debugArray[index].x                = mask ? forceTorqueEnergy[0].x  : 0.0f;
    debugArray[index].y                = mask ? forceTorqueEnergy[0].y  : 0.0f;
    debugArray[index].z                = mask ? forceTorqueEnergy[0].z  : 0.0f;
    debugArray[index].w                = mask ? forceTorqueEnergy[0].w  : 0.0f;


-    index                             += cAmoebaSim.paddedNumberOfAtoms;
+    index                             += cSim.paddedNumberOfAtoms;
    debugArray[index].x                = mask ? forceTorqueEnergy[1].x : 0.0f;
    debugArray[index].y                = mask ? forceTorqueEnergy[1].y : 0.0f;
    debugArray[index].z                = mask ? forceTorqueEnergy[1].z : 0.0f;
-    float offsetF                      = (float)(3*(x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms));
+    float offsetF                      = (float)(3*(x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms));
    debugArray[index].w                = offsetF;

-    index                             += cAmoebaSim.paddedNumberOfAtoms;
+    index                             += cSim.paddedNumberOfAtoms;
    debugArray[index].x                = mask ? forceTorqueEnergy[2].x : 0.0f;
    debugArray[index].y                = mask ? forceTorqueEnergy[2].y : 0.0f;
    debugArray[index].z                = mask ? forceTorqueEnergy[2].z : 0.0f;
    debugArray[index].w                = offsetF;

    for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
-        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        index                             += cSim.paddedNumberOfAtoms;
        debugArray[index].x                = pullBack[pullIndex].x;
        debugArray[index].y                = pullBack[pullIndex].y;
        debugArray[index].z                = pullBack[pullIndex].z;
@@ -197,7 +196,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){

            // include self energy and self torque

-            if( atomI < cAmoebaSim.numberOfAtoms ){
+            if( atomI < cSim.atoms ){
                calculatePmeSelfTorqueElectrostaticPairIxn_kernel( localParticle );
                float energy;
                calculatePmeSelfEnergyElectrostaticPairIxn_kernel( localParticle, &energy );
@@ -207,41 +206,13 @@ if( atomI == targetAtom || atomJ == targetAtom ){
            // Write results

 #ifdef USE_OUTPUT_BUFFER_PER_WARP
-            float  of;
-            unsigned int offset                 = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
-            of                                  = outputForce[offset];
-            of                                 += localParticle.force[0];
-            outputForce[offset]                 = of;
-
-            of                                  = outputForce[offset+1];
-            of                                 += localParticle.force[1];
-            outputForce[offset+1]               = of;
-
-            of                                  = outputForce[offset+2];
-            of                                 += localParticle.force[2];
-            outputForce[offset+2]               = of;
-
-            of                                  = outputTorque[offset];
-            of                                 += localParticle.torque[0];
-            outputTorque[offset]                = of;
-
-            of                                  = outputTorque[offset+1];
-            of                                 += localParticle.torque[1];
-            outputTorque[offset+1]              = of;
-
-            of                                  = outputTorque[offset+2];
-            of                                 += localParticle.torque[2];
-            outputTorque[offset+2]              = of;
-
+            unsigned int offset                 = (x + tgx + warp*cSim.paddedNumberOfAtoms);
+            add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
+            add3dArray( 3*offset, localParticle.torque, outputTorque );
 #else
-            unsigned int offset                 = 3*(x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
-            outputForce[offset]                 = localParticle.force[0];
-            outputForce[offset+1]               = localParticle.force[1];
-            outputForce[offset+2]               = localParticle.force[2];
-
-            outputTorque[offset]                = localParticle.torque[0];
-            outputTorque[offset+1]              = localParticle.torque[1];
-            outputTorque[offset+2]              = localParticle.torque[2];
+            unsigned int offset                 = (x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
+            add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
+            load3dArray( 3*offset, localParticle.torque, outputTorque );
 #endif

        } else {
@@ -271,7 +242,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
                {
                    unsigned int xi   = x >> GRIDBITS;
                    unsigned int yi   = y >> GRIDBITS;
-                    unsigned int cell = xi+yi*cAmoebaSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
+                    unsigned int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
                    dScaleMask        = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
                    pScaleMask        = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
                    mScaleMask        = cAmoebaSim.pM_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
@@ -306,7 +277,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){

                        // check if atoms out-of-bounds

-                        if( (atomI < cAmoebaSim.numberOfAtoms) && (atomJ < cAmoebaSim.numberOfAtoms) )
+                        if( (atomI < cSim.atoms) && (atomJ < cSim.atoms) )
                        {
                            // add force and torque to atom I due atom J
    
@@ -372,7 +343,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
 #ifdef AMOEBA_DEBUG
 unsigned int jIdx  = (flags == 0xFFFFFFFF) ? tj : j;
 unsigned int atomJ = y + jIdx;
-unsigned int mask  =  ( (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
+unsigned int mask  =  ( (atomI >= cSim.atoms) || (atomJ >= cSim.atoms) ) ? 0 : 1;
 if( atomI == targetAtom || atomJ == targetAtom ){
    unsigned int index                 = (atomI == targetAtom) ? atomJ : atomI;

@@ -381,29 +352,29 @@ if( atomI == targetAtom || atomJ == targetAtom ){
    debugArray[index].z                = (float) y;
    debugArray[index].w                = (flags == 0xFFFFFFFF) ? (float) -141.0f : -151.0f;

-    index                             += cAmoebaSim.paddedNumberOfAtoms;
+    index                             += cSim.paddedNumberOfAtoms;
    debugArray[index].x                = mask ? forceTorqueEnergy[0].x  : 0.0f;
    debugArray[index].y                = mask ? forceTorqueEnergy[0].y  : 0.0f;
    debugArray[index].z                = mask ? forceTorqueEnergy[0].z  : 0.0f;
    debugArray[index].w                = mask ? forceTorqueEnergy[0].w  : 0.0f;


-    index                             += cAmoebaSim.paddedNumberOfAtoms;
+    index                             += cSim.paddedNumberOfAtoms;
    debugArray[index].x                = mask ? forceTorqueEnergy[1].x : 0.0f;
    debugArray[index].y                = mask ? forceTorqueEnergy[1].y : 0.0f;
    debugArray[index].z                = mask ? forceTorqueEnergy[1].z : 0.0f;
-    float offsetF                      = (float)(3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms));
+    float offsetF                      = (float)(3*(y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms));
    debugArray[index].w                = offsetF;

-    index                             += cAmoebaSim.paddedNumberOfAtoms;
+    index                             += cSim.paddedNumberOfAtoms;
    debugArray[index].x                = mask ? forceTorqueEnergy[2].x : 0.0f;
    debugArray[index].y                = mask ? forceTorqueEnergy[2].y : 0.0f;
    debugArray[index].z                = mask ? forceTorqueEnergy[2].z : 0.0f;
-    offsetF                            = (float) (3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms));
+    offsetF                            = (float) (3*(x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms));
    debugArray[index].w                = offsetF;

    for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
-        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        index                             += cSim.paddedNumberOfAtoms;
        debugArray[index].x                = pullBack[pullIndex].x;
        debugArray[index].y                = pullBack[pullIndex].y;
        debugArray[index].z                = pullBack[pullIndex].z;
@@ -419,78 +390,23 @@ if( atomI == targetAtom || atomJ == targetAtom ){
    
 #ifdef USE_OUTPUT_BUFFER_PER_WARP
    
-                float of;
-                unsigned int offset                 = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
-                of                                  = outputForce[offset];
-                of                                 += localParticle.force[0];
-                outputForce[offset]                 = of;
-    
-                of                                  = outputForce[offset+1];
-                of                                 += localParticle.force[1];
-                outputForce[offset+1]               = of;
-    
-                of                                  = outputForce[offset+2];
-                of                                 += localParticle.force[2];
-                outputForce[offset+2]               = of;
-    
-                of                                  = outputTorque[offset];
-                of                                 += localParticle.torque[0];
-                outputTorque[offset]                = of;
-    
-                of                                  = outputTorque[offset+1];
-                of                                 += localParticle.torque[1];
-                outputTorque[offset+1]              = of;
+                unsigned int offset                 = (x + tgx + warp*cSim.paddedNumberOfAtoms);
+                add3dArrayToFloat4( offset, localParticle.force,  cSim.pForce4 );
+                add3dArray(       3*offset, localParticle.torque, outputTorque );
    
-                of                                  = outputTorque[offset+2];
-                of                                 += localParticle.torque[2];
-                outputTorque[offset+2]              = of;
-    
-                offset                              = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
-    
-                of                                  = outputForce[offset];
-                of                                 += sA[threadIdx.x].force[0];
-                outputForce[offset]                 = of;
-    
-                of                                  = outputForce[offset+1];
-                of                                 += sA[threadIdx.x].force[1];
-                outputForce[offset+1]               = of;
-    
-                of                                  = outputForce[offset+2];
-                of                                 += sA[threadIdx.x].force[2];
-                outputForce[offset+2]               = of;
-    
-                of                                  = outputTorque[offset];
-                of                                 += sA[threadIdx.x].torque[0];
-                outputTorque[offset]                = of;
-    
-                of                                  = outputTorque[offset+1];
-                of                                 += sA[threadIdx.x].torque[1];
-                outputTorque[offset+1]              = of;
-    
-                of                                  = outputTorque[offset+2];
-                of                                 += sA[threadIdx.x].torque[2];
-                outputTorque[offset+2]              = of;
+                offset                              = (y + tgx + warp*cSim.paddedNumberOfAtoms);
+                add3dArrayToFloat4( offset, sA[threadIdx.x].force,  cSim.pForce4 );
+                add3dArray(       3*offset, sA[threadIdx.x].torque, outputTorque );
    
 #else
-                unsigned int offset                 = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
-    
-                outputForce[offset]                 = localParticle.force[0];
-                outputForce[offset+1]               = localParticle.force[1];
-                outputForce[offset+2]               = localParticle.force[2];
-    
-                outputTorque[offset]                = localParticle.torque[0];
-                outputTorque[offset+1]              = localParticle.torque[1];
-                outputTorque[offset+2]              = localParticle.torque[2];
-    
-                offset                              = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+                unsigned int offset                 = (x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms);
+                add3dArrayToFloat4( offset, localParticle.force,  cSim.pForce4 );
+                load3dArray(       3*offset, localParticle.torque, outputTorque );
    
-                outputForce[offset]                 = sA[threadIdx.x].force[0];
-                outputForce[offset+1]               = sA[threadIdx.x].force[1];
-                outputForce[offset+2]               = sA[threadIdx.x].force[2];
+                offset                              = (y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
    
-                outputTorque[offset]                = sA[threadIdx.x].torque[0];
-                outputTorque[offset+1]              = sA[threadIdx.x].torque[1];
-                outputTorque[offset+2]              = sA[threadIdx.x].torque[2];
+                add3dArrayToFloat4( offset, sA[threadIdx.x].force,  cSim.pForce4 );
+                load3dArray(       3*offset, sA[threadIdx.x].torque, outputTorque );
    
 #endif
                lasty = y;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
@@ -3,6 +3,7 @@

 //-----------------------------------------------------------------------------------------

+#include "cudaKernels.h"
 #include "amoebaCudaKernels.h"
 #include "kCalculateAmoebaCudaUtilities.h"

@@ -135,17 +136,19 @@ static void kReducePmeEField_kernel( unsigned int fieldComponents, unsigned int
 static void kReducePmeDirectE_Fields(amoebaGpuContext amoebaGpu )
 {

+    gpuContext gpu = amoebaGpu->gpuContext;
+
    // E_FieldPolar = E_Field (reciprocal) + E_FieldPolar (direct) + self

-    kReducePmeEFieldPolar_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
-                                   amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
+    kReducePmeEFieldPolar_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
+                                   gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
                                   amoebaGpu->psE_Field->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData );
    LAUNCHERROR("kReducePmeE_Fields1");

    // E_Field = E_Field (reciprocal) + E_Field (direct) + self

-    kReducePmeEField_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
-                              amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
+    kReducePmeEField_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
+                              gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
                              amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psE_Field->_pDevData );
    LAUNCHERROR("kReducePmeE_Fields2");
 }
@@ -362,7 +365,6 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
 static int isNanOrInfinity( double number ){
    return (number != number || number == std::numeric_limits<double>::infinity() || number == -std::numeric_limits<double>::infinity()) ? 1 : 0; 
 }
-#endif

 static void bubbleSort( std::vector<int>& array, std::vector<int>& track, int length)
 {
@@ -390,7 +392,8 @@ static void bubbleSort( std::vector<int>& array, std::vector<int>& track, int le
    if(test==0) break; /*will exit if the list is sorted!*/
  } /*end for i*/
      
-}/*end bubbleSort*/
+}
+#endif

 /**---------------------------------------------------------------------------------------

@@ -438,7 +441,7 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
    }    

    if (gpu->bOutputBufferPerWarp){
-        kCalculateAmoebaPmeDirectFixedE_FieldCutoffByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
+        kCalculateAmoebaPmeDirectFixedE_FieldCutoffByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
                                                                           gpu->sim.pInteractingWorkUnit,
                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
 #ifdef AMOEBA_DEBUG
@@ -448,7 +451,7 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
                                                                           amoebaGpu->psWorkArray_3_2->_pDevData );
 #endif
    } else {
-        kCalculateAmoebaPmeDirectFixedE_FieldCutoff_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
+        kCalculateAmoebaPmeDirectFixedE_FieldCutoff_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
                                                                           gpu->sim.pInteractingWorkUnit,
                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
 #ifdef AMOEBA_DEBUG
@@ -469,15 +472,15 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
                        threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)+sizeof(float3)),
                        (sizeof(FixedFieldParticle)+sizeof(float3)), (sizeof(FixedFieldParticle)+sizeof(float3))*threadsPerBlock );
        (void) fprintf( amoebaGpu->log, "AmoebaCutoffForces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u warp=%d\n",
-                        amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
+                        gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
                        sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock,
                        (*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->bOutputBufferPerWarp );
        (void) fflush( amoebaGpu->log );
 /*
-        (void) fprintf( amoebaGpu->log, "Out WorkArray_3_[1,2]  paddedNumberOfAtoms=%d\n",  amoebaGpu->paddedNumberOfAtoms, amoebaGpu->outputBuffers );
+        (void) fprintf( amoebaGpu->log, "Out WorkArray_3_[1,2]  paddedNumberOfAtoms=%d\n",  gpu->sim.paddedNumberOfAtoms, gpu->sim.outputBuffers );
        amoebaGpu->psWorkArray_3_1->Download();
        amoebaGpu->psWorkArray_3_2->Download();
-        for( int ii = 0; ii < amoebaGpu->paddedNumberOfAtoms; ii++ ){
+        for( int ii = 0; ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
           (void) fprintf( amoebaGpu->log, "%5d ", ii); 

            int indexOffset     = ii*3;
@@ -624,33 +627,13 @@ void cudaComputeAmoebaPmeFixedEField( amoebaGpuContext amoebaGpu )
        fileId.push_back( 0 );
        VectorOfDoubleVectors outputVector;
        //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,              outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
+        kReduceForces( gpu );
+        cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psForce4,              outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_Field,      outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-        cudaWriteVectorOfDoubleVectorsToFile( "CudaRecipEField", fileId, outputVector );
-        //exit(0);
-    }
-
-    if( 0 ){
-        gpuContext gpu                       = amoebaGpu->gpuContext;
-        std::vector<int> fileId;
-        fileId.push_back( 0 );
-        VectorOfDoubleVectors outputVector;
-        //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,              outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_Field,      outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-        cudaWriteVectorOfDoubleVectorsToFile( "CudaEField", fileId, outputVector );
+        cudaWriteVectorOfDoubleVectorsToFile( "CudaRecipForceTorqueFixed", fileId, outputVector );
+        //cudaWriteVectorOfDoubleVectorsToFile( "CudaRecipEField", fileId, outputVector );
+        exit(0);
    }

-
-    if( 0 ){
-        cudaComputeAmoebaPmeDirectFixedEField( amoebaGpu );
-        gpuContext gpu                       = amoebaGpu->gpuContext;
-        std::vector<int> fileId;
-        //fileId.push_back( 0 );
-        VectorOfDoubleVectors outputVector;
-        //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,              outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_Field,      outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-        cudaWriteVectorOfDoubleVectorsToFile( "CudaDirectEField", fileId, outputVector );
-    }
 }
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.h
@@ -101,7 +101,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(

            if( bExclusionFlag ){
                unsigned int xi       = x >> GRIDBITS;
-                unsigned int cell     = xi + xi*cAmoebaSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
+                unsigned int cell     = xi + xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
                dScaleMask            = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
                pScaleMask            = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
            } else {
@@ -127,7 +127,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
                // nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution
                // by setting match flag

-                unsigned int match      = ( (atomI == (y + j)) || (atomI >= cAmoebaSim.numberOfAtoms) || ((y+j) >= cAmoebaSim.numberOfAtoms) ) ? 1 : 0;
+                unsigned int match      = ( (atomI == (y + j)) || (atomI >= cSim.atoms) || ((y+j) >= cSim.atoms) ) ? 1 : 0;

                // add to field at atomI the field due atomJ's charge/dipole/quadrupole

@@ -151,37 +151,37 @@ if( atomI == targetAtom || targetAtom == (y+j) ){
    debugArray[index].z                = dScaleValue;
    debugArray[index].w                = pScaleValue;
 /*
-    index                             += cAmoebaSim.paddedNumberOfAtoms;
+    index                             += cSim.paddedNumberOfAtoms;
    debugArray[index].x                = (float) bExclusionFlag;
    debugArray[index].y                = (float) (tgx);
    debugArray[index].z                = (float) j;
    debugArray[index].w                = flag;

-    index                             += cAmoebaSim.paddedNumberOfAtoms;
+    index                             += cSim.paddedNumberOfAtoms;
    debugArray[index].x                = (float) dScaleMask;
    debugArray[index].y                = (float) pScaleMask.x;
    debugArray[index].z                = (float) pScaleMask.y;
    debugArray[index].w                = flag;
 */
-    index                             += cAmoebaSim.paddedNumberOfAtoms;
+    index                             += cSim.paddedNumberOfAtoms;
    debugArray[index].x                = match ? 0.0f : ijField[0].x;
    debugArray[index].y                = match ? 0.0f : ijField[1].x;
    debugArray[index].z                = match ? 0.0f : ijField[2].x;
    debugArray[index].w                = flag + 1.0f;

-    index                             += cAmoebaSim.paddedNumberOfAtoms;
+    index                             += cSim.paddedNumberOfAtoms;
    debugArray[index].x                = match ? 0.0f : ijField[0].z;
    debugArray[index].y                = match ? 0.0f : ijField[1].z;
    debugArray[index].z                = match ? 0.0f : ijField[2].z;
    debugArray[index].w                = flag + 2.0f;

-    index                             += cAmoebaSim.paddedNumberOfAtoms;
+    index                             += cSim.paddedNumberOfAtoms;
    debugArray[index].x                = match ? 0.0f : ijField[0].y;
    debugArray[index].y                = match ? 0.0f : ijField[1].y;
    debugArray[index].z                = match ? 0.0f : ijField[2].y;
    debugArray[index].w                = flag + 3.0f;

-    index                             += cAmoebaSim.paddedNumberOfAtoms;
+    index                             += cSim.paddedNumberOfAtoms;
    debugArray[index].x                = match ? 0.0f : ijField[0].w;
    debugArray[index].y                = match ? 0.0f : ijField[1].w;
    debugArray[index].z                = match ? 0.0f : ijField[2].w;
@@ -189,7 +189,7 @@ if( atomI == targetAtom || targetAtom == (y+j) ){


    for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
-        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        index                             += cSim.paddedNumberOfAtoms;
        debugArray[index].x                = pullBack[pullIndex].x;
        debugArray[index].y                = pullBack[pullIndex].y;
        debugArray[index].z                = pullBack[pullIndex].z;
@@ -204,11 +204,11 @@ if( atomI == targetAtom || targetAtom == (y+j) ){
            // Write results

 #ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset                 = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+            unsigned int offset                 = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
            load3dArrayBufferPerWarp( offset, fieldSum,       outputEField );
            load3dArrayBufferPerWarp( offset, fieldPolarSum,  outputEFieldPolar );
 #else
-            unsigned int offset                 = 3*(x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+            unsigned int offset                 = 3*(x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
            load3dArray( offset, fieldSum,       outputEField );
            load3dArray( offset, fieldPolarSum,  outputEFieldPolar );
 #endif
@@ -235,7 +235,7 @@ if( atomI == targetAtom || targetAtom == (y+j) ){
                if( bExclusionFlag ) {
                    unsigned int xi   = x >> GRIDBITS;
                    unsigned int yi   = y >> GRIDBITS;
-                    unsigned int cell = xi+yi*cAmoebaSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
+                    unsigned int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
                    dScaleMask        = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
                    pScaleMask        = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
                } else {
@@ -258,7 +258,7 @@ if( atomI == targetAtom || targetAtom == (y+j) ){
    #endif
                        );

-                        unsigned int outOfBounds     = ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+jIdx) >= cAmoebaSim.numberOfAtoms) ) ? 1 : 0;
+                        unsigned int outOfBounds     = ( (atomI >= cSim.atoms) || ((y+jIdx) >= cSim.atoms) ) ? 1 : 0;

                        // add to field at atomI the field due atomJ's charge/dipole/quadrupole

@@ -331,44 +331,44 @@ if( (atomI == targetAtom || (y + jIdx) == targetAtom) ){

            float flag                         = 9.0f;
 /*
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = (float) bExclusionFlag;
            debugArray[index].y                = (float) (tgx);
            debugArray[index].z                = (float) j;
            debugArray[index].w                = jIdx;
        
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = (float) dScaleMask;
            debugArray[index].y                = (float) pScaleMask.x;
            debugArray[index].z                = (float) pScaleMask.y;
            debugArray[index].w                = (float) flags;
 */       
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                =  outOfBounds ? 0.0f : ijField[0].x;
            debugArray[index].y                =  outOfBounds ? 0.0f : ijField[1].x;
            debugArray[index].z                =  outOfBounds ? 0.0f : ijField[2].x;
            debugArray[index].w                =  flag + 1.0f;

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                =  outOfBounds ? 0.0f : ijField[0].y;
            debugArray[index].y                =  outOfBounds ? 0.0f : ijField[1].y;
            debugArray[index].z                =  outOfBounds ? 0.0f : ijField[2].y;
            debugArray[index].w                = flag + 2.0f;

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                =  outOfBounds ? 0.0f : ijField[0].z;
            debugArray[index].y                =  outOfBounds ? 0.0f : ijField[1].z;
            debugArray[index].z                =  outOfBounds ? 0.0f : ijField[2].z;
            debugArray[index].w                = flag + 3.0f;

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                =  outOfBounds ? 0.0f : ijField[0].w;
            debugArray[index].y                =  outOfBounds ? 0.0f : ijField[1].w;
            debugArray[index].z                =  outOfBounds ? 0.0f : ijField[2].w;
            debugArray[index].w                = flag + 4.0f;

            for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
-                index                             += cAmoebaSim.paddedNumberOfAtoms;
+                index                             += cSim.paddedNumberOfAtoms;
                debugArray[index].x                = pullBack[pullIndex].x;
                debugArray[index].y                = pullBack[pullIndex].y;
                debugArray[index].z                = pullBack[pullIndex].z;
@@ -384,20 +384,20 @@ if( (atomI == targetAtom || (y + jIdx) == targetAtom) ){
                // Write results
    
 #ifdef USE_OUTPUT_BUFFER_PER_WARP
-                unsigned int offset                 = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+                unsigned int offset                 = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
                load3dArrayBufferPerWarp( offset, fieldSum,       outputEField );
                load3dArrayBufferPerWarp( offset, fieldPolarSum,  outputEFieldPolar );
    
-                offset                              = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+                offset                              = 3*(y + tgx + warp*cSim.paddedNumberOfAtoms);
                load3dArrayBufferPerWarp( offset, sA[threadIdx.x].eField,  outputEField );
                load3dArrayBufferPerWarp( offset, sA[threadIdx.x].eFieldP, outputEFieldPolar );
    
 #else
-                unsigned int offset                 = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+                unsigned int offset                 = 3*(x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms);
                load3dArray( offset, fieldSum,       outputEField );
                load3dArray( offset, fieldPolarSum,  outputEFieldPolar );
    
-                offset                              = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+                offset                              = 3*(y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
                load3dArray( offset, sA[threadIdx.x].eField,  outputEField );
                load3dArray( offset, sA[threadIdx.x].eFieldP, outputEFieldPolar );
     

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
@@ -354,13 +354,14 @@ static void kSorUpdateMutualInducedField_kernel(

 static void kReduceMutualInducedFields(amoebaGpuContext amoebaGpu, CUDAStream<float>* outputArray, CUDAStream<float>* outputPolarArray )
 {
-    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
-                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
+    gpuContext gpu = amoebaGpu->gpuContext;
+    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
+                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
                               amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData );
    LAUNCHERROR("kReducePmeMI_Fields1");

-    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
-                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
+    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
+                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
                               amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData );
    LAUNCHERROR("kReducePmeMI_Fields2");
 }
@@ -410,21 +411,22 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(MutualInducedParticle)), maxThreads);
    }    

-    if (gpu->bOutputBufferPerWarp){
-
 #ifdef AMOEBA_DEBUG
-        if( amoebaGpu->log ){
-            (void) fprintf( amoebaGpu->log, "Cutoff -- use warp\n" );
-            (void) fprintf( amoebaGpu->log, "%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n",
-                            methodName, amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
-                            sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock,
-                            (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
-            (void) fflush( amoebaGpu->log );
-        }
+    if( amoebaGpu->log ){
+        (void) fprintf( amoebaGpu->log, "Cutoff -- use warp\n" );
+        (void) fprintf( amoebaGpu->log, "%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n",
+                        methodName, gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
+                        sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock,
+                        (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
+        (void) fflush( amoebaGpu->log );
+    }
 #endif
+
+    if (gpu->bOutputBufferPerWarp){
+
                                                                 //gpu->sim.pInteractingWorkUnit,
                                                                 //amoebaGpu->psWorkUnit->_pDevData,
-        kCalculateAmoebaPmeMutualInducedFieldCutoffByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
+        kCalculateAmoebaPmeMutualInducedFieldCutoffByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
                                                                 gpu->sim.pInteractingWorkUnit,
                                                                 amoebaGpu->psWorkArray_3_1->_pDevData,
 #ifdef AMOEBA_DEBUG
@@ -436,17 +438,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte

    } else {

-#ifdef AMOEBA_DEBUG
-        if( amoebaGpu->log ){
-            (void) fprintf( amoebaGpu->log, "Cutoff no warp\n" );
-            (void) fprintf( amoebaGpu->log, "%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n",
-                            methodName, amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
-                            sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock,
-                            (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
-            (void) fflush( amoebaGpu->log );
-        }
-#endif
-        kCalculateAmoebaPmeMutualInducedFieldCutoff_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
+        kCalculateAmoebaPmeMutualInducedFieldCutoff_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
                                                                 gpu->sim.pInteractingWorkUnit,
                                                                 amoebaGpu->psWorkArray_3_1->_pDevData,
 #ifdef AMOEBA_DEBUG
@@ -596,7 +588,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
        (void) fprintf( amoebaGpu->log, "%s Initial setup for matrix multiply\n", methodName );
        int offset   = 0;
        int maxPrint = 10;
-         cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psForce,      outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
        for( int ii = 0; ii < gpu->natoms; ii++ ){
            (void) fprintf( amoebaGpu->log, "%4d pol=%12.4e ", ii, 
                            amoebaGpu->psPolarizability->_pSysData[offset] );

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.h
@@ -110,7 +110,7 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)(
 #endif
 );

-                unsigned int mask       =  ( (atomI == (y + j)) || (atomI >= cAmoebaSim.numberOfAtoms) || ((y+j) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
+                unsigned int mask       =  ( (atomI == (y + j)) || (atomI >= cSim.atoms) || ((y+j) >= cSim.atoms) ) ? 0 : 1;

                // add to field at atomI the field due atomJ's dipole

@@ -136,51 +136,51 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
            debugArray[index].w                = 6.0f;


-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = pullBack[pullBackIndex].x;
            debugArray[index].y                = pullBack[pullBackIndex].y;
            debugArray[index].z                = pullBack[pullBackIndex].z;
            debugArray[index].w                = pullBack[pullBackIndex].w;

            pullBackIndex++;
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = pullBack[pullBackIndex].x;
            debugArray[index].y                = pullBack[pullBackIndex].y;
            debugArray[index].z                = pullBack[pullBackIndex].z;
            debugArray[index].w                = pullBack[pullBackIndex].w;

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            float flag                         = 6.0f;
            debugArray[index].x                = ijField[0].x;
            debugArray[index].y                = ijField[1].x;
            debugArray[index].z                = ijField[2].x;
            debugArray[index].w                = flag;

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = ijField[0].x;
            debugArray[index].y                = ijField[1].x;
            debugArray[index].z                = ijField[2].x;
            debugArray[index].w                = flag;

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = ijField[0].z;
            debugArray[index].y                = ijField[1].z;
            debugArray[index].z                = ijField[2].z;
            debugArray[index].w                = flag;

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = ijField[0].z;
            debugArray[index].y                = ijField[1].z;
            debugArray[index].z                = ijField[2].z;
            debugArray[index].w                = flag;

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = match ? 0.0f : ijField[0].x;
            debugArray[index].y                = match ? 0.0f : ijField[1].x;
            debugArray[index].z                = match ? 0.0f : ijField[2].x;
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            unsigned int mask                  = 1 << j;
            unsigned int pScaleIndex           = (scaleMask.x & mask) ? 1 : 0;
            pScaleIndex                       += (scaleMask.y & mask) ? 2 : 0;
@@ -199,12 +199,12 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
            // Write results

 #ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset            = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+            unsigned int offset            = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
            load3dArrayBufferPerWarp( offset, fieldSum,      outputField );
            load3dArrayBufferPerWarp( offset, fieldPolarSum, outputFieldPolar);

 #else
-            unsigned int offset            = 3*(x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+            unsigned int offset            = 3*(x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
            load3dArray( offset, fieldSum,      outputField );
            load3dArray( offset, fieldPolarSum, outputFieldPolar);

@@ -245,7 +245,7 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
    #endif
           );

-                        unsigned int mask   =  ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+jIdx) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
+                        unsigned int mask   =  ( (atomI >= cSim.atoms) || ((y+jIdx) >= cSim.atoms) ) ? 0 : 1;

                        // add to field at atomI the field due atomJ's dipole

@@ -323,39 +323,39 @@ if( atomI == targetAtom || (y+jIdx) == targetAtom ){
            debugArray[index].w                = 7.0f;


-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = pullBack[pullBackIndex].x;
            debugArray[index].y                = pullBack[pullBackIndex].y;
            debugArray[index].z                = pullBack[pullBackIndex].z;
            debugArray[index].w                = pullBack[pullBackIndex].w;

            pullBackIndex++;
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = pullBack[pullBackIndex].x;
            debugArray[index].y                = pullBack[pullBackIndex].y;
            debugArray[index].z                = pullBack[pullBackIndex].z;
            debugArray[index].w                = pullBack[pullBackIndex].w;

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            float flag                         = 7.0f;
            debugArray[index].x                = ijField[indexI][0];
            debugArray[index].y                = ijField[indexI][1];
            debugArray[index].z                = ijField[indexI][2];
            debugArray[index].w                = flag;

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = ijField[indexJ][0];
            debugArray[index].y                = ijField[indexJ][1];
            debugArray[index].z                = ijField[indexJ][2];
            debugArray[index].w                = flag;

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = ijField[indexI+1][0];
            debugArray[index].y                = ijField[indexI+1][1];
            debugArray[index].z                = ijField[indexI+1][2];
            debugArray[index].w                = flag;

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = ijField[indexJ+1][0];
            debugArray[index].y                = ijField[indexJ+1][1];
            debugArray[index].z                = ijField[indexJ+1][2];
@@ -372,21 +372,21 @@ if( atomI == targetAtom || (y+jIdx) == targetAtom ){
                // Write results
    
 #ifdef USE_OUTPUT_BUFFER_PER_WARP
-                unsigned int offset     = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+                unsigned int offset     = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
                load3dArrayBufferPerWarp( offset, fieldSum,      outputField );
                load3dArrayBufferPerWarp( offset, fieldPolarSum, outputFieldPolar);
    
-                offset                  = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+                offset                  = 3*(y + tgx + warp*cSim.paddedNumberOfAtoms);
    
                load3dArrayBufferPerWarp( offset, sA[threadIdx.x].field,      outputField );
                load3dArrayBufferPerWarp( offset, sA[threadIdx.x].fieldPolar, outputFieldPolar);
    
 #else
-                unsigned int offset     = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+                unsigned int offset     = 3*(x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms);
                load3dArray( offset, fieldSum,      outputField );
                load3dArray( offset, fieldPolarSum, outputFieldPolar);
    
-                offset                  = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+                offset                  = 3*(y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
                load3dArray( offset, sA[threadIdx.x].field,      outputField );
                load3dArray( offset, sA[threadIdx.x].fieldPolar, outputFieldPolar);
    

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaRotateFrame.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaRotateFrame.cu
@@ -77,74 +77,58 @@ void kCudaComputeCheckChiral_kernel( void )
    const int C           = 3;
    float delta[4][3];
 
-    float4* atomCoord            = cSim.pPosq;
-    int4* multiPoleAtoms         = cAmoebaSim.pMultipoleParticlesIdsAndAxisType;
-    float* molecularDipole       = cAmoebaSim.pMolecularDipole;
-    float* molecularQuadrupole   = cAmoebaSim.pMolecularQuadrupole;
+    float4* particleCoord        = cSim.pPosq;
+    int4* multiPoleParticles     = cAmoebaSim.pMultipoleParticlesIdsAndAxisType;
    float* labFrameDipole        = cAmoebaSim.pLabFrameDipole;
    float* labFrameQuadrupole    = cAmoebaSim.pLabFrameQuadrupole;
 
    // ---------------------------------------------------------------------------------------
 
-    int atomIndex                = blockIdx.x;
-    if( atomIndex >= cSim.atoms )return;
- 
-    int axisType                 = multiPoleAtoms[atomIndex].w; 
- 
-    float* molDipole             = &(molecularDipole[atomIndex*3]);
-    float* labDipole             = &(labFrameDipole[atomIndex*3]);
-    labDipole[0]                 = molDipole[0];
-    labDipole[1]                 = molDipole[1];
-    labDipole[2]                 = molDipole[2];
- 
-    float* molQuadrupole         = &(molecularQuadrupole[atomIndex*9]);
-    float* labQuadrupole         = &(labFrameQuadrupole[atomIndex*9]);
-    labQuadrupole[0]             = molQuadrupole[0];
-    labQuadrupole[1]             = molQuadrupole[1];
-    labQuadrupole[2]             = molQuadrupole[2];
-    labQuadrupole[3]             = molQuadrupole[3];
-    labQuadrupole[4]             = molQuadrupole[4];
-    labQuadrupole[5]             = molQuadrupole[5];
-    labQuadrupole[6]             = molQuadrupole[6];
-    labQuadrupole[7]             = molQuadrupole[7];
-    labQuadrupole[8]             = molQuadrupole[8];
-
-    // skip z-then-x
-
-    if( axisType == 0 || multiPoleAtoms[atomIndex].y < 0 )return;
- 
-    // ---------------------------------------------------------------------------------------
- 
-    int atomA                    = atomIndex;
-    int atomB                    = multiPoleAtoms[atomIndex].z;
-    int atomC                    = multiPoleAtoms[atomIndex].x;
-    int atomD                    = multiPoleAtoms[atomIndex].y;
-
-    delta[AD][0]                 = atomCoord[atomA].x - atomCoord[atomD].x;
-    delta[AD][1]                 = atomCoord[atomA].y - atomCoord[atomD].y;
-    delta[AD][2]                 = atomCoord[atomA].z - atomCoord[atomD].z;
-
-    delta[BD][0]                 = atomCoord[atomB].x - atomCoord[atomD].x;
-    delta[BD][1]                 = atomCoord[atomB].y - atomCoord[atomD].y;
-    delta[BD][2]                 = atomCoord[atomB].z - atomCoord[atomD].z;
-
-    delta[CD][0]                 = atomCoord[atomC].x - atomCoord[atomD].x;
-    delta[CD][1]                 = atomCoord[atomC].y - atomCoord[atomD].y;
-    delta[CD][2]                 = atomCoord[atomC].z - atomCoord[atomD].z;
-
-    delta[C][0]                  = delta[BD][1]*delta[CD][2] - delta[BD][2]*delta[CD][1];
-    delta[C][1]                  = delta[CD][1]*delta[AD][2] - delta[CD][2]*delta[AD][1];
-    delta[C][2]                  = delta[AD][1]*delta[BD][2] - delta[AD][2]*delta[BD][1];
- 
-    float volume                 = delta[C][0]*delta[AD][0] + delta[C][1]*delta[BD][0] + delta[C][2]*delta[CD][0];
-    if( volume < 0.0 ){
-        labDipole[1]            *= -1.0f; // pole(3,i)
-        labQuadrupole[1]        *= -1.0f; // pole(6,i)  && pole(8,i)
-        labQuadrupole[3]        *= -1.0f; // pole(10,i) && pole(12,i)
-        labQuadrupole[5]        *= -1.0f; // pole(6,i)  && pole(8,i)
-        labQuadrupole[7]        *= -1.0f; // pole(10,i) && pole(12,i)
-    }
-
+    int particleIndex            = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
+    int numberOfParticles        = cSim.atoms;
+    while( particleIndex < numberOfParticles )
+    { 
+        // skip z-then-x
+    
+        int axisType                     = multiPoleParticles[particleIndex].w; 
+        if( axisType != 0 && multiPoleParticles[particleIndex].y >= 0 )
+        {
+     
+            // ---------------------------------------------------------------------------------------
+         
+            int particleA                = particleIndex;
+            int particleB                = multiPoleParticles[particleIndex].z;
+            int particleC                = multiPoleParticles[particleIndex].x;
+            int particleD                = multiPoleParticles[particleIndex].y;
+        
+            delta[AD][0]                 = particleCoord[particleA].x - particleCoord[particleD].x;
+            delta[AD][1]                 = particleCoord[particleA].y - particleCoord[particleD].y;
+            delta[AD][2]                 = particleCoord[particleA].z - particleCoord[particleD].z;
+        
+            delta[BD][0]                 = particleCoord[particleB].x - particleCoord[particleD].x;
+            delta[BD][1]                 = particleCoord[particleB].y - particleCoord[particleD].y;
+            delta[BD][2]                 = particleCoord[particleB].z - particleCoord[particleD].z;
+        
+            delta[CD][0]                 = particleCoord[particleC].x - particleCoord[particleD].x;
+            delta[CD][1]                 = particleCoord[particleC].y - particleCoord[particleD].y;
+            delta[CD][2]                 = particleCoord[particleC].z - particleCoord[particleD].z;
+        
+            delta[C][0]                  = delta[BD][1]*delta[CD][2] - delta[BD][2]*delta[CD][1];
+            delta[C][1]                  = delta[CD][1]*delta[AD][2] - delta[CD][2]*delta[AD][1];
+            delta[C][2]                  = delta[AD][1]*delta[BD][2] - delta[AD][2]*delta[BD][1];
+         
+            float volume                 = delta[C][0]*delta[AD][0] + delta[C][1]*delta[BD][0] + delta[C][2]*delta[CD][0];
+            if( volume < 0.0 ){
+                labFrameDipole[particleIndex*3+1]            *= -1.0f; // pole(3,i)
+                labFrameQuadrupole[particleIndex*9+1]        *= -1.0f; // pole(6,i)  && pole(8,i)
+                labFrameQuadrupole[particleIndex*9+3]        *= -1.0f; // pole(10,i) && pole(12,i)
+                labFrameQuadrupole[particleIndex*9+5]        *= -1.0f; // pole(6,i)  && pole(8,i)
+                labFrameQuadrupole[particleIndex*9+7]        *= -1.0f; // pole(10,i) && pole(12,i)
+            }
+        }
+    
+        particleIndex                += gridDim.x*blockDim.x;
+    }    
 }

 __global__
@@ -162,18 +146,14 @@ void kCudaComputeLabFrameMoments_kernel( void )
    float vectorY[3];
    float vectorZ[3];
 
-    int numOfAtoms               = cSim.atoms;
-    float4* atomCoord            = cSim.pPosq;
-    int4* multiPoleAtoms         = cAmoebaSim.pMultipoleParticlesIdsAndAxisType;
+    int particleIndex            = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
+    int numberOfParticles        = cSim.atoms;
+
+    float4* particleCoord        = cSim.pPosq;
+    int4* multiPoleParticles     = cAmoebaSim.pMultipoleParticlesIdsAndAxisType;
    float* labFrameDipole        = cAmoebaSim.pLabFrameDipole;
    float* labFrameQuadrupole    = cAmoebaSim.pLabFrameQuadrupole;
 
-    // ---------------------------------------------------------------------------------------
- 
-    int atomIndex = blockIdx.x;
- 
-    // ---------------------------------------------------------------------------------------
- 
    // get coordinates of this atom and the z & x axis atoms
    // compute the vector between the atoms and 1/sqrt(d2), d2 is distance between
    // this atom and the axis atom
@@ -182,216 +162,210 @@ void kCudaComputeLabFrameMoments_kernel( void )
 
    // code common to ZThenX and Bisector
    
-    float4 coordinatesThisAtom       = atomCoord[atomIndex];
- 
-    int multipoleAtomIndex           = multiPoleAtoms[atomIndex].z;
-    float4 coordinatesAxisAtom       = atomCoord[multipoleAtomIndex];
- 
-    vectorZ[0]                       = coordinatesAxisAtom.x - coordinatesThisAtom.x;
-    vectorZ[1]                       = coordinatesAxisAtom.y - coordinatesThisAtom.y;
-    vectorZ[2]                       = coordinatesAxisAtom.z - coordinatesThisAtom.z;
-      
-    multipoleAtomIndex               = multiPoleAtoms[atomIndex].x; 
-    coordinatesAxisAtom              = atomCoord[multipoleAtomIndex];
- 
-    vectorX[0]                       = coordinatesAxisAtom.x - coordinatesThisAtom.x;
-    vectorX[1]                       = coordinatesAxisAtom.y - coordinatesThisAtom.y;
-    vectorX[2]                       = coordinatesAxisAtom.z - coordinatesThisAtom.z;
- 
-    int axisType                     = multiPoleAtoms[atomIndex].w; 
-      
+    while( particleIndex < numberOfParticles )
+    {
+        float4 coordinatesThisParticle   = particleCoord[particleIndex];
+     
+        int multipoleParticleIndex       = multiPoleParticles[particleIndex].z;
+        float4 coordinatesAxisParticle   = particleCoord[multipoleParticleIndex];
+     
+        vectorZ[0]                       = coordinatesAxisParticle.x - coordinatesThisParticle.x;
+        vectorZ[1]                       = coordinatesAxisParticle.y - coordinatesThisParticle.y;
+        vectorZ[2]                       = coordinatesAxisParticle.z - coordinatesThisParticle.z;
+          
+        multipoleParticleIndex           = multiPoleParticles[particleIndex].x; 
+        coordinatesAxisParticle          = particleCoord[multipoleParticleIndex];
+     
+        vectorX[0]                       = coordinatesAxisParticle.x - coordinatesThisParticle.x;
+        vectorX[1]                       = coordinatesAxisParticle.y - coordinatesThisParticle.y;
+        vectorX[2]                       = coordinatesAxisParticle.z - coordinatesThisParticle.z;
+     
+        int axisType                     = multiPoleParticles[particleIndex].w; 
+          
+        /*
+            z-only
+               (1) norm z
+               (2) select random x
+               (3) x = x - (x.z)z
+               (4) norm x
+    
+            z-then-x
+               (1) norm z
+               (2) norm x (not needed)
+               (3) x = x - (x.z)z
+               (4) norm x
+    
+            bisector
+               (1) norm z
+               (2) norm x 
+               (3) z = x + z
+               (4) norm z
+               (5) x = x - (x.z)z 
+               (6) norm x 
+    
+            z-bisect
+               (1) norm z
+               (2) norm x 
+               (3) norm y 
+               (3) x = x + y
+               (4) norm x
+               (5) x = x - (x.z)z 
+               (6) norm x 
    
-    /*
-        z-only
-           (1) norm z
-           (2) select random x
-           (3) x = x - (x.z)z
-           (4) norm x
-
-        z-then-x
-           (1) norm z
-           (2) norm x (not needed)
-           (3) x = x - (x.z)z
-           (4) norm x
-
-        bisector
-           (1) norm z
-           (2) norm x 
-           (3) z = x + z
-           (4) norm z
-           (5) x = x - (x.z)z 
-           (6) norm x 
-
-        z-bisect
-           (1) norm z
-           (2) norm x 
-           (3) norm y 
-           (3) x = x + y
-           (4) norm x
-           (5) x = x - (x.z)z 
-           (6) norm x 
-
-        3-fold
-           (1) norm z
-           (2) norm x 
-           (3) norm y 
-           (4) z = x + y + z
-           (5) norm z
-           (6) x = x - (x.z)z 
-           (7) norm x 
-
-    */
-
-    // branch based on axis type
+            3-fold
+               (1) norm z
+               (2) norm x 
+               (3) norm y 
+               (4) z = x + y + z
+               (5) norm z
+               (6) x = x - (x.z)z 
+               (7) norm x 
+    
+        */
+    
+        // branch based on axis type
+         
+        float sum                        = normVector3( vectorZ );
+    
+        if( axisType == 1 ){
+    
+            // bisector
+            
+            sum                     = normVector3( vectorX );
+            
+            vectorZ[0]             += vectorX[0];
+            vectorZ[1]             += vectorX[1];
+            vectorZ[2]             += vectorX[2];
+       
+            sum                     = normVector3( vectorZ );
+    
+        } else if( axisType == 2 || axisType == 3 ){ 
     
-    float sum                        = normVector3( vectorZ );
-
-    if( axisType == 1 ){
-
-        // bisector
+            // z-bisect
+    
+            multipoleParticleIndex  = multiPoleParticles[particleIndex].y; 
+            coordinatesAxisParticle = particleCoord[multipoleParticleIndex];
+            vectorY[0]              = coordinatesAxisParticle.x - coordinatesThisParticle.x;
+            vectorY[1]              = coordinatesAxisParticle.y - coordinatesThisParticle.y;
+            vectorY[2]              = coordinatesAxisParticle.z - coordinatesThisParticle.z;
+    
+            sum                     = normVector3( vectorY );
+            sum                     = normVector3( vectorX );
+    
+            if( axisType == 2 ){
+    
+                vectorX[0]         += vectorY[0];
+                vectorX[1]         += vectorY[1];
+                vectorX[2]         += vectorY[2];
+                sum                 = normVector3( vectorX );
+     
+            } else { 
+     
+                // 3-fold
        
-        sum                     = normVector3( vectorX );
+                vectorZ[0]         += vectorX[0] + vectorY[0];
+                vectorZ[1]         += vectorX[1] + vectorY[1];
+                vectorZ[2]         += vectorX[2] + vectorY[2];
+                sum                 = normVector3( vectorZ );
+            }
+     
+        } else if( axisType >= 4 ){ 
+    
+            vectorX[0]             = 0.1f;
+            vectorX[1]             = 0.1f;
+            vectorX[2]             = 0.1f;
+        }
        
-        vectorZ[0]             += vectorX[0];
-        vectorZ[1]             += vectorX[1];
-        vectorZ[2]             += vectorX[2];
-   
-        sum                     = normVector3( vectorZ );
-
-    } else if( axisType == 2 || axisType == 3 ){ 
- 
-        // z-bisect
-
-        multipoleAtomIndex      = multiPoleAtoms[atomIndex].y; 
-        coordinatesAxisAtom     = atomCoord[multipoleAtomIndex];
-        vectorY[0]              = coordinatesAxisAtom.x - coordinatesThisAtom.x;
-        vectorY[1]              = coordinatesAxisAtom.y - coordinatesThisAtom.y;
-        vectorY[2]              = coordinatesAxisAtom.z - coordinatesThisAtom.z;
-
-        sum                     = normVector3( vectorY );
-        sum                     = normVector3( vectorX );
-
-        if( axisType == 2 ){
-
-            vectorX[0]         += vectorY[0];
-            vectorX[1]         += vectorY[1];
-            vectorX[2]         += vectorY[2];
-            sum                 = normVector3( vectorX );
- 
-        } else { 
- 
-            // 3-fold
+        // x = x - (x.z)z
+    
+        float dot         = vectorZ[0]*vectorX[0] + vectorZ[1]*vectorX[1] + vectorZ[2]*vectorX[2];
+            
+        vectorX[0]       -= dot*vectorZ[0];
+        vectorX[1]       -= dot*vectorZ[1];
+        vectorX[2]       -= dot*vectorZ[2];
+         
+        sum               = normVector3( vectorX );
    
-            vectorZ[0]         += vectorX[0] + vectorY[0];
-            vectorZ[1]         += vectorX[1] + vectorY[1];
-            vectorZ[2]         += vectorX[2] + vectorY[2];
-            sum                 = normVector3( vectorZ );
+        vectorY[0]        = (vectorZ[1]*vectorX[2]) - (vectorZ[2]*vectorX[1]);
+        vectorY[1]        = (vectorZ[2]*vectorX[0]) - (vectorZ[0]*vectorX[2]);
+        vectorY[2]        = (vectorZ[0]*vectorX[1]) - (vectorZ[1]*vectorX[0]);
+     
+        // use identity rotation matrix for unrecognized axis types
+    
+        if( axisType < 0 || axisType > 4 ){
+    
+            vectorX[0] = 1.0f;
+            vectorX[1] = 0.0f;
+            vectorX[2] = 0.0f;
+    
+            vectorY[0] = 0.0f;
+            vectorY[1] = 1.0f;
+            vectorY[2] = 0.0f;
+    
+            vectorZ[0] = 0.0f;
+            vectorZ[1] = 0.0f;
+            vectorZ[2] = 1.0f;
        }
- 
-    } else if( axisType >= 4 ){ 
-
-        vectorX[0]             = 0.1f;
-        vectorX[1]             = 0.1f;
-        vectorX[2]             = 0.1f;
-    }
    
-    // x = x - (x.z)z
-
-    float dot         = vectorZ[0]*vectorX[0] + vectorZ[1]*vectorX[1] + vectorZ[2]*vectorX[2];
+        float molDipole[3];
+        molDipole[0]                            = labFrameDipole[particleIndex*3];
+        molDipole[1]                            = labFrameDipole[particleIndex*3+1];
+        molDipole[2]                            = labFrameDipole[particleIndex*3+2];
        
-    vectorX[0]       -= dot*vectorZ[0];
-    vectorX[1]       -= dot*vectorZ[1];
-    vectorX[2]       -= dot*vectorZ[2];
+        // set out-of-range elements to 0.0f
     
-    sum               = normVector3( vectorX );
+        labFrameDipole[particleIndex*3]         = molDipole[0]*vectorX[0] + molDipole[1]*vectorY[0] + molDipole[2]*vectorZ[0];
+        labFrameDipole[particleIndex*3+1]       = molDipole[0]*vectorX[1] + molDipole[1]*vectorY[1] + molDipole[2]*vectorZ[1];
+        labFrameDipole[particleIndex*3+2]       = molDipole[0]*vectorX[2] + molDipole[1]*vectorY[2] + molDipole[2]*vectorZ[2];
+        
+        // ---------------------------------------------------------------------------------------
+        
+        float mPole[3][3];
+        unsigned int offset           = particleIndex*9;
+        
+        mPole[0][0]                   = labFrameQuadrupole[offset];
+        mPole[0][1]                   = labFrameQuadrupole[offset+1];
+        mPole[0][2]                   = labFrameQuadrupole[offset+2];
+    
+        mPole[1][0]                   = labFrameQuadrupole[offset+3];
+        mPole[1][1]                   = labFrameQuadrupole[offset+4];
+        mPole[1][2]                   = labFrameQuadrupole[offset+5];
+    
+        mPole[2][0]                   = labFrameQuadrupole[offset+6];
+        mPole[2][1]                   = labFrameQuadrupole[offset+7];
+        mPole[2][2]                   = labFrameQuadrupole[offset+8];
+    
+        labFrameQuadrupole[offset+8]  = vectorX[2]*(vectorX[2]*mPole[0][0] + vectorY[2]*mPole[0][1] + vectorZ[2]*mPole[0][2]);
+        labFrameQuadrupole[offset+8] += vectorY[2]*(vectorX[2]*mPole[1][0] + vectorY[2]*mPole[1][1] + vectorZ[2]*mPole[1][2]);
+        labFrameQuadrupole[offset+8] += vectorZ[2]*(vectorX[2]*mPole[2][0] + vectorY[2]*mPole[2][1] + vectorZ[2]*mPole[2][2]);

-    vectorY[0]        = (vectorZ[1]*vectorX[2]) - (vectorZ[2]*vectorX[1]);
-    vectorY[1]        = (vectorZ[2]*vectorX[0]) - (vectorZ[0]*vectorX[2]);
-    vectorY[2]        = (vectorZ[0]*vectorX[1]) - (vectorZ[1]*vectorX[0]);
- 
-    // use identity rotation matrix for unrecognized axis types
+        labFrameQuadrupole[offset+4]  = vectorX[1]*(vectorX[1]*mPole[0][0] + vectorY[1]*mPole[0][1] + vectorZ[1]*mPole[0][2]);
+        labFrameQuadrupole[offset+4] += vectorY[1]*(vectorX[1]*mPole[1][0] + vectorY[1]*mPole[1][1] + vectorZ[1]*mPole[1][2]);
+        labFrameQuadrupole[offset+4] += vectorZ[1]*(vectorX[1]*mPole[2][0] + vectorY[1]*mPole[2][1] + vectorZ[1]*mPole[2][2]);

-    if( axisType < 0 || axisType > 4 ){
+        labFrameQuadrupole[offset+5]  = vectorX[1]*(vectorX[2]*mPole[0][0] + vectorY[2]*mPole[0][1] + vectorZ[2]*mPole[0][2]);
+        labFrameQuadrupole[offset+5] += vectorY[1]*(vectorX[2]*mPole[1][0] + vectorY[2]*mPole[1][1] + vectorZ[2]*mPole[1][2]);
+        labFrameQuadrupole[offset+5] += vectorZ[1]*(vectorX[2]*mPole[2][0] + vectorY[2]*mPole[2][1] + vectorZ[2]*mPole[2][2]);

-        vectorX[0] = 1.0f;
-        vectorX[1] = 0.0f;
-        vectorX[2] = 0.0f;
+        labFrameQuadrupole[offset]    = vectorX[0]*(vectorX[0]*mPole[0][0] + vectorY[0]*mPole[0][1] + vectorZ[0]*mPole[0][2]);
+        labFrameQuadrupole[offset]   += vectorY[0]*(vectorX[0]*mPole[1][0] + vectorY[0]*mPole[1][1] + vectorZ[0]*mPole[1][2]);
+        labFrameQuadrupole[offset]   += vectorZ[0]*(vectorX[0]*mPole[2][0] + vectorY[0]*mPole[2][1] + vectorZ[0]*mPole[2][2]);

-        vectorY[0] = 0.0f;
-        vectorY[1] = 1.0f;
-        vectorY[2] = 0.0f;
+        labFrameQuadrupole[offset+1]  = vectorX[0]*(vectorX[1]*mPole[0][0] + vectorY[1]*mPole[0][1] + vectorZ[1]*mPole[0][2]);
+        labFrameQuadrupole[offset+1] += vectorY[0]*(vectorX[1]*mPole[1][0] + vectorY[1]*mPole[1][1] + vectorZ[1]*mPole[1][2]);
+        labFrameQuadrupole[offset+1] += vectorZ[0]*(vectorX[1]*mPole[2][0] + vectorY[1]*mPole[2][1] + vectorZ[1]*mPole[2][2]);

-        vectorZ[0] = 0.0f;
-        vectorZ[1] = 0.0f;
-        vectorZ[2] = 1.0f;
-    }
-
-    float molDipole[3];
-    float* labDipole  = &(labFrameDipole[atomIndex*3]);
-    molDipole[0]      = labDipole[0];
-    molDipole[1]      = labDipole[1];
-    molDipole[2]      = labDipole[2];
-    
-    // set out-of-range elements to 0.0f
+        labFrameQuadrupole[offset+2]  = vectorX[0]*(vectorX[2]*mPole[0][0] + vectorY[2]*mPole[0][1] + vectorZ[2]*mPole[0][2]);
+        labFrameQuadrupole[offset+2] += vectorY[0]*(vectorX[2]*mPole[1][0] + vectorY[2]*mPole[1][1] + vectorZ[2]*mPole[1][2]);
+        labFrameQuadrupole[offset+2] += vectorZ[0]*(vectorX[2]*mPole[2][0] + vectorY[2]*mPole[2][1] + vectorZ[2]*mPole[2][2]);
 
-    labDipole[0]      = atomIndex >= numOfAtoms ? 0.0f : molDipole[0]*vectorX[0] + molDipole[1]*vectorY[0] + molDipole[2]*vectorZ[0];
-    labDipole[1]      = atomIndex >= numOfAtoms ? 0.0f : molDipole[0]*vectorX[1] + molDipole[1]*vectorY[1] + molDipole[2]*vectorZ[1];
-    labDipole[2]      = atomIndex >= numOfAtoms ? 0.0f : molDipole[0]*vectorX[2] + molDipole[1]*vectorY[2] + molDipole[2]*vectorZ[2];
-    
-    // ---------------------------------------------------------------------------------------
-    
-    float* rPole[3];
-    float mPole[3][3];
-    float* labQuadrupole       = &(labFrameQuadrupole[atomIndex*9]);
-    
-    for( int ii = 0; ii < 3; ii++ ){
-        mPole[ii][0]   = labQuadrupole[3*ii+0];
-        mPole[ii][1]   = labQuadrupole[3*ii+1];
-        mPole[ii][2]   = labQuadrupole[3*ii+2];
+        labFrameQuadrupole[offset+3]  = labFrameQuadrupole[offset+1];
+        labFrameQuadrupole[offset+6]  = labFrameQuadrupole[offset+2];
+        labFrameQuadrupole[offset+7]  = labFrameQuadrupole[offset+5];

-        rPole[ii]      = labQuadrupole + ii*3;
-        rPole[ii][0]   = 0.0f;
-        rPole[ii][1]   = 0.0f;
-        rPole[ii][2]   = 0.0f;
-
-    }
-    
-    int ii = threadIdx.x;
-    if( ii < 3 ){
-        for( int jj = ii; jj < 3; jj++ ){
- 
-            rPole[ii][jj] += vectorX[ii]*vectorX[jj]*mPole[0][0];
-            rPole[ii][jj] += vectorX[ii]*vectorY[jj]*mPole[0][1];
-            rPole[ii][jj] += vectorX[ii]*vectorZ[jj]*mPole[0][2];
-       	
-            rPole[ii][jj] += vectorY[ii]*vectorX[jj]*mPole[1][0];
-            rPole[ii][jj] += vectorY[ii]*vectorY[jj]*mPole[1][1];
-            rPole[ii][jj] += vectorY[ii]*vectorZ[jj]*mPole[1][2];
-       	
-            rPole[ii][jj] += vectorZ[ii]*vectorX[jj]*mPole[2][0];
-            rPole[ii][jj] += vectorZ[ii]*vectorY[jj]*mPole[2][1];
-            rPole[ii][jj] += vectorZ[ii]*vectorZ[jj]*mPole[2][2];
-       }
+        particleIndex                += gridDim.x*blockDim.x;
    }
- 
-    __syncthreads();
- 
- 
-    rPole[1][0] = rPole[0][1];
-    rPole[2][0] = rPole[0][2];
-    rPole[2][1] = rPole[1][2];
- 
-    // set out-of-range elements to 0.0f
- 
-    labQuadrupole[0]   = atomIndex >= numOfAtoms ? 0.0f : labQuadrupole[0];
-    labQuadrupole[1]   = atomIndex >= numOfAtoms ? 0.0f : labQuadrupole[1];
-    labQuadrupole[2]   = atomIndex >= numOfAtoms ? 0.0f : labQuadrupole[2];
-    labQuadrupole[3]   = atomIndex >= numOfAtoms ? 0.0f : labQuadrupole[3];
-    labQuadrupole[4]   = atomIndex >= numOfAtoms ? 0.0f : labQuadrupole[4];
-    labQuadrupole[5]   = atomIndex >= numOfAtoms ? 0.0f : labQuadrupole[5];
-    labQuadrupole[6]   = atomIndex >= numOfAtoms ? 0.0f : labQuadrupole[6];
-    labQuadrupole[7]   = atomIndex >= numOfAtoms ? 0.0f : labQuadrupole[7];
-    labQuadrupole[8]   = atomIndex >= numOfAtoms ? 0.0f : labQuadrupole[8];
+     
 }

 void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
@@ -405,8 +379,8 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )

    gpuContext gpu    = amoebaGpu->gpuContext;

-    int numBlocks     =  amoebaGpu->paddedNumberOfAtoms;
-    int numThreads    =  20;
+    int numBlocks     = gpu->sim.blocks;
+    int numThreads    = gpu->sim.update_threads_per_block;

 //#define AMOEBA_DEBUG  
 #ifdef AMOEBA_DEBUG
@@ -434,100 +408,19 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
    double kernelTime = 0.0;
 #endif

+    // copy molecular moments to lab frame moment arrays
+    // check if chiral center requires moments to have sign flipped
+    // compute lab frame moments
+
+    cudaMemcpy( amoebaGpu->psLabFrameDipole->_pDevData,      amoebaGpu->psMolecularDipole->_pDevData,    3*gpu->sim.paddedNumberOfAtoms*sizeof( float ), cudaMemcpyDeviceToDevice ); 
+    cudaMemcpy( amoebaGpu->psLabFrameQuadrupole->_pDevData, amoebaGpu->psMolecularQuadrupole->_pDevData, 9*gpu->sim.paddedNumberOfAtoms*sizeof( float ), cudaMemcpyDeviceToDevice ); 
+
    kCudaComputeCheckChiral_kernel<<< numBlocks, numThreads>>> ( );
    LAUNCHERROR("kCudaComputeCheckChiral");

    kCudaComputeLabFrameMoments_kernel<<< numBlocks, numThreads>>> ( );
    LAUNCHERROR(methodName);

-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-        static int timestep = 0;
-        timestep++;
-        (void) fprintf( amoebaGpu->log, "Finished rotation kernel execution in %lf us\n", kernelTime ); (void) fflush( amoebaGpu->log );
-        (void) fflush( amoebaGpu->log );
-
-        amoebaGpu->psLabFrameDipole->Download();
-        (void) fprintf( amoebaGpu->log, "psLabFrameDipole completed\n" );  (void) fflush( amoebaGpu->log );
-
-        amoebaGpu->psLabFrameQuadrupole->Download();
-        (void) fprintf( amoebaGpu->log, "psLabFrameQpole completed\n" );  (void) fflush( amoebaGpu->log );
-
-        int maxPrint = 20;
-        for( int ii = 0; ii < amoebaGpu->paddedNumberOfAtoms; ii++ ){
-
-             int dipoleOffset     = 3*ii;
-             int quadrupoleOffset = 9*ii;
-
-             (void) fprintf( amoebaGpu->log,"\n%6d [%6d %6d %6d] ", ii,
-                             amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].x,
-                             amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].y,
-                             amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].w );
-             // coords
-
-             (void) fprintf( amoebaGpu->log,"x[%16.9e %16.9e %16.9e]\n",
-                             gpu->psPosq4->_pSysData[ii].x,
-                             gpu->psPosq4->_pSysData[ii].y,
-                             gpu->psPosq4->_pSysData[ii].z);
-/*
-             (void) fprintf( amoebaGpu->log,"   R[%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e]\n",
-                             amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset],
-                             amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+1],
-                             amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+2],
-                             amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+3],
-                             amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+4],
-                             amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+5],
-                             amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+6],
-                             amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+7],
-                             amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+8] );
-*/
-             // dipole
-
-             (void) fprintf( amoebaGpu->log,"   D[%16.9e %16.9e %16.9e]\n",
-                             amoebaGpu->psLabFrameDipole->_pSysData[dipoleOffset],
-                             amoebaGpu->psLabFrameDipole->_pSysData[dipoleOffset+1],
-                             amoebaGpu->psLabFrameDipole->_pSysData[dipoleOffset+2] );
-    
-             // quadrupole
-
-             (void) fprintf( amoebaGpu->log,"   Q[%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e]\n",
-                             amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset],
-                             amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset+1],
-                             amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset+2],
-                             amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset+3],
-                             amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset+4],
-                             amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset+5],
-                             amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset+6],
-                             amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset+7],
-                             amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset+8] );
-
-            if( ii == maxPrint && (ii < (gpu->natoms - maxPrint)) ){
-                ii = gpu->natoms - maxPrint;
-            }
-        }
-        int nansDetected   = checkForNansAndInfinities( amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->psLabFrameDipole );
-            nansDetected  += checkForNansAndInfinities( amoebaGpu->paddedNumberOfAtoms*9, amoebaGpu->psLabFrameQuadrupole );
-        if( nansDetected ){
-             (void) fprintf( amoebaGpu->log,"Nans detected in dipole/quadrupoles.\n" );
-             exit(0);
-        }
-        (void) fflush( amoebaGpu->log );
-    }
-#endif
-
-    if( 0 ){
-
-        int particles = amoebaGpu->paddedNumberOfAtoms;
-        std::vector<int> fileId;
-        //fileId.push_back( 0 );
-
-        VectorOfDoubleVectors outputVector;
-        cudaLoadCudaFloat4Array( particles, 3, gpu->psPosq4,                     outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-        cudaLoadCudaFloatArray( particles,  3, amoebaGpu->psLabFrameDipole,      outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-        cudaLoadCudaFloatArray( particles,  9, amoebaGpu->psLabFrameQuadrupole,  outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-        cudaWriteVectorOfDoubleVectorsToFile( "CudaRotatedMoments", fileId, outputVector );
-    }
-  
 }

 void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaGeneralizedKirkwood ) 
@@ -536,21 +429,9 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG

    // compute lab frame moments

-    if( 0 ){
-        static int iteration = 0;
-        gpuContext gpu       = amoebaGpu->gpuContext;
-        checkForNansFloat4( gpu->natoms, gpu->psPosq4, gpu->psAtomIndex->_pSysData, ++iteration, "MultipoleForcesPreLabCoord", stderr );
-    }   
-
-    if( 0 ){
-        static int iteration = 0;
-        gpuContext gpu       = amoebaGpu->gpuContext;
-        checkForNansFloat4( gpu->natoms, gpu->psForce4, gpu->psAtomIndex->_pSysData, ++iteration, "MultipoleForcesPreForce", stderr );
-     }   
-
    cudaComputeAmoebaLabFrameMoments( amoebaGpu );

-    if( 0 ){
+    if( 1 ){
        gpuContext gpu                       = amoebaGpu->gpuContext;
        std::vector<int> fileId;
        //fileId.push_back( 0 );
@@ -567,10 +448,14 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
        cudaComputeAmoebaFixedEAndGkFields( amoebaGpu );
        cudaComputeAmoebaMutualInducedAndGkField( amoebaGpu );
    } else {
+
        if( amoebaGpu->multipoleNonbondedMethod == AMOEBA_NO_CUTOFF ){
+
            cudaComputeAmoebaFixedEField( amoebaGpu );
            cudaComputeAmoebaMutualInducedField( amoebaGpu );
+
        } else {
+
            gpuContext gpu = amoebaGpu->gpuContext;
            kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
            LAUNCHERROR("kFindBlockBoundsPeriodic");
@@ -581,6 +466,7 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
            kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
            LAUNCHERROR("kFindInteractionsWithinBlocksPeriodic");
+
            cudaComputeAmoebaPmeFixedEField( amoebaGpu );
            cudaComputeAmoebaPmeMutualInducedField( amoebaGpu );
        }
@@ -597,33 +483,10 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
    // calculate electrostatic forces

    if( amoebaGpu->multipoleNonbondedMethod == AMOEBA_NO_CUTOFF ){
-
        cudaComputeAmoebaElectrostatic( amoebaGpu );
-
-        // map torques to forces
-
-        cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpu, amoebaGpu->psTorque, amoebaGpu->psForce, amoebaGpu->gpuContext->psForce4 );
-
-        if( 0 ){
-            gpuContext gpu = amoebaGpu->gpuContext;
-            std::vector<int> fileId;
-            //fileId.push_back( 0 );
-            VectorOfDoubleVectors outputVector;
-            //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,              outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-            cudaLoadCudaFloat4Array( gpu->natoms,  3, amoebaGpu->gpuContext->psForce4,     outputVector, gpu->psAtomIndex->_pSysData, 1.0f/4.184 );
-            cudaWriteVectorOfDoubleVectorsToFile( "CudaMpole", fileId, outputVector );
-        }   
-
    } else {
        cudaComputeAmoebaPmeElectrostatic( amoebaGpu );
    }
-
-    if( 0 ){
-        static int iteration = 0;
-        gpuContext gpu       = amoebaGpu->gpuContext;
-        checkForNansFloat4( gpu->natoms, gpu->psForce4, gpu->psAtomIndex->_pSysData, ++iteration, "MultipoleForcesPstForce", stderr );
-     }   
-
 }

 #undef AMOEBA_DEBUG
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaUtilities.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaUtilities.cu
@@ -117,23 +117,27 @@ void kClearFields_kernel( unsigned int bufferLength, float* EField )
 void kClearFields_3( amoebaGpuContext amoebaGpu, unsigned int numberToClear )
 {

-    kClearFields_kernel<<<amoebaGpu->nonbondBlocks, 384>>>( amoebaGpu->paddedNumberOfAtoms*3*amoebaGpu->outputBuffers,
-                                                            amoebaGpu->psWorkArray_3_1->_pDevData );
+    gpuContext gpu = amoebaGpu->gpuContext;
+    kClearFields_kernel<<<gpu->sim.nonbond_blocks, 384>>>( gpu->sim.paddedNumberOfAtoms*3*gpu->sim.outputBuffers, amoebaGpu->psWorkArray_3_1->_pDevData );
    LAUNCHERROR("kClearFields_3_1");

-    kClearFields_kernel<<<amoebaGpu->nonbondBlocks, 384>>>( amoebaGpu->paddedNumberOfAtoms*3*amoebaGpu->outputBuffers,
-                                                            amoebaGpu->psWorkArray_3_2->_pDevData );
-    LAUNCHERROR("kClearFields_3_2");
+    if( numberToClear > 1 ){
+        kClearFields_kernel<<<gpu->sim.nonbond_blocks, 384>>>( gpu->sim.paddedNumberOfAtoms*3*gpu->sim.outputBuffers, amoebaGpu->psWorkArray_3_2->_pDevData );
+        LAUNCHERROR("kClearFields_3_2");
+    } else {
+        return;
+    }

    if( numberToClear > 2 ){
-        kClearFields_kernel<<<amoebaGpu->nonbondBlocks, 384>>>( amoebaGpu->paddedNumberOfAtoms*3*amoebaGpu->outputBuffers,
+        kClearFields_kernel<<<gpu->sim.nonbond_blocks, 384>>>( gpu->sim.paddedNumberOfAtoms*3*gpu->sim.outputBuffers,
                                                            amoebaGpu->psWorkArray_3_3->_pDevData );
        LAUNCHERROR("kClearFields_3_3");
    } else {
        return;
    }
+
    if( numberToClear > 3 ){
-        kClearFields_kernel<<<amoebaGpu->nonbondBlocks, 384>>>( amoebaGpu->paddedNumberOfAtoms*3*amoebaGpu->outputBuffers,
+        kClearFields_kernel<<<gpu->sim.nonbond_blocks, 384>>>( gpu->sim.paddedNumberOfAtoms*3*gpu->sim.outputBuffers,
                                                                amoebaGpu->psWorkArray_3_4->_pDevData );
        LAUNCHERROR("kClearFields_3_4");
    }
@@ -144,11 +148,12 @@ void kClearFields_3( amoebaGpuContext amoebaGpu, unsigned int numberToClear )
 void kClearFields_1( amoebaGpuContext amoebaGpu )
 {

-    kClearFields_kernel<<<amoebaGpu->nonbondBlocks, 384>>>( amoebaGpu->paddedNumberOfAtoms*amoebaGpu->outputBuffers,
+    gpuContext gpu = amoebaGpu->gpuContext;
+    kClearFields_kernel<<<gpu->sim.nonbond_blocks, 384>>>( gpu->sim.paddedNumberOfAtoms*gpu->sim.outputBuffers,
                                                            amoebaGpu->psWorkArray_1_1->_pDevData );
    LAUNCHERROR("kClearFields_1_1");

-    kClearFields_kernel<<<amoebaGpu->nonbondBlocks, 384>>>( amoebaGpu->paddedNumberOfAtoms*amoebaGpu->outputBuffers,
+    kClearFields_kernel<<<gpu->sim.nonbond_blocks, 384>>>( gpu->sim.paddedNumberOfAtoms*gpu->sim.outputBuffers,
                                                            amoebaGpu->psWorkArray_1_2->_pDevData );
    LAUNCHERROR("kClearFields_1_2");
 }

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.cu
@@ -449,8 +449,9 @@ static void kCalculateAmoebaVdw14_7NonReduction(amoebaGpuContext amoebaGpu, CUDA

 static void kReduceVdw14_7(amoebaGpuContext amoebaGpu, CUDAStream<float>* outputArray )
 {
-    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
-                           amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
+    gpuContext gpu = amoebaGpu->gpuContext;
+    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
+                           gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
                           amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData );
    LAUNCHERROR("kReduceVdw14_7");
 }
@@ -545,13 +546,13 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
    if( 1 && amoebaGpu->log ){
        (void) fprintf( amoebaGpu->log, "Apply cutoff=%d warp=%d\n", applyCutoff, gpu->bOutputBufferPerWarp );
        (void) fprintf( amoebaGpu->log, "numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n",
-                        amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
+                        gpu->sim.nonbond_blocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
                        sizeof(Vdw14_7Particle), sizeof(Vdw14_7Particle)*threadsPerBlock,
                        (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
        if( 0 ){  
            gpu->psInteractionCount->Download();
            amoebaGpu->psVdwWorkUnit->Download();
-            unsigned int totalWarps  = (amoebaGpu->nonbondBlocks*threadsPerBlock)/GRID;
+            unsigned int totalWarps  = (gpu->sim.nonbond_blocks*threadsPerBlock)/GRID;
            float        ratiof     = (float)totalWarps/(float)amoebaGpu->psVdwWorkUnit->_length;
            (void) fprintf( amoebaGpu->log, "Ixn warps=%u count=%u\n", totalWarps, gpu->psInteractionCount->_pSysData[0] );
            for( unsigned int ii = 0; ii < amoebaGpu->psVdwWorkUnit->_length; ii++ ){
@@ -610,7 +611,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
 #endif

        if (gpu->bOutputBufferPerWarp){
-            kCalculateAmoebaVdw14_7CutoffByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
+            kCalculateAmoebaVdw14_7CutoffByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
                                                                gpu->sim.pInteractingWorkUnit,
                                                                amoebaGpu->psAmoebaVdwCoordinates->_pDevData,
                                                                amoebaGpu->psVdwSigmaEpsilon->_pDevData,
@@ -624,7 +625,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
 #endif
        } else {

-            kCalculateAmoebaVdw14_7Cutoff_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
+            kCalculateAmoebaVdw14_7Cutoff_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
                                                                gpu->sim.pInteractingWorkUnit,
                                                                amoebaGpu->psAmoebaVdwCoordinates->_pDevData,
                                                                amoebaGpu->psVdwSigmaEpsilon->_pDevData,
@@ -644,7 +645,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff

        if (gpu->bOutputBufferPerWarp){

-            kCalculateAmoebaVdw14_7N2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
+            kCalculateAmoebaVdw14_7N2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
                                                                amoebaGpu->psVdwWorkUnit->_pDevData,
                                                                amoebaGpu->psAmoebaVdwCoordinates->_pDevData,
                                                                amoebaGpu->psVdwSigmaEpsilon->_pDevData,
@@ -658,7 +659,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
 #endif
        } else {

-            kCalculateAmoebaVdw14_7N2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
+            kCalculateAmoebaVdw14_7N2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
                                                                amoebaGpu->psVdwWorkUnit->_pDevData,
                                                                amoebaGpu->psAmoebaVdwCoordinates->_pDevData,
                                                                amoebaGpu->psVdwSigmaEpsilon->_pDevData,
@@ -707,7 +708,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
        amoebaGpu->psWorkArray_3_1->Download();
        //for( int jj = 0; jj < 3*gpu->natoms; jj += 3 )
        for( int jj = 0; jj < 3*gpu->natoms; jj += 3 ){
-            for( int kk = 0; kk < amoebaGpu->outputBuffers; kk++ ){
+            for( int kk = 0; kk < gpu->sim.outputBuffers; kk++ ){
                float delta = fabs(amoebaGpu->psWorkArray_3_1->_pSysStream[kk][jj+2] + 1.0f);
                if( delta < 5.0e-06 || isNanOrInfinity( (double) amoebaGpu->psWorkArray_3_1->_pSysStream[kk][jj] ) || isNanOrInfinity( (double) amoebaGpu->psWorkArray_3_1->_pSysStream[kk][jj+2] )  )
                (void) fprintf( amoebaGpu->log,"%6d %6d [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e]\n", jj, kk,

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.h
@@ -95,7 +95,7 @@ void METHOD_NAME(kCalculateAmoebaVdw14_7, _kernel)(

            if( bExclusionFlag ){
                unsigned int xi              = x >> GRIDBITS;
-                unsigned int cell            = xi + xi*cAmoebaSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
+                unsigned int cell            = xi + xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
                exclusionIndex               = cAmoebaSim.pVdwExclusionIndicesIndex[cell]+tgx;
                exclusionMask                = cAmoebaSim.pVdwExclusionIndices[exclusionIndex];
            }
@@ -136,7 +136,7 @@ void METHOD_NAME(kCalculateAmoebaVdw14_7, _kernel)(
 );
                // mask out excluded ixns

-                unsigned int mask  =  ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+j) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
+                unsigned int mask  =  ( (atomI >= cSim.atoms) || ((y+j) >= cSim.atoms) ) ? 0 : 1;
                if( mask && bExclusionFlag ){
                    unsigned int maskIndex  = 1 << j;
                    mask = (exclusionMask & maskIndex) ? 0 : 1;
@@ -158,25 +158,25 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
        debugArray[index].z                = -1.0f;
        debugArray[index].w                = (float) (mask + 1); 

-        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        index                             += cSim.paddedNumberOfAtoms;
        debugArray[index].x                = (float) x;
        debugArray[index].y                = (float) y;
        debugArray[index].z                = (float) tgx;
        debugArray[index].w                = energy;

-        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        index                             += cSim.paddedNumberOfAtoms;
        debugArray[index].x                = pullDebug[0].x;
        debugArray[index].y                = pullDebug[0].y;
        debugArray[index].z                = pullDebug[0].z;
        debugArray[index].w                = pullDebug[0].w;

-        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        index                             += cSim.paddedNumberOfAtoms;
        debugArray[index].x                = pullDebug[1].x;
        debugArray[index].y                = pullDebug[1].y;
        debugArray[index].z                = pullDebug[1].z;
        debugArray[index].w                = pullDebug[1].w;

-        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        index                             += cSim.paddedNumberOfAtoms;
        debugArray[index].x                = mask ? ijForce[0] : 0.0f;
        debugArray[index].y                = mask ? ijForce[1] : 0.0f;
        debugArray[index].z                = mask ? ijForce[2] : 0.0f;
@@ -189,11 +189,11 @@ if( atomI == targetAtom || (y+j) == targetAtom ){

 #ifdef USE_OUTPUT_BUFFER_PER_WARP

-            unsigned int offset                 = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+            unsigned int offset                 = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
            load3dArrayBufferPerWarp( offset, forceSum, outputForce );

 #else
-            unsigned int offset                 = 3*(x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+            unsigned int offset                 = 3*(x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
            load3dArray( offset, forceSum, outputForce );
 #endif

@@ -268,7 +268,7 @@ flags = 0xFFFFFFFF;
            
                        // mask out excluded ixns
        
-                        unsigned int mask       =  ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+jIdx) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
+                        unsigned int mask       =  ( (atomI >= cSim.atoms) || ((y+jIdx) >= cSim.atoms) ) ? 0 : 1;
                        if( mask && bExclusionFlag ){
                            unsigned int maskIndex  = 1 << jIdx;
                            mask =  (exclusionMask & maskIndex) ? 0 : 1;
@@ -337,25 +337,25 @@ if( atomI == targetAtom || (y+jIdx) == targetAtom ){
        debugArray[index].z                = -3.0;
        debugArray[index].w                = (float) (mask + 1); 

-        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        index                             += cSim.paddedNumberOfAtoms;
        debugArray[index].x                = (float) x;
        debugArray[index].y                = (float) y;
        debugArray[index].z                = (float) tgx;
        debugArray[index].w                = energy;

-        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        index                             += cSim.paddedNumberOfAtoms;
        debugArray[index].x                = pullDebug[0].x;
        debugArray[index].y                = pullDebug[0].y;
        debugArray[index].z                = pullDebug[0].z;
        debugArray[index].w                = pullDebug[0].w;

-        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        index                             += cSim.paddedNumberOfAtoms;
        debugArray[index].x                = pullDebug[1].x;
        debugArray[index].y                = pullDebug[1].y;
        debugArray[index].z                = pullDebug[1].z;
        debugArray[index].w                = pullDebug[1].w;

-        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        index                             += cSim.paddedNumberOfAtoms;
        debugArray[index].x                = mask ? ijForce[0] : 0.0f;
        debugArray[index].y                = mask ? ijForce[1] : 0.0f;
        debugArray[index].z                = mask ? ijForce[2] : 0.0f;
@@ -375,17 +375,17 @@ if( atomI == targetAtom || (y+jIdx) == targetAtom ){
            // Write results

 #ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset                 = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+            unsigned int offset                 = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
            load3dArrayBufferPerWarp( offset, forceSum,       outputForce );

-            offset                              = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+            offset                              = 3*(y + tgx + warp*cSim.paddedNumberOfAtoms);

            load3dArrayBufferPerWarp( offset, sA[threadIdx.x].force,       outputForce );
 #else
-            unsigned int offset                 = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+            unsigned int offset                 = 3*(x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms);
            load3dArray( offset, forceSum,       outputForce );

-            offset                              = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+            offset                              = 3*(y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
            load3dArray( offset, sA[threadIdx.x].force,       outputForce );

 #endif

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersion.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersion.cu
@@ -357,8 +357,9 @@ __device__ void calculateWcaDispersionPairIxn_kernel( float4 atomCoordinatesI, f

 static void kReduceWcaDispersion(amoebaGpuContext amoebaGpu, CUDAStream<float>* outputArray )
 {
-    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
-                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
+    gpuContext gpu = amoebaGpu->gpuContext;
+    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
+                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
                               amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData );
    LAUNCHERROR("kReduceWcaDispersion");
 }
@@ -367,8 +368,9 @@ static void kReduceWcaDispersion(amoebaGpuContext amoebaGpu, CUDAStream<float>*

 static void kReduceWcaDispersionToFloat4(amoebaGpuContext amoebaGpu, CUDAStream<float4>* outputArray )
 {
-    kReduceFieldsToFloat4_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
-                                   amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
+    gpuContext gpu = amoebaGpu->gpuContext;
+    kReduceFieldsToFloat4_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
+                                   gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
                                   amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData );
    LAUNCHERROR("kReduceWcaDispersion");
 }
@@ -417,7 +419,7 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )
 #ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){
        (void) fprintf( amoebaGpu->log, "%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n",
-                        methodName, amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
+                        methodName, gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
                        sizeof(WcaDispersionParticle), sizeof(WcaDispersionParticle)*threadsPerBlock,
                        (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
        (void) fflush( amoebaGpu->log );
@@ -425,7 +427,7 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )
 #endif

    if (gpu->bOutputBufferPerWarp){
-        kCalculateAmoebaWcaDispersionN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(WcaDispersionParticle)*threadsPerBlock>>>(
+        kCalculateAmoebaWcaDispersionN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(WcaDispersionParticle)*threadsPerBlock>>>(
                                                            amoebaGpu->psWorkUnit->_pDevData,
                                                            gpu->psPosq4->_pDevData,
                                                            amoebaGpu->psWcaDispersionRadiusEpsilon->_pDevData,
@@ -438,7 +440,7 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )

    } else {

-        kCalculateAmoebaWcaDispersionN2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(WcaDispersionParticle)*threadsPerBlock>>>(
+        kCalculateAmoebaWcaDispersionN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(WcaDispersionParticle)*threadsPerBlock>>>(
                                                            amoebaGpu->psWorkUnit->_pDevData,
                                                            gpu->psPosq4->_pDevData,
                                                            amoebaGpu->psWcaDispersionRadiusEpsilon->_pDevData,

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersion.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersion.h
@@ -137,43 +137,43 @@ unsigned int index                 = (atomI == targetAtom) ? (y + tj) : atomI;

 debugArray[index].x                = (float) atomI;
 debugArray[index].y                = (float) (y + tj); 
-//debugArray[index].z                = (float) cAmoebaSim.numberOfAtoms;
+//debugArray[index].z                = (float) cSim.atoms;
 debugArray[index].z                = atomI == (y+tj) ? 0.0f : energy;
-energy                             = ( (atomI != (y+tj)) && (atomI < cAmoebaSim.numberOfAtoms) && ((y+tj) < cAmoebaSim.numberOfAtoms) )  ? (energy) : 0.0f;
+energy                             = ( (atomI != (y+tj)) && (atomI < cSim.atoms) && ((y+tj) < cSim.atoms) )  ? (energy) : 0.0f;
 debugArray[index].w                = energy+totalEnergy;

-index                             += cAmoebaSim.paddedNumberOfAtoms;
+index                             += cSim.paddedNumberOfAtoms;
 debugArray[index].x                = iCoord.x;
 debugArray[index].y                = iCoord.y;
 debugArray[index].z                = iCoord.z;
 debugArray[index].w                = (float) (blockIdx.x * blockDim.x + threadIdx.x);

-index                             += cAmoebaSim.paddedNumberOfAtoms;
+index                             += cSim.paddedNumberOfAtoms;
 debugArray[index].x                = jCoord.x;
 debugArray[index].y                = jCoord.y;
 debugArray[index].z                = jCoord.z;
 debugArray[index].w                = -4.0f;

-index                             += cAmoebaSim.paddedNumberOfAtoms;
+index                             += cSim.paddedNumberOfAtoms;
 debugArray[index].x                = emixo;
 debugArray[index].y                = emixh;
 debugArray[index].z                = rmixo;
 debugArray[index].w                = rmixh;

-index                             += cAmoebaSim.paddedNumberOfAtoms;
+index                             += cSim.paddedNumberOfAtoms;
 debugArray[index].x                = pullDebug[0].x;
 debugArray[index].y                = pullDebug[0].y;
 debugArray[index].z                = pullDebug[0].z;
 debugArray[index].w                = pullDebug[0].w;

 #if 0
-index                             += cAmoebaSim.paddedNumberOfAtoms;
+index                             += cSim.paddedNumberOfAtoms;
 debugArray[index].x                = pullDebug[1].x;
 debugArray[index].y                = pullDebug[1].y;
 debugArray[index].z                = pullDebug[1].z;
 debugArray[index].w                = pullDebug[1].w;

-index                             += cAmoebaSim.paddedNumberOfAtoms;
+index                             += cSim.paddedNumberOfAtoms;
 debugArray[index].x                = pullDebug[2].x;
 debugArray[index].y                = pullDebug[2].y;
 debugArray[index].z                = pullDebug[2].z;
@@ -184,7 +184,7 @@ debugArray[index].w                = pullDebug[2].w;
 //    energy = 0.0f;
 }
 #endif
-            if( (atomI != (y+tj)) && (atomI < cAmoebaSim.numberOfAtoms) && ((y+tj) < cAmoebaSim.numberOfAtoms) ){
+            if( (atomI != (y+tj)) && (atomI < cSim.atoms) && ((y+tj) < cSim.atoms) ){
       
                // add to field at atomI the field due atomJ's dipole

@@ -219,34 +219,34 @@ debugArray[index].w                = pullDebug[2].w;
 if( (atomI == targetAtom) || ( (y+tj) == targetAtom ) ){

 unsigned int index                 = (atomI == targetAtom) ? (y + tj) : atomI;
-index                             += 2*cAmoebaSim.paddedNumberOfAtoms;
+index                             += 2*cSim.paddedNumberOfAtoms;

 debugArray[index].x                = (float) atomI;
 debugArray[index].y                = (float) (y + tj); 
 debugArray[index].z                = atomI == (y+tj) ? 0.0f : energy;
-energy                             = ( (atomI != (y+tj)) && (atomI < cAmoebaSim.numberOfAtoms) && ((y+tj) < cAmoebaSim.numberOfAtoms) )  ? (energy) : 0.0f;
+energy                             = ( (atomI != (y+tj)) && (atomI < cSim.atoms) && ((y+tj) < cSim.atoms) )  ? (energy) : 0.0f;
 debugArray[index].w                = energy+totalEnergy;
 //debugArray[index].w                = -2.0f;

-index                             += cAmoebaSim.paddedNumberOfAtoms;
+index                             += cSim.paddedNumberOfAtoms;
 debugArray[index].x                = emjxo;
 debugArray[index].y                = emjxh;
 debugArray[index].z                = rmjxo;
 debugArray[index].w                = rmjxh;

-index                             += cAmoebaSim.paddedNumberOfAtoms;
+index                             += cSim.paddedNumberOfAtoms;
 debugArray[index].x                = pullDebug[0].x;
 debugArray[index].y                = pullDebug[0].y;
 debugArray[index].z                = pullDebug[0].z;
 debugArray[index].w                = pullDebug[0].w;
 #if 0
-index                             += cAmoebaSim.paddedNumberOfAtoms;
+index                             += cSim.paddedNumberOfAtoms;
 debugArray[index].x                = pullDebug[1].x;
 debugArray[index].y                = pullDebug[1].y;
 debugArray[index].z                = pullDebug[1].z;
 debugArray[index].w                = pullDebug[1].w;

-index                             += cAmoebaSim.paddedNumberOfAtoms;
+index                             += cSim.paddedNumberOfAtoms;
 debugArray[index].x                = pullDebug[2].x;
 debugArray[index].y                = pullDebug[2].y;
 debugArray[index].z                = pullDebug[2].z;
@@ -257,7 +257,7 @@ debugArray[index].w                = pullDebug[2].w;
    //energy = 0.0f;
 }
 #endif
-            if( (atomI != (y+tj)) && (atomI < cAmoebaSim.numberOfAtoms) && ((y+tj) < cAmoebaSim.numberOfAtoms) ){
+            if( (atomI != (y+tj)) && (atomI < cSim.atoms) && ((y+tj) < cSim.atoms) ){
       
                // add to field at atomI the field due atomJ's dipole

@@ -281,18 +281,18 @@ debugArray[index].w                = pullDebug[2].w;
        // Write results

 #ifdef USE_OUTPUT_BUFFER_PER_WARP
-        unsigned int offset                 = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+        unsigned int offset                 = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
        load3dArrayBufferPerWarp( offset, forceSum,       outputForce );

-        offset                              = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+        offset                              = 3*(y + tgx + warp*cSim.paddedNumberOfAtoms);

        load3dArrayBufferPerWarp( offset, sA[threadIdx.x].force,       outputForce );

 #else
-        unsigned int offset                 = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+        unsigned int offset                 = 3*(x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms);
        load3dArray( offset, forceSum,       outputForce );

-        offset                              = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+        offset                              = 3*(y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
        load3dArray( offset, sA[threadIdx.x].force,       outputForce );

 #endif