Removal of limitation for 'long-range in sequence' covalent bonds

Reduced memory footprint

Removal of limitation for 'long-range in sequence' covalent bonds
Reduced memory footprint
761d7e17 · Mark Friedrichs · 80c4976e · 761d7e17 · 761d7e17 · 761d7e17
Commit 761d7e17 authored Apr 07, 2011 by Mark Friedrichs
20 changed files
--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaData.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaData.cpp
@@ -112,8 +112,15 @@ void AmoebaCudaData::initializeGpu( void ) {
        if( getHasAmoebaGeneralizedKirkwood() && !getHasAmoebaMultipole() ){
            throw OpenMMException("GK force requires Multipole force\n");
        }
+
+        amoebaGpuBuildOutputBuffers( amoebaGpu, getHasAmoebaGeneralizedKirkwood() );
+        amoebaGpuBuildThreadBlockWorkList( amoebaGpu );
+        amoebaGpuBuildVdwExclusionList( amoebaGpu );
+        amoebaGpuBuildScalingList( amoebaGpu );
        amoebaGpuSetConstants( amoebaGpu );
+
        gpuInitialized = true;
+
        if( log ){
            gpuPrintCudaAmoebaGmxSimulation( amoebaGpu, getLog() );
            (void) fprintf( log, "Gpu initialized\n" );

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernelFactory.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernelFactory.cpp
@@ -93,7 +93,7 @@ KernelImpl* AmoebaCudaKernelFactory::createKernelImpl(std::string name, const Pl
    if( mapIterator == contextToAmoebaDataMap.end() ){
        amoebaCudaData                         = new AmoebaCudaData( cudaPlatformData );
        contextToAmoebaDataMap[&context]       = amoebaCudaData;
-        //amoebaCudaData->setLog( stderr );
+        amoebaCudaData->setLog( stderr );
        amoebaCudaData->setContextImpl( static_cast<void*>(&context) );
    } else {
        amoebaCudaData                         = mapIterator->second;

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
@@ -99,7 +99,7 @@ extern void GetCalculateAmoebaCudaMutualInducedAndGkFieldsSim(amoebaGpuContext a
 extern void cudaComputeAmoebaMutualInducedAndGkField( amoebaGpuContext gpu);

 extern void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu );
-extern void cudaWriteFloat4AndFloat1ArraysToFile( int numberOfAtoms, char* fname, int timestep, int entriesPerAtom1, CUDAStream<float4>* array1, 
+extern void cudaWriteFloat4AndFloat1ArraysToFile( int numberOfAtoms, const std::string& fname, int timestep, int entriesPerAtom1, CUDAStream<float4>* array1, 
                                                  int entriesPerAtom2, CUDAStream<float>* array2 );

 extern void SetCalculateAmoebaElectrostaticSim( amoebaGpuContext amoebaGpu );
@@ -112,9 +112,7 @@ extern void cudaComputeAmoebaPmeElectrostatic( amoebaGpuContext amoebaGpu );

 extern void SetCalculateAmoebaCudaMapTorquesSim(amoebaGpuContext gpu);
 extern void GetCalculateAmoebaCudaMapTorquesSim(amoebaGpuContext gpu);
-extern void cudaComputeAmoebaMapTorques( amoebaGpuContext gpu, CUDAStream<float>* psTorque, CUDAStream<float>* psForce);
-extern void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext gpu, CUDAStream<float>* psTorque, CUDAStream<float>* psForce, CUDAStream<float4>* psOutputForce);
-extern void cudaComputeAmoebaMapTorquesAndAddTotalForce2( amoebaGpuContext gpu, CUDAStream<float>* psTorque, CUDAStream<float4>* psOutputForce);
+extern void cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpuContext gpu, CUDAStream<float>* psTorque );

 extern void SetCalculateAmoebaKirkwoodSim( amoebaGpuContext amoebaGpu );
 extern void GetCalculateAmoebaKirkwoodSim( amoebaGpuContext amoebaGpu );
@@ -143,14 +141,14 @@ extern void  cudaReduceN2ToN( float *N2Array, int N, float *NArray, int includeD
 extern float cudaGetSum( int numberOfElements, CUDAStream<float>* array );
 extern float cudaGetNorm2( int numberOfElements, CUDAStream<float>* array );
 extern int   checkForNansAndInfinities( int numberOfElements, CUDAStream<float>* array );
-extern void cudaWriteFloat1AndFloat1ArraysToFile( int numberOfAtoms, char* fname, std::vector<int>& fileId, int entriesPerAtom1, CUDAStream<float>* array1, 
+extern void cudaWriteFloat1AndFloat1ArraysToFile( int numberOfAtoms, const std::string& fname, std::vector<int>& fileId, int entriesPerAtom1, CUDAStream<float>* array1, 
                                                  int entriesPerAtom2, CUDAStream<float>* array2 );
 extern void readFile( std::string fileName, StringVectorVector& fileContents );
 
 extern void cudaLoadCudaFloatArray(  int numberOfParticles, int entriesPerParticle, CUDAStream<float>*  array, VectorOfDoubleVectors& outputVector, int* order, float conversion );
-extern void cudaLoadCudaFloat2Array( int numberOfParticles, int entriesPerParticle, CUDAStream<float2>* array, VectorOfDoubleVectors& outputVector, float conversion );
+extern void cudaLoadCudaFloat2Array( int numberOfParticles, int entriesPerParticle, CUDAStream<float2>* array, VectorOfDoubleVectors& outputVector, int* order, float conversion );
 extern void cudaLoadCudaFloat4Array( int numberOfParticles, int entriesPerParticle, CUDAStream<float4>* array, VectorOfDoubleVectors& outputVector, int* order, float conversion );
-extern void cudaWriteVectorOfDoubleVectorsToFile( char* fname, std::vector<int>& fileId, VectorOfDoubleVectors& outputVector );
+extern void cudaWriteVectorOfDoubleVectorsToFile( const std::string& fname, std::vector<int>& fileId, VectorOfDoubleVectors& outputVector );
 extern void initializeCudaFloatArray( int numberOfParticles, int entriesPerParticle, CUDAStream<float>* array, float initValue );
 extern void checkForNans( int numberOfParticles, int entriesPerParticle,
                          CUDAStream<float>* array, int* order, int iteration, std::string idString, FILE* log );
@@ -166,6 +164,8 @@ extern unsigned int getThreadsPerBlock( amoebaGpuContext amoebaGpu, unsigned int

 //extern int isNanOrInfinity( double number );
 extern void trackMutualInducedIterations( amoebaGpuContext amoebaGpu, int iteration);
+extern void zeroCUDAStreamFloat4( CUDAStream<float4>* streamToCopy );
+extern void reduceAndCopyCUDAStreamFloat4( CUDAStream<float4>* streamToCopy, CUDAStream<float>*  outputStream, float conversion );

 // PME


--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaTypes.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaTypes.h
@@ -50,6 +50,7 @@ enum CudaAmoebaNonbondedMethod
 static const int AMOEBA_PME_ORDER = 5;

 struct cudaAmoebaGmxSimulation {
+
    // Constants

    unsigned int    amoebaBonds;                    // Number of bonds
@@ -132,19 +133,20 @@ struct cudaAmoebaGmxSimulation {
    float           amoebaUreyBradleyQuarticicParameter; // quartic parameter
    unsigned int    amoebaUreyBradley_offset;       // Offset to end of bonds

-    unsigned int numberOfAtoms;                     // number of atoms
-    unsigned int paddedNumberOfAtoms;               // padded number of atoms
-    //float cutoffDistance2;                          // cutoff distance squared for PME
    float sqrtPi;                                   // sqrt(PI)
    float scalingDistanceCutoff;                    // scaling cutoff
    float2*         pDampingFactorAndThole;         // Thole & damping factors

    int4*  pMultipoleParticlesIdsAndAxisType; 
-    int*   pMultipoleAxisOffset; 
+    int4*  pMultipoleParticlesTorqueBufferIndices; 
+    float4* pTorqueMapForce4;
+
    float* pMolecularDipole; 
    float* pMolecularQuadrupole; 
+
    float* pLabFrameDipole;
    float* pLabFrameQuadrupole;
+
    float* pInducedDipole;
    float* pInducedDipolePolar;

@@ -171,6 +173,7 @@ struct cudaAmoebaGmxSimulation {
    int* pVdwExclusionIndices;

    // WCA constants
+
    float epso;
    float epsh;
    float rmino;
@@ -180,6 +183,7 @@ struct cudaAmoebaGmxSimulation {
    float dispoff;
 
    float totalMaxWcaDispersionEnergy;
+
                    // scaling indices
    int*            pScaleIndicesIndex;
    int*            pD_ScaleIndices;

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
@@ -33,33 +33,31 @@
 #define THREADS_PER_BLOCK 256

 #include <map>
-//using namespace std;
 typedef std::map<int,float> MapIntFloat;
-//typedef MapIntFloat::iterator MapIntFloatI;
 typedef MapIntFloat::const_iterator MapIntFloatCI;


-/* Pointer to this structure will be given 
- * to gromacs functions*/
+/* 
+ * Remove
+ * pMapArray, dMapArray, paddedNumberOfAtoms, nonbondBlocks, nonbondThreadsPerBlock, nonbondOutputBuffers
+ * allocation of torqueMapForce psCovalentDegree psPolarizationDegree
+ * 
+   THREADS_PER_BLOCK
+ */
 struct _amoebaGpuContext {
    
    _gpuContext* gpuContext;
 
    FILE* log;

-    // diagnostic arrays
-
-    MapIntFloat** pMapArray;
-    MapIntFloat** dMapArray;
-
-    bool bOutputBufferPerWarp;
-    unsigned int paddedNumberOfAtoms;
-    unsigned int nonbondBlocks;
-    unsigned int nonbondThreadsPerBlock;
-    unsigned int nonbondOutputBuffers;
-    unsigned int threadsPerBlock;
-    unsigned int fieldReduceThreadsPerBlock;
-    unsigned int outputBuffers; 
+    //bool bOutputBufferPerWarp;
+    //unsigned int paddedNumberOfAtoms;
+    //unsigned int nonbondBlocks;
+    //unsigned int nonbondThreadsPerBlock;
+    //unsigned int nonbondOutputBuffers;
+    //unsigned int threadsPerBlock;
+    //unsigned int fieldReduceThreadsPerBlock;
+    //unsigned int outputBuffers; 
    unsigned int workUnits; 

    // workspace arrays
@@ -120,14 +118,19 @@ struct _amoebaGpuContext {

    float solventDielectric;

-    // rotation matrix
-
-    CUDAStream<float>* psRotationMatrix;
-
    // multipole parameters

    CUDAStream<int4>* psMultipoleParticlesIdsAndAxisType;
    CUDAStream<int>* psMultipoleAxisOffset;
+
+    // buffer indices used for mapping torques onto forces 
+
+    int maxTorqueBufferIndex;
+    int useNewTorqueMapScheme;
+    int torqueMapForce4Delete;
+    CUDAStream<int4>* psMultipoleParticlesTorqueBufferIndices;
+    CUDAStream<float4>*  psTorqueMapForce4; 
+
    CUDAStream<float>* psMolecularDipole;
    CUDAStream<float>* psMolecularQuadrupole;

@@ -175,11 +178,7 @@ struct _amoebaGpuContext {

    // electrostatic

-    CUDAStream<float>*  psForce; 
    CUDAStream<float>*  psTorque; 
-    CUDAStream<float>*  torqueMapForce; 
-    int maxMapTorqueDifference; 
-    int maxMapTorqueDifferencePow2;

    // Kirkwood fields

@@ -188,8 +187,6 @@ struct _amoebaGpuContext {
    CUDAStream<float>*  psInducedDipolePolarS; 
    CUDAStream<float>*  psBorn; 
    CUDAStream<float>*  psBornPolar; 
-    CUDAStream<float>*  psKirkwoodForce; 
-    CUDAStream<float>*  psKirkwoodEDiffForce; 

    int includeObcCavityTerm;

@@ -208,6 +205,7 @@ struct _amoebaGpuContext {

    int vdwSigmaCombiningRule;
    int vdwEpsilonCombiningRule;
+    std::vector< std::vector<int> > vdwExclusions;

    // Wca dispersion fields

@@ -239,7 +237,7 @@ extern "C"
 void amoebaGpuShutDown(amoebaGpuContext gpu);

 extern "C"
-void amoebaGpuBuildOutputBuffers( amoebaGpuContext gpu );
+void amoebaGpuBuildOutputBuffers( amoebaGpuContext gpu, int hasKirkwood );

 extern "C"
 int amoebaGpuBuildThreadBlockWorkList( amoebaGpuContext gpu );
@@ -327,7 +325,7 @@ extern "C"
 void gpuSetAmoebaPMEParameters(amoebaGpuContext amoebaGpu, float alpha, int gridSizeX, int gridSizeY, int gridSizeZ);

 extern "C"
-void amoebaGpuBuildVdwExclusionList( amoebaGpuContext amoebaGpu,  const std::vector< std::vector<int> >& exclusions );
+void amoebaGpuBuildVdwExclusionList( amoebaGpuContext amoebaGpu );

 extern "C"
 void gpuSetAmoebaWcaDispersionParameters( amoebaGpuContext amoebaGpu,

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaScaleFactors.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaScaleFactors.h
@@ -101,6 +101,30 @@ __device__ static void load3dArrayBufferPerWarp( unsigned int offset, float* for

 }

+__device__ static void add3dArrayToFloat4( unsigned int offset, float* forceSum, float4* outputForce )
+{
+
+    float4 of; 
+    of                                  = outputForce[offset];
+    of.x                               += forceSum[0];
+    of.y                               += forceSum[1];
+    of.z                               += forceSum[2];
+    outputForce[offset]                 = of;  
+
+}
+
+__device__ static void load3dArrayToFloat4( unsigned int offset, float* forceSum, float4* outputForce )
+{
+
+    float4 of; 
+    of.x                                = forceSum[0];
+    of.y                                = forceSum[1];
+    of.z                                = forceSum[2];
+    of.w                                = 0.0f;
+    outputForce[offset]                 = of;  
+
+}
+
 __device__ static void load3dArray( unsigned int offset, float* forceSum, float* outputForce )
 {

@@ -110,6 +134,15 @@ __device__ static void load3dArray( unsigned int offset, float* forceSum, float*

 }

+__device__ static void add3dArray( unsigned int offset, float* forceSum, float* outputForce )
+{
+
+    outputForce[offset]                += forceSum[0];  
+    outputForce[offset+1]              += forceSum[1];  
+    outputForce[offset+2]              += forceSum[2];  
+
+}
+
 __device__ static void scale3dArray( float scaleFactor, float* force )
 {


--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
@@ -698,22 +698,17 @@ __device__ void loadElectrostaticShared( struct ElectrostaticParticle* sA, unsig
 #define METHOD_NAME(a, b) a##N2ByWarp##b
 #include "kCalculateAmoebaCudaElectrostatic.h"

-// reduce psWorkArray_3_1 -> force
-// reduce psWorkArray_3_2 -> torque
+// reduce psWorkArray_3_1 -> torque

-static void kReduceForceTorque(amoebaGpuContext amoebaGpu )
+static void kReduceTorque(amoebaGpuContext amoebaGpu )
 {
-    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
-                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
-                               amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psForce->_pDevData );
-    LAUNCHERROR("kReduceElectrostaticForce");
-    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
-                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
-                               amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psTorque->_pDevData );
+    gpuContext gpu = amoebaGpu->gpuContext;
+    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
+                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
+                               amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psTorque->_pDevData );
    LAUNCHERROR("kReduceElectrostaticTorque");
 }

-
 /**---------------------------------------------------------------------------------------

   Compute Amoeba electrostatic force & torque
@@ -728,7 +723,6 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
  
   // ---------------------------------------------------------------------------------------

-    static unsigned int threadsPerBlock = 0;

 #ifdef AMOEBA_DEBUG
    static const char* methodName = "cudaComputeAmoebaElectrostatic";
@@ -752,7 +746,7 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
                        methodName, gpu->natoms, amoebaGpu->maxCovalentDegreeSz );
    }   
    static const int maxSlots                 =20;
-    int paddedNumberOfAtoms                   = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
+    int paddedNumberOfAtoms                   = gpu->sim.paddedNumberOfAtoms;
    CUDAStream<float4>* debugArray            = new CUDAStream<float4>(maxSlots*paddedNumberOfAtoms, 1, "DebugArray");
    memset( debugArray->_pSysData,      0, sizeof( float )*4*maxSlots*paddedNumberOfAtoms);
    debugArray->Upload();
@@ -761,6 +755,7 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )

    // on first pass, set threads/block

+    static unsigned int threadsPerBlock = 0;
    if( threadsPerBlock == 0 ){
        unsigned int maxThreads;
        if (gpu->sm_version >= SM_20)
@@ -772,70 +767,59 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(ElectrostaticParticle)), maxThreads);
    }

-    kClearFields_3( amoebaGpu, 2 );
+    kClearFields_3( amoebaGpu, 1 );
    LAUNCHERROR("kClearFields_3 kCalculateAmoebaCudaElectrostatic");

-    if (gpu->bOutputBufferPerWarp){
-
 #ifdef AMOEBA_DEBUG
        if( amoebaGpu->log ){
            (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaElectrostaticN2Forces warp:  numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
-                            amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
+                            gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
                            sizeof(ElectrostaticParticle), sizeof(ElectrostaticParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits ); (void) fflush( amoebaGpu->log );
        }
 #endif

+    if (gpu->bOutputBufferPerWarp){

-        kCalculateAmoebaCudaElectrostaticN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
+        kCalculateAmoebaCudaElectrostaticN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
                                                                           amoebaGpu->psWorkUnit->_pDevData,
                                                                           gpu->psPosq4->_pDevData,
                                                                           amoebaGpu->psLabFrameDipole->_pDevData,
                                                                           amoebaGpu->psLabFrameQuadrupole->_pDevData,
                                                                           amoebaGpu->psInducedDipole->_pDevData,
                                                                           amoebaGpu->psInducedDipolePolar->_pDevData,
-                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
 #ifdef AMOEBA_DEBUG
-                                                                           amoebaGpu->psWorkArray_3_2->_pDevData,
+                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
                                                                           debugArray->_pDevData, targetAtom );
 #else
-                                                                           amoebaGpu->psWorkArray_3_2->_pDevData );
+                                                                           amoebaGpu->psWorkArray_3_1->_pDevData );
 #endif

    } else {

-#ifdef AMOEBA_DEBUG
-        if( amoebaGpu->log ){
-            (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaElectrostaticN2Forces no warp:  numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u xnCt=%u workUnits=%u\n",
-                            amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
-                            sizeof(ElectrostaticParticle), sizeof(ElectrostaticParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
-            (void) fflush( amoebaGpu->log );
-        }
-#endif
-
-        kCalculateAmoebaCudaElectrostaticN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
+        kCalculateAmoebaCudaElectrostaticN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
                                                                           amoebaGpu->psWorkUnit->_pDevData,
                                                                           gpu->psPosq4->_pDevData,
                                                                           amoebaGpu->psLabFrameDipole->_pDevData,
                                                                           amoebaGpu->psLabFrameQuadrupole->_pDevData,
                                                                           amoebaGpu->psInducedDipole->_pDevData,
                                                                           amoebaGpu->psInducedDipolePolar->_pDevData,
-                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
 #ifdef AMOEBA_DEBUG
-                                                                           amoebaGpu->psWorkArray_3_2->_pDevData,
+                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
                                                                           debugArray->_pDevData, targetAtom );
 #else
-                                                                           amoebaGpu->psWorkArray_3_2->_pDevData );
+                                                                           amoebaGpu->psWorkArray_3_1->_pDevData );
 #endif
    }
    LAUNCHERROR("kCalculateAmoebaCudaElectrostaticN2Forces");

-    kReduceForceTorque( amoebaGpu );
+    kReduceTorque( amoebaGpu );
    LAUNCHERROR("kReduceForceTorque");

+    cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );
+
 #ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){

-        amoebaGpu->psForce->Download();
        amoebaGpu->psTorque->Download();
        debugArray->Download();

@@ -847,13 +831,6 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )

            int indexOffset     = ii*3;
    
-           // force
-
-           (void) fprintf( amoebaGpu->log,"ElectrostaticF [%16.9e %16.9e %16.9e] ",
-                           amoebaGpu->psForce->_pSysData[indexOffset],
-                           amoebaGpu->psForce->_pSysData[indexOffset+1],
-                           amoebaGpu->psForce->_pSysData[indexOffset+2] );
-    
           // torque

           (void) fprintf( amoebaGpu->log,"ElectrostaticT [%16.9e %16.9e %16.9e] ",
@@ -925,10 +902,7 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
    
                int offset  = 3*ii;
    
-                (void) fprintf( amoebaGpu->log,"%6d F[%16.7e %16.7e %16.7e] T[%16.7e %16.7e %16.7e]\n", ii,
-                                amoebaGpu->psForce->_pSysData[offset],
-                                amoebaGpu->psForce->_pSysData[offset+1],
-                                amoebaGpu->psForce->_pSysData[offset+2],
+                (void) fprintf( amoebaGpu->log,"%6d T[%16.7e %16.7e %16.7e]\n", ii,
                                amoebaGpu->psTorque->_pSysData[offset],
                                amoebaGpu->psTorque->_pSysData[offset+1],
                                amoebaGpu->psTorque->_pSysData[offset+2] );
@@ -941,9 +915,8 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
            //fileId.push_back( 0 );
            VectorOfDoubleVectors outputVector;
            cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,            outputVector, NULL, 1.0f );
-            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psForce,      outputVector, NULL, 1.0f );
            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psTorque,     outputVector, NULL, 1.0f );
-            cudaWriteVectorOfDoubleVectorsToFile( "CudaForceTorque", fileId, outputVector );
+            cudaWriteVectorOfDoubleVectorsToFile( "CudaTorque", fileId, outputVector );
         }

    }   
@@ -956,7 +929,7 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
        //fileId.push_back( 0 );
        VectorOfDoubleVectors outputVector;
        //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,            outputVector, NULL, 1.0f );
-        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psForce,      outputVector, NULL, 1.0f/4.184 );
+        //cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psForce,      outputVector, NULL, 1.0f/4.184 );
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psTorque,     outputVector, NULL, 1.0f/4.184 );
        cudaWriteVectorOfDoubleVectorsToFile( "CudaForceTorque", fileId, outputVector );
     }

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.h
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.cu
@@ -6,7 +6,7 @@
 #include "amoebaCudaKernels.h"
 #include "kCalculateAmoebaCudaUtilities.h"

-//#define AMOEBA_DEBUG
+#define AMOEBA_DEBUG

 static __constant__ cudaGmxSimulation cSim;
 static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
@@ -30,7 +30,6 @@ void GetCalculateAmoebaCudaFixedEAndGKFieldSim(amoebaGpuContext amoebaGpu)
    status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));         
    RTERROR(status, "GetCalculateAmoebaCudaFixedEAndGKFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
 }
-
 // reduce psWorkArray_3_1 -> E_Field
 // reduce psWorkArray_3_2 -> E_FieldPolar
 // reduce psWorkArray_3_3 -> Gk_FieldPolar
@@ -38,18 +37,19 @@ void GetCalculateAmoebaCudaFixedEAndGKFieldSim(amoebaGpuContext amoebaGpu)
 static void kReduceEAndGkFields(amoebaGpuContext amoebaGpu )
 {

-    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
-                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
+    gpuContext gpu = amoebaGpu->gpuContext;
+    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
+                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
                               amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psE_Field->_pDevData );

    LAUNCHERROR("kReduceEAndGK_Fields1");
-    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
-                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
+    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
+                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
                               amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData );
    LAUNCHERROR("kReduceEAndGK_Fields2");

-    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
-                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
+    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
+                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
                               amoebaGpu->psWorkArray_3_3->_pDevData, amoebaGpu->psGk_Field->_pDevData );
    LAUNCHERROR("kReduceEAndGK_Fields3");
 }
@@ -330,8 +330,6 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )

   // ---------------------------------------------------------------------------------------

-    static unsigned int threadsPerBlock        = 0;
-
    gpuContext gpu                             = amoebaGpu->gpuContext;

 #ifdef AMOEBA_DEBUG
@@ -342,13 +340,11 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )

    // N2 debug array
 
-    CUDAStream<float4>* debugArray             = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
-    memset( debugArray->_pSysData,      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
+    int maxSlots                               = 10;
+    CUDAStream<float4>* debugArray             = new CUDAStream<float4>(maxSlots*paddedNumberOfAtoms, 1, "DebugArray");
+    memset( debugArray->_pSysData,      0, sizeof( float )*4*maxSlots*paddedNumberOfAtoms);
    debugArray->Upload();

-    (*gpu->psInteractionCount)[0]              = gpu->sim.workUnits;
-    gpu->psInteractionCount->Upload();
-
    // print intermediate results for the targetAtom 

    unsigned int targetAtom  = 0;
@@ -356,6 +352,7 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )

    // on first pass, set threads/block

+    static unsigned int threadsPerBlock        = 0;
    if( threadsPerBlock == 0 ){
        unsigned int maxThreads;
        if (gpu->sm_version >= SM_20)
@@ -372,21 +369,16 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
 #ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){
        (void) fprintf( amoebaGpu->log, "%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n", methodName,
-                        amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
+                        gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
                        sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
        (void) fflush( amoebaGpu->log );
    }
 #endif

    if (gpu->bOutputBufferPerWarp){
-        kCalculateAmoebaFixedEAndGkFieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
+        kCalculateAmoebaFixedEAndGkFieldN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
                                                                           amoebaGpu->psWorkUnit->_pDevData,
-                                                                           gpu->psPosq4->_pDevData,
-                                                                           amoebaGpu->psLabFrameDipole->_pDevData,
-                                                                           amoebaGpu->psLabFrameQuadrupole->_pDevData,
-                                                                           gpu->psBornRadii->_pDevData,
-                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
-                                                                           amoebaGpu->psWorkArray_3_2->_pDevData,
+                                                                           amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData,
 #ifdef AMOEBA_DEBUG
                                                                           amoebaGpu->psWorkArray_3_3->_pDevData,
                                                                           debugArray->_pDevData, targetAtom );
@@ -396,14 +388,9 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )

    } else {

-        kCalculateAmoebaFixedEAndGkFieldN2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
+        kCalculateAmoebaFixedEAndGkFieldN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
                                                          amoebaGpu->psWorkUnit->_pDevData,
-                                                          gpu->psPosq4->_pDevData,
-                                                          amoebaGpu->psLabFrameDipole->_pDevData,
-                                                          amoebaGpu->psLabFrameQuadrupole->_pDevData,
-                                                          gpu->psBornRadii->_pDevData,
-                                                          amoebaGpu->psWorkArray_3_1->_pDevData,
-                                                          amoebaGpu->psWorkArray_3_2->_pDevData,
+                                                          amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData,
 #ifdef AMOEBA_DEBUG
                                                          amoebaGpu->psWorkArray_3_3->_pDevData,
                                                          debugArray->_pDevData, targetAtom );
@@ -413,27 +400,13 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
    }
    LAUNCHERROR("kCalculateAmoebaFixedEAndGkFieldN2_kernel");

-#if 0
-        for( unsigned int ii = 0; ii < amoebaGpu->outputBuffers; ii++ ){
-            //float index = 1.0f;
-            float index = (float) ii;
-            for( unsigned int jj = 0; jj < 3*amoebaGpu->paddedNumberOfAtoms; jj += 3 ){
-                unsigned int kk = 3*ii*amoebaGpu->paddedNumberOfAtoms + jj;
-                amoebaGpu->psWorkArray_3_1->_pSysData[kk]   = index;
-                amoebaGpu->psWorkArray_3_1->_pSysData[kk+1] = index;
-                amoebaGpu->psWorkArray_3_1->_pSysData[kk+2] = index;
-            }
-        }
-        amoebaGpu->psWorkArray_3_1->Upload();
-#endif
-
    kReduceEAndGkFields( amoebaGpu );

 #ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){
        gpu->psInteractionCount->Download();
        (void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n",
-                        amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
+                        gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
                        sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
        (void) fflush( amoebaGpu->log );

@@ -481,15 +454,6 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
           }
        }
        (void) fflush( amoebaGpu->log );
-
-        //printEFieldAtomBuffers( amoebaGpu, (targetAtom + 0) );
-        //printEFieldAtomBuffers( amoebaGpu, (targetAtom + 1) );
-        //printEFieldAtomBuffers( amoebaGpu, 100 );
-        //printEFieldBuffer( amoebaGpu, 0 );
-        //printEFieldBuffer( amoebaGpu, 1 );
-        //printEFieldBuffer( amoebaGpu, 37 );
-        //printEFieldBuffer( amoebaGpu, 38 );
-
        (void) fprintf( amoebaGpu->log, "EFields End\n" );
        (void) fprintf( amoebaGpu->log, "DebugQ\n" );
        debugArray->Download();
@@ -497,7 +461,6 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
            int ii = targetAtom;
            (void) fprintf( amoebaGpu->log,"\n" );
            int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-            unsigned int count                         = 0;
            for( int jj = 0; jj < gpu->natoms; jj++ ){
                int debugIndex = jj; 
                (void) fprintf( amoebaGpu->log,"%4d %4d Qint [%16.9e %16.9e %16.9e %16.9e] %16.9e ",
@@ -521,10 +484,10 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
            std::vector<int> fileId;
            //fileId.push_back( 0 );
            VectorOfDoubleVectors outputVector;
-            cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,              outputVector, NULL, 1.0f );
            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_Field,      outputVector, NULL, 1.0f );
            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_FieldPolar, outputVector, NULL, 1.0f);
            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psGk_Field,     outputVector, NULL, 1.0f);
+            cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,              outputVector, NULL, 1.0f );
            cudaWriteVectorOfDoubleVectorsToFile( "CudaEAndGkField", fileId, outputVector );

         }
@@ -532,5 +495,4 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
    }
 #endif
 
-//exit(0);
 }
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.h
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.cu
@@ -36,13 +36,14 @@ void GetCalculateAmoebaCudaFixedEFieldSim(amoebaGpuContext amoebaGpu)

 static void kReduceE_Fields_kernel(amoebaGpuContext amoebaGpu )
 {
-    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
-                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
+    gpuContext gpu = amoebaGpu->gpuContext;
+    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
+                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
                               amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psE_Field->_pDevData );
    LAUNCHERROR("kReduceE_Fields1");

-    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
-                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
+    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
+                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
                               amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData );
    LAUNCHERROR("kReduceE_Fields2");
 }
@@ -73,12 +74,6 @@ static void kReduceE_Fields_kernel(amoebaGpuContext amoebaGpu )
 void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
 {
  
-   // ---------------------------------------------------------------------------------------
-
-    static unsigned int threadsPerBlock = 0;
-
-   // ---------------------------------------------------------------------------------------
-
    gpuContext gpu    = amoebaGpu->gpuContext;

 #ifdef AMOEBA_DEBUG
@@ -104,6 +99,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )

    kClearFields_3( amoebaGpu, 2 );

+    static unsigned int threadsPerBlock = 0;
    if( threadsPerBlock == 0 ){ 
        unsigned int maxThreads;
        if (gpu->sm_version >= SM_20)
@@ -118,15 +114,14 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
 #ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){
        (void) fprintf( amoebaGpu->log, "%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%lu ixnCt=%lu workUnits=%lu\n", methodName,
-                        amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
+                        gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
                        sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
        (void) fflush( amoebaGpu->log );
    }
 #endif

    if (gpu->bOutputBufferPerWarp){
-
-        kCalculateAmoebaFixedE_FieldN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
+        kCalculateAmoebaFixedE_FieldN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
                                                                           amoebaGpu->psWorkUnit->_pDevData,
                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
 #ifdef AMOEBA_DEBUG
@@ -137,7 +132,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
 #endif
    } else {

-        kCalculateAmoebaFixedE_FieldN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
+        kCalculateAmoebaFixedE_FieldN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
                                                                           amoebaGpu->psWorkUnit->_pDevData,
                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
 #ifdef AMOEBA_DEBUG
@@ -151,7 +146,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
    LAUNCHERROR("kCalculateAmoebaFixedE_FieldN2Forces_kernel");

 #if 0
-        for( unsigned int ii = 0; ii < amoebaGpu->outputBuffers; ii++ ){
+        for( unsigned int ii = 0; ii < gpu->sim.outputBuffers; ii++ ){
            //float index = 1.0f;
            float index = (float) ii;
            for( unsigned int jj = 0; jj < 3*amoebaGpu->paddedNumberOfAtoms; jj += 3 ){
@@ -170,7 +165,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
    if( amoebaGpu->log ){
        gpu->psInteractionCount->Download();
        (void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%lu ixnCt=%lu workUnits=%lu\n",
-                        amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
+                        gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
                        sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
        (void) fflush( amoebaGpu->log );
        amoebaGpu->psWorkArray_3_1->Download();
@@ -205,14 +200,6 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
        }
        (void) fflush( amoebaGpu->log );

-        //printEFieldAtomBuffers( amoebaGpu, (targetAtom + 0) );
-        //printEFieldAtomBuffers( amoebaGpu, (targetAtom + 1) );
-        //printEFieldAtomBuffers( amoebaGpu, 100 );
-        //printEFieldBuffer( amoebaGpu, 0 );
-        //printEFieldBuffer( amoebaGpu, 1 );
-        //printEFieldBuffer( amoebaGpu, 37 );
-        //printEFieldBuffer( amoebaGpu, 38 );
-
        (void) fprintf( amoebaGpu->log, "EFields End\n" );
        (void) fprintf( amoebaGpu->log, "DebugQ\n" );
        debugArray->Download();

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.h
@@ -129,7 +129,7 @@ void METHOD_NAME(kCalculateAmoebaFixedE_Field, Forces_kernel)(
            else  // bExclusion
            {
                unsigned int xi       = x >> GRIDBITS;
-                unsigned int cell     = xi + xi*cAmoebaSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
+                unsigned int cell     = xi + xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
                int  dScaleMask       = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
                int2 pScaleMask       = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];

@@ -182,38 +182,38 @@ if( 0 && atomI == targetAtom ){
 /*
            pullBackIndex                     += 2;

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = pullBack[pullBackIndex++];
            debugArray[index].y                = pullBack[pullBackIndex++];
            debugArray[index].z                = pullBack[pullBackIndex++];

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = pullBack[pullBackIndex++];
            debugArray[index].y                = pullBack[pullBackIndex++];
            debugArray[index].z                = pullBack[pullBackIndex++];
 */

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = ijField[indexI][0];
            debugArray[index].y                = ijField[indexI][1];
            debugArray[index].z                = ijField[indexI][2];

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = ijField[indexJ][0];
            debugArray[index].y                = ijField[indexJ][1];
            debugArray[index].z                = ijField[indexJ][2];

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = match ? 0.0f : dScaleVal*ijField[indexI][0];
            debugArray[index].y                = match ? 0.0f : dScaleVal*ijField[indexI][1];
            debugArray[index].z                = match ? 0.0f : dScaleVal*ijField[indexI][2];

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = match ? 0.0f : pScaleVal*ijField[indexI][0];
            debugArray[index].y                = match ? 0.0f : pScaleVal*ijField[indexI][1];
            debugArray[index].z                = match ? 0.0f : pScaleVal*ijField[indexI][2];
 /*
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            unsigned int mask                  = 1 << j;
            unsigned int pScaleIndex           = (scaleMask.x & mask) ? 1 : 0;
            pScaleIndex                       += (scaleMask.y & mask) ? 2 : 0;
@@ -222,12 +222,12 @@ if( 0 && atomI == targetAtom ){
            debugArray[index].y                = scaleMask.x & mask ? 1.0f : -1.0f;
            debugArray[index].z                = scaleMask.y & mask ? 1.0f : -1.0f;
            debugArray[index].w                = pScaleVal + 10.0f;
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = jCoord.x;
            debugArray[index].y                = jCoord.y;
            debugArray[index].z                = jCoord.z;

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = iCoord.x;
            debugArray[index].y                = iCoord.y;
            debugArray[index].z                = iCoord.z;
@@ -241,11 +241,11 @@ if( 0 && atomI == targetAtom ){
            // Write results

 #ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset                 = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+            unsigned int offset                 = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
            load3dArrayBufferPerWarp( offset, fieldSum,       outputEField );
            load3dArrayBufferPerWarp( offset, fieldPolarSum,  outputEFieldPolar );
 #else
-            unsigned int offset                 = 3*(x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+            unsigned int offset                 = 3*(x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
            load3dArray( offset, fieldSum,       outputEField );
            load3dArray( offset, fieldPolarSum,  outputEFieldPolar );
 #endif
@@ -314,48 +314,48 @@ if( 0 && (atomI == targetAtom  || (y + tj) == targetAtom) ){
            debugArray[index].z                = pullBack[pullBackIndex++];
            debugArray[index].w                = pullBack[pullBackIndex++];

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = pullBack[pullBackIndex++];
            debugArray[index].y                = pullBack[pullBackIndex++];
            debugArray[index].z                = pullBack[pullBackIndex++];

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = pullBack[pullBackIndex++];
            debugArray[index].y                = pullBack[pullBackIndex++];
            debugArray[index].z                = pullBack[pullBackIndex++];
 */
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = ijField[indexI][0];
            debugArray[index].y                = ijField[indexI][1];
            debugArray[index].z                = ijField[indexI][2];

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = ijField[indexJ][0];
            debugArray[index].y                = ijField[indexJ][1];
            debugArray[index].z                = ijField[indexJ][2];

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = ijField[indexI][0];
            debugArray[index].y                = ijField[indexI][1];
            debugArray[index].z                = ijField[indexI][2];

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = ijField[indexI][0];
            debugArray[index].y                = ijField[indexI][1];
            debugArray[index].z                = ijField[indexI][2];

 #if 0
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = jCoord.x;
            debugArray[index].y                = jCoord.y;
            debugArray[index].z                = jCoord.z;

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = iCoord.x;
            debugArray[index].y                = iCoord.y;
            debugArray[index].z                = iCoord.z;

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            unsigned int mask                  = 1 << j;
            unsigned int pScaleIndex           = (scaleMask.x & mask) ? 1 : 0;
            pScaleIndex                       += (scaleMask.y & mask) ? 2 : 0;
@@ -378,7 +378,7 @@ if( 0 && (atomI == targetAtom  || (y + tj) == targetAtom) ){

                unsigned int xi   = x >> GRIDBITS;
                unsigned int yi   = y >> GRIDBITS;
-                unsigned int cell = xi+yi*cAmoebaSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
+                unsigned int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
                int  dScaleMask   = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
                int2 pScaleMask   = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];

@@ -435,38 +435,38 @@ if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
            pullBackIndex                     += 2;


-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = pullBack[pullBackIndex++];
            debugArray[index].y                = pullBack[pullBackIndex++];
            debugArray[index].z                = pullBack[pullBackIndex++];

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = pullBack[pullBackIndex++];
            debugArray[index].y                = pullBack[pullBackIndex++];
            debugArray[index].z                = pullBack[pullBackIndex++];
 */

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = ijField[indexI][0];
            debugArray[index].y                = ijField[indexI][1];
            debugArray[index].z                = ijField[indexI][2];

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = ijField[indexJ][0];
            debugArray[index].y                = ijField[indexJ][1];
            debugArray[index].z                = ijField[indexJ][2];

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = dScaleVal*ijField[indexI][0];
            debugArray[index].y                = dScaleVal*ijField[indexI][1];
            debugArray[index].z                = dScaleVal*ijField[indexI][2];

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = pScaleVal*ijField[indexI][0];
            debugArray[index].y                = pScaleVal*ijField[indexI][1];
            debugArray[index].z                = pScaleVal*ijField[indexI][2];
 /*
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            unsigned int mask                  = 1 << j;
            unsigned int pScaleIndex           = (scaleMask.x & mask) ? 1 : 0;
            pScaleIndex                       += (scaleMask.y & mask) ? 2 : 0;
@@ -476,12 +476,12 @@ if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
            debugArray[index].z                = scaleMask.y & mask ? 1.0f : -1.0f;
            debugArray[index].w                = pScaleVal + 10.0f;

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = jCoord.x;
            debugArray[index].y                = jCoord.y;
            debugArray[index].z                = jCoord.z;

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            index                             += cSim.paddedNumberOfAtoms;
            debugArray[index].x                = iCoord.x;
            debugArray[index].y                = iCoord.y;
            debugArray[index].z                = iCoord.z;
@@ -500,20 +500,20 @@ if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){


 #ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset                 = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+            unsigned int offset                 = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
            load3dArrayBufferPerWarp( offset, fieldSum,       outputEField );
            load3dArrayBufferPerWarp( offset, fieldPolarSum,  outputEFieldPolar );

-            offset                              = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+            offset                              = 3*(y + tgx + warp*cSim.paddedNumberOfAtoms);
            load3dArrayBufferPerWarp( offset, sA[threadIdx.x].eField,  outputEField );
            load3dArrayBufferPerWarp( offset, sA[threadIdx.x].eFieldP, outputEFieldPolar );

 #else
-            unsigned int offset                 = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+            unsigned int offset                 = 3*(x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms);
            load3dArray( offset, fieldSum,       outputEField );
            load3dArray( offset, fieldPolarSum,  outputEFieldPolar );

-            offset                              = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+            offset                              = 3*(y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
            load3dArray( offset, sA[threadIdx.x].eField,  outputEField );
            load3dArray( offset, sA[threadIdx.x].eFieldP, outputEFieldPolar );
 

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.h
@@ -133,7 +133,7 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwood, Forces_kernel)(
 #endif
                                        );

-                unsigned int mask       =  ( (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
+                unsigned int mask       =  ( (atomI >= cSim.atoms) || (atomJ >= cSim.atoms) ) ? 0 : 1;

                // torques include i == j contribution

@@ -172,29 +172,29 @@ if( atomI == targetAtom  || atomJ == targetAtom ){

        index = debugAccumulate( index, debugArray, force,          mask, 1.0f );

-        mask  =  ( (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
+        mask  =  ( (atomI >= cSim.atoms) || (atomJ >= cSim.atoms) ) ? 0 : 1;
        index = debugAccumulate( index, debugArray, torque[indexI], mask, 2.0f );
        index = debugAccumulate( index, debugArray, torque[indexJ], mask, 3.0f );

-        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        index                             += cSim.paddedNumberOfAtoms;
        debugArray[index].x                = pullBack[0].x;
        debugArray[index].y                = pullBack[0].y;
        debugArray[index].z                = pullBack[0].z;
        debugArray[index].w                = pullBack[0].w;

-        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        index                             += cSim.paddedNumberOfAtoms;
        debugArray[index].x                = pullBack[1].x;
        debugArray[index].y                = pullBack[1].y;
        debugArray[index].z                = pullBack[1].z;
        debugArray[index].w                = pullBack[1].w;

-        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        index                             += cSim.paddedNumberOfAtoms;
        debugArray[index].x                = pullBack[2].x;
        debugArray[index].y                = pullBack[2].y;
        debugArray[index].z                = pullBack[2].z;
        debugArray[index].w                = pullBack[2].w;
 /*
-        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        index                             += cSim.paddedNumberOfAtoms;
        debugArray[index].x                = dBorn[0];
        debugArray[index].y                = dBornPolar[0];
        debugArray[index].z                = dBorn[1];
@@ -209,7 +209,7 @@ if( atomI == targetAtom  || atomJ == targetAtom ){

 #ifdef USE_OUTPUT_BUFFER_PER_WARP
            float  of;
-            unsigned int offset                 = x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms;
+            unsigned int offset                 = x + tgx + warp*cSim.paddedNumberOfAtoms;

            of                                  = cAmoebaSim.pWorkArray_1_1[offset];
            of                                 += dBornSum;
@@ -219,23 +219,17 @@ if( atomI == targetAtom  || atomJ == targetAtom ){
            of                                 += dBornPolarSum;
            cAmoebaSim.pWorkArray_1_2[offset]   = of;

-
-            offset                             *= 3;
-
-            load3dArrayBufferPerWarp( offset, localParticle.force,  cAmoebaSim.pWorkArray_3_1 );
-            load3dArrayBufferPerWarp( offset, localParticle.torque, cAmoebaSim.pWorkArray_3_2 );
+            add3dArrayToFloat4(   offset, localParticle.force,  cSim.pForce4 );
+            load3dArrayBufferPerWarp( 3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 );

 #else
-            unsigned int offset                 = x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms;
+            unsigned int offset                 = x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms;

            cAmoebaSim.pWorkArray_1_1[offset]   = dBornSum;
            cAmoebaSim.pWorkArray_1_2[offset]   = dBornPolarSum;

-            offset                             *= 3;
-
-            load3dArray( offset, localParticle.force,  cAmoebaSim.pWorkArray_3_1 );
-            load3dArray( offset, localParticle.torque, cAmoebaSim.pWorkArray_3_2 );
-
+            add3dArrayToFloat4(    offset, localParticle.force,  cSim.pForce4);
+            load3dArray( 3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 );
 #endif

        }
@@ -278,7 +272,7 @@ if( atomI == targetAtom  || atomJ == targetAtom ){
 #endif
                                        );

-                unsigned int mask          =  ( (atomI >= cAmoebaSim.numberOfAtoms) || ( atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
+                unsigned int mask          =  ( (atomI >= cSim.atoms) || ( atomJ >= cSim.atoms) ) ? 0 : 1;

                // add force and torque to atom I due atom J

@@ -325,25 +319,25 @@ if( atomI == targetAtom || atomJ == targetAtom ){
        index = debugAccumulate( index, debugArray, torque[indexI], mask, -2.0f );
        index = debugAccumulate( index, debugArray, torque[indexJ], mask, -3.0f );

-        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        index                             += cSim.paddedNumberOfAtoms;
        debugArray[index].x                = pullBack[0].x;
        debugArray[index].y                = pullBack[0].y;
        debugArray[index].z                = pullBack[0].z;
        debugArray[index].w                = -1.0f;

-        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        index                             += cSim.paddedNumberOfAtoms;
        debugArray[index].x                = pullBack[1].x;
        debugArray[index].y                = pullBack[1].y;
        debugArray[index].z                = pullBack[1].z;
        debugArray[index].w                = pullBack[1].w;

-        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        index                             += cSim.paddedNumberOfAtoms;
        debugArray[index].x                = pullBack[2].x;
        debugArray[index].y                = pullBack[2].y;
        debugArray[index].z                = pullBack[2].z;
        debugArray[index].w                = pullBack[2].w;
 /*
-        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        index                             += cSim.paddedNumberOfAtoms;
        debugArray[index].x                = dBorn[0];
        debugArray[index].y                = dBornPolar[0];
        debugArray[index].z                = dBorn[1];
@@ -355,7 +349,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
 //#ifdef AMOEBA_DEBUG
 #if 0
 if( mask || !mask ){
-        unsigned int index                 = atomJ + atomI*cAmoebaSim.paddedNumberOfAtoms;
+        unsigned int index                 = atomJ + atomI*cSim.paddedNumberOfAtoms;
        debugArray[index].x                = (float) atomI;
        debugArray[index].y                = (float) atomJ;
        debugArray[index].z                = energy;
@@ -372,7 +366,7 @@ if( mask || !mask ){
 #ifdef USE_OUTPUT_BUFFER_PER_WARP

            float of;
-            unsigned int offset                 = x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms;
+            unsigned int offset                 = x + tgx + warp*cSim.paddedNumberOfAtoms;

            of                                  = cAmoebaSim.pWorkArray_1_1[offset];
            of                                 += dBornSum;
@@ -382,12 +376,10 @@ if( mask || !mask ){
            of                                 += dBornPolarSum;
            cAmoebaSim.pWorkArray_1_2[offset]   = of;

-            offset                             *= 3;
-
-            load3dArrayBufferPerWarp( offset, localParticle.force,  cAmoebaSim.pWorkArray_3_1 );
-            load3dArrayBufferPerWarp( offset, localParticle.torque, cAmoebaSim.pWorkArray_3_2 );
+            add3dArrayToFloat4( offset, localParticle.force,  cSim.pForce4 );
+            load3dArrayBufferPerWarp( 3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 );

-            offset                              = y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms;
+            offset                              = y + tgx + warp*cSim.paddedNumberOfAtoms;

            of                                  = cAmoebaSim.pWorkArray_1_1[offset];
            of                                 += dBornSum;
@@ -397,31 +389,24 @@ if( mask || !mask ){
            of                                 += dBornPolarSum;
            cAmoebaSim.pWorkArray_1_2[offset]   = of;

-            offset                             *= 3;
-
-            load3dArrayBufferPerWarp( offset, sA[threadIdx.x].force,  cAmoebaSim.pWorkArray_3_1 );
-            load3dArrayBufferPerWarp( offset, sA[threadIdx.x].torque, cAmoebaSim.pWorkArray_3_2 );
+            add3dArrayToFloat4(   offset, sA[threadIdx.x].force,  cSim.pForce4 );
+            load3dArrayBufferPerWarp( 3*offset, sA[threadIdx.x].torque, cAmoebaSim.pWorkArray_3_1 );
 #else
-            unsigned int offset                 = x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms;
+            unsigned int offset                 = x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms;

            cAmoebaSim.pWorkArray_1_1[offset]   = dBornSum;
            cAmoebaSim.pWorkArray_1_2[offset]   = dBornPolarSum;

-            offset                             *= 3;
-
-            load3dArray( offset, localParticle.force,  cAmoebaSim.pWorkArray_3_1 );
-            load3dArray( offset, localParticle.torque, cAmoebaSim.pWorkArray_3_2 );
+            add3dArrayToFloat4(   offset, localParticle.force,  cSim.pForce4 );
+            load3dArray( 3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 );

-
-            offset                              = y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms;
+            offset                              = y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms;

            cAmoebaSim.pWorkArray_1_1[offset]   = sA[threadIdx.x].dBornRadius;
            cAmoebaSim.pWorkArray_1_2[offset]   = sA[threadIdx.x].dBornRadiusPolar;

-            offset                             *= 3;
-
-            load3dArray( offset, sA[threadIdx.x].force,  cAmoebaSim.pWorkArray_3_1 );
-            load3dArray( offset, sA[threadIdx.x].torque, cAmoebaSim.pWorkArray_3_2 );
+            add3dArrayToFloat4( offset,    sA[threadIdx.x].force,  cSim.pForce4 );
+            load3dArray( 3*offset, sA[threadIdx.x].torque, cAmoebaSim.pWorkArray_3_1 );

 #endif
            lasty = y;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.cu
@@ -894,7 +894,7 @@ __device__ void calculateKirkwoodEDiffPairIxn_kernel( KirkwoodEDiffParticle& ato
 #ifdef AMOEBA_DEBUG
 __device__ static int debugAccumulate( unsigned int index, float4* debugArray, float* field, unsigned int addMask, float idLabel )
 {
-    index                             += cAmoebaSim.paddedNumberOfAtoms;
+    index                             += cSim.paddedNumberOfAtoms;
    debugArray[index].x                = addMask ? field[0] : 0.0f;
    debugArray[index].y                = addMask ? field[1] : 0.0f;
    debugArray[index].z                = addMask ? field[2] : 0.0f;
@@ -928,79 +928,18 @@ __device__ void zeroKirkwoodEDiffParticleSharedField( struct KirkwoodEDiffPartic
 #define METHOD_NAME(a, b) a##N2ByWarp##b
 #include "kCalculateAmoebaCudaKirkwoodEDiff.h"

-// reduce psWorkArray_3_1 -> force
-// reduce psWorkArray_3_2 -> torque
+// reduce psWorkArray_3_1 -> torque

-static void kReduceForceTorque( amoebaGpuContext amoebaGpu )
+static void kReduceTorque( amoebaGpuContext amoebaGpu )
 {

-    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
-                           amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
-                           amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psKirkwoodEDiffForce->_pDevData );
-    LAUNCHERROR("kReduceForceTorqueKirkwoodEDiff1");
-
-    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
-                           amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
-                           amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psTorque->_pDevData );
-
-    LAUNCHERROR("kReduceForceTorqueKirkwoodEDiff2");
-}
-
-#ifdef AMOEBA_DEBUG
-//#if 1
-static void printKirkwoodEDiffBuffer( amoebaGpuContext amoebaGpu, unsigned int bufferIndex )
-{
-    (void) fprintf( amoebaGpu->log, "KirkwoodEDiff Buffer %u\n", bufferIndex );
-    unsigned int start = bufferIndex*3*amoebaGpu->paddedNumberOfAtoms;
-    unsigned int stop  = (bufferIndex+1)*3*amoebaGpu->paddedNumberOfAtoms;
-    for( unsigned int ii = start; ii < stop; ii += 3 ){
-        unsigned int ii3Index      = ii/3;
-        unsigned int bufferIndex   = ii3Index/(amoebaGpu->paddedNumberOfAtoms);
-        unsigned int particleIndex = ii3Index - bufferIndex*(amoebaGpu->paddedNumberOfAtoms);
-        (void) fprintf( amoebaGpu->log, "   %6u %3u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n", 
-                            ii/3,  bufferIndex, particleIndex,
-                            amoebaGpu->psWorkArray_3_1->_pSysData[ii],
-                            amoebaGpu->psWorkArray_3_1->_pSysData[ii+1],
-                            amoebaGpu->psWorkArray_3_1->_pSysData[ii+2],
-                            amoebaGpu->psWorkArray_3_2->_pSysData[ii],
-                            amoebaGpu->psWorkArray_3_2->_pSysData[ii+1],
-                            amoebaGpu->psWorkArray_3_2->_pSysData[ii+2] );
-    } 
-
-/*
-    start = 0;
-    stop  = -146016;
-    float maxV = -1.0e+99;
-    for( unsigned int ii = start; ii < stop; ii += 3 ){
-        if(  amoebaGpu->psWorkArray_3_1->_pSysData[ii] > maxV ){ 
-            unsigned int ii3Index      = ii/3;
-            unsigned int bufferIndex   = ii3Index/(amoebaGpu->paddedNumberOfAtoms);
-            unsigned int particleIndex = ii3Index - bufferIndex*(amoebaGpu->paddedNumberOfAtoms);
-            (void) fprintf( amoebaGpu->log, "MaxQ %6u %3u %6u %14.6e\n", 
-                            ii/3,  bufferIndex, particleIndex,
-                            amoebaGpu->psWorkArray_3_1->_pSysData[ii] );
-            maxV = amoebaGpu->psWorkArray_3_1->_pSysData[ii];
-        } 
-    } 
-*/
-}
+    gpuContext gpu = amoebaGpu->gpuContext;
+    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
+                           gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
+                           amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psTorque->_pDevData );

-static void printKirkwoodEDiffAtomBuffers( amoebaGpuContext amoebaGpu, unsigned int targetAtom )
-{
-    (void) fprintf( amoebaGpu->log, "KirkwoodEDiff atom %u\n", targetAtom );
-    for( unsigned int ii = 0; ii < amoebaGpu->outputBuffers; ii++ ){
-        unsigned int particleIndex = 3*(targetAtom + ii*amoebaGpu->paddedNumberOfAtoms);
-        (void) fprintf( amoebaGpu->log, " %2u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n", 
-                        ii, particleIndex,
-                        amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex],
-                        amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+1],
-                        amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+2],
-                        amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex],
-                        amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+1],
-                        amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+2] );
-    } 
+    LAUNCHERROR("kReduceForceTorqueKirkwoodEDiff");
 }
-#endif

 /**---------------------------------------------------------------------------------------

@@ -1016,7 +955,6 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
  
   // ---------------------------------------------------------------------------------------

-    static unsigned int threadsPerBlock = 0;
    static int timestep                 = 0;
    timestep++;
 #ifdef AMOEBA_DEBUG
@@ -1050,6 +988,7 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
    kClearFields_3( amoebaGpu, 6 );
    LAUNCHERROR("kClearFields_3_kCalculateAmoebaCudaKirkwoodEDiff");

+    static unsigned int threadsPerBlock = 0;
    if( threadsPerBlock == 0 ){
        unsigned int maxThreads;
        if (gpu->sm_version >= SM_20)
@@ -1065,7 +1004,7 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
    if( amoebaGpu->log && timestep == 1 ){
        (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaKirkwoodEDiffN2Forces: blocks=%u threads=%u bffr/Warp=%u atm=%lu shrd=%lu"
                                        " ixnCt=%lu workUnits=%u sm=%d device=%d sharedMemoryPerBlock=%u\n",
-                        amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
+                        gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
                        sizeof(KirkwoodEDiffParticle), sizeof(KirkwoodEDiffParticle)*threadsPerBlock,
                        (*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->sm_version, gpu->device, gpu->sharedMemoryPerBlock );
        (void) fflush( amoebaGpu->log );
@@ -1074,7 +1013,7 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )

    if (gpu->bOutputBufferPerWarp){

-        kCalculateAmoebaCudaKirkwoodEDiffN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(KirkwoodEDiffParticle)*threadsPerBlock>>>(
+        kCalculateAmoebaCudaKirkwoodEDiffN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(KirkwoodEDiffParticle)*threadsPerBlock>>>(
                                                                           amoebaGpu->psWorkUnit->_pDevData,
                                                                           gpu->psPosq4->_pDevData,
                                                                           amoebaGpu->psLabFrameDipole->_pDevData,
@@ -1083,17 +1022,16 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
                                                                           amoebaGpu->psInducedDipolePolar->_pDevData,
                                                                           amoebaGpu->psInducedDipoleS->_pDevData,
                                                                           amoebaGpu->psInducedDipolePolarS->_pDevData,
-                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
 #ifdef AMOEBA_DEBUG
-                                                                           amoebaGpu->psWorkArray_3_2->_pDevData,
+                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
                                                                           debugArray->_pDevData, targetAtom );
 #else
-                                                                           amoebaGpu->psWorkArray_3_2->_pDevData );
+                                                                           amoebaGpu->psWorkArray_3_1->_pDevData );
 #endif

    } else {

-        kCalculateAmoebaCudaKirkwoodEDiffN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(KirkwoodEDiffParticle)*threadsPerBlock>>>(
+        kCalculateAmoebaCudaKirkwoodEDiffN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(KirkwoodEDiffParticle)*threadsPerBlock>>>(
                                                                           amoebaGpu->psWorkUnit->_pDevData,
                                                                           gpu->psPosq4->_pDevData,
                                                                           amoebaGpu->psLabFrameDipole->_pDevData,
@@ -1102,160 +1040,23 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
                                                                           amoebaGpu->psInducedDipolePolar->_pDevData,
                                                                           amoebaGpu->psInducedDipoleS->_pDevData,
                                                                           amoebaGpu->psInducedDipolePolarS->_pDevData,
-                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
 #ifdef AMOEBA_DEBUG
-                                                                           amoebaGpu->psWorkArray_3_2->_pDevData,
+                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
                                                                           debugArray->_pDevData, targetAtom );
 #else
-                                                                           amoebaGpu->psWorkArray_3_2->_pDevData );
+                                                                           amoebaGpu->psWorkArray_3_1->_pDevData );
 #endif
    }

    LAUNCHERROR("kCalculateAmoebaCudaKirkwoodEDiffN2Forces");

-    kReduceForceTorque( amoebaGpu );
+    kReduceTorque( amoebaGpu );
    LAUNCHERROR("kReduceForceTorque_kCalculateAmoebaCudaKirkwoodEDiff");

-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-
-        amoebaGpu->psWorkArray_3_1->Download();
-        amoebaGpu->psWorkArray_3_2->Download();
-
-        amoebaGpu->psKirkwoodEDiffForce->Download();
-        amoebaGpu->psTorque->Download();
-        debugArray->Download();
-
-        int maxPrint        = 1400;
-        for( int ii = 0; ii < gpu->natoms; ii++ ){
-           (void) fprintf( amoebaGpu->log, "%5d ", ii); 
-
-            int indexOffset     = ii*3;
-    
-           // force
-
-           (void) fprintf( amoebaGpu->log,"KirkwoodEDiffF [%16.9e %16.9e %16.9e] ",
-                           amoebaGpu->psKirkwoodEDiffForce->_pSysData[indexOffset],
-                           amoebaGpu->psKirkwoodEDiffForce->_pSysData[indexOffset+1],
-                           amoebaGpu->psKirkwoodEDiffForce->_pSysData[indexOffset+2] );
-    
-           // torque
-
-           (void) fprintf( amoebaGpu->log,"T [%16.9e %16.9e %16.9e] ",
-                           amoebaGpu->psTorque->_pSysData[indexOffset],
-                           amoebaGpu->psTorque->_pSysData[indexOffset+1],
-                           amoebaGpu->psTorque->_pSysData[indexOffset+2] );
-
-           if( ii == targetAtom ){
-               (void) fprintf( amoebaGpu->log,"\n" );
-               int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-               for( int jj = 0; jj < gpu->natoms; jj++ ){
-                   int debugIndex = jj;
-                   (void) fprintf( amoebaGpu->log,"%5d %5d ediff F%T\n", ii, jj );
-                   for( int kk = 0; kk < 5; kk++ ){
-                       (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
-                                       debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
-                                       debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
-                       debugIndex += paddedNumberOfAtoms;
-                   }
-                   (void) fprintf( amoebaGpu->log,"\n" );
-
-               }
-
-               (void) fprintf( amoebaGpu->log,"\n" );
-           }
-           (void) fprintf( amoebaGpu->log,"\n" );
-           if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
-                ii = gpu->natoms - maxPrint;
-           }
-        }
-        (void) fflush( amoebaGpu->log );
-        {
-            (void) fprintf( amoebaGpu->log, "%s Tiled F & T\n", methodName ); fflush( amoebaGpu->log );
-            int maxPrint = 12;
-            for( int ii = 0; ii < gpu->natoms; ii++ ){
-    
-                // print cpu & gpu reductions
-    
-                int offset  = 3*ii;
-    
-                (void) fprintf( amoebaGpu->log,"%6d F[%16.7e %16.7e %16.7e] T[%16.7e %16.7e %16.7e]\n", ii,
-                                amoebaGpu->psKirkwoodEDiffForce->_pSysData[offset],
-                                amoebaGpu->psKirkwoodEDiffForce->_pSysData[offset+1],
-                                amoebaGpu->psKirkwoodEDiffForce->_pSysData[offset+2],
-                                amoebaGpu->psTorque->_pSysData[offset],
-                                amoebaGpu->psTorque->_pSysData[offset+1],
-                                amoebaGpu->psTorque->_pSysData[offset+2] );
-                if( (ii == maxPrint) && (ii < (gpu->natoms - maxPrint)) )ii = gpu->natoms - maxPrint; 
-            }   
-        }   
-
-        if( 1 ){
-            std::vector<int> fileId;
-            //fileId.push_back( 0 );
-            VectorOfDoubleVectors outputVector;
-            cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,                    outputVector, NULL, 1.0f );
-            cudaLoadCudaFloatArray(  gpu->natoms, 3, amoebaGpu->psKirkwoodEDiffForce, outputVector, NULL, 1.0f );
-            cudaLoadCudaFloatArray(  gpu->natoms, 3, amoebaGpu->psTorque,             outputVector, NULL, 1.0f);
-            cudaWriteVectorOfDoubleVectorsToFile( "CudaForceTorque", fileId, outputVector );
-         }
-
-    }   
-    delete debugArray;
-
-#endif
-
    // map torques to forces

-    cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpu, amoebaGpu->psTorque, amoebaGpu->psKirkwoodEDiffForce, amoebaGpu->gpuContext->psForce4 );
+    cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );

-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-
-        cudaComputeAmoebaMapTorques( amoebaGpu, amoebaGpu->psTorque, amoebaGpu->psKirkwoodEDiffForce );
-        amoebaGpu->psKirkwoodEDiffForce->Download();
-        (void) fprintf( amoebaGpu->log, "Mapped KirkwoodEDiff torques to forces.\n" ); (void) fflush( amoebaGpu->log );
-
-        int maxPrint        = 1400;
-        for( int ii = 0; ii < gpu->natoms; ii++ ){
-           (void) fprintf( amoebaGpu->log, "%5d ", ii); 
-
-            int indexOffset     = ii*3;
-    
-           // force
-
-           (void) fprintf( amoebaGpu->log,"KirkwoodEDiffF [%16.9e %16.9e %16.9e] ",
-                           amoebaGpu->psKirkwoodEDiffForce->_pSysData[indexOffset],
-                           amoebaGpu->psKirkwoodEDiffForce->_pSysData[indexOffset+1],
-                           amoebaGpu->psKirkwoodEDiffForce->_pSysData[indexOffset+2] );
-    
-           (void) fprintf( amoebaGpu->log,"\n" );
-           if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
-                ii = gpu->natoms - maxPrint;
-           }
-        }
-        (void) fflush( amoebaGpu->log );
-        if( 1 ){
-            std::vector<int> fileId;
-            //fileId.push_back( 0 );
-            VectorOfDoubleVectors outputVector;
-            cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,                    outputVector, NULL, 1.0f );
-            cudaLoadCudaFloatArray(  gpu->natoms, 3, amoebaGpu->psKirkwoodEDiffForce, outputVector, NULL, 1.0f );
-            cudaWriteVectorOfDoubleVectorsToFile( "CudaKirkwoodEDiffForce", fileId, outputVector );
-         }
-
-    }   
-#endif
-
-    if( 0 ){
-        cudaComputeAmoebaMapTorques( amoebaGpu, amoebaGpu->psTorque, amoebaGpu->psKirkwoodEDiffForce );
-        std::vector<int> fileId;
-        //fileId.push_back( 0 );
-        VectorOfDoubleVectors outputVector;
-        //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,                    outputVector, NULL, 1.0f );
-        cudaLoadCudaFloatArray(  gpu->natoms, 3, amoebaGpu->psKirkwoodEDiffForce, outputVector, NULL, 1.0f/4.184 );
-        cudaWriteVectorOfDoubleVectorsToFile( "CudaKirkwoodEDiffForce", fileId, outputVector );
-    }

   // ---------------------------------------------------------------------------------------
 }
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.h
@@ -43,7 +43,6 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwoodEDiff, Forces_kernel)(
                            float* inducedDipolePolar,
                            float* inducedDipoleS,
                            float* inducedDipolePolarS,
-                            float* outputForce,
                            float* outputTorque
 #ifdef AMOEBA_DEBUG
                           , float4* debugArray, unsigned int targetAtom
@@ -138,7 +137,7 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwoodEDiff, Forces_kernel)(
 #endif
                                            );

-                    unsigned int mask       =  ( (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
+                    unsigned int mask       =  ( (atomI >= cSim.atoms) || (atomJ >= cSim.atoms) ) ? 0 : 1;

                    // torques include i == j contribution

@@ -166,12 +165,12 @@ if( atomI == targetAtom  || atomJ == targetAtom ){

            debugArray[index].x                = (float) atomI;
            debugArray[index].y                = (float) atomJ;
-            mask                               =  ( (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
+            mask                               =  ( (atomI >= cSim.atoms) || (atomJ >= cSim.atoms) ) ? 0 : 1;
            debugArray[index].z                = mask ? tinker_f*energy : 0.0f;

            index = debugAccumulate( index, debugArray, force,          mask, 1.0f );

-            mask  =  ( (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
+            mask  =  ( (atomI >= cSim.atoms) || (atomJ >= cSim.atoms) ) ? 0 : 1;
            index = debugAccumulate( index, debugArray, torqueIPtr, mask, 2.0f );
            index = debugAccumulate( index, debugArray, torqueJPtr, mask, 3.0f );
 }
@@ -181,7 +180,7 @@ if( atomI == targetAtom  || atomJ == targetAtom ){
            else // bExclusion
            {
                unsigned int xi       = x >> GRIDBITS;
-                unsigned int cell     = xi + xi*cAmoebaSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
+                unsigned int cell     = xi + xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
                int  dScaleMask       = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
                int2 pScaleMask       = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];

@@ -203,7 +202,7 @@ if( atomI == targetAtom  || atomJ == targetAtom ){
 #endif
                                            );

-                    unsigned int mask       =  ( (atomI == atomJ) || (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
+                    unsigned int mask       =  ( (atomI == atomJ) || (atomI >= cSim.atoms) || (atomJ >= cSim.atoms) ) ? 0 : 1;

                    // torques include i == j contribution

@@ -233,7 +232,7 @@ if( atomI == targetAtom  || atomJ == targetAtom ){

            index = debugAccumulate( index, debugArray, force,          mask, 1.0f );

-            //mask  =  ( (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
+            //mask  =  ( (atomI >= cSim.atoms) || (atomJ >= cSim.atoms) ) ? 0 : 1;
            index = debugAccumulate( index, debugArray, torqueIPtr, mask, 2.0f );
            index = debugAccumulate( index, debugArray, torqueJPtr, mask, 3.0f );
 }
@@ -249,15 +248,15 @@ if( atomI == targetAtom  || atomJ == targetAtom ){
            scale3dArray( tinker_f, localParticle.torque );

 #ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset                 = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+            unsigned int offset                 = x + tgx + warp*cSim.paddedNumberOfAtoms;

-            load3dArrayBufferPerWarp( offset, localParticle.force,  outputForce );
-            load3dArrayBufferPerWarp( offset, localParticle.torque, outputTorque );
+            add3dArrayToFloat4(         offset, localParticle.force,  cSim.pForce4 );
+            load3dArrayBufferPerWarp( 3*offset, localParticle.torque, outputTorque );

 #else
-            unsigned int offset                 = 3*(x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
-            load3dArray( offset, localParticle.force,  outputForce );
-            load3dArray( offset, localParticle.torque, outputTorque );
+            unsigned int offset                 = x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms;
+            add3dArrayToFloat4( offset, localParticle.force,  cSim.pForce4 );
+            load3dArray(      3*offset, localParticle.torque, outputTorque );

 #endif

@@ -304,7 +303,7 @@ if( atomI == targetAtom  || atomJ == targetAtom ){
                                            );


-                    unsigned int mask    =  ( (atomI >= cAmoebaSim.numberOfAtoms) || ( atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
+                    unsigned int mask    =  ( (atomI >= cSim.atoms) || ( atomJ >= cSim.atoms) ) ? 0 : 1;

                    // add force and torque to atom I due atom J

@@ -355,7 +354,7 @@ if( atomI == targetAtom  || atomJ == targetAtom ){

                unsigned int xi   = x >> GRIDBITS;
                unsigned int yi   = y >> GRIDBITS;
-                unsigned int cell = xi+yi*cAmoebaSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
+                unsigned int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
                int  dScaleMask   = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
                int2 pScaleMask   = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];

@@ -378,7 +377,7 @@ if( atomI == targetAtom  || atomJ == targetAtom ){
                                            );


-                    unsigned int mask    =  ( (atomI >= cAmoebaSim.numberOfAtoms) || ( atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
+                    unsigned int mask    =  ( (atomI >= cSim.atoms) || ( atomJ >= cSim.atoms) ) ? 0 : 1;

                    // add force and torque to atom I due atom J

@@ -435,26 +434,26 @@ if( atomI == targetAtom  || atomJ == targetAtom ){

 #ifdef USE_OUTPUT_BUFFER_PER_WARP

-            unsigned int offset                 = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+            unsigned int offset                 = x + tgx + warp*cSim.paddedNumberOfAtoms;

-            load3dArrayBufferPerWarp( offset, localParticle.force,  outputForce );
-            load3dArrayBufferPerWarp( offset, localParticle.torque, outputTorque );
+            add3dArrayToFloat4( offset, localParticle.force,  cSim.pForce4 );
+            load3dArrayBufferPerWarp( 3*offset, localParticle.torque, outputTorque );

-            offset                              = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+            offset                              = y + tgx + warp*cSim.paddedNumberOfAtoms;

-            load3dArrayBufferPerWarp( offset, sA[threadIdx.x].force,  outputForce );
-            load3dArrayBufferPerWarp( offset, sA[threadIdx.x].torque, outputTorque );
+            add3dArrayToFloat4(         offset, sA[threadIdx.x].force,  cSim.pForce4 );
+            load3dArrayBufferPerWarp( 3*offset, sA[threadIdx.x].torque, outputTorque );
 #else
-            unsigned int offset                 = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+            unsigned int offset                 = x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms;

-            load3dArray( offset, localParticle.force,  outputForce );
-            load3dArray( offset, localParticle.torque, outputTorque );
+            add3dArrayToFloat4( offset, localParticle.force,  cSim.pForce4 );
+            load3dArray(      3*offset, localParticle.torque, outputTorque );


-            offset                              = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+            offset                              = y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms;

-            load3dArray( offset, sA[threadIdx.x].force,  outputForce );
-            load3dArray( offset, sA[threadIdx.x].torque, outputTorque );
+            add3dArrayToFloat4( offset, sA[threadIdx.x].force,  cSim.pForce4 );
+            load3dArray(      3*offset, sA[threadIdx.x].torque, outputTorque );

 #endif
            lasty = y;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaLocalForces.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaLocalForces.cu
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMapTorques.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMapTorques.cu
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
@@ -33,8 +33,8 @@ void GetCalculateAmoebaCudaMutualInducedAndGkFieldsSim(amoebaGpuContext amoebaGp
    RTERROR(status, "GetCalculateAmoebaCudaMutualInducedAndGkFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
 }

-//#define AMOEBA_DEBUG
-#undef AMOEBA_DEBUG
+#define AMOEBA_DEBUG
+//#undef AMOEBA_DEBUG

 #define GK
 #include "kCalculateAmoebaCudaMutualInducedParticle.h"
@@ -216,7 +216,7 @@ __device__ void calculateMutualInducedAndGkFieldsGkPairIxn_kernel( MutualInduced
 #ifdef AMOEBA_DEBUG
 __device__ static int debugAccumulate( int index, float4* debugArray, float* field, unsigned int addMask, float idLabel )
 {
-    index                             += cAmoebaSim.paddedNumberOfAtoms;
+    index                             += cSim.paddedNumberOfAtoms;
    debugArray[index].x                = addMask ? field[0] : 0.0f;
    debugArray[index].y                = addMask ? field[1] : 0.0f;
    debugArray[index].z                = addMask ? field[2] : 0.0f;
@@ -256,7 +256,7 @@ void kInitializeMutualInducedAndGkField_kernel(
 {

    int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
-    if( threadId >= 3*cAmoebaSim.numberOfAtoms )return;
+    if( threadId >= 3*cSim.atoms )return;

    fixedEField[threadId]          *= polarizability[threadId];
    inducedDipole[threadId]         = fixedEField[threadId];
@@ -292,7 +292,7 @@ void kReduceMutualInducedAndGkFieldDelta_kernel( float* arrayOfDeltas1, float* a

    // load deltas

-    while( pos < 3*cAmoebaSim.numberOfAtoms )
+    while( pos < 3*cSim.atoms )
    {   
        delta[threadIdx.x].x  += arrayOfDeltas1[pos];
        delta[threadIdx.x].y  += arrayOfDeltas2[pos];
@@ -324,12 +324,12 @@ void kReduceMutualInducedAndGkFieldDelta_kernel( float* arrayOfDeltas1, float* a
        epsilon[0]  = epsilon[0] < delta[0].y ? delta[0].y : epsilon[0];
        epsilon[0]  = epsilon[0] < delta[0].z ? delta[0].z : epsilon[0];
        epsilon[0]  = epsilon[0] < delta[0].w ? delta[0].w : epsilon[0];
-        epsilon[0]  = 48.033324f*sqrtf( epsilon[0]/( (float) cAmoebaSim.numberOfAtoms ) );
+        epsilon[0]  = 48.033324f*sqrtf( epsilon[0]/( (float) cSim.atoms ) );
 #ifdef AMOEBA_DEBUG
-        epsilon[1]  = 48.033324f*sqrtf( delta[0].x/( (float) cAmoebaSim.numberOfAtoms ) );
-        epsilon[2]  = 48.033324f*sqrtf( delta[0].y/( (float) cAmoebaSim.numberOfAtoms ) );
-        epsilon[3]  = 48.033324f*sqrtf( delta[0].z/( (float) cAmoebaSim.numberOfAtoms ) );
-        epsilon[4]  = 48.033324f*sqrtf( delta[0].w/( (float) cAmoebaSim.numberOfAtoms ) );
+        epsilon[1]  = 48.033324f*sqrtf( delta[0].x/( (float) cSim.atoms ) );
+        epsilon[2]  = 48.033324f*sqrtf( delta[0].y/( (float) cSim.atoms ) );
+        epsilon[3]  = 48.033324f*sqrtf( delta[0].z/( (float) cSim.atoms ) );
+        epsilon[4]  = 48.033324f*sqrtf( delta[0].w/( (float) cSim.atoms ) );
 #endif
    }   
 }
@@ -356,7 +356,7 @@ void kSorUpdateMutualInducedAndGkField_kernel(

    float polarSOR = 0.70f;
    int threadId   = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
-    if( threadId  >= 3*cAmoebaSim.numberOfAtoms)return;
+    if( threadId  >= 3*cSim.atoms)return;

    float previousDipole                = inducedDipole[threadId];
    float previousDipoleP               = inducedDipoleP[threadId];
@@ -390,7 +390,7 @@ void kSorUpdateMutualInducedAndGkFieldS_kernel(

    float polarSOR = 0.70f;
    int threadId   = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
-    if( threadId  >= 3*cAmoebaSim.numberOfAtoms)return;
+    if( threadId  >= 3*cSim.atoms)return;

    float previousDipole                = inducedDipole[threadId];
    float previousDipoleP               = inducedDipoleP[threadId];
@@ -415,23 +415,24 @@ static void kReduceMutualInducedAndGkFields(amoebaGpuContext amoebaGpu,
                                            CUDAStream<float>* outputArray,  CUDAStream<float>* outputPolarArray,
                                            CUDAStream<float>* outputArrayS, CUDAStream<float>* outputPolarArrayS )
 {
-    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
-                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
+    gpuContext gpu = amoebaGpu->gpuContext;
+    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
+                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
                               amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData );
    LAUNCHERROR("kReduceMutualInducedAndGkFields1");

-    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
-                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
+    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
+                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
                               amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData );
    LAUNCHERROR("kReduceMutualInducedAndGkFields2");

-    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
-                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
+    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
+                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
                               amoebaGpu->psWorkArray_3_3->_pDevData, outputArrayS->_pDevData );
    LAUNCHERROR("kReduceMutualInducedAndGkFields3");

-    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
-                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
+    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
+                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
                               amoebaGpu->psWorkArray_3_4->_pDevData, outputPolarArrayS->_pDevData );
    LAUNCHERROR("kReduceMutualInducedAndGkFields4");
 }
@@ -441,12 +442,12 @@ static void kReduceMutualInducedAndGkFields(amoebaGpuContext amoebaGpu,
 static void printMiFieldBuffer( amoebaGpuContext amoebaGpu, unsigned int bufferIndex )
 {
    (void) fprintf( amoebaGpu->log, "MI Field Buffer %u\n", bufferIndex );
-    unsigned int start = bufferIndex*3*amoebaGpu->paddedNumberOfAtoms;
-    unsigned int stop  = (bufferIndex+1)*3*amoebaGpu->paddedNumberOfAtoms;
+    unsigned int start = bufferIndex*3*gpu->sim.paddedNumberOfAtoms;
+    unsigned int stop  = (bufferIndex+1)*3*gpu->sim.paddedNumberOfAtoms;
    for( unsigned int ii = start; ii < stop; ii += 3 ){
        unsigned int ii3Index      = ii/3;
-        unsigned int bufferIndex   = ii3Index/(amoebaGpu->paddedNumberOfAtoms);
-        unsigned int particleIndex = ii3Index - bufferIndex*(amoebaGpu->paddedNumberOfAtoms);
+        unsigned int bufferIndex   = ii3Index/(gpu->sim.paddedNumberOfAtoms);
+        unsigned int particleIndex = ii3Index - bufferIndex*(gpu->sim.paddedNumberOfAtoms);
        (void) fprintf( amoebaGpu->log, "   %6u %3u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n", 
                            ii/3,  bufferIndex, particleIndex,
                            amoebaGpu->psWorkArray_3_1->_pSysData[ii],
@@ -461,8 +462,8 @@ static void printMiFieldBuffer( amoebaGpuContext amoebaGpu, unsigned int bufferI
 static void printMiFieldAtomBuffers( amoebaGpuContext amoebaGpu, unsigned int targetAtom )
 {
    (void) fprintf( amoebaGpu->log, "MI Field atom %u\n", targetAtom );
-    for( unsigned int ii = 0; ii < amoebaGpu->outputBuffers; ii++ ){
-        unsigned int particleIndex = 3*(targetAtom + ii*amoebaGpu->paddedNumberOfAtoms);
+    for( unsigned int ii = 0; ii < gpu->sim.outputBuffers; ii++ ){
+        unsigned int particleIndex = 3*(targetAtom + ii*gpu->sim.paddedNumberOfAtoms);
        (void) fprintf( amoebaGpu->log, " %2u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n", 
                        ii, particleIndex,
                        amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex],
@@ -530,8 +531,8 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
    
 #ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n",
-                        amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
+        (void) fprintf( amoebaGpu->log, "cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%lu ixnCt=%lu workUnits=%lu\n",
+                        gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
                        sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock,
                        (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
        (void) fflush( amoebaGpu->log );
@@ -539,7 +540,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
 #endif

    if (gpu->bOutputBufferPerWarp){
-        kCalculateAmoebaMutualInducedAndGkFieldsN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
+        kCalculateAmoebaMutualInducedAndGkFieldsN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
                                                                 amoebaGpu->psWorkUnit->_pDevData,
                                                                 amoebaGpu->psWorkArray_3_1->_pDevData,
                                                                 amoebaGpu->psWorkArray_3_2->_pDevData,
@@ -552,7 +553,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
 #endif
    } else {

-        kCalculateAmoebaMutualInducedAndGkFieldsN2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
+        kCalculateAmoebaMutualInducedAndGkFieldsN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
                                                            amoebaGpu->psWorkUnit->_pDevData,
                                                            amoebaGpu->psWorkArray_3_1->_pDevData,
                                                            amoebaGpu->psWorkArray_3_2->_pDevData,
@@ -745,6 +746,8 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
 #ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){

+        (void) fprintf( amoebaGpu->log, "%s Initial setup for matrix multiply\n", methodName ); fflush( amoebaGpu->log );
+
        amoebaGpu->psE_Field->Download();
        amoebaGpu->psE_FieldPolar->Download();
        amoebaGpu->psInducedDipole->Download(),
@@ -753,7 +756,6 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
        amoebaGpu->psInducedDipolePolarS->Download();
        amoebaGpu->psPolarizability->Download();

-        (void) fprintf( amoebaGpu->log, "%s Initial setup for matrix multiply\n", methodName );
        int offset   = 0;
        int maxPrint = 10;
        for( int ii = 0; ii < gpu->natoms; ii++ ){
@@ -921,7 +923,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
    }

 #ifdef AMOEBA_DEBUG
-    if( 0 ){
+    if( 1 ){
        std::vector<int> fileId;
        //fileId.push_back( 0 );
        VectorOfDoubleVectors outputVector;