WCA force was overwriting total force vector

Modified out-of-plane bend force to improve stability Redid arguments to kCalculateAmoebaCudaKirkwood to reduce lmem

WCA force was overwriting total force vector
Modified out-of-plane bend force to improve stability Redid arguments to kCalculateAmoebaCudaKirkwood to reduce lmem
7eaa5d29 · Mark Friedrichs · 98b09e6f · 7eaa5d29 · 7eaa5d29 · 7eaa5d29
Commit 7eaa5d29 authored Jul 26, 2010 by Mark Friedrichs
11 changed files
--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -500,6 +500,7 @@ static void computeAmoebaMultipoleForce( AmoebaCudaData& data ) {
    // multipoles

    kCalculateAmoebaMultipoleForces(gpu, data.getHasAmoebaGeneralizedKirkwood() );
+
 //kClearForces(gpu->gpuContext);
 //kClearEnergy(gpu->gpuContext);
 //(void) fprintf( data.getLog(), "computeAmoebaMultipoleForce clearing forces/energy after kCalculateAmoebaMultipoleForces()\n" );

--- a/plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
+++ b/plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
@@ -1197,7 +1197,10 @@ static void gpuRotationToLabFrameAllocate( amoebaGpuContext amoebaGpu )
    // output
 
    amoebaGpu->psLabFrameDipole                      = new CUDAStream<float>(3*amoebaGpu->paddedNumberOfAtoms, 1, "LabFrameDipole");
+    amoebaGpu->amoebaSim.pLabFrameDipole             = amoebaGpu->psLabFrameDipole->_pDevStream[0];
+
    amoebaGpu->psLabFrameQuadrupole                  = new CUDAStream<float>(9*amoebaGpu->paddedNumberOfAtoms, 1, "LabFrameQuadrupole");
+    amoebaGpu->amoebaSim.pLabFrameQuadrupole         = amoebaGpu->psLabFrameQuadrupole->_pDevStream[0];

    memset( amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0], 0, sizeof(int)*4*amoebaGpu->paddedNumberOfAtoms );
 }
@@ -1275,7 +1278,11 @@ void gpuMutualInducedFieldAllocate( amoebaGpuContext amoebaGpu )
 #endif

    amoebaGpu->psInducedDipole                 = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "InducedDipole");
+    amoebaGpu->amoebaSim.pInducedDipole        = amoebaGpu->psInducedDipole->_pDevStream[0];
+
    amoebaGpu->psInducedDipolePolar            = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "InducedDipolePolar");
+    amoebaGpu->amoebaSim.pInducedDipolePolar   = amoebaGpu->psInducedDipolePolar->_pDevStream[0];
+
    amoebaGpu->psCurrentEpsilon                = new CUDAStream<float>(5, 1, "CurrentEpsilon");
    amoebaGpu->epsilonThreadsPerBlock          = 384;
 
@@ -1326,19 +1333,15 @@ void gpuElectrostaticAllocate( amoebaGpuContext amoebaGpu )
    }
 #endif

-    amoebaGpu->psForce                         = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "ForceuElectrostatic");
-    amoebaGpu->psTorque                        = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "ForceTorque");
+    amoebaGpu->psForce                         = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "ElectrostaticForce");
+    amoebaGpu->psTorque                        = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "Torque");

    unsigned int offset                        = 3*paddedNumberOfAtoms*sizeof( float );
    memset( amoebaGpu->psForce->_pSysStream[0],          0, offset );
    memset( amoebaGpu->psTorque->_pSysStream[0],         0, offset );

-    offset                                     = paddedNumberOfAtoms*sizeof( float );
-//  memset( amoebaGpu->psEnergy->_pSysStream[0],         0, offset );
-
    amoebaGpu->psForce->Download();
    amoebaGpu->psTorque->Download();
-//    amoebaGpu->psEnergy->Download();
    
 }

@@ -1363,22 +1366,27 @@ void gpuKirkwoodAllocate( amoebaGpuContext amoebaGpu )
        return;
    }

-    int paddedNumberOfAtoms                = amoebaGpu->paddedNumberOfAtoms;
+    int paddedNumberOfAtoms                      = amoebaGpu->paddedNumberOfAtoms;

 #ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log,"%s: paddedNumberOfAtoms=%d\n", methodName.c_str(), paddedNumberOfAtoms );
+        (void) fprintf( amoebaGpu->log,"%s: paddedNumberOfAtoms      =%d\n", methodName.c_str(), paddedNumberOfAtoms );
        (void) fflush( amoebaGpu->log );
    }
 #endif

-    amoebaGpu->psBorn                      = new CUDAStream<float>(paddedNumberOfAtoms,   1, "KirkwoodBorn");
-    amoebaGpu->psBornPolar                 = new CUDAStream<float>(paddedNumberOfAtoms,   1, "KirkwoodBornPolar");
-    amoebaGpu->psGk_Field                  = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "Gk_Fixed_Field");
-    amoebaGpu->psInducedDipoleS            = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "InducedDipoleS");
-    amoebaGpu->psInducedDipolePolarS       = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "InducedDipolePolarS");
-    amoebaGpu->psKirkwoodForce             = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "KirkwoodForce");
-    amoebaGpu->psKirkwoodEDiffForce        = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "KirkwoodEDiffForce");
+    amoebaGpu->psBorn                            = new CUDAStream<float>(paddedNumberOfAtoms,   1, "KirkwoodBorn");
+    amoebaGpu->psBornPolar                       = new CUDAStream<float>(paddedNumberOfAtoms,   1, "KirkwoodBornPolar");
+    amoebaGpu->psGk_Field                        = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "Gk_Fixed_Field");
+
+    amoebaGpu->psInducedDipoleS                  = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "InducedDipoleS");
+    amoebaGpu->amoebaSim.pInducedDipoleS         = amoebaGpu->psInducedDipoleS->_pDevStream[0];
+
+    amoebaGpu->psInducedDipolePolarS             = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "InducedDipolePolarS");
+    amoebaGpu->amoebaSim.pInducedDipolePolarS    = amoebaGpu->psInducedDipolePolarS->_pDevStream[0];
+
+    amoebaGpu->psKirkwoodForce                   = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "KirkwoodForce");
+    amoebaGpu->psKirkwoodEDiffForce              = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "KirkwoodEDiffForce");

    unsigned int offset                    = paddedNumberOfAtoms*sizeof( float );
    memset( amoebaGpu->psBorn->_pSysStream[0],               0, offset );
@@ -2685,7 +2693,10 @@ void amoebaGpuBuildOutputBuffers( amoebaGpuContext amoebaGpu )
        (void) fflush( amoebaGpu->log );
    }
    amoebaGpu->psWorkArray_3_1            = new CUDAStream<float>(3*paddedNumberOfAtoms, (amoebaGpu->outputBuffers), "AmoebaField_3_1");
+    amoebaGpu->amoebaSim.pWorkArray_3_1   = amoebaGpu->psWorkArray_3_1->_pDevStream[0];
+
    amoebaGpu->psWorkArray_3_2            = new CUDAStream<float>(3*paddedNumberOfAtoms, (amoebaGpu->outputBuffers), "AmoebaField_3_2");
+    amoebaGpu->amoebaSim.pWorkArray_3_2   = amoebaGpu->psWorkArray_3_2->_pDevStream[0];

    // used GK
    amoebaGpu->psWorkArray_3_3            = new CUDAStream<float>(3*paddedNumberOfAtoms, (amoebaGpu->outputBuffers), "AmoebaField_3_3");
@@ -2694,7 +2705,10 @@ void amoebaGpuBuildOutputBuffers( amoebaGpuContext amoebaGpu )
    amoebaGpu->psWorkArray_3_6            = new CUDAStream<float>(3*paddedNumberOfAtoms, (amoebaGpu->outputBuffers), "AmoebaField_3_6");

    amoebaGpu->psWorkArray_1_1            = new CUDAStream<float>(  paddedNumberOfAtoms, (amoebaGpu->outputBuffers), "AmoebaField_1_1");
+    amoebaGpu->amoebaSim.pWorkArray_1_1   = amoebaGpu->psWorkArray_1_1->_pDevStream[0];
+
    amoebaGpu->psWorkArray_1_2            = new CUDAStream<float>(  paddedNumberOfAtoms, (amoebaGpu->outputBuffers), "AmoebaField_1_2");
+    amoebaGpu->amoebaSim.pWorkArray_1_2   = amoebaGpu->psWorkArray_1_2->_pDevStream[0];

    amoebaGpu->psEnergy                   = new CUDAStream<float>(amoebaGpu->energyOutputBuffers, 1, "AmoebaEnergy");


--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaTypes.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaTypes.h
@@ -41,231 +41,6 @@
 #include <builtin_types.h>
 #include <vector_functions.h>

-#if 0
-#define RTERROR(status, s) \
-    if (status != cudaSuccess) { \
-        printf("%s %s\n", s, cudaGetErrorString(status)); \
-        exit(-1); \
-    }
-
-#define LAUNCHERROR(s) \
-    { \
-        cudaError_t status = cudaGetLastError(); \
-        if (status != cudaSuccess) { \
-            printf("Error: %s launching kernel %s\n", cudaGetErrorString(status), s); \
-            exit(-1); \
-        } \
-    }
-#endif
-
-#if 0
-// Pure virtual class to define an interface for objects resident both on GPU and CPU
-struct SoADeviceObject {
-    virtual void Allocate() = 0;
-    virtual void Deallocate() = 0;
-    virtual void Upload() = 0;
-    virtual void Download() = 0;
-};
-
-template <typename T>
-struct CUDAStream : public SoADeviceObject
-{
-    unsigned int    _length;
-    unsigned int    _subStreams;
-    unsigned int    _stride;
-    T**             _pSysStream;
-    T**             _pDevStream;
-    T*              _pSysData;
-    T*              _pDevData;
-    std::string     _name;
-    CUDAStream(int length, int subStreams = 1, std::string name="");
-    CUDAStream(unsigned int length, unsigned int subStreams = 1, std::string name="");
-    CUDAStream(unsigned int length, int subStreams = 1, std::string name="");
-    CUDAStream(int length, unsigned int subStreams = 1, std::string name="");
-    virtual ~CUDAStream();
-    void Allocate();
-    void Deallocate();
-    void Upload();
-    void Download();
-    void Collapse(unsigned int newstreams = 1, unsigned int interleave = 1);
-    T& operator[](int index);
-};
-
-float CompareStreams(CUDAStream<float>& s1, CUDAStream<float>& s2, float tolerance, unsigned int maxindex = 0);
-
-template <typename T>
-CUDAStream<T>::CUDAStream(int length, unsigned int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
-{
-    Allocate();   
-}
-
-template <typename T>
-CUDAStream<T>::CUDAStream(unsigned int length, int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
-{
-    Allocate();   
-}
-
-template <typename T>
-CUDAStream<T>::CUDAStream(unsigned int length, unsigned int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
-{
-    Allocate();   
-}
-
-template <typename T>
-CUDAStream<T>::CUDAStream(int length, int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
-{
-    Allocate();   
-}
-
-template <typename T>
-CUDAStream<T>::~CUDAStream()
-{
-    Deallocate();
-}
-
-template <typename T>
-void CUDAStream<T>::Allocate()
-{
-    cudaError_t status;
-    _pSysStream =   new T*[_subStreams];
-    _pDevStream =   new T*[_subStreams];
-    _pSysData =     new T[_subStreams * _stride];
-
-    status = cudaMalloc((void **) &_pDevData, _stride * _subStreams * sizeof(T));
-    RTERROR(status, (_name+": cudaMalloc in CUDAStream::Allocate failed").c_str());
-
-    for (unsigned int i = 0; i < _subStreams; i++)
-    {
-        _pSysStream[i] = _pSysData + i * _stride;
-        _pDevStream[i] = _pDevData + i * _stride;
-    }
-}
-
-template <typename T>
-void CUDAStream<T>::Deallocate()
-{
-    cudaError_t status;
-    delete[] _pSysStream;
-    _pSysStream = NULL;
-    delete[] _pDevStream;
-    _pDevStream = NULL;
-    delete[] _pSysData;
-    _pSysData = NULL;
-    status = cudaFree(_pDevData);
-    RTERROR(status, (_name+": cudaFree in CUDAStream::Deallocate failed").c_str());
-}
-
-template <typename T>
-void CUDAStream<T>::Upload()
-{
-    cudaError_t status;
-    status = cudaMemcpy(_pDevData, _pSysData, _stride * _subStreams * sizeof(T), cudaMemcpyHostToDevice);
-    RTERROR(status, (_name+": cudaMemcpy in CUDAStream::Upload failed").c_str());
-}
-
-template <typename T>
-void CUDAStream<T>::Download()
-{
-    cudaError_t status;
-    status = cudaMemcpy(_pSysData, _pDevData, _stride * _subStreams * sizeof(T), cudaMemcpyDeviceToHost);
-    RTERROR(status, (_name+": cudaMemcpy in CUDAStream::Download failed").c_str());
-}
-
-template <typename T>
-void CUDAStream<T>::Collapse(unsigned int newstreams, unsigned int interleave)
-{
-    T* pTemp = new T[_subStreams * _stride];
-    unsigned int stream = 0;
-    unsigned int pos = 0;
-    unsigned int newstride = _stride * _subStreams / newstreams;
-    unsigned int newlength = _length * _subStreams / newstreams;
-
-    // Copy data into new format
-    for (unsigned int i = 0; i < _length; i++)
-    {
-        for (unsigned int j = 0; j < _subStreams; j++)
-        {
-            pTemp[stream * newstride + pos] = _pSysStream[j][i];
-            stream++;
-            if (stream == newstreams)
-            {
-                stream = 0;
-                pos++;
-            }
-        }
-    }
-
-    // Remap stream pointers;
-    for (unsigned int i = 0; i < newstreams; i++)
-    {
-        _pSysStream[i] = _pSysData + i * newstride;
-        _pDevStream[i] = _pDevData + i * newstride;
-    }
-
-    // Copy data back intro original stream
-    for (unsigned int i = 0; i < newlength; i++)
-        for (unsigned int j = 0; j < newstreams; j++)
-            _pSysStream[j][i] = pTemp[j * newstride + i];
-    
-    _stride = newstride;
-    _length = newlength;
-    _subStreams = newstreams;
-    delete[] pTemp;
-}
-
-template <typename T>
-T& CUDAStream<T>::operator[](int index)
-{
-    return _pSysData[index];
-}
-
-static const unsigned int GRID = 32;
-static const unsigned int GRIDBITS = 5;
-static const int G8X_NONBOND_THREADS_PER_BLOCK          = 256;
-static const int GT2XX_NONBOND_THREADS_PER_BLOCK        = 320;
-static const int G8X_BORNFORCE2_THREADS_PER_BLOCK       = 256;
-static const int GT2XX_BORNFORCE2_THREADS_PER_BLOCK     = 320;
-static const int G8X_SHAKE_THREADS_PER_BLOCK            = 128;
-static const int GT2XX_SHAKE_THREADS_PER_BLOCK          = 256;
-static const int G8X_UPDATE_THREADS_PER_BLOCK           = 192;
-static const int GT2XX_UPDATE_THREADS_PER_BLOCK         = 384;
-static const int G8X_LOCALFORCES_THREADS_PER_BLOCK      = 192;
-static const int GT2XX_LOCALFORCES_THREADS_PER_BLOCK    = 384;
-static const int G8X_THREADS_PER_BLOCK                  = 256;
-static const int GT2XX_THREADS_PER_BLOCK                = 256;
-static const int G8X_RANDOM_THREADS_PER_BLOCK           = 256;
-static const int GT2XX_RANDOM_THREADS_PER_BLOCK         = 384;
-static const int G8X_NONBOND_WORKUNITS_PER_SM           = 220;
-static const int GT2XX_NONBOND_WORKUNITS_PER_SM         = 256;
-static const unsigned int MAX_STACK_SIZE = 8;
-static const unsigned int MAX_TABULATED_FUNCTIONS = 4;
-
-static const float PI = 3.14159265358979323846f;
-
-static const int PME_ORDER = 4;
-
-enum CudaNonbondedMethod
-{
-    NO_CUTOFF,
-    CUTOFF,
-    PERIODIC,
-    EWALD,
-    PARTICLE_MESH_EWALD
-};
-
-enum ExpressionOp {
-    CONSTANT = 0, VARIABLE0, VARIABLE1, VARIABLE2, VARIABLE3, VARIABLE4, VARIABLE5, VARIABLE6, VARIABLE7, VARIABLE8, GLOBAL, CUSTOM, CUSTOM_DERIV, ADD, SUBTRACT, MULTIPLY, DIVIDE,
-        POWER, NEGATE, SQRT, EXP, LOG, SIN, COS, SEC, CSC, TAN, COT, ASIN, ACOS, ATAN, SQUARE, CUBE, RECIPROCAL, ADD_CONSTANT, MULTIPLY_CONSTANT, POWER_CONSTANT
-};
-
-template<int SIZE>
-struct Expression {
-    int op[SIZE];
-    float arg[SIZE];
-    int length, stackSize;
-};
-#endif
-
 struct cudaAmoebaGmxSimulation {
    // Constants

@@ -344,6 +119,19 @@ struct cudaAmoebaGmxSimulation {
    float scalingDistanceCutoff;                    // scaling cutoff
    float2*         pDampingFactorAndThole;         // Thole & damping factors

+    float* pLabFrameDipole;
+    float* pLabFrameQuadrupole;
+    float* pInducedDipole;
+    float* pInducedDipolePolar;
+
+    float* pInducedDipoleS;
+    float* pInducedDipolePolarS;
+
+    float* pWorkArray_3_1;
+    float* pWorkArray_3_2;
+    float* pWorkArray_1_1;
+    float* pWorkArray_1_2;
+
    unsigned int amoebaVdwNonReductions;
    int* pAmoebaVdwNonReductionID;


--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
@@ -867,11 +867,11 @@ static void kReduceForceTorque(amoebaGpuContext amoebaGpu )
    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
                               amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psForce->_pDevStream[0] );
-    LAUNCHERROR("kReduceForceTorque1");
+    LAUNCHERROR("kReduceElectrostaticForce");
    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
                               amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psTorque->_pDevStream[0] );
-    LAUNCHERROR("kReduceForceTorque2");
+    LAUNCHERROR("kReduceElectrostaticTorque");
 }

 #ifdef AMOEBA_DEBUG
@@ -1137,10 +1137,7 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
            cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,            outputVector );
            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psForce,      outputVector );
            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psTorque,     outputVector);
-(void) fprintf( amoebaGpu->log, "%s calling cudaWriteVectorOfDoubleVectorsToFile \n", methodName ); fflush( amoebaGpu->log );
            cudaWriteVectorOfDoubleVectorsToFile( "CudaForceTorque", fileId, outputVector );
-(void) fprintf( amoebaGpu->log, "%s called cudaWriteVectorOfDoubleVectorsToFile \n", methodName ); fflush( amoebaGpu->log );
-
         }

    }   

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
@@ -2386,21 +2386,11 @@ threadsPerBlock = 32;
 #endif

        kCalculateAmoebaCudaKirkwoodN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(KirkwoodParticle)*threadsPerBlock>>>(
-                                                                           amoebaGpu->psWorkUnit->_pDevStream[0],
-                                                                           gpu->psPosq4->_pDevStream[0],
-                                                                           amoebaGpu->psLabFrameDipole->_pDevStream[0],
-                                                                           amoebaGpu->psLabFrameQuadrupole->_pDevStream[0],
-                                                                           amoebaGpu->psInducedDipoleS->_pDevStream[0],
-                                                                           amoebaGpu->psInducedDipolePolarS->_pDevStream[0],
-                                                                           gpu->psBornRadii->_pDevStream[0],
-                                                                           amoebaGpu->psWorkArray_3_1->_pDevStream[0],
-                                                                           amoebaGpu->psWorkArray_3_2->_pDevStream[0],
-                                                                           amoebaGpu->psWorkArray_1_1->_pDevStream[0],
+                                                                           amoebaGpu->psWorkUnit->_pDevStream[0]
 #ifdef AMOEBA_DEBUG
-                                                                           amoebaGpu->psWorkArray_1_2->_pDevStream[0],
-                                                                           debugArray->_pDevStream[0], targetAtom );
+                                                                           , debugArray->_pDevStream[0], targetAtom );
 #else
-                                                                           amoebaGpu->psWorkArray_1_2->_pDevStream[0] );
+                                                                           );
 #endif
    }


--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.h
@@ -37,18 +37,7 @@ __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
 #endif
 */
 void METHOD_NAME(kCalculateAmoebaCudaKirkwood, Forces_kernel)(
-                            unsigned int* workUnit,
-                            float4* atomCoord,
-                            float* labFrameDipole,
-                            float* labFrameQuadrupole,
-                            float* inducedDipole,
-                            float* inducedDipolePolar,
-                            float* bornRadii,
-                            float* outputForce,
-                            float* outputTorque,
-                            float* output_dBornRadius,
-                            float* output_dBornRadiusPolar
-
+                            unsigned int* workUnit
 #ifdef AMOEBA_DEBUG
                           , float4* debugArray, unsigned int targetAtom
 #endif
@@ -67,6 +56,12 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwood, Forces_kernel)(
    unsigned int end             = (warp+1)*numWorkUnits/totalWarps;
    unsigned int lasty           = 0xFFFFFFFF;

+    // pWorkArray_3_1 == force
+    // pWorkArray_3_2 == torque
+
+    // pWorkArray_1_1 == dBorn
+    // pWorkArray_1_2 == dBornPolar
+
    float4 jCoord;
    float jDipole[3];     
    float jQuadrupole[9];     
@@ -94,7 +89,7 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwood, Forces_kernel)(
        KirkwoodParticle* psA         = &sA[tbx];

        unsigned int atomI            = x + tgx;
-        float4 iCoord                 = atomCoord[atomI];
+        float4 iCoord                 =  cSim.pPosq[atomI];

        float forceSum[3];
        float torqueSum[3];
@@ -120,8 +115,8 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwood, Forces_kernel)(
            // load shared data

            loadKirkwoodShared( &(sA[threadIdx.x]), atomI,
-                                atomCoord, labFrameDipole, labFrameQuadrupole,
-                                inducedDipole, inducedDipolePolar, bornRadii );
+                                cSim.pPosq, cAmoebaSim.pLabFrameDipole, cAmoebaSim.pLabFrameQuadrupole,
+                                cAmoebaSim.pInducedDipoleS,  cAmoebaSim.pInducedDipolePolarS, cSim.pBornRadii );

            // this branch is never exercised since it includes the
            // interaction between atomI and itself which is always excluded
@@ -145,11 +140,11 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwood, Forces_kernel)(

                calculateKirkwoodPairIxn_kernel( sameAtom,
                                                 iCoord,                                     jCoord,
-                                                 &(labFrameDipole[3*atomI]),                 jDipole,
-                                                 &(labFrameQuadrupole[9*atomI]),             jQuadrupole,
-                                                 &(inducedDipole[3*atomI]),                  jInducedDipole,
-                                                 &(inducedDipolePolar[3*atomI]),             jInducedDipolePolar,
-                                                 bornRadii[atomI],                           jBornRadius,
+                                                 &(cAmoebaSim.pLabFrameDipole[3*atomI]),     jDipole,
+                                                 &(cAmoebaSim.pLabFrameQuadrupole[9*atomI]), jQuadrupole,
+                                                 &(cAmoebaSim.pInducedDipoleS[3*atomI]),     jInducedDipole,
+                                                 &(cAmoebaSim.pInducedDipolePolarS[3*atomI]),jInducedDipolePolar,
+                                                 cSim.pBornRadii[atomI],                     jBornRadius,
                                                 force, torque, dBorn, dBornPolar, &energy
 #ifdef AMOEBA_DEBUG
                                          , pullBack
@@ -188,7 +183,7 @@ if( atomI == targetAtom  || atomJ == targetAtom ){
        debugArray[index].x                = (float) atomI;
        debugArray[index].y                = (float) atomJ;
        debugArray[index].z                = (float) (mask + 1);
-        //debugArray[index].z                = bornRadii[atomI];
+        //debugArray[index].z                = cSim.pBornRadii[atomI];
        //debugArray[index].z                = energy;
        //debugArray[index].w                = (float) (blockIdx.x*blockDim.x+threadIdx.x);
        debugArray[index].w                = jBornRadius;
@@ -234,30 +229,30 @@ if( atomI == targetAtom  || atomJ == targetAtom ){
            float  of;
            unsigned int offset                 = x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms;

-            of                                  = output_dBornRadius[offset];
+            of                                  = cAmoebaSim.pWorkArray_1_1[offset];
            of                                 += dBornSum;
-            output_dBornRadius[offset]          = of;
+            cAmoebaSim.pWorkArray_1_1[offset]   = of;

-            of                                  = output_dBornRadiusPolar[offset];
+            of                                  = cAmoebaSim.pWorkArray_1_2[offset];
            of                                 += dBornPolarSum;
-            output_dBornRadiusPolar[offset]     = of;
+            cAmoebaSim.pWorkArray_1_2[offset]   = of;


            offset                             *= 3;

-            load3dArrayBufferPerWarp( offset, forceSum,  outputForce );
-            load3dArrayBufferPerWarp( offset, torqueSum, outputTorque );
+            load3dArrayBufferPerWarp( offset, forceSum,  cAmoebaSim.pWorkArray_3_1 );
+            load3dArrayBufferPerWarp( offset, torqueSum, cAmoebaSim.pWorkArray_3_2 );

 #else
            unsigned int offset                 = x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms;

-            output_dBornRadius[offset]          = dBornSum;
-            output_dBornRadiusPolar[offset]     = dBornPolarSum;
+            cAmoebaSim.pWorkArray_1_1[offset]   = dBornSum;
+            cAmoebaSim.pWorkArray_1_2[offset]   = dBornPolarSum;

            offset                             *= 3;

-            load3dArray( offset, forceSum,  outputForce );
-            load3dArray( offset, torqueSum, outputTorque );
+            load3dArray( offset, forceSum,  cAmoebaSim.pWorkArray_3_1 );
+            load3dArray( offset, torqueSum, cAmoebaSim.pWorkArray_3_2 );

 #endif

@@ -271,8 +266,8 @@ if( atomI == targetAtom  || atomJ == targetAtom ){
                // load shared data

                loadKirkwoodShared( &(sA[threadIdx.x]), (y+tgx),
-                                    atomCoord, labFrameDipole, labFrameQuadrupole,
-                                    inducedDipole, inducedDipolePolar, bornRadii );
+                                    cSim.pPosq, cAmoebaSim.pLabFrameDipole, cAmoebaSim.pLabFrameQuadrupole,
+                                    cAmoebaSim.pInducedDipoleS, cAmoebaSim.pInducedDipolePolarS, cSim.pBornRadii);

            }

@@ -299,12 +294,12 @@ if( atomI == targetAtom  || atomJ == targetAtom ){
                                  jInducedDipole, jInducedDipolePolar, &jBornRadius );

                calculateKirkwoodPairIxn_kernel( sameAtom, 
-                                                 iCoord,                                     jCoord,
-                                                 &(labFrameDipole[3*atomI]),                 jDipole,
-                                                 &(labFrameQuadrupole[9*atomI]),             jQuadrupole,
-                                                 &(inducedDipole[3*atomI]),                  jInducedDipole,
-                                                 &(inducedDipolePolar[3*atomI]),             jInducedDipolePolar,
-                                                 bornRadii[atomI],                           jBornRadius,
+                                                 iCoord,                                       jCoord,
+                                                 &(cAmoebaSim.pLabFrameDipole[3*atomI]),       jDipole,
+                                                 &(cAmoebaSim.pLabFrameQuadrupole[9*atomI]),   jQuadrupole,
+                                                 &(cAmoebaSim.pInducedDipoleS[3*atomI]),       jInducedDipole,
+                                                 &(cAmoebaSim.pInducedDipolePolarS[3*atomI]),  jInducedDipolePolar,
+                                                 cSim.pBornRadii[atomI],                       jBornRadius,
                                                 force, torque, dBorn, dBornPolar, &energy
 #ifdef AMOEBA_DEBUG
                                          , pullBack
@@ -350,7 +345,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
        debugArray[index].x                = (float) atomI;
        debugArray[index].y                = (float) atomJ;
        debugArray[index].z                = (float) (mask+1);
-        //debugArray[index].z                = bornRadii[atomI];
+        //debugArray[index].z                = cSim.pBornRadii[atomI];
        //debugArray[index].z                = energy;
        debugArray[index].w                = (float) (blockIdx.x*blockDim.x+threadIdx.x);

@@ -407,54 +402,54 @@ if( mask || !mask ){
            float of;
            unsigned int offset                 = x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms;

-            of                                  = output_dBornRadius[offset];
+            of                                  = cAmoebaSim.pWorkArray_1_1[offset];
            of                                 += dBornSum;
-            output_dBornRadius[offset]          = of;
+            cAmoebaSim.pWorkArray_1_1[offset]   = of;

-            of                                  = output_dBornRadiusPolar[offset];
+            of                                  = cAmoebaSim.pWorkArray_1_2[offset];
            of                                 += dBornPolarSum;
-            output_dBornRadiusPolar[offset]     = of;
+            cAmoebaSim.pWorkArray_1_2[offset]   = of;

            offset                             *= 3;

-            load3dArrayBufferPerWarp( offset, forceSum,  outputForce );
-            load3dArrayBufferPerWarp( offset, torqueSum, outputTorque );
+            load3dArrayBufferPerWarp( offset, forceSum,  cAmoebaSim.pWorkArray_3_1 );
+            load3dArrayBufferPerWarp( offset, torqueSum, cAmoebaSim.pWorkArray_3_2 );

            offset                              = y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms;

-            of                                  = output_dBornRadius[offset];
+            of                                  = cAmoebaSim.pWorkArray_1_1[offset];
            of                                 += dBornSum;
-            output_dBornRadius[offset]          = of;
+            cAmoebaSim.pWorkArray_1_1[offset]   = of;

-            of                                  = output_dBornRadiusPolar[offset];
+            of                                  = cAmoebaSim.pWorkArray_1_2[offset];
            of                                 += dBornPolarSum;
-            output_dBornRadiusPolar[offset]     = of;
+            cAmoebaSim.pWorkArray_1_2[offset]   = of;

            offset                             *= 3;

-            load3dArrayBufferPerWarp( offset, sA[threadIdx.x].force,  outputForce );
-            load3dArrayBufferPerWarp( offset, sA[threadIdx.x].torque, outputTorque );
+            load3dArrayBufferPerWarp( offset, sA[threadIdx.x].force,  cAmoebaSim.pWorkArray_3_1 );
+            load3dArrayBufferPerWarp( offset, sA[threadIdx.x].torque, cAmoebaSim.pWorkArray_3_2 );
 #else
            unsigned int offset                 = x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms;

-            output_dBornRadius[offset]          = dBornSum;
-            output_dBornRadiusPolar[offset]     = dBornPolarSum;
+            cAmoebaSim.pWorkArray_1_1[offset]   = dBornSum;
+            cAmoebaSim.pWorkArray_1_2[offset]   = dBornPolarSum;

            offset                             *= 3;

-            load3dArray( offset, forceSum,  outputForce );
-            load3dArray( offset, torqueSum, outputTorque );
+            load3dArray( offset, forceSum,  cAmoebaSim.pWorkArray_3_1 );
+            load3dArray( offset, torqueSum, cAmoebaSim.pWorkArray_3_2 );


            offset                              = y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms;

-            output_dBornRadius[offset]          = sA[threadIdx.x].dBornRadius;
-            output_dBornRadiusPolar[offset]     = sA[threadIdx.x].dBornRadiusPolar;
+            cAmoebaSim.pWorkArray_1_1[offset]   = sA[threadIdx.x].dBornRadius;
+            cAmoebaSim.pWorkArray_1_2[offset]   = sA[threadIdx.x].dBornRadiusPolar;

            offset                             *= 3;

-            load3dArray( offset, sA[threadIdx.x].force,  outputForce );
-            load3dArray( offset, sA[threadIdx.x].torque, outputTorque );
+            load3dArray( offset, sA[threadIdx.x].force,  cAmoebaSim.pWorkArray_3_1 );
+            load3dArray( offset, sA[threadIdx.x].torque, cAmoebaSim.pWorkArray_3_2 );

 #endif
            lasty = y;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaUtilities.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaUtilities.cu
@@ -259,7 +259,7 @@ void kReduceFieldsToFloat4_kernel( unsigned int fieldComponents, unsigned int ou

        unsigned int j   = pos/3;
        unsigned int k   = pos - 3*j;
-        fieldOut[4*j+k]  = totalField;
+        fieldOut[4*j+k] += totalField;
        pos             += gridDim.x * blockDim.x;
    }   
 }

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaLocalForces.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaLocalForces.cu
@@ -1165,11 +1165,19 @@ void kCalculateAmoebaLocalForces_kernel()

            float dot               = xad*xcd + yad*ycd + zad*zcd;
            float cc                = rad2*rcd2 - dot*dot;
-            float bkk2              = (cc != 0.0f) ? (rdb2 - ee*ee/cc) : 0.0f;
-
-            float cosine            = (rdb2  !=  0.0f) ? sqrtf(bkk2/rdb2) : 0.0f;
-                  cosine            = (cosine >  1.0f) ?  1.0f            : cosine;
-                  cosine            = (cosine < -1.0f) ? -1.0f            : cosine;
+            float bkk2              = (cc   != 0.0f ) ? (ee*ee)/(cc) : 0.0f;
+            float bkk3              = (rdb2 != 0.0f ) ? bkk2/rdb2    : 0.0f;
+                  bkk2              = rdb2 - bkk2;
+            float cosine;
+            if( fabs( bkk3 ) < 0.98f ){
+                  bkk3              = 1.0f - bkk3;
+                  cosine            = bkk3 > 0.0f ? sqrtf(bkk3) : 0.0f;
+                  cosine            = (cosine >  1.0f) ?  0.0f  : acos(cosine);
+            } else {
+                  cosine            = sqrtf(bkk3);
+                  cosine            = asin(cosine);
+            }
+                

 /*
 c
@@ -1204,7 +1212,7 @@ c
 */
            // find the out-of-plane energy and master chain rule terms

-            float    dt             = LOCAL_HACK_RADIAN_D*acos(cosine);
+            float    dt             = LOCAL_HACK_RADIAN_D*cosine;
            float    dt2            = dt  * dt;
            float    dt3            = dt2 * dt;
            float    dt4            = dt2 * dt2;
@@ -1215,7 +1223,7 @@ c
                                                    (cAmoebaSim.amoebaOutOfPlaneBendPenticK* dt3) +
                                                    (cAmoebaSim.amoebaOutOfPlaneBendSexticK* dt4) );

-            float    deddt          = k*dt*LOCAL_HACK_RADIAN*(2.0f                                           + 
+            float    deddt          = k*dt*LOCAL_HACK_RADIAN*(2.0f                                              + 
                                                             (3.0f*cAmoebaSim.amoebaOutOfPlaneBendCubicK*  dt ) +
                                                             (4.0f*cAmoebaSim.amoebaOutOfPlaneBendQuarticK*dt2) +
                                                             (5.0f*cAmoebaSim.amoebaOutOfPlaneBendPenticK* dt3) +
@@ -1281,11 +1289,6 @@ c
            force.x                -= dedxia;
            force.y                -= dedyia;
            force.z                -= dedzia;
-
-force.x = bkk2;
-force.y = rdb2;
-force.z = cosine;
-force.w = dt;
            cSim.pForce4[offset]    = force;

            offset                  = atom1.y + atom2.y * cSim.stride;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaLocalForces.cu1
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaLocalForces.cu1
--- a/plugins/amoeba/platforms/cuda/tests/AmoebaTinkerParameterFile.cpp
+++ b/plugins/amoeba/platforms/cuda/tests/AmoebaTinkerParameterFile.cpp
@@ -3770,6 +3770,15 @@ Integrator* readAmoebaParameterFile( const std::string& inputParameterFile, MapS
                    (void) fprintf( log, "CMMotionRemover added w/ frequency=%d at line=%d\n", frequency, lineCount );
                }
   
+            // All forces 
+  
+            } else if( field == ALL_FORCES ){
+                readVec3( filePtr, tokens, forces[ALL_FORCES], &lineCount, field, log );
+            } else if( field == "AllEnergy" ){
+                if( tokens.size() > 1 ){
+                   potentialEnergy[ALL_FORCES] = atof( tokens[1].c_str() ); 
+                }
+   
            // AmoebaHarmonicBond
  
            } else if( field == "AmoebaHarmonicBondParameters" ){
@@ -3876,6 +3885,7 @@ Integrator* readAmoebaParameterFile( const std::string& inputParameterFile, MapS
                       field == "AmoebaGk_A_Force"                     ||
                       field == "AmoebaGk_A_DrB"                       ||
                       field == "AmoebaDBorn"                          ||
+                       field == "AmoebaBorn1Force"                     ||
                       field == "AmoebaBornForce"                      ||
                       field == "AmoebaGkEdiffForceAndTorque"          ||
                       field == "AmoebaGkEdiffForce"                      ){
@@ -3885,6 +3895,7 @@ Integrator* readAmoebaParameterFile( const std::string& inputParameterFile, MapS
           } else if( field == "AmoebaGkEnergy"       || 
                      field == "AmoebaGkEdiffEnergy"  ||
                      field == "AmoebaGk_A_Energy"    ||
+                      field == "AmoebaBorn1Energy"    ||
                      field == "AmoebaBornEnergy"     ){
               double value = atof( tokens[1].c_str() );
               std::vector< std::vector<double> > vectorOfDoubleVectors;
@@ -3948,14 +3959,19 @@ Integrator* readAmoebaParameterFile( const std::string& inputParameterFile, MapS
    double totalPotentialEnergy = 0.0;
    if( log )(void) fprintf( log, "Potential energies\n" );
   
+    double allEnergy = 0.0;
    for( MapStringDoubleI ii = potentialEnergy.begin(); ii != potentialEnergy.end(); ii++ ){
-        totalPotentialEnergy += ii->second;
+        if( ii->first == ALL_FORCES ){
+            allEnergy = ii->second;
+        } else {
+            totalPotentialEnergy += ii->second;
+        }
        if( log )(void) fprintf( log, "%30s %14.7e\n", ii->first.c_str(), ii->second );
    }
-    potentialEnergy["AllForces"] = totalPotentialEnergy;
+    potentialEnergy["SumOfInputEnergies"] = totalPotentialEnergy;
       
    if( log ){
-       (void) fprintf( log, "Total PE %14.7e\n", totalPotentialEnergy );
+       (void) fprintf( log, "Total PE %14.7e  %14.7e\n", totalPotentialEnergy, allEnergy );
       (void) fprintf( log, "Read %d lines from file=<%s>\n", lineCount, inputParameterFile.c_str() );
       (void) fflush( log );
    }
@@ -4614,6 +4630,9 @@ void testUsingAmoebaTinkerParameterFile( const std::string& amoebaTinkerParamete
            activeForceNames += ii->first + ":";
        }
    }
+    if( forceList.size() >= 11 ){
+        activeForceNames =ALL_FORCES;
+    }
  
    std::vector<Vec3> expectedForces;
    expectedForces.resize( system.getNumParticles() );
@@ -4634,7 +4653,7 @@ void testUsingAmoebaTinkerParameterFile( const std::string& amoebaTinkerParamete
        }
    }

-    int showAll                            = 0;
+    int showAll                            = 1;
    double energyConversion;
    double forceConversion;
    if( useOpenMMUnits ){
@@ -4650,6 +4669,7 @@ void testUsingAmoebaTinkerParameterFile( const std::string& amoebaTinkerParamete
    if( log ){
        std::vector<FILE*> fileList;
        if( log )fileList.push_back( log );
+        double cutoffDelta = 0.02;
        for( unsigned int ii = 0; ii < fileList.size(); ii++ ){
            FILE* filePtr = fileList[ii];
            (void) fprintf( filePtr, "\n" );
@@ -4668,9 +4688,10 @@ void testUsingAmoebaTinkerParameterFile( const std::string& amoebaTinkerParamete
                double delta           = fabs( normF1 - normF2 );
                double sumNorms        = 0.5*(normF1 + normF2);
                double relativeDelta   = sumNorms > 0.0 ? fabs( normF1 - normF2 )/sumNorms : 0.0;
-                if( ( (maxRelativeDelta < relativeDelta) && (sumNorms > 0.1)) || showAll ){
-                    (void) fprintf( filePtr, "%6u %10.3e %10.3e [%14.7e %14.7e %14.7e]   [%14.7e %14.7e %14.7e]\n", ii, relativeDelta, delta,
-                                    expectedForces[ii][0], expectedForces[ii][1], expectedForces[ii][2], forceConversion*forces[ii][0], forceConversion*forces[ii][1], forceConversion*forces[ii][2] );
+                bool badMatch          = (cutoffDelta < relativeDelta) && (sumNorms > 0.1) ? true : false;
+                if( badMatch || showAll ){
+                    (void) fprintf( filePtr, "%6u %10.3e %10.3e [%14.7e %14.7e %14.7e]   [%14.7e %14.7e %14.7e]  %s\n", ii, relativeDelta, delta,
+                                    expectedForces[ii][0], expectedForces[ii][1], expectedForces[ii][2], forceConversion*forces[ii][0], forceConversion*forces[ii][1], forceConversion*forces[ii][2], ( (showAll && badMatch) ? " XXX" : "") );
                    if( ( (maxRelativeDelta < relativeDelta) && (sumNorms > 0.1)) ){
                        maxRelativeDelta      =  relativeDelta;
                        maxRelativeDeltaIndex = ii;
@@ -5466,7 +5487,7 @@ int runTestsUsingAmoebaTinkerParameterFile( MapStringString& argumentMap ){
            //if( checkEnergyForceConsistency )checkForces = 0;
        } else if( key == "log" ){
            logControl = atoi( value.c_str() );
-        } else if( key == "AllForces" ){
+        } else if( key == ALL_FORCES ){
            initializeForceMap( forceMap, 1 );
        } else if( key == AMOEBA_HARMONIC_BOND_FORCE              ||
                   key == AMOEBA_HARMONIC_ANGLE_FORCE             ||

--- a/plugins/amoeba/platforms/cuda/tests/AmoebaTinkerParameterFile.h
+++ b/plugins/amoeba/platforms/cuda/tests/AmoebaTinkerParameterFile.h
@@ -76,6 +76,7 @@ static std::string AMOEBA_GK_FORCE                                    = "AmoebaG
 static std::string AMOEBA_VDW_FORCE                                   = "AmoebaVdw";
 static std::string AMOEBA_WCA_DISPERSION_FORCE                        = "AmoebaWcaDispersion";
 static std::string AMOEBA_SASA_FORCE                                  = "AmoebaSASA";
+static std::string ALL_FORCES                                         = "AllForces";

 static std::string AMOEBA_MULTIPOLE_ROTATION_MATRICES                 = "AmoebaMultipoleRotationMatrices";
 static std::string AMOEBA_MULTIPOLE_ROTATED                           = "AmoebaMultipolesRotated";