WCA force was overwriting total force vector

Modified out-of-plane bend force to improve stability Redid arguments to kCalculateAmoebaCudaKirkwood to reduce lmem

WCA force was overwriting total force vector
Modified out-of-plane bend force to improve stability Redid arguments to kCalculateAmoebaCudaKirkwood to reduce lmem
7eaa5d29 · Mark Friedrichs · 98b09e6f · 7eaa5d29 · 7eaa5d29 · 7eaa5d29
Commit 7eaa5d29 authored Jul 26, 2010 by Mark Friedrichs
11 changed files
--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -500,6 +500,7 @@ static void computeAmoebaMultipoleForce( AmoebaCudaData& data ) {
    // multipoles

    kCalculateAmoebaMultipoleForces(gpu, data.getHasAmoebaGeneralizedKirkwood() );
+
 //kClearForces(gpu->gpuContext);
 //kClearEnergy(gpu->gpuContext);
 //(void) fprintf( data.getLog(), "computeAmoebaMultipoleForce clearing forces/energy after kCalculateAmoebaMultipoleForces()\n" );

--- a/plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
+++ b/plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
@@ -1197,7 +1197,10 @@ static void gpuRotationToLabFrameAllocate( amoebaGpuContext amoebaGpu )
    // output
 
    amoebaGpu->psLabFrameDipole                      = new CUDAStream<float>(3*amoebaGpu->paddedNumberOfAtoms, 1, "LabFrameDipole");
+    amoebaGpu->amoebaSim.pLabFrameDipole             = amoebaGpu->psLabFrameDipole->_pDevStream[0];
+
    amoebaGpu->psLabFrameQuadrupole                  = new CUDAStream<float>(9*amoebaGpu->paddedNumberOfAtoms, 1, "LabFrameQuadrupole");
+    amoebaGpu->amoebaSim.pLabFrameQuadrupole         = amoebaGpu->psLabFrameQuadrupole->_pDevStream[0];

    memset( amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0], 0, sizeof(int)*4*amoebaGpu->paddedNumberOfAtoms );
 }
@@ -1275,7 +1278,11 @@ void gpuMutualInducedFieldAllocate( amoebaGpuContext amoebaGpu )
 #endif

    amoebaGpu->psInducedDipole                 = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "InducedDipole");
+    amoebaGpu->amoebaSim.pInducedDipole        = amoebaGpu->psInducedDipole->_pDevStream[0];
+
    amoebaGpu->psInducedDipolePolar            = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "InducedDipolePolar");
+    amoebaGpu->amoebaSim.pInducedDipolePolar   = amoebaGpu->psInducedDipolePolar->_pDevStream[0];
+
    amoebaGpu->psCurrentEpsilon                = new CUDAStream<float>(5, 1, "CurrentEpsilon");
    amoebaGpu->epsilonThreadsPerBlock          = 384;
 
@@ -1326,19 +1333,15 @@ void gpuElectrostaticAllocate( amoebaGpuContext amoebaGpu )
    }
 #endif

-    amoebaGpu->psForce                         = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "ForceuElectrostatic");
-    amoebaGpu->psTorque                        = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "ForceTorque");
+    amoebaGpu->psForce                         = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "ElectrostaticForce");
+    amoebaGpu->psTorque                        = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "Torque");

    unsigned int offset                        = 3*paddedNumberOfAtoms*sizeof( float );
    memset( amoebaGpu->psForce->_pSysStream[0],          0, offset );
    memset( amoebaGpu->psTorque->_pSysStream[0],         0, offset );

-    offset                                     = paddedNumberOfAtoms*sizeof( float );
-//  memset( amoebaGpu->psEnergy->_pSysStream[0],         0, offset );
-
    amoebaGpu->psForce->Download();
    amoebaGpu->psTorque->Download();
-//    amoebaGpu->psEnergy->Download();
    
 }

@@ -1367,7 +1370,7 @@ void gpuKirkwoodAllocate( amoebaGpuContext amoebaGpu )

 #ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log,"%s: paddedNumberOfAtoms=%d\n", methodName.c_str(), paddedNumberOfAtoms );
+        (void) fprintf( amoebaGpu->log,"%s: paddedNumberOfAtoms      =%d\n", methodName.c_str(), paddedNumberOfAtoms );
        (void) fflush( amoebaGpu->log );
    }
 #endif
@@ -1375,8 +1378,13 @@ void gpuKirkwoodAllocate( amoebaGpuContext amoebaGpu )
    amoebaGpu->psBorn                            = new CUDAStream<float>(paddedNumberOfAtoms,   1, "KirkwoodBorn");
    amoebaGpu->psBornPolar                       = new CUDAStream<float>(paddedNumberOfAtoms,   1, "KirkwoodBornPolar");
    amoebaGpu->psGk_Field                        = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "Gk_Fixed_Field");
+
    amoebaGpu->psInducedDipoleS                  = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "InducedDipoleS");
+    amoebaGpu->amoebaSim.pInducedDipoleS         = amoebaGpu->psInducedDipoleS->_pDevStream[0];
+
    amoebaGpu->psInducedDipolePolarS             = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "InducedDipolePolarS");
+    amoebaGpu->amoebaSim.pInducedDipolePolarS    = amoebaGpu->psInducedDipolePolarS->_pDevStream[0];
+
    amoebaGpu->psKirkwoodForce                   = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "KirkwoodForce");
    amoebaGpu->psKirkwoodEDiffForce              = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "KirkwoodEDiffForce");

@@ -2685,7 +2693,10 @@ void amoebaGpuBuildOutputBuffers( amoebaGpuContext amoebaGpu )
        (void) fflush( amoebaGpu->log );
    }
    amoebaGpu->psWorkArray_3_1            = new CUDAStream<float>(3*paddedNumberOfAtoms, (amoebaGpu->outputBuffers), "AmoebaField_3_1");
+    amoebaGpu->amoebaSim.pWorkArray_3_1   = amoebaGpu->psWorkArray_3_1->_pDevStream[0];
+
    amoebaGpu->psWorkArray_3_2            = new CUDAStream<float>(3*paddedNumberOfAtoms, (amoebaGpu->outputBuffers), "AmoebaField_3_2");
+    amoebaGpu->amoebaSim.pWorkArray_3_2   = amoebaGpu->psWorkArray_3_2->_pDevStream[0];

    // used GK
    amoebaGpu->psWorkArray_3_3            = new CUDAStream<float>(3*paddedNumberOfAtoms, (amoebaGpu->outputBuffers), "AmoebaField_3_3");
@@ -2694,7 +2705,10 @@ void amoebaGpuBuildOutputBuffers( amoebaGpuContext amoebaGpu )
    amoebaGpu->psWorkArray_3_6            = new CUDAStream<float>(3*paddedNumberOfAtoms, (amoebaGpu->outputBuffers), "AmoebaField_3_6");

    amoebaGpu->psWorkArray_1_1            = new CUDAStream<float>(  paddedNumberOfAtoms, (amoebaGpu->outputBuffers), "AmoebaField_1_1");
+    amoebaGpu->amoebaSim.pWorkArray_1_1   = amoebaGpu->psWorkArray_1_1->_pDevStream[0];
+
    amoebaGpu->psWorkArray_1_2            = new CUDAStream<float>(  paddedNumberOfAtoms, (amoebaGpu->outputBuffers), "AmoebaField_1_2");
+    amoebaGpu->amoebaSim.pWorkArray_1_2   = amoebaGpu->psWorkArray_1_2->_pDevStream[0];

    amoebaGpu->psEnergy                   = new CUDAStream<float>(amoebaGpu->energyOutputBuffers, 1, "AmoebaEnergy");


--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaTypes.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaTypes.h
@@ -41,231 +41,6 @@
 #include <builtin_types.h>
 #include <vector_functions.h>

-#if 0
-#define RTERROR(status, s) \
-    if (status != cudaSuccess) { \
-        printf("%s %s\n", s, cudaGetErrorString(status)); \
-        exit(-1); \
-    }
-
-#define LAUNCHERROR(s) \
-    { \
-        cudaError_t status = cudaGetLastError(); \
-        if (status != cudaSuccess) { \
-            printf("Error: %s launching kernel %s\n", cudaGetErrorString(status), s); \
-            exit(-1); \
-        } \
-    }
-#endif
-
-#if 0
-// Pure virtual class to define an interface for objects resident both on GPU and CPU
-struct SoADeviceObject {
-    virtual void Allocate() = 0;
-    virtual void Deallocate() = 0;
-    virtual void Upload() = 0;
-    virtual void Download() = 0;
-};
-
-template <typename T>
-struct CUDAStream : public SoADeviceObject
-{
-    unsigned int    _length;
-    unsigned int    _subStreams;
-    unsigned int    _stride;
-    T**             _pSysStream;
-    T**             _pDevStream;
-    T*              _pSysData;
-    T*              _pDevData;
-    std::string     _name;
-    CUDAStream(int length, int subStreams = 1, std::string name="");
-    CUDAStream(unsigned int length, unsigned int subStreams = 1, std::string name="");
-    CUDAStream(unsigned int length, int subStreams = 1, std::string name="");
-    CUDAStream(int length, unsigned int subStreams = 1, std::string name="");
-    virtual ~CUDAStream();
-    void Allocate();
-    void Deallocate();
-    void Upload();
-    void Download();
-    void Collapse(unsigned int newstreams = 1, unsigned int interleave = 1);
-    T& operator[](int index);
-};
-
-float CompareStreams(CUDAStream<float>& s1, CUDAStream<float>& s2, float tolerance, unsigned int maxindex = 0);
-
-template <typename T>
-CUDAStream<T>::CUDAStream(int length, unsigned int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
-{
-    Allocate();   
-}
-
-template <typename T>
-CUDAStream<T>::CUDAStream(unsigned int length, int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
-{
-    Allocate();   
-}
-
-template <typename T>
-CUDAStream<T>::CUDAStream(unsigned int length, unsigned int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
-{
-    Allocate();   
-}
-
-template <typename T>
-CUDAStream<T>::CUDAStream(int length, int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
-{
-    Allocate();   
-}
-
-template <typename T>
-CUDAStream<T>::~CUDAStream()
-{
-    Deallocate();
-}
-
-template <typename T>
-void CUDAStream<T>::Allocate()
-{
-    cudaError_t status;
-    _pSysStream =   new T*[_subStreams];
-    _pDevStream =   new T*[_subStreams];
-    _pSysData =     new T[_subStreams * _stride];
-
-    status = cudaMalloc((void **) &_pDevData, _stride * _subStreams * sizeof(T));
-    RTERROR(status, (_name+": cudaMalloc in CUDAStream::Allocate failed").c_str());
-
-    for (unsigned int i = 0; i < _subStreams; i++)
-    {
-        _pSysStream[i] = _pSysData + i * _stride;
-        _pDevStream[i] = _pDevData + i * _stride;
-    }
-}
-
-template <typename T>
-void CUDAStream<T>::Deallocate()
-{
-    cudaError_t status;
-    delete[] _pSysStream;
-    _pSysStream = NULL;
-    delete[] _pDevStream;
-    _pDevStream = NULL;
-    delete[] _pSysData;
-    _pSysData = NULL;
-    status = cudaFree(_pDevData);
-    RTERROR(status, (_name+": cudaFree in CUDAStream::Deallocate failed").c_str());
-}
-
-template <typename T>
-void CUDAStream<T>::Upload()
-{
-    cudaError_t status;
-    status = cudaMemcpy(_pDevData, _pSysData, _stride * _subStreams * sizeof(T), cudaMemcpyHostToDevice);
-    RTERROR(status, (_name+": cudaMemcpy in CUDAStream::Upload failed").c_str());
-}
-
-template <typename T>
-void CUDAStream<T>::Download()
-{
-    cudaError_t status;
-    status = cudaMemcpy(_pSysData, _pDevData, _stride * _subStreams * sizeof(T), cudaMemcpyDeviceToHost);
-    RTERROR(status, (_name+": cudaMemcpy in CUDAStream::Download failed").c_str());
-}
-
-template <typename T>
-void CUDAStream<T>::Collapse(unsigned int newstreams, unsigned int interleave)
-{
-    T* pTemp = new T[_subStreams * _stride];
-    unsigned int stream = 0;
-    unsigned int pos = 0;
-    unsigned int newstride = _stride * _subStreams / newstreams;
-    unsigned int newlength = _length * _subStreams / newstreams;
-
-    // Copy data into new format
-    for (unsigned int i = 0; i < _length; i++)
-    {
-        for (unsigned int j = 0; j < _subStreams; j++)
-        {
-            pTemp[stream * newstride + pos] = _pSysStream[j][i];
-            stream++;
-            if (stream == newstreams)
-            {
-                stream = 0;
-                pos++;
-            }
-        }
-    }
-
-    // Remap stream pointers;
-    for (unsigned int i = 0; i < newstreams; i++)
-    {
-        _pSysStream[i] = _pSysData + i * newstride;
-        _pDevStream[i] = _pDevData + i * newstride;
-    }
-
-    // Copy data back intro original stream
-    for (unsigned int i = 0; i < newlength; i++)
-        for (unsigned int j = 0; j < newstreams; j++)
-            _pSysStream[j][i] = pTemp[j * newstride + i];
-    
-    _stride = newstride;
-    _length = newlength;
-    _subStreams = newstreams;
-    delete[] pTemp;
-}
-
-template <typename T>
-T& CUDAStream<T>::operator[](int index)
-{
-    return _pSysData[index];
-}
-
-static const unsigned int GRID = 32;
-static const unsigned int GRIDBITS = 5;
-static const int G8X_NONBOND_THREADS_PER_BLOCK          = 256;
-static const int GT2XX_NONBOND_THREADS_PER_BLOCK        = 320;
-static const int G8X_BORNFORCE2_THREADS_PER_BLOCK       = 256;
-static const int GT2XX_BORNFORCE2_THREADS_PER_BLOCK     = 320;
-static const int G8X_SHAKE_THREADS_PER_BLOCK            = 128;
-static const int GT2XX_SHAKE_THREADS_PER_BLOCK          = 256;
-static const int G8X_UPDATE_THREADS_PER_BLOCK           = 192;
-static const int GT2XX_UPDATE_THREADS_PER_BLOCK         = 384;
-static const int G8X_LOCALFORCES_THREADS_PER_BLOCK      = 192;
-static const int GT2XX_LOCALFORCES_THREADS_PER_BLOCK    = 384;
-static const int G8X_THREADS_PER_BLOCK                  = 256;
-static const int GT2XX_THREADS_PER_BLOCK                = 256;
-static const int G8X_RANDOM_THREADS_PER_BLOCK           = 256;
-static const int GT2XX_RANDOM_THREADS_PER_BLOCK         = 384;
-static const int G8X_NONBOND_WORKUNITS_PER_SM           = 220;
-static const int GT2XX_NONBOND_WORKUNITS_PER_SM         = 256;
-static const unsigned int MAX_STACK_SIZE = 8;
-static const unsigned int MAX_TABULATED_FUNCTIONS = 4;
-
-static const float PI = 3.14159265358979323846f;
-
-static const int PME_ORDER = 4;
-
-enum CudaNonbondedMethod
-{
-    NO_CUTOFF,
-    CUTOFF,
-    PERIODIC,
-    EWALD,
-    PARTICLE_MESH_EWALD
-};
-
-enum ExpressionOp {
-    CONSTANT = 0, VARIABLE0, VARIABLE1, VARIABLE2, VARIABLE3, VARIABLE4, VARIABLE5, VARIABLE6, VARIABLE7, VARIABLE8, GLOBAL, CUSTOM, CUSTOM_DERIV, ADD, SUBTRACT, MULTIPLY, DIVIDE,
-        POWER, NEGATE, SQRT, EXP, LOG, SIN, COS, SEC, CSC, TAN, COT, ASIN, ACOS, ATAN, SQUARE, CUBE, RECIPROCAL, ADD_CONSTANT, MULTIPLY_CONSTANT, POWER_CONSTANT
-};
-
-template<int SIZE>
-struct Expression {
-    int op[SIZE];
-    float arg[SIZE];
-    int length, stackSize;
-};
-#endif
-
 struct cudaAmoebaGmxSimulation {
    // Constants

@@ -344,6 +119,19 @@ struct cudaAmoebaGmxSimulation {
    float scalingDistanceCutoff;                    // scaling cutoff
    float2*         pDampingFactorAndThole;         // Thole & damping factors

+    float* pLabFrameDipole;
+    float* pLabFrameQuadrupole;
+    float* pInducedDipole;
+    float* pInducedDipolePolar;
+
+    float* pInducedDipoleS;
+    float* pInducedDipolePolarS;
+
+    float* pWorkArray_3_1;
+    float* pWorkArray_3_2;
+    float* pWorkArray_1_1;
+    float* pWorkArray_1_2;
+
    unsigned int amoebaVdwNonReductions;
    int* pAmoebaVdwNonReductionID;


--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
@@ -867,11 +867,11 @@ static void kReduceForceTorque(amoebaGpuContext amoebaGpu )
    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
                               amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psForce->_pDevStream[0] );
-    LAUNCHERROR("kReduceForceTorque1");
+    LAUNCHERROR("kReduceElectrostaticForce");
    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
                               amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psTorque->_pDevStream[0] );
-    LAUNCHERROR("kReduceForceTorque2");
+    LAUNCHERROR("kReduceElectrostaticTorque");
 }

 #ifdef AMOEBA_DEBUG
@@ -1137,10 +1137,7 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
            cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,            outputVector );
            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psForce,      outputVector );
            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psTorque,     outputVector);
-(void) fprintf( amoebaGpu->log, "%s calling cudaWriteVectorOfDoubleVectorsToFile \n", methodName ); fflush( amoebaGpu->log );
            cudaWriteVectorOfDoubleVectorsToFile( "CudaForceTorque", fileId, outputVector );
-(void) fprintf( amoebaGpu->log, "%s called cudaWriteVectorOfDoubleVectorsToFile \n", methodName ); fflush( amoebaGpu->log );
-
         }

    }   

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
@@ -2386,21 +2386,11 @@ threadsPerBlock = 32;
 #endif

        kCalculateAmoebaCudaKirkwoodN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(KirkwoodParticle)*threadsPerBlock>>>(
-                                                                           amoebaGpu->psWorkUnit->_pDevStream[0],
-                                                                           gpu->psPosq4->_pDevStream[0],
-                                                                           amoebaGpu->psLabFrameDipole->_pDevStream[0],
-                                                                           amoebaGpu->psLabFrameQuadrupole->_pDevStream[0],
-                                                                           amoebaGpu->psInducedDipoleS->_pDevStream[0],
-                                                                           amoebaGpu->psInducedDipolePolarS->_pDevStream[0],
-                                                                           gpu->psBornRadii->_pDevStream[0],
-                                                                           amoebaGpu->psWorkArray_3_1->_pDevStream[0],
-                                                                           amoebaGpu->psWorkArray_3_2->_pDevStream[0],
-                                                                           amoebaGpu->psWorkArray_1_1->_pDevStream[0],
+                                                                           amoebaGpu->psWorkUnit->_pDevStream[0]
 #ifdef AMOEBA_DEBUG
-                                                                           amoebaGpu->psWorkArray_1_2->_pDevStream[0],
-                                                                           debugArray->_pDevStream[0], targetAtom );
+                                                                           , debugArray->_pDevStream[0], targetAtom );
 #else
-                                                                           amoebaGpu->psWorkArray_1_2->_pDevStream[0] );
+                                                                           );
 #endif
    }


--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.h
@@ -37,18 +37,7 @@ __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
 #endif
 */
 void METHOD_NAME(kCalculateAmoebaCudaKirkwood, Forces_kernel)(
-                            unsigned int* workUnit,
-                            float4* atomCoord,
-                            float* labFrameDipole,
-                            float* labFrameQuadrupole,
-                            float* inducedDipole,
-                            float* inducedDipolePolar,
-                            float* bornRadii,
-                            float* outputForce,
-                            float* outputTorque,
-                            float* output_dBornRadius,
-                            float* output_dBornRadiusPolar
-
+                            unsigned int* workUnit
 #ifdef AMOEBA_DEBUG
                           , float4* debugArray, unsigned int targetAtom
 #endif
@@ -67,6 +56,12 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwood, Forces_kernel)(
    unsigned int end             = (warp+1)*numWorkUnits/totalWarps;
    unsigned int lasty           = 0xFFFFFFFF;

+    // pWorkArray_3_1 == force
+    // pWorkArray_3_2 == torque
+
+    // pWorkArray_1_1 == dBorn
+    // pWorkArray_1_2 == dBornPolar
+
    float4 jCoord;
    float jDipole[3];     
    float jQuadrupole[9];     
@@ -94,7 +89,7 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwood, Forces_kernel)(
        KirkwoodParticle* psA         = &sA[tbx];

        unsigned int atomI            = x + tgx;
-        float4 iCoord                 = atomCoord[atomI];
+        float4 iCoord                 =  cSim.pPosq[atomI];

        float forceSum[3];
        float torqueSum[3];
@@ -120,8 +115,8 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwood, Forces_kernel)(
            // load shared data

            loadKirkwoodShared( &(sA[threadIdx.x]), atomI,
-                                atomCoord, labFrameDipole, labFrameQuadrupole,
-                                inducedDipole, inducedDipolePolar, bornRadii );
+                                cSim.pPosq, cAmoebaSim.pLabFrameDipole, cAmoebaSim.pLabFrameQuadrupole,
+                                cAmoebaSim.pInducedDipoleS,  cAmoebaSim.pInducedDipolePolarS, cSim.pBornRadii );

            // this branch is never exercised since it includes the
            // interaction between atomI and itself which is always excluded
@@ -145,11 +140,11 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwood, Forces_kernel)(

                calculateKirkwoodPairIxn_kernel( sameAtom,
                                                 iCoord,                                     jCoord,
-                                                 &(labFrameDipole[3*atomI]),                 jDipole,
-                                                 &(labFrameQuadrupole[9*atomI]),             jQuadrupole,
-                                                 &(inducedDipole[3*atomI]),                  jInducedDipole,
-                                                 &(inducedDipolePolar[3*atomI]),             jInducedDipolePolar,
-                                                 bornRadii[atomI],                           jBornRadius,
+                                                 &(cAmoebaSim.pLabFrameDipole[3*atomI]),     jDipole,
+                                                 &(cAmoebaSim.pLabFrameQuadrupole[9*atomI]), jQuadrupole,
+                                                 &(cAmoebaSim.pInducedDipoleS[3*atomI]),     jInducedDipole,
+                                                 &(cAmoebaSim.pInducedDipolePolarS[3*atomI]),jInducedDipolePolar,
+                                                 cSim.pBornRadii[atomI],                     jBornRadius,
                                                 force, torque, dBorn, dBornPolar, &energy
 #ifdef AMOEBA_DEBUG
                                          , pullBack
@@ -188,7 +183,7 @@ if( atomI == targetAtom  || atomJ == targetAtom ){
        debugArray[index].x                = (float) atomI;
        debugArray[index].y                = (float) atomJ;
        debugArray[index].z                = (float) (mask + 1);
-        //debugArray[index].z                = bornRadii[atomI];
+        //debugArray[index].z                = cSim.pBornRadii[atomI];
        //debugArray[index].z                = energy;
        //debugArray[index].w                = (float) (blockIdx.x*blockDim.x+threadIdx.x);
        debugArray[index].w                = jBornRadius;
@@ -234,30 +229,30 @@ if( atomI == targetAtom  || atomJ == targetAtom ){
            float  of;
            unsigned int offset                 = x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms;

-            of                                  = output_dBornRadius[offset];
+            of                                  = cAmoebaSim.pWorkArray_1_1[offset];
            of                                 += dBornSum;
-            output_dBornRadius[offset]          = of;
+            cAmoebaSim.pWorkArray_1_1[offset]   = of;

-            of                                  = output_dBornRadiusPolar[offset];
+            of                                  = cAmoebaSim.pWorkArray_1_2[offset];
            of                                 += dBornPolarSum;
-            output_dBornRadiusPolar[offset]     = of;
+            cAmoebaSim.pWorkArray_1_2[offset]   = of;


            offset                             *= 3;

-            load3dArrayBufferPerWarp( offset, forceSum,  outputForce );
-            load3dArrayBufferPerWarp( offset, torqueSum, outputTorque );
+            load3dArrayBufferPerWarp( offset, forceSum,  cAmoebaSim.pWorkArray_3_1 );
+            load3dArrayBufferPerWarp( offset, torqueSum, cAmoebaSim.pWorkArray_3_2 );

 #else
            unsigned int offset                 = x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms;

-            output_dBornRadius[offset]          = dBornSum;
-            output_dBornRadiusPolar[offset]     = dBornPolarSum;
+            cAmoebaSim.pWorkArray_1_1[offset]   = dBornSum;
+            cAmoebaSim.pWorkArray_1_2[offset]   = dBornPolarSum;

            offset                             *= 3;

-            load3dArray( offset, forceSum,  outputForce );
-            load3dArray( offset, torqueSum, outputTorque );
+            load3dArray( offset, forceSum,  cAmoebaSim.pWorkArray_3_1 );
+            load3dArray( offset, torqueSum, cAmoebaSim.pWorkArray_3_2 );

 #endif

@@ -271,8 +266,8 @@ if( atomI == targetAtom  || atomJ == targetAtom ){
                // load shared data

                loadKirkwoodShared( &(sA[threadIdx.x]), (y+tgx),
-                                    atomCoord, labFrameDipole, labFrameQuadrupole,
-                                    inducedDipole, inducedDipolePolar, bornRadii );
+                                    cSim.pPosq, cAmoebaSim.pLabFrameDipole, cAmoebaSim.pLabFrameQuadrupole,
+                                    cAmoebaSim.pInducedDipoleS, cAmoebaSim.pInducedDipolePolarS, cSim.pBornRadii);

            }

@@ -300,11 +295,11 @@ if( atomI == targetAtom  || atomJ == targetAtom ){

                calculateKirkwoodPairIxn_kernel( sameAtom, 
                                                 iCoord,                                       jCoord,
-                                                 &(labFrameDipole[3*atomI]),                 jDipole,
-                                                 &(labFrameQuadrupole[9*atomI]),             jQuadrupole,
-                                                 &(inducedDipole[3*atomI]),                  jInducedDipole,
-                                                 &(inducedDipolePolar[3*atomI]),             jInducedDipolePolar,
-                                                 bornRadii[atomI],                           jBornRadius,
+                                                 &(cAmoebaSim.pLabFrameDipole[3*atomI]),       jDipole,
+                                                 &(cAmoebaSim.pLabFrameQuadrupole[9*atomI]),   jQuadrupole,
+                                                 &(cAmoebaSim.pInducedDipoleS[3*atomI]),       jInducedDipole,
+                                                 &(cAmoebaSim.pInducedDipolePolarS[3*atomI]),  jInducedDipolePolar,
+                                                 cSim.pBornRadii[atomI],                       jBornRadius,
                                                 force, torque, dBorn, dBornPolar, &energy
 #ifdef AMOEBA_DEBUG
                                          , pullBack
@@ -350,7 +345,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
        debugArray[index].x                = (float) atomI;
        debugArray[index].y                = (float) atomJ;
        debugArray[index].z                = (float) (mask+1);
-        //debugArray[index].z                = bornRadii[atomI];
+        //debugArray[index].z                = cSim.pBornRadii[atomI];
        //debugArray[index].z                = energy;
        debugArray[index].w                = (float) (blockIdx.x*blockDim.x+threadIdx.x);

@@ -407,54 +402,54 @@ if( mask || !mask ){
            float of;
            unsigned int offset                 = x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms;

-            of                                  = output_dBornRadius[offset];
+            of                                  = cAmoebaSim.pWorkArray_1_1[offset];
            of                                 += dBornSum;
-            output_dBornRadius[offset]          = of;
+            cAmoebaSim.pWorkArray_1_1[offset]   = of;

-            of                                  = output_dBornRadiusPolar[offset];
+            of                                  = cAmoebaSim.pWorkArray_1_2[offset];
            of                                 += dBornPolarSum;
-            output_dBornRadiusPolar[offset]     = of;
+            cAmoebaSim.pWorkArray_1_2[offset]   = of;

            offset                             *= 3;

-            load3dArrayBufferPerWarp( offset, forceSum,  outputForce );
-            load3dArrayBufferPerWarp( offset, torqueSum, outputTorque );
+            load3dArrayBufferPerWarp( offset, forceSum,  cAmoebaSim.pWorkArray_3_1 );
+            load3dArrayBufferPerWarp( offset, torqueSum, cAmoebaSim.pWorkArray_3_2 );

            offset                              = y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms;

-            of                                  = output_dBornRadius[offset];
+            of                                  = cAmoebaSim.pWorkArray_1_1[offset];
            of                                 += dBornSum;
-            output_dBornRadius[offset]          = of;
+            cAmoebaSim.pWorkArray_1_1[offset]   = of;

-            of                                  = output_dBornRadiusPolar[offset];
+            of                                  = cAmoebaSim.pWorkArray_1_2[offset];
            of                                 += dBornPolarSum;
-            output_dBornRadiusPolar[offset]     = of;
+            cAmoebaSim.pWorkArray_1_2[offset]   = of;

            offset                             *= 3;

-            load3dArrayBufferPerWarp( offset, sA[threadIdx.x].force,  outputForce );
-            load3dArrayBufferPerWarp( offset, sA[threadIdx.x].torque, outputTorque );
+            load3dArrayBufferPerWarp( offset, sA[threadIdx.x].force,  cAmoebaSim.pWorkArray_3_1 );
+            load3dArrayBufferPerWarp( offset, sA[threadIdx.x].torque, cAmoebaSim.pWorkArray_3_2 );
 #else
            unsigned int offset                 = x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms;

-            output_dBornRadius[offset]          = dBornSum;
-            output_dBornRadiusPolar[offset]     = dBornPolarSum;
+            cAmoebaSim.pWorkArray_1_1[offset]   = dBornSum;
+            cAmoebaSim.pWorkArray_1_2[offset]   = dBornPolarSum;

            offset                             *= 3;

-            load3dArray( offset, forceSum,  outputForce );
-            load3dArray( offset, torqueSum, outputTorque );
+            load3dArray( offset, forceSum,  cAmoebaSim.pWorkArray_3_1 );
+            load3dArray( offset, torqueSum, cAmoebaSim.pWorkArray_3_2 );


            offset                              = y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms;

-            output_dBornRadius[offset]          = sA[threadIdx.x].dBornRadius;
-            output_dBornRadiusPolar[offset]     = sA[threadIdx.x].dBornRadiusPolar;
+            cAmoebaSim.pWorkArray_1_1[offset]   = sA[threadIdx.x].dBornRadius;
+            cAmoebaSim.pWorkArray_1_2[offset]   = sA[threadIdx.x].dBornRadiusPolar;

            offset                             *= 3;

-            load3dArray( offset, sA[threadIdx.x].force,  outputForce );
-            load3dArray( offset, sA[threadIdx.x].torque, outputTorque );
+            load3dArray( offset, sA[threadIdx.x].force,  cAmoebaSim.pWorkArray_3_1 );
+            load3dArray( offset, sA[threadIdx.x].torque, cAmoebaSim.pWorkArray_3_2 );

 #endif
            lasty = y;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaUtilities.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaUtilities.cu
@@ -259,7 +259,7 @@ void kReduceFieldsToFloat4_kernel( unsigned int fieldComponents, unsigned int ou

        unsigned int j   = pos/3;
        unsigned int k   = pos - 3*j;
-        fieldOut[4*j+k]  = totalField;
+        fieldOut[4*j+k] += totalField;
        pos             += gridDim.x * blockDim.x;
    }   
 }

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaLocalForces.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaLocalForces.cu
@@ -1165,11 +1165,19 @@ void kCalculateAmoebaLocalForces_kernel()

            float dot               = xad*xcd + yad*ycd + zad*zcd;
            float cc                = rad2*rcd2 - dot*dot;
-            float bkk2              = (cc != 0.0f) ? (rdb2 - ee*ee/cc) : 0.0f;
+            float bkk2              = (cc   != 0.0f ) ? (ee*ee)/(cc) : 0.0f;
+            float bkk3              = (rdb2 != 0.0f ) ? bkk2/rdb2    : 0.0f;
+                  bkk2              = rdb2 - bkk2;
+            float cosine;
+            if( fabs( bkk3 ) < 0.98f ){
+                  bkk3              = 1.0f - bkk3;
+                  cosine            = bkk3 > 0.0f ? sqrtf(bkk3) : 0.0f;
+                  cosine            = (cosine >  1.0f) ?  0.0f  : acos(cosine);
+            } else {
+                  cosine            = sqrtf(bkk3);
+                  cosine            = asin(cosine);
+            }
                
-            float cosine            = (rdb2  !=  0.0f) ? sqrtf(bkk2/rdb2) : 0.0f;
-                  cosine            = (cosine >  1.0f) ?  1.0f            : cosine;
-                  cosine            = (cosine < -1.0f) ? -1.0f            : cosine;

 /*
 c
@@ -1204,7 +1212,7 @@ c
 */
            // find the out-of-plane energy and master chain rule terms

-            float    dt             = LOCAL_HACK_RADIAN_D*acos(cosine);
+            float    dt             = LOCAL_HACK_RADIAN_D*cosine;
            float    dt2            = dt  * dt;
            float    dt3            = dt2 * dt;
            float    dt4            = dt2 * dt2;
@@ -1281,11 +1289,6 @@ c
            force.x                -= dedxia;
            force.y                -= dedyia;
            force.z                -= dedzia;
-
-force.x = bkk2;
-force.y = rdb2;
-force.z = cosine;
-force.w = dt;
            cSim.pForce4[offset]    = force;

            offset                  = atom1.y + atom2.y * cSim.stride;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaLocalForces.cu1
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaLocalForces.cu1
-/* -------------------------------------------------------------------------- *
- *                                  AmoebaOpenMM                              *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#include <cuda.h>
-#include <vector_functions.h>
-#include <cstdlib>
-#include <string>
-#include <iostream>
-#include <fstream>
-using namespace std;
-
-#include "amoebaGpuTypes.h"
-
-extern __shared__ Vectors sV[];
-static __constant__ cudaGmxSimulation cSim;
-static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
-
-/* Cuda compiler on Windows does not recognized "static const float" values */
-#define LOCAL_HACK_PI        3.1415926535897932384626433832795f
-#define LOCAL_HACK_RADIAN   57.29577951308232088f
-#define LOCAL_HACK_RADIAN_D 57.29577951308232088
-
-#define DOT3(v1, v2) (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z)
-
-#define GETNORMEDDOTPRODUCT(v1, v2, dp) \
-{ \
-    dp          = DOT3(v1, v2); \
-    float norm1 = DOT3(v1, v1); \
-    float norm2 = DOT3(v2, v2); \
-    dp /= sqrt(norm1 * norm2); \
-    dp = min(dp, 1.0f); \
-    dp = max(dp, -1.0f); \
-}
-
-#define CROSS_PRODUCT(v1, v2, c) \
-    c.x = v1.y * v2.z - v1.z * v2.y; \
-    c.y = v1.z * v2.x - v1.x * v2.z; \
-    c.z = v1.x * v2.y - v1.y * v2.x;
-
-
-#define GETANGLEBETWEENTWOVECTORS(v1, v2, angle) \
-{ \
-    float dp; \
-    GETNORMEDDOTPRODUCT(v1, v2, dp); \
-    angle = acos(dp); \
-}
-
-#define GETANGLECOSINEBETWEENTWOVECTORS(v1, v2, angle, cosine) \
-{ \
-    GETNORMEDDOTPRODUCT(v1, v2, cosine); \
-    angle = acos(cosine); \
-}
-
-#define GETDIHEDRALANGLEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle) \
-{ \
-    CROSS_PRODUCT(vector1, vector2, cp0); \
-    CROSS_PRODUCT(vector2, vector3, cp1); \
-    GETANGLEBETWEENTWOVECTORS(cp0, cp1, angle); \
-    float dp = DOT3(signVector, cp1); \
-    angle = (dp >= 0) ? angle : -angle; \
-}                                                          
-
-#define GETDIHEDRALANGLECOSINEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle, cosine) \
-{ \
-    CROSS_PRODUCT(vector1, vector2, cp0); \
-    CROSS_PRODUCT(vector2, vector3, cp1); \
-    GETANGLECOSINEBETWEENTWOVECTORS(cp0, cp1, angle, cosine); \
-    float dp = DOT3(signVector, cp1); \
-    angle = (dp >= 0) ? angle : -angle; \
-}
-
-void SetCalculateAmoebaLocalForcesSim(amoebaGpuContext amoebaGpu)
-{
-    cudaError_t status;
-    gpuContext gpu = amoebaGpu->gpuContext;
-    status         = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateAmoebaLocalForcesSim copy to cSim failed");
-    status         = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
-}
-
-void GetCalculateAmoebaLocalForcesSim(amoebaGpuContext amoebaGpu)
-{
-    cudaError_t status;
-    gpuContext gpu = amoebaGpu->gpuContext;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyFromSymbol: GetCalculateAmoebaLocalForcesSim copy from cSim failed");
-    status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyFromSymbol: GetCalculateAmoebaLocalForcesSim copy from cAmoebaSim failed");
-}
-
-// bicubic spline
-
-__device__ void bicubic( float4 y, float4 y1i, float4 y2i, float4 y12i, float x1, float x1l, float x1u,
-                         float x2, float x2l, float x2u, float* energyOut, float* dang1Out, float* dang2Out )
-//                         float4* c0, float4* c1, float4* c2, float4* c3 )
-{
-
-   // c[0][j] = cl[0-3]
-   // c[1][j] = cl[4-7]
-   // c[2][j] = cl[8-11]
-   // c[4][j] = cl[12-15]
-
-   float c[4][4];
-   float d1   = x1u - x1l;
-   float d2   = x2u - x2l;
-   float d12  = d1*d2;
-
-   float4 y1;
-   y1.x       = d1*y1i.x;
-   y1.y       = d1*y1i.y;
-   y1.z       = d1*y1i.z;
-   y1.w       = d1*y1i.w;
-
-   float4 y2;
-   y2.x       = d2*y2i.x;
-   y2.y       = d2*y2i.y;
-   y2.z       = d2*y2i.z;
-   y2.w       = d2*y2i.w;
-
-   float4 y12;
-   y12.x      = d12*y12i.x;
-   y12.y      = d12*y12i.y;
-   y12.z      = d12*y12i.z;
-   y12.w      = d12*y12i.w;
-
-   // 1    1    1.000 [0][0]
-   c[0][0]    = y.x;
-
-   // 2    9    1.000 [0][1]
-   c[0][1]    = y2.x;
-
-   // 3    1   -3.000 [0][2]
-   // 3    4    3.000
-   // 3    9   -2.000
-   // 3   12   -1.000
-
-   c[0][2]    =  3.0f*(y.w - y.x) - (2.0f*y2.x + y2.w);
-   
-   // 4    1    2.000 [0][3]
-   // 4    4   -2.000
-   // 4    9    1.000
-   // 4   12    1.000
- 
-   c[0][3]    =  2.0f*(y.x - y.w) + y2.x + y2.w;
-
-   // 5    5    1.000 [1][0]
-   c[1][0]    =  y1.x;
-
-   // 6   13    1.000 [1][1]
-   c[1][1]    =  y12.x;
-
-   // 7    5   -3.000 [1][2]
-   // 7    8    3.000
-   // 7   13   -2.000
-   // 7   16   -1.000
-
-   c[1][2]    =  3.0f*(y1.w - y1.x) - (2.0f*y12.x + y12.w);
-
-   // 8    5    2.000 [1][3]
-   // 8    8   -2.000
-   // 8   13    1.000
-   // 8   16    1.000
-
-   c[1][3]    =  2.0f*(y1.x - y1.w) + y12.x + y12.w;
-
-   // 9    1   -3.000 [2][0]
-   // 9    2    3.000
-   // 9    5   -2.000
-   // 9    6   -1.000
-
-   c[2][0]    =  3.0f*(y.y - y.x) - (2.0f*y1.x + y1.y);
-
-   // 10    9   -3.000 [2][1]
-   // 10   10    3.000
-   // 10   13   -2.000
-   // 10   14   -1.000
-   c[2][1]    =  3.0f*(y2.y - y2.x) - (2.0f*y12.x + y12.y);
-
-   // 11    1    9.000 [2][2]
-   // 11    2   -9.000
-   // 11    3    9.000
-   // 11    4   -9.000
-
-   // 11    5    6.000
-   // 11    6    3.000
-   // 11    7   -3.000
-   // 11    8   -6.000
-
-   // 11    9    6.000
-   // 11   10   -6.000
-   // 11   11   -3.000
-   // 11   12    3.000
-
-   // 11   13    4.000
-   // 11   14    2.000
-   // 11   15    1.000
-   // 11   16    2.000
-   c[2][2]    =  9.0f*(y.x - y.y + y.z - y.w) + 6.0f* y1.x  +  3.0f* y1.y  -  3.0f* y1.z  -  6.0f* y1.w +
-                                                6.0f* y2.x  -  6.0f* y2.y  -  3.0f* y2.z  +  3.0f* y2.w +
-                                                4.0f*y12.x  +  2.0f*y12.y  +       y12.z  +  2.0f*y12.w;
-
-   // 12    1   -6.000 [2][3]
-   // 12    2    6.000
-   // 12    3   -6.000
-   // 12    4    6.000
-
-   // 12    5   -4.000
-   // 12    6   -2.000
-   // 12    7    2.000
-   // 12    8    4.000
-
-   // 12    9   -3.000
-   // 12   10    3.000
-   // 12   11    3.000
-   // 12   12   -3.000
-
-   // 12   13   -2.000
-   // 12   14   -1.000
-   // 12   15   -1.000
-   // 12   16   -2.000
-
-   c[2][3]    =  6.0f*(y.y - y.x + y.w - y.z) + -4.0f* y1.x  -  2.0f* y1.y  +  2.0f* y1.z  +  4.0f* y1.w +
-                                                -3.0f* y2.x  +  3.0f* y2.y  +  3.0f* y2.z  -  3.0f* y2.w
-                                                -2.0f*y12.x  -       y12.y  -       y12.z  -  2.0f*y12.w;
-
-   // 13    1    2.000 [3][0]
-   // 13    2   -2.000
-   // 13    5    1.000
-   // 13    6    1.000
-   c[3][0]    =  2.0f*(y.x - y.y) + y1.x + y1.y;
-
-   // 14    9    2.000 [3][1]
-   // 14   10   -2.000
-   // 14   13    1.000
-   // 14   14    1.000
-   c[3][1]    =  2.0f*(y2.x - y2.y) + y12.x + y12.y;
-
-   // 15    1   -6.000 [3][2]
-   // 15    2    6.000
-   // 15    3   -6.000
-   // 15    4    6.000
-
-   // 15    5   -3.000
-   // 15    6   -3.000
-   // 15    7    3.000
-   // 15    8    3.000
-
-   // 15    9   -4.000
-   // 15   10    4.000
-   // 15   11    2.000
-   // 15   12   -2.000
-
-   // 15   13   -2.000
-   // 15   14   -2.000
-   // 15   15   -1.000
-   // 15   16   -1.000
-
-   c[3][2]    =  6.0f*( y.y -  y.x +  y.w -  y.z) +
-                 3.0f*(y1.z + y1.w - y1.x - y1.y) +
-                 2.0f*( 2.0f*(y2.y - y2.x) + y2.z - y2.w) +
-                 -2.0f*(y12.x + y12.y) - y12.z - y12.w;
-
-   // 16    1    4.000 [3][3]
-   // 16    2   -4.000
-   // 16    3    4.000
-   // 16    4   -4.000
-
-   // 16    5    2.000
-   // 16    6    2.000
-   // 16    7   -2.000
-   // 16    8   -2.000
-
-   // 16    9    2.000
-   // 16   10   -2.000
-   // 16   11   -2.000
-   // 16   12    2.000
-
-   // 16   13    1.000
-   // 16   14    1.000
-   // 16   15    1.000
-   // 16   16    1.000
-
-   c[3][3]    =  4.0f*(  y.x -    y.y  +   y.z -  y.w)  +
-                 2.0f*( y1.x +   y1.y  -  y1.z -  y1.w) +
-                 2.0f*( y2.x -   y2.y  -  y2.z +  y2.w) +
-                       y12.x +  y12.y  + y12.z + y12.w;
-
-   float      t = (x1-x1l) / (x1u-x1l);
-   float      u = (x2-x2l) / (x2u-x2l);
-
-   float energy =            ((c[3][3]*u + c[3][2])*u + c[3][1])*u + c[3][0];
-         energy = t*energy + ((c[2][3]*u + c[2][2])*u + c[2][1])*u + c[2][0];
-         energy = t*energy + ((c[1][3]*u + c[1][2])*u + c[1][1])*u + c[1][0];
-         energy = t*energy + ((c[0][3]*u + c[0][2])*u + c[0][1])*u + c[0][0];
-
-   float dang1  =           (3.0f*c[3][3]*t + 2.0f*c[2][3])*t + c[1][3];
-         dang1  = u*dang1 + (3.0f*c[3][2]*t + 2.0f*c[2][2])*t + c[1][2];
-         dang1  = u*dang1 + (3.0f*c[3][1]*t + 2.0f*c[2][1])*t + c[1][1];
-         dang1  = u*dang1 + (3.0f*c[3][0]*t + 2.0f*c[2][0])*t + c[1][0];
-
-   float dang2  =           (3.0f*c[3][3]*u + 2.0f*c[3][2])*u + c[3][1];
-         dang2  = t*dang2 + (3.0f*c[2][3]*u + 2.0f*c[2][2])*u + c[2][1];
-         dang2  = t*dang2 + (3.0f*c[1][3]*u + 2.0f*c[1][2])*u + c[1][1];
-         dang2  = t*dang2 + (3.0f*c[0][3]*u + 2.0f*c[0][2])*u + c[0][1];
-
-      dang1     = dang1 / (x1u-x1l);
-      dang2     = dang2 / (x2u-x2l);
-
-   *energyOut   = energy;
-   *dang1Out    = dang1;
-   *dang2Out    = dang2;
-
-}
-    
-__global__ void kCalculateAmoebaLocalForces_kernel()
-{
-    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
-    Vectors* A = &sV[threadIdx.x];
-
-    float energy = 0.0f;
-
-    while (pos < cAmoebaSim.amoebaBond_offset)
-    {
-        if (pos < cAmoebaSim.amoebaBonds)
-        {
-            int4   atom         = cAmoebaSim.pAmoebaBondID[pos];
-            float4 atomA        = cSim.pPosq[atom.x];
-            float4 atomB        = cSim.pPosq[atom.y];
-            float4 bond         = cAmoebaSim.pAmoebaBondParameter[pos];
-            float dx            = atomB.x - atomA.x;
-            float dy            = atomB.y - atomA.y;
-            float dz            = atomB.z - atomA.z;
-            float r2            = dx * dx + dy * dy + dz * dz;
-            float r             = sqrt(r2);
-            float deltaIdeal    = r - bond.x;
-#if defined OLD
-            energy             += 0.5f * bond.y * deltaIdeal * deltaIdeal;
-            float dEdR          = bond.y * deltaIdeal;
-            dEdR                = (r > 0.0f) ? (dEdR / r) : 0.0f;
-#else
-            float deltaIdeal2   = deltaIdeal*deltaIdeal;
-            energy             += bond.y * deltaIdeal2*( 1.0f + bond.z*deltaIdeal + bond.w*deltaIdeal2 );
-            float dEdR          = 2.0f*bond.y * deltaIdeal*( 1.0f + 1.5f*bond.z*deltaIdeal + 2.0f*bond.w*deltaIdeal2 );
-            dEdR                = (r > 0.0f) ? (dEdR / r) : 0.0f;
-#endif
-
-            dx                 *= dEdR;
-            dy                 *= dEdR;
-            dz                 *= dEdR;
-            unsigned int offsetA                = atom.x + atom.z * cSim.stride;
-            unsigned int offsetB                = atom.y + atom.w * cSim.stride;
-            float4 forceA                       = cSim.pForce4[offsetA];
-            float4 forceB                       = cSim.pForce4[offsetB];
-            forceA.x                           += dx;
-            forceA.y                           += dy;
-            forceA.z                           += dz;
-            forceB.x                           -= dx;
-            forceB.y                           -= dy;
-            forceB.z                           -= dz;
-            cSim.pForce4[offsetA]               = forceA;
-            cSim.pForce4[offsetB]               = forceB;    
-        }
-        pos += blockDim.x * gridDim.x;
-    }
-
-    while (pos < cAmoebaSim.amoebaAngle_offset)
-    {
-        unsigned int pos1   = pos - cAmoebaSim.amoebaBond_offset;
-        if (pos1 < cAmoebaSim.amoebaAngles )
-        {
-            int4   atom1            = cAmoebaSim.pAmoebaAngleID1[pos1];  
-
-            /*
-               bond_angle1.x        ideal
-               bond_angle1.y        k
-            */
-
-            float2 bond_angle1      = cAmoebaSim.pAmoebaAngleParameter[pos1];
-
-            float4 a1               = cSim.pPosq[atom1.x];
-            float4 a2               = cSim.pPosq[atom1.y];
-            float4 a3               = cSim.pPosq[atom1.z];
-            A->v0.x                 = a2.x - a1.x;
-            A->v0.y                 = a2.y - a1.y;
-            A->v0.z                 = a2.z - a1.z;
-            A->v1.x                 = a2.x - a3.x;
-            A->v1.y                 = a2.y - a3.y;
-            A->v1.z                 = a2.z - a3.z;
-            float3 cp;
-            CROSS_PRODUCT(A->v0, A->v1, cp);
-            float rp                = DOT3(cp, cp); //cx * cx + cy * cy + cz * cz;
-            rp                      = max(sqrt(rp), 1.0e-06f);
-            float r21               = DOT3(A->v0, A->v0); // dx1 * dx1 + dy1 * dy1 + dz1 * dz1;
-            float r23               = DOT3(A->v1, A->v1); // dx2 * dx2 + dy2 * dy2 + dz2 * dz2;
-            float dot               = DOT3(A->v0, A->v1); // dx1 * dx2 + dy1 * dy2 + dz1 * dz2;
-            float cosine            = dot / sqrt(r21 * r23);
-
-// e     = angunit * force * dt2 * (1.0d0+cang*dt+qang*dt2+pang*dt3+sang*dt4)
-// deddt = angunit * force * dt * radian * (2.0d0 + 3.0d0*cang*dt + 4.0d0*qang*dt2 + 5.0d0*pang*dt3 + 6.0d0*sang*dt4)
-
-            float angle             = acos(cosine);
-            float deltaIdeal        = angle*(180.0f/LOCAL_HACK_PI) - bond_angle1.x;
-            float deltaIdeal2       = deltaIdeal *deltaIdeal;
-            float deltaIdeal3       = deltaIdeal *deltaIdeal2;
-            float deltaIdeal4       = deltaIdeal2*deltaIdeal2;
-            energy                 += bond_angle1.y*deltaIdeal2*( 1.0f + cAmoebaSim.amoebaAngleCubicK*deltaIdeal               +
-                                                                         cAmoebaSim.amoebaAngleQuarticK*deltaIdeal2            +
-                                                                         cAmoebaSim.amoebaAnglePenticK*deltaIdeal3             +
-                                                                         cAmoebaSim.amoebaAngleSexticK*deltaIdeal4 );
-
-            float dEdR              = bond_angle1.y*deltaIdeal*( 2.0f + 3.0f*cAmoebaSim.amoebaAngleCubicK*deltaIdeal           +
-                                                                        4.0f*cAmoebaSim.amoebaAngleQuarticK*deltaIdeal2        +
-                                                                        5.0f*cAmoebaSim.amoebaAnglePenticK*deltaIdeal3         + 
-                                                                        6.0f*cAmoebaSim.amoebaAngleSexticK*deltaIdeal4 );
-
-            dEdR                   *= LOCAL_HACK_RADIAN;
-
-            float termA             =  dEdR / (r21 * rp);
-            float termC             = -dEdR / (r23 * rp);
-            float3 c21;
-            float3 c23;
-            CROSS_PRODUCT(A->v0, cp, c21);
-            CROSS_PRODUCT(A->v1, cp, c23);
-            c21.x                  *= termA;
-            c21.y                  *= termA;
-            c21.z                  *= termA;
-            c23.x                  *= termC;
-            c23.y                  *= termC;
-            c23.z                  *= termC;
-            int2 atom2              = cAmoebaSim.pAmoebaAngleID2[pos1];
-            unsigned int offset     = atom1.x + atom1.w * cSim.stride;
-            float4 force            = cSim.pForce4[offset]; 
-            force.x                += c21.x;
-            force.y                += c21.y;
-            force.z                += c21.z;
-            cSim.pForce4[offset]    = force;
-            offset                  = atom1.y + atom2.x * cSim.stride;
-            force                   = cSim.pForce4[offset];
-            force.x                -= (c21.x + c23.x);
-            force.y                -= (c21.y + c23.y);
-            force.z                -= (c21.z + c23.z);
-            cSim.pForce4[offset]    = force;
-            offset                  = atom1.z + atom2.y * cSim.stride;
-            force                   = cSim.pForce4[offset];
-            force.x                += c23.x;
-            force.y                += c23.y;
-            force.z                += c23.z;
-            cSim.pForce4[offset]    = force;
-        }
-        pos += blockDim.x * gridDim.x;
-    }
-
-    while (pos < cAmoebaSim.amoebaInPlaneAngle_offset)
-    {
-        unsigned int pos1   = pos - cAmoebaSim.amoebaAngle_offset;
-        if (pos1 < cAmoebaSim.amoebaInPlaneAngles )
-        {
-            int4   atom1            = cAmoebaSim.pAmoebaInPlaneAngleID1[pos1];  
-
-            /*
-               bond_angle1.x        ideal
-               bond_angle1.y        k
-            */
-
-            float2 bond_angle1      = cAmoebaSim.pAmoebaInPlaneAngleParameter[pos1];
-
-            float4 a1               = cSim.pPosq[atom1.x];
-            float4 a2               = cSim.pPosq[atom1.y];
-            float4 a3               = cSim.pPosq[atom1.z];
-            float4 a4               = cSim.pPosq[atom1.w];
-
-            float xad               = a1.x - a4.x;
-            float yad               = a1.y - a4.y;
-            float zad               = a1.z - a4.z;
-
-            float xbd               = a2.x - a4.x;
-            float ybd               = a2.y - a4.y;
-            float zbd               = a2.z - a4.z;
-
-            float xcd               = a3.x - a4.x;
-            float ycd               = a3.y - a4.y;
-            float zcd               = a3.z - a4.z;
-
-            float xt                = yad*zcd - zad*ycd;
-            float yt                = zad*xcd - xad*zcd;
-            float zt                = xad*ycd - yad*xcd;
-
-            float rt2               = xt*xt + yt*yt + zt*zt;
-
-            float delta             = -(xt*xbd + yt*ybd + zt*zbd) / rt2;
-
-            float xip               = a2.x + xt*delta;
-            float yip               = a2.y + yt*delta;
-            float zip               = a2.z + zt*delta;
-
-            float xap               = a1.x - xip;
-            float yap               = a1.y - yip;
-            float zap               = a1.z - zip;
-
-            float xcp               = a3.x - xip;
-            float ycp               = a3.y - yip;
-            float zcp               = a3.z - zip;
-
-            float rap2              = xap*xap + yap*yap + zap*zap;
-            float rcp2              = xcp*xcp + ycp*ycp + zcp*zcp;
-
-            float xm                = ycp*zap - zcp*yap;
-            float ym                = zcp*xap - xcp*zap;
-            float zm                = xcp*yap - ycp*xap;
-
-            float rm                = sqrtf(xm*xm + ym*ym + zm*zm);
-                  rm                = rm > 0.000001f ? rm : 0.000001f;
-            float dot               = xap*xcp + yap*ycp + zap*zcp;
-            float product           = sqrtf(rap2*rcp2);
-            float cosine            = product > 0.0f ? (dot/product) : 0.0f;
-                  cosine            = cosine >  1.0f ?  1.0f : cosine;
-                  cosine            = cosine < -1.0f ? -1.0f : cosine;
-            float angle             = acos(cosine);
-
-            // if product == 0, set force/energy to 0
-
-            float deltaIdeal        = product > 0.0f ? (angle*(180.0f/LOCAL_HACK_PI) - bond_angle1.x) : 0.0f;
-            float deltaIdeal2       = deltaIdeal *deltaIdeal;
-            float deltaIdeal3       = deltaIdeal *deltaIdeal2;
-            float deltaIdeal4       = deltaIdeal2*deltaIdeal2;
-
-            energy                 += bond_angle1.y*deltaIdeal2*( 1.0f +  cAmoebaSim.amoebaInPlaneAngleCubicK*deltaIdeal          +
-                                                                          cAmoebaSim.amoebaInPlaneAngleQuarticK*deltaIdeal2       +
-                                                                          cAmoebaSim.amoebaInPlaneAnglePenticK*deltaIdeal3        +
-                                                                          cAmoebaSim.amoebaInPlaneAngleSexticK*deltaIdeal4 );
-
-
-            float dEdR              = bond_angle1.y*deltaIdeal*( 2.0f + 3.0f*cAmoebaSim.amoebaInPlaneAngleCubicK*deltaIdeal       + 
-                                                                        4.0f*cAmoebaSim.amoebaInPlaneAngleQuarticK*deltaIdeal2    +
-                                                                        5.0f*cAmoebaSim.amoebaInPlaneAnglePenticK*deltaIdeal3     + 
-                                                                        6.0f*cAmoebaSim.amoebaInPlaneAngleSexticK*deltaIdeal4 );
-
-            dEdR                   *= LOCAL_HACK_RADIAN;
-
-            float terma             = -dEdR/ (rap2*rm);
-            float termc             = dEdR / (rcp2*rm);
-
-            float dedxia            = terma * (yap*zm-zap*ym);
-            float dedyia            = terma * (zap*xm-xap*zm);
-            float dedzia            = terma * (xap*ym-yap*xm);
-
-            float dedxic            = termc * (ycp*zm-zcp*ym);
-            float dedyic            = termc * (zcp*xm-xcp*zm);
-            float dedzic            = termc * (xcp*ym-ycp*xm);
-
-            float dedxip            = -dedxia - dedxic;
-            float dedyip            = -dedyia - dedyic;
-            float dedzip            = -dedzia - dedzic;
-
-            float delta2            = 2.0f * delta;
-            float ptrt2             = (dedxip*xt + dedyip*yt + dedzip*zt) / rt2;
-
-            float term              = (zcd*ybd-ycd*zbd) + delta2*(yt*zcd-zt*ycd);
-            float dpdxia            = delta*(ycd*dedzip-zcd*dedyip) + term*ptrt2;
-
-                  term              = (xcd*zbd-zcd*xbd) + delta2*(zt*xcd-xt*zcd);
-            float dpdyia            = delta*(zcd*dedxip-xcd*dedzip) + term*ptrt2;
-
-                  term              = (ycd*xbd-xcd*ybd) + delta2*(xt*ycd-yt*xcd);
-            float dpdzia            = delta*(xcd*dedyip-ycd*dedxip) + term*ptrt2;
-
-                  term              = (yad*zbd-zad*ybd) + delta2*(zt*yad-yt*zad);
-            float dpdxic            = delta*(zad*dedyip-yad*dedzip) + term*ptrt2;
-
-                  term              = (zad*xbd-xad*zbd) + delta2*(xt*zad-zt*xad);
-            float dpdyic            = delta*(xad*dedzip-zad*dedxip) + term*ptrt2;
-
-                  term              = (xad*ybd-yad*xbd) + delta2*(yt*xad-xt*yad);
-            float dpdzic            = delta*(yad*dedxip-xad*dedyip) + term*ptrt2;
-
-                  dedxia            = dedxia + dpdxia;
-                  dedyia            = dedyia + dpdyia;
-                  dedzia            = dedzia + dpdzia;
-
-            float dedxib            = dedxip;
-            float dedyib            = dedyip;
-            float dedzib            = dedzip;
-
-                  dedxic            = dedxic + dpdxic;
-                  dedyic            = dedyic + dpdyic;
-                  dedzic            = dedzic + dpdzic;
-
-            float dedxid            = -dedxia - dedxib - dedxic;
-            float dedyid            = -dedyia - dedyib - dedyic;
-            float dedzid            = -dedzia - dedzib - dedzic;
-
-            int4 atom2              = cAmoebaSim.pAmoebaInPlaneAngleID2[pos1];
-            unsigned int offset     = atom1.x + atom2.x * cSim.stride;
-            float4 force            = cSim.pForce4[offset]; 
-            force.x                -= dedxia;
-            force.y                -= dedyia;
-            force.z                -= dedzia;
-            cSim.pForce4[offset]    = force;
-
-            offset                  = atom1.y + atom2.y * cSim.stride;
-            force                   = cSim.pForce4[offset];
-            force.x                -= dedxib;
-            force.y                -= dedyib;
-            force.z                -= dedzib;
-            cSim.pForce4[offset]    = force;
-
-            offset                  = atom1.z + atom2.z * cSim.stride;
-            force                   = cSim.pForce4[offset];
-            force.x                -= dedxic;
-            force.y                -= dedyic;
-            force.z                -= dedzic;
-            cSim.pForce4[offset]    = force;
-
-            offset                  = atom1.w + atom2.w * cSim.stride;
-            force                   = cSim.pForce4[offset];
-            force.x                -= dedxid;
-            force.y                -= dedyid;
-            force.z                -= dedzid;
-            cSim.pForce4[offset]    = force;
-        }
-        pos += blockDim.x * gridDim.x;
-    }
-
-    while (pos < cAmoebaSim.amoebaTorsion_offset)
-    {
-        unsigned int pos1   = pos - cAmoebaSim.amoebaInPlaneAngle_offset;
-        if (pos1 < cAmoebaSim.amoebaTorsions )
-        {
-            int4   atom1            = cAmoebaSim.pAmoebaTorsionID1[pos1];  
-
-            /*
-               torsionParam1.x      amplitude(1)
-               torsionParam1.y      phase(1)
-               torsionParam1.z      amplitude(2)
-               torsionParam1.w      phase(2)
-               torsionParam2.x      amplitude(3)
-               torsionParam2.y      phase(3)
-            */
-
-            float4 torsionParam1    = cAmoebaSim.pAmoebaTorsionParameter1[pos1];
-            float2 torsionParam2    = cAmoebaSim.pAmoebaTorsionParameter2[pos1];
-
-            float4 a1               = cSim.pPosq[atom1.x];
-            float4 a2               = cSim.pPosq[atom1.y];
-            float4 a3               = cSim.pPosq[atom1.z];
-            float4 a4               = cSim.pPosq[atom1.w];
-
-            float xba               = a2.x - a1.x;
-            float yba               = a2.y - a1.y;
-            float zba               = a2.z - a1.z;
-
-            float xcb               = a3.x - a2.x;
-            float ycb               = a3.y - a2.y;
-            float zcb               = a3.z - a2.z;
-
-            float xdc               = a4.x - a3.x;
-            float ydc               = a4.y - a3.y;
-            float zdc               = a4.z - a3.z;
-
-            float xt                = yba*zcb - ycb*zba;
-            float yt                = zba*xcb - zcb*xba;
-            float zt                = xba*ycb - xcb*yba;
-            float xu                = ycb*zdc - ydc*zcb;
-            float yu                = zcb*xdc - zdc*xcb;
-            float zu                = xcb*ydc - xdc*ycb;
-
-            float xtu               = yt*zu - yu*zt;
-            float ytu               = zt*xu - zu*xt;
-            float ztu               = xt*yu - xu*yt;
-
-            float rt2               = xt*xt + yt*yt + zt*zt;
-            float ru2               = xu*xu + yu*yu + zu*zu;
-
-            float rtru              = sqrtf(rt2 * ru2);
-
-            float rcb               = sqrtf(xcb*xcb + ycb*ycb + zcb*zcb);
-            float cosine            = rtru > 0.0f ? ( (xt*xu + yt*yu + zt*zu) / rtru) : 0.0f;
-            float sine              = rtru > 0.0f ? ( (xcb*xtu + ycb*ytu + zcb*ztu) / (rcb*rtru) ) : 0.0f;
-
-            // set the torsional parameters for this angle;
-
-            float    v1             = torsionParam1.x;
-            float    angle          = torsionParam1.y;
-            float    c1             = cos( angle );
-            float    s1             = sin( angle );
-
-            float    v2             = torsionParam1.z;
-                     angle          = torsionParam1.w;
-            float    c2             = cos( angle );
-            float    s2             = sin( angle );
-
-
-            float    v3             = torsionParam2.x;
-                     angle          = torsionParam2.y;
-            float    c3             = cos( angle );
-            float    s3             = sin( angle );
-
-            // compute the multiple angle trigonometry and the phase terms
-
-            float    cosine2          = cosine*cosine - sine*sine;
-            float    sine2            = 2.0f * cosine * sine;
-            float    cosine3          = cosine*cosine2 - sine*sine2;
-            float    sine3            = cosine*sine2 + sine*cosine2;
-
-            // not deleted since may be needed in future
-/*
-            float    cosine4          = cosine*cosine3 - sine*sine3;
-            float    sine4            = cosine*sine3 + sine*cosine3;
-            float    cosine5          = cosine*cosine4 - sine*sine4;
-            float    sine5            = cosine*sine4 + sine*cosine4;
-            float    cosine6          = cosine*cosine5 - sine*sine5;
-            float    sine6            = cosine*sine5 + sine*cosine5;
-*/
-            float    phi1             = 1.0f + (cosine*c1 + sine*s1);
-            float    phi2             = 1.0f + (cosine2*c2 + sine2*s2);
-            float    phi3             = 1.0f + (cosine3*c3 + sine3*s3);
-/*
-            float    phi4             = 1.0f + (cosine4*c4 + sine4*s4);
-            float    phi5             = 1.0f + (cosine5*c5 + sine5*s5);
-            float    phi6             = 1.0f + (cosine6*c6 + sine6*s6);
-*/
-            float    dphi1            = (cosine*s1 - sine*c1);
-            float    dphi2            = 2.0f * (cosine2*s2 - sine2*c2);
-            float    dphi3            = 3.0f * (cosine3*s3 - sine3*c3);
-/*
-            float    dphi4            = 4.0f * (cosine4*s4 - sine4*c4);
-            float    dphi5            = 5.0f * (cosine5*s5 - sine5*c5);
-            float    dphi6            = 6.0f * (cosine6*s6 - sine6*c6);
-*/
-
-            // calculate torsional energy and master chain rule term
-
-            energy                   += v1*phi1 + v2*phi2 + v3*phi3;
-     //                                 + v4*phi4 + v5*phi5 + v6*phi6;
-
-            float    dedphi           = v1*dphi1 + v2*dphi2 + v3*dphi3;
-     //                                 + v4*dphi4 + v5*dphi5 + v6*dphi6;
-
-            // chain rule terms for first derivative components
-
-            float    xca              = a3.x - a1.x;
-            float    yca              = a3.y - a1.y;
-            float    zca              = a3.z - a1.z;
-
-            float    xdb              = a4.x - a2.x;
-            float    ydb              = a4.y - a2.y;
-            float    zdb              = a4.z - a2.z;
-
-            float    dedxt            =  dedphi * (yt*zcb - ycb*zt) / (rt2*rcb);
-            float    dedyt            =  dedphi * (zt*xcb - zcb*xt) / (rt2*rcb);
-            float    dedzt            =  dedphi * (xt*ycb - xcb*yt) / (rt2*rcb);
-            float    dedxu            = -dedphi * (yu*zcb - ycb*zu) / (ru2*rcb);
-            float    dedyu            = -dedphi * (zu*xcb - zcb*xu) / (ru2*rcb);
-            float    dedzu            = -dedphi * (xu*ycb - xcb*yu) / (ru2*rcb);
-
-            // compute first derivative components for this angle
-
-            float    dedxia           = zcb*dedyt - ycb*dedzt;
-            float    dedyia           = xcb*dedzt - zcb*dedxt;
-            float    dedzia           = ycb*dedxt - xcb*dedyt;
-            float    dedxib           = yca*dedzt - zca*dedyt + zdc*dedyu - ydc*dedzu;
-            float    dedyib           = zca*dedxt - xca*dedzt + xdc*dedzu - zdc*dedxu;
-            float    dedzib           = xca*dedyt - yca*dedxt + ydc*dedxu - xdc*dedyu;
-            float    dedxic           = zba*dedyt - yba*dedzt + ydb*dedzu - zdb*dedyu;
-            float    dedyic           = xba*dedzt - zba*dedxt + zdb*dedxu - xdb*dedzu;
-            float    dedzic           = yba*dedxt - xba*dedyt + xdb*dedyu - ydb*dedxu;
-            float    dedxid           = zcb*dedyu - ycb*dedzu;
-            float    dedyid           = xcb*dedzu - zcb*dedxu;
-            float    dedzid           = ycb*dedxu - xcb*dedyu;
-
-            int4 atom2                = cAmoebaSim.pAmoebaTorsionID2[pos1];
-            unsigned int offset       = atom1.x + atom2.x * cSim.stride;
-            float4 force              = cSim.pForce4[offset]; 
-            force.x                  -= dedxia;
-            force.y                  -= dedyia;
-            force.z                  -= dedzia;
-            cSim.pForce4[offset]      = force;
-
-            offset                    = atom1.y + atom2.y * cSim.stride;
-            force                     = cSim.pForce4[offset];
-            force.x                  -= dedxib;
-            force.y                  -= dedyib;
-            force.z                  -= dedzib;
-            cSim.pForce4[offset]      = force;
-
-            offset                    = atom1.z + atom2.z * cSim.stride;
-            force                     = cSim.pForce4[offset];
-            force.x                  -= dedxic;
-            force.y                  -= dedyic;
-            force.z                  -= dedzic;
-            cSim.pForce4[offset]      = force;
-
-            offset                    = atom1.w + atom2.w * cSim.stride;
-            force                     = cSim.pForce4[offset];
-            force.x                  -= dedxid;
-            force.y                  -= dedyid;
-            force.z                  -= dedzid;
-            cSim.pForce4[offset]      = force;
-        }
-        pos += blockDim.x * gridDim.x;
-    }
-
-    while (pos < cAmoebaSim.amoebaPiTorsion_offset)
-    {
-        unsigned int pos1   = pos - cAmoebaSim.amoebaTorsion_offset;
-        if (pos1 < cAmoebaSim.amoebaPiTorsions )
-        {
-            int4   atom1            = cAmoebaSim.pAmoebaPiTorsionID1[pos1];  
-            int4   atom2            = cAmoebaSim.pAmoebaPiTorsionID2[pos1];  
-
-            /*
-               torsionParam1.x      amplitude(1)
-               torsionParam1.y      phase(1)
-               torsionParam1.z      amplitude(2)
-               torsionParam1.w      phase(2)
-               torsionParam2.x      amplitude(3)
-               torsionParam2.y      phase(3)
-            */
-
-            float4 a1               = cSim.pPosq[atom1.x];
-            float4 a2               = cSim.pPosq[atom1.y];
-            float4 a3               = cSim.pPosq[atom1.z];
-            float4 a4               = cSim.pPosq[atom1.w];
-            float4 a5               = cSim.pPosq[atom2.x];
-            float4 a6               = cSim.pPosq[atom2.y];
-
-            // compute the value of the pi-orbital torsion angle
-
-            float xad               = a1.x - a4.x;
-            float yad               = a1.y - a4.y;
-            float zad               = a1.z - a4.z;
-
-            float xbd               = a2.x - a4.x;
-            float ybd               = a2.y - a4.y;
-            float zbd               = a2.z - a4.z;
-
-            float xec               = a5.x - a3.x;
-            float yec               = a5.y - a3.y;
-            float zec               = a5.z - a3.z;
-
-            float xgc               = a6.x - a3.x;
-            float ygc               = a6.y - a3.y;
-            float zgc               = a6.z - a3.z;
-
-            float xip               = yad*zbd - ybd*zad + a3.x;
-            float yip               = zad*xbd - zbd*xad + a3.y;
-            float zip               = xad*ybd - xbd*yad + a3.z;
-
-            float xiq               = yec*zgc - ygc*zec + a4.x;
-            float yiq               = zec*xgc - zgc*xec + a4.y;
-            float ziq               = xec*ygc - xgc*yec + a4.z;
-
-            float xcp               = a3.x - xip;
-            float ycp               = a3.y - yip;
-            float zcp               = a3.z - zip;
-
-            float xdc               = a4.x - a3.x;
-            float ydc               = a4.y - a3.y;
-            float zdc               = a4.z - a3.z;
-
-            float xqd               = xiq - a4.x;
-            float yqd               = yiq - a4.y;
-            float zqd               = ziq - a4.z;
-
-            float xt                = ycp*zdc - ydc*zcp;
-            float yt                = zcp*xdc - zdc*xcp;
-            float zt                = xcp*ydc - xdc*ycp;
-
-            float xu                = ydc*zqd - yqd*zdc;
-            float yu                = zdc*xqd - zqd*xdc;
-            float zu                = xdc*yqd - xqd*ydc;
-
-            float xtu               = yt*zu - yu*zt;
-            float ytu               = zt*xu - zu*xt;
-            float ztu               = xt*yu - xu*yt;
-
-            float rt2               = xt*xt + yt*yt + zt*zt;
-            float ru2               = xu*xu + yu*yu + zu*zu;
-
-            float rtru              = sqrtf(rt2 * ru2);
-            float rdc               = sqrt(xdc*xdc + ydc*ydc + zdc*zdc);
-
-            float cosine            = rtru > 0.0f ? (xt*xu + yt*yu + zt*zu) / rtru : 0.0f;
-            float sine              = (rtru*rdc) > 0.0f ? (xdc*xtu + ydc*ytu + zdc*ztu) / (rdc*rtru) : 0.0f;
-
-            // zero energy/force if rtru == 0
-
-            float v2                = cAmoebaSim.pAmoebaPiTorsionParameter[pos1];
-                  v2                = rtru > 0.0f ? v2 : 0.0f;
-
-            // compute the multiple angle trigonometry and the phase terms
-
-            float cosine2           = cosine*cosine - sine*sine;
-            float sine2             = 2.0f * cosine * sine;
-            float phi2              = 1.0f - cosine2;
-            float dphi2             = 2.0f * sine2;
-
-            // calculate pi-orbital torsion energy and master chain rule term
-
-            energy                 += v2 * phi2;
-            float dedphi            = v2 * dphi2;
-
-            // chain rule terms for first derivative components
-
-            float xdp               = a4.x - xip;
-            float ydp               = a4.y - yip;
-            float zdp               = a4.z - zip;
-
-            float xqc               = xiq - a3.x;
-            float yqc               = yiq - a3.y;
-            float zqc               = ziq - a3.z;
-
-            float dedxt             =  dedphi * (yt*zdc - ydc*zt) / (rt2*rdc);
-            float dedyt             =  dedphi * (zt*xdc - zdc*xt) / (rt2*rdc);
-            float dedzt             =  dedphi * (xt*ydc - xdc*yt) / (rt2*rdc);
-
-            float dedxu             = -dedphi * (yu*zdc - ydc*zu) / (ru2*rdc);
-            float dedyu             = -dedphi * (zu*xdc - zdc*xu) / (ru2*rdc);
-            float dedzu             = -dedphi * (xu*ydc - xdc*yu) / (ru2*rdc);
-
-            // compute first derivative components for pi-orbital angle
-
-            float dedxip            = zdc*dedyt - ydc*dedzt;
-            float dedyip            = xdc*dedzt - zdc*dedxt;
-            float dedzip            = ydc*dedxt - xdc*dedyt;
-
-            float dedxic            = ydp*dedzt - zdp*dedyt + zqd*dedyu - yqd*dedzu;
-            float dedyic            = zdp*dedxt - xdp*dedzt + xqd*dedzu - zqd*dedxu;
-            float dedzic            = xdp*dedyt - ydp*dedxt + yqd*dedxu - xqd*dedyu;
-
-            float dedxid            = zcp*dedyt - ycp*dedzt + yqc*dedzu - zqc*dedyu;
-            float dedyid            = xcp*dedzt - zcp*dedxt + zqc*dedxu - xqc*dedzu;
-            float dedzid            = ycp*dedxt - xcp*dedyt + xqc*dedyu - yqc*dedxu;
-
-            float dedxiq            = zdc*dedyu - ydc*dedzu;
-            float dedyiq            = xdc*dedzu - zdc*dedxu;
-            float dedziq            = ydc*dedxu - xdc*dedyu;
-
-            // compute first derivative components for individual atoms
-
-            float dedxia            = ybd*dedzip - zbd*dedyip;
-            float dedyia            = zbd*dedxip - xbd*dedzip;
-            float dedzia            = xbd*dedyip - ybd*dedxip;
-
-            float dedxib            = zad*dedyip - yad*dedzip;
-            float dedyib            = xad*dedzip - zad*dedxip;
-            float dedzib            = yad*dedxip - xad*dedyip;
-
-            float dedxie            = ygc*dedziq - zgc*dedyiq;
-            float dedyie            = zgc*dedxiq - xgc*dedziq;
-            float dedzie            = xgc*dedyiq - ygc*dedxiq;
-
-            float dedxig            = zec*dedyiq - yec*dedziq;
-            float dedyig            = xec*dedziq - zec*dedxiq;
-            float dedzig            = yec*dedxiq - xec*dedyiq;
-
-                  dedxic            = dedxic + dedxip - dedxie - dedxig;
-                  dedyic            = dedyic + dedyip - dedyie - dedyig;
-                  dedzic            = dedzic + dedzip - dedzie - dedzig;
-                  dedxid            = dedxid + dedxiq - dedxia - dedxib;
-                  dedyid            = dedyid + dedyiq - dedyia - dedyib;
-                  dedzid            = dedzid + dedziq - dedzia - dedzib;
-
-            int4 atom3              = cAmoebaSim.pAmoebaPiTorsionID3[pos1];
-            unsigned int offset     = atom1.x + atom2.z * cSim.stride;
-            float4 force            = cSim.pForce4[offset]; 
-            force.x                -= dedxia;
-            force.y                -= dedyia;
-            force.z                -= dedzia;
-            cSim.pForce4[offset]    = force;
-
-            offset                  = atom1.y + atom2.w * cSim.stride;
-            force                   = cSim.pForce4[offset];
-            force.x                -= dedxib;
-            force.y                -= dedyib;
-            force.z                -= dedzib;
-            cSim.pForce4[offset]    = force;
-
-            offset                  = atom1.z + atom3.x * cSim.stride;
-            force                   = cSim.pForce4[offset];
-            force.x                -= dedxic;
-            force.y                -= dedyic;
-            force.z                -= dedzic;
-            cSim.pForce4[offset]    = force;
-
-            offset                  = atom1.w + atom3.y * cSim.stride;
-            force                   = cSim.pForce4[offset];
-            force.x                -= dedxid;
-            force.y                -= dedyid;
-            force.z                -= dedzid;
-            cSim.pForce4[offset]    = force;
-
-            offset                  = atom2.x + atom3.z * cSim.stride;
-            force                   = cSim.pForce4[offset];
-            force.x                -= dedxie;
-            force.y                -= dedyie;
-            force.z                -= dedzie;
-            cSim.pForce4[offset]    = force;
-
-            offset                  = atom2.y + atom3.w * cSim.stride;
-            force                   = cSim.pForce4[offset];
-            force.x                -= dedxig;
-            force.y                -= dedyig;
-            force.z                -= dedzig;
-            cSim.pForce4[offset]    = force;
-
-        }
-        pos += blockDim.x * gridDim.x;
-    }
-
-    while (pos < cAmoebaSim.amoebaStretchBend_offset)
-    {
-        unsigned int pos1   = pos - cAmoebaSim.amoebaPiTorsion_offset;
-        if (pos1 < cAmoebaSim.amoebaStretchBends )
-        {
-            int4   atom1            = cAmoebaSim.pAmoebaStretchBendID1[pos1];  
-
-            /*
-               parameters.x        length AB
-               parameters.y        length CB
-               parameters.z        angle (in radians)
-               parameters.w        k
-            */
-
-            float4 a1               = cSim.pPosq[atom1.x];
-            float4 a2               = cSim.pPosq[atom1.y];
-            float4 a3               = cSim.pPosq[atom1.z];
-
-            // compute the value of the bond angle
-
-            float xab               = a1.x - a2.x;
-            float yab               = a1.y - a2.y;
-            float zab               = a1.z - a2.z;
-
-            float xcb               = a3.x - a2.x;
-            float ycb               = a3.y - a2.y;
-            float zcb               = a3.z - a2.z;
-
-            float rab               = sqrtf(xab*xab + yab*yab + zab*zab);
-            float rcb               = sqrtf(xcb*xcb + ycb*ycb + zcb*zcb);
-
-            float xp                = ycb*zab - zcb*yab;
-            float yp                = zcb*xab - xcb*zab;
-            float zp                = xcb*yab - ycb*xab;
-
-            float rp                = sqrt(xp*xp + yp*yp + zp*zp);
-
-            float    dot            = xab*xcb + yab*ycb + zab*zcb;
-            float    cosine         = rab*rcb > 0.0f ? (dot / (rab*rcb)) : 1.0f;
-                     cosine         = cosine >  1.0f ?  1.0f : cosine;
-                     cosine         = cosine < -1.0f ? -1.0f : cosine;
-            float    angle          = acos(cosine);
-
-            // find chain rule terms for the bond angle deviation
-
-            float4 parameters       = cAmoebaSim.pAmoebaStretchBendParameter[pos1];
-
-            float    dt             = LOCAL_HACK_RADIAN*(angle - parameters.z);
-            float    terma          = rab*rp != 0.0f ? (-LOCAL_HACK_RADIAN/(rab*rab*rp)) : 0.0f;
-            float    termc          = rcb*rp != 0.0f ? ( LOCAL_HACK_RADIAN/(rcb*rcb*rp)) : 0.0f;
-
-            float    ddtdxia        = terma * (yab*zp-zab*yp);
-            float    ddtdyia        = terma * (zab*xp-xab*zp);
-            float    ddtdzia        = terma * (xab*yp-yab*xp);
-
-            float    ddtdxic        = termc * (ycb*zp-zcb*yp);
-            float    ddtdyic        = termc * (zcb*xp-xcb*zp);
-            float    ddtdzic        = termc * (xcb*yp-ycb*xp);
-
-            // find chain rule terms for the bond length deviations
-
-            float    dr             = parameters.x > 0.0f ? (rab - parameters.x) : 0.0f;
-                     terma          = parameters.x > 0.0f ? (1.0f/rab) : 0.0f;
-
-                     dr            += parameters.y > 0.0f ? (rcb - parameters.y) : 0.0f;
-                     termc          = parameters.y > 0.0f ? (1.0f/rcb) : 0.0f;
-
-            float    ddrdxia        = terma * xab;
-            float    ddrdyia        = terma * yab;
-            float    ddrdzia        = terma * zab;
-
-            float    ddrdxic        = termc * xcb;
-            float    ddrdyic        = termc * ycb;
-            float    ddrdzic        = termc * zcb;
-
-            // get the energy and master chain rule terms for derivatives
-
-            float    term           = (rp != 0.0f) ? parameters.w : 0.0f;
-                     energy        += term*dt*dr;
-
-            float    dedxia         = term * (dt*ddrdxia+ddtdxia*dr);
-            float    dedyia         = term * (dt*ddrdyia+ddtdyia*dr);
-            float    dedzia         = term * (dt*ddrdzia+ddtdzia*dr);
-
-            float    dedxic         = term * (dt*ddrdxic+ddtdxic*dr);
-            float    dedyic         = term * (dt*ddrdyic+ddtdyic*dr);
-            float    dedzic         = term * (dt*ddrdzic+ddtdzic*dr);
-
-            float    dedxib         = -dedxia - dedxic;
-            float    dedyib         = -dedyia - dedyic;
-            float    dedzib         = -dedzia - dedzic;
-
-            // increment the total stretch-bend energy and derivatives
-
-            unsigned int offset     = atom1.x + atom1.w * cSim.stride;
-            float4 force            = cSim.pForce4[offset]; 
-            force.x                -= dedxia;
-            force.y                -= dedyia;
-            force.z                -= dedzia;
-            cSim.pForce4[offset]    = force;
-
-            int2 atom2              = cAmoebaSim.pAmoebaStretchBendID2[pos1];
-            offset                  = atom1.y + atom2.x * cSim.stride;
-            force                   = cSim.pForce4[offset];
-            force.x                -= dedxib;
-            force.y                -= dedyib;
-            force.z                -= dedzib;
-            cSim.pForce4[offset]    = force;
-
-            offset                  = atom1.z + atom2.y * cSim.stride;
-            force                   = cSim.pForce4[offset];
-            force.x                -= dedxic;
-            force.y                -= dedyic;
-            force.z                -= dedzic;
-            cSim.pForce4[offset]    = force;
-
-        }
-        pos += blockDim.x * gridDim.x;
-    }
-
-    while (pos < cAmoebaSim.amoebaOutOfPlaneBend_offset)
-    {
-        unsigned int pos1   = pos - cAmoebaSim.amoebaStretchBend_offset;
-        if (pos1 < cAmoebaSim.amoebaOutOfPlaneBends )
-        {
-            int4   atom1            = cAmoebaSim.pAmoebaOutOfPlaneBendID1[pos1];  
-
-            float4 a1               = cSim.pPosq[atom1.x];
-            float4 a2               = cSim.pPosq[atom1.y];
-            float4 a3               = cSim.pPosq[atom1.z];
-            float4 a4               = cSim.pPosq[atom1.w];
-
-            // compute the value of the bond angle
-
-            float xab               = a1.x - a2.x;
-            float yab               = a1.y - a2.y;
-            float zab               = a1.z - a2.z;
-
-            float xcb               = a3.x - a2.x;
-            float ycb               = a3.y - a2.y;
-            float zcb               = a3.z - a2.z;
-
-            // compute the out-of-plane bending angle
-
-            float xdb               = a4.x - a2.x;
-            float ydb               = a4.y - a2.y;
-            float zdb               = a4.z - a2.z;
-
-            float xad               = a1.x - a4.x;
-            float yad               = a1.y - a4.y;
-            float zad               = a1.z - a4.z;
-
-            float xcd               = a3.x - a4.x;
-            float ycd               = a3.y - a4.y;
-            float zcd               = a3.z - a4.z;
-
-            float rdb2              = xdb*xdb + ydb*ydb + zdb*zdb;
-            float rad2              = xad*xad + yad*yad + zad*zad;
-            float rcd2              = xcd*xcd + ycd*ycd + zcd*zcd;
-
-            float ee                = xab*(ycb*zdb-zcb*ydb) +  yab*(zcb*xdb-xcb*zdb) + zab*(xcb*ydb-ycb*xdb);
-
-            float dot               = xad*xcd + yad*ycd + zad*zcd;
-            float cc                = rad2*rcd2 - dot*dot;
-            float bkk2              = (cc != 0.0f) ? (rdb2 - ee*ee/cc) : 0.0f;
-
-            float cosine            = (rdb2  !=  0.0f) ? sqrtf(bkk2/rdb2) : 0.0f;;
-                  cosine            = (cosine >  1.0f) ?  1.0f            : cosine;
-                  cosine            = (cosine < -1.0f) ? -1.0f            : cosine;
-
-/*
-            double rdb2             = xdb*xdb + ydb*ydb + zdb*zdb;
-            double rad2             = xad*xad + yad*yad + zad*zad;
-            double rcd2             = xcd*xcd + ycd*ycd + zcd*zcd;
-
-            double ee               = xab*(ycb*zdb-zcb*ydb) +  yab*(zcb*xdb-xcb*zdb) + zab*(xcb*ydb-ycb*xdb);
-
-            double dot              = xad*xcd + yad*ycd + zad*zcd;
-            double cc               = rad2*rcd2 - dot*dot;
-            double bkk2             = (cc != 0.0) ? (rdb2 - ee*ee/cc) : 0.0;
-
-            double cosine           = (rdb2  !=  0.0) ? sqrt(bkk2/rdb2) : 0.0;;
-                   cosine           = (cosine >  1.0) ?  1.0            : cosine;
-                   cosine           = (cosine < -1.0) ? -1.0            : cosine;
-*/
-            // find the out-of-plane energy and master chain rule terms
-
-            float    dt             = LOCAL_HACK_RADIAN_D*acos(cosine);
-            float    dt2            = dt  * dt;
-            float    dt3            = dt2 * dt;
-            float    dt4            = dt2 * dt2;
-            float      k            = (rdb2 != 0.0f && cc != 0.0f) ? cAmoebaSim.pAmoebaOutOfPlaneBendParameter[pos1] : 0.0f;
-                    
-                   energy          += k*dt2*(1.0f + (cAmoebaSim.amoebaOutOfPlaneBendCubicK*  dt ) +
-                                                    (cAmoebaSim.amoebaOutOfPlaneBendQuarticK*dt2) +
-                                                    (cAmoebaSim.amoebaOutOfPlaneBendPenticK* dt3) +
-                                                    (cAmoebaSim.amoebaOutOfPlaneBendSexticK* dt4) );
-
-            float    deddt          = k*dt*LOCAL_HACK_RADIAN*(2.0f                                           + 
-                                                             (3.0f*cAmoebaSim.amoebaOutOfPlaneBendCubicK*  dt ) +
-                                                             (4.0f*cAmoebaSim.amoebaOutOfPlaneBendQuarticK*dt2) +
-                                                             (5.0f*cAmoebaSim.amoebaOutOfPlaneBendPenticK* dt3) +
-                                                             (6.0f*cAmoebaSim.amoebaOutOfPlaneBendSexticK* dt4) );
-
-            float    eeSign         = ee >= 0.0f ? 1.0f : -1.0f;
-            float    dedcos         = -deddt*eeSign/sqrtf(cc*bkk2);
-
-            // chain rule terms for first derivative components
-
-            float    term            = ee / cc;
-
-            float    dccdxia         = (xad*rcd2-xcd*dot) * term;
-            float    dccdyia         = (yad*rcd2-ycd*dot) * term;
-            float    dccdzia         = (zad*rcd2-zcd*dot) * term;
-
-            float    dccdxic         = (xcd*rad2-xad*dot) * term;
-            float    dccdyic         = (ycd*rad2-yad*dot) * term;
-            float    dccdzic         = (zcd*rad2-zad*dot) * term;
-
-            float    dccdxid         = -dccdxia - dccdxic;
-            float    dccdyid         = -dccdyia - dccdyic;
-            float    dccdzid         = -dccdzia - dccdzic;
-
-                     term            = ee / rdb2;
-
-            float    deedxia         = ydb*zcb - zdb*ycb;
-            float    deedyia         = zdb*xcb - xdb*zcb;
-            float    deedzia         = xdb*ycb - ydb*xcb;
-
-            float    deedxic         = yab*zdb - zab*ydb;
-            float    deedyic         = zab*xdb - xab*zdb;
-            float    deedzic         = xab*ydb - yab*xdb;
-
-            float    deedxid         = ycb*zab - zcb*yab + xdb*term;
-            float    deedyid         = zcb*xab - xcb*zab + ydb*term;
-            float    deedzid         = xcb*yab - ycb*xab + zdb*term;
-
-            // compute first derivative components for this angle
-
-            float    dedxia          = dedcos * (dccdxia+deedxia);
-            float    dedyia          = dedcos * (dccdyia+deedyia);
-            float    dedzia          = dedcos * (dccdzia+deedzia);
-
-            float    dedxic          = dedcos * (dccdxic+deedxic);
-            float    dedyic          = dedcos * (dccdyic+deedyic);
-            float    dedzic          = dedcos * (dccdzic+deedzic);
-
-            float    dedxid          = dedcos * (dccdxid+deedxid);
-            float    dedyid          = dedcos * (dccdyid+deedyid);
-            float    dedzid          = dedcos * (dccdzid+deedzid);
-
-            float    dedxib          = -dedxia - dedxic - dedxid;
-            float    dedyib          = -dedyia - dedyic - dedyid;
-            float    dedzib          = -dedzia - dedzic - dedzid;
-
-            // increment the out-of-plane bending energy and gradient;
-
-            int4   atom2            = cAmoebaSim.pAmoebaOutOfPlaneBendID2[pos1];  
-
-            unsigned int offset     = atom1.x + atom2.x * cSim.stride;
-            float4 force            = cSim.pForce4[offset]; 
-            force.x                -= dedxia;
-            force.y                -= dedyia;
-            force.z                -= dedzia;
-            cSim.pForce4[offset]    = force;
-
-            offset                  = atom1.y + atom2.y * cSim.stride;
-            force                   = cSim.pForce4[offset];
-            force.x                -= dedxib;
-            force.y                -= dedyib;
-            force.z                -= dedzib;
-            cSim.pForce4[offset]    = force;
-
-            offset                  = atom1.z + atom2.z * cSim.stride;
-            force                   = cSim.pForce4[offset];
-            force.x                -= dedxic;
-            force.y                -= dedyic;
-            force.z                -= dedzic;
-            cSim.pForce4[offset]    = force;
-
-            offset                  = atom1.w + atom2.w * cSim.stride;
-            force                   = cSim.pForce4[offset];
-            force.x                -= dedxid;
-            force.y                -= dedyid;
-            force.z                -= dedzid;
-            cSim.pForce4[offset]    = force;
-
-        }
-        pos += blockDim.x * gridDim.x;
-    }
-
-    while (pos < cAmoebaSim.amoebaTorsionTorsion_offset)
-    {
-        unsigned int pos1   = pos - cAmoebaSim.amoebaOutOfPlaneBend_offset;
-        if (pos1 < cAmoebaSim.amoebaTorsionTorsions )
-        {
-            int4   atom1            = cAmoebaSim.pAmoebaTorsionTorsionID1[pos1];  
-            int4   atom2            = cAmoebaSim.pAmoebaTorsionTorsionID2[pos1];  
-
-            // atom2.y = chiral atom index
-            // atom2.z = grid index
-
-            float4 a1               = cSim.pPosq[atom1.x];
-            float4 a2               = cSim.pPosq[atom1.y];
-            float4 a3               = cSim.pPosq[atom1.z];
-            float4 a4               = cSim.pPosq[atom1.w];
-            float4 a5               = cSim.pPosq[atom2.x];
-
-            float xba               = a2.x - a1.x;
-            float yba               = a2.y - a1.y;
-            float zba               = a2.z - a1.z;
-
-            float xcb               = a3.x - a2.x;
-            float ycb               = a3.y - a2.y;
-            float zcb               = a3.z - a2.z;
-
-            float xdc               = a4.x - a3.x;
-            float ydc               = a4.y - a3.y;
-            float zdc               = a4.z - a3.z;
-
-            float xed               = a5.x - a4.x;
-            float yed               = a5.y - a4.y;
-            float zed               = a5.z - a4.z;
-
-            float xt                = yba*zcb - ycb*zba;
-            float yt                = zba*xcb - zcb*xba;
-            float zt                = xba*ycb - xcb*yba;
-
-            float xu                = ycb*zdc - ydc*zcb;
-            float yu                = zcb*xdc - zdc*xcb;
-            float zu                = xcb*ydc - xdc*ycb;
-
-            float rt2               = xt*xt + yt*yt + zt*zt;
-            float ru2               = xu*xu + yu*yu + zu*zu;
-
-            float rtru              = sqrtf(rt2 * ru2);
-
-            float xv                = ydc*zed - yed*zdc;
-            float yv                = zdc*xed - zed*xdc;
-            float zv                = xdc*yed - xed*ydc;
-
-            float rv2               = xv*xv + yv*yv + zv*zv;
-            float rurv              = sqrtf(ru2 * rv2);
-
-            float    rcb            = sqrtf(xcb*xcb + ycb*ycb + zcb*zcb);
-            float    cosine1        = rtru != 0.0f ? ((xt*xu + yt*yu + zt*zu) / rtru) : 0.0f;
-                     cosine1        = cosine1 >  1.0f ?  1.0f : cosine1;
-                     cosine1        = cosine1 < -1.0f ? -1.0f : cosine1;
-
-            float    angle1         = LOCAL_HACK_RADIAN * acos(cosine1);
-            float    sign           = xba*xu + yba*yu + zba*zu;
-                     angle1         = sign < 0.0f ? -angle1 : angle1;
-            float    value1         = angle1;
-
-            float    rdc            = sqrtf(xdc*xdc + ydc*ydc + zdc*zdc);
-            float    cosine2        = (xu*xv + yu*yv + zu*zv) / rurv;
-                     cosine2        = cosine2 >  1.0f ?  1.0f : cosine2;
-                     cosine2        = cosine2 < -1.0f ? -1.0f : cosine2;
-            float    angle2         = LOCAL_HACK_RADIAN * acos(cosine2);
-
-                     sign           = xcb*xv + ycb*yv + zcb*zv;
-                     angle2         = sign < 0.0f ? -angle2 : angle2;
-            float    value2         = angle2;
-
-            // check for inverted chirality at the central atom
-
-            // if atom2.y < 0, then no chiral check required
-            // sign is set to 1.0 in this case
-            // user atom2.x for the atom index to avoid warp divergence
-
-            int chiralAtomIndex     = (atom2.y  > -1) ? atom2.y : atom2.x;
-            float4 a6               = cSim.pPosq[chiralAtomIndex];
-
-            float xac               = a6.x - a3.x;
-            float yac               = a6.y - a3.y;
-            float zac               = a6.z - a3.z;
-
-            float xbc               = a2.x - a3.x;
-            float ybc               = a2.y - a3.y;
-            float zbc               = a2.z - a3.z;
-
-            // xdc, ydc, zdc appear above
-
-            float xdc1              = a4.x - a3.x;
-            float ydc1              = a4.y - a3.y;
-            float zdc1              = a4.z - a3.z;
-
-            float c1                = ybc*zdc1 - zbc*ydc1;
-            float c2                = ydc1*zac - zdc1*yac;
-            float c3                = yac*zbc  - zac*ybc;
-            float vol               = xac*c1 + xbc*c2 + xdc1*c3;
-                  sign              = vol > 0.0f ? 1.0f : -1.0f;
-                  sign              = atom2.y < 0 ? 1.0f : sign;
-                  value1           *= sign;
-                  value2           *= sign;
-
-           // use bicubic interpolation to compute spline values
-           // compute indices into grid based on angles
-
-           int index1               = (int) ((value1 - cAmoebaSim.amoebaTorTorGridBegin[atom2.z])/cAmoebaSim.amoebaTorTorGridDelta[atom2.z] + 1.0e-05f);
-           float fIndex             = (float) index1;
-           float x1l                = cAmoebaSim.amoebaTorTorGridDelta[atom2.z]*fIndex + cAmoebaSim.amoebaTorTorGridBegin[atom2.z];
-           float x1u                = x1l + cAmoebaSim.amoebaTorTorGridDelta[atom2.z];
-
-           int index2               = (int) ((value2 - cAmoebaSim.amoebaTorTorGridBegin[atom2.z])/cAmoebaSim.amoebaTorTorGridDelta[atom2.z] + 1.0e-05f);
-                 fIndex             = (float) index2;
-           float x2l                = cAmoebaSim.amoebaTorTorGridDelta[atom2.z]*fIndex + cAmoebaSim.amoebaTorTorGridBegin[atom2.z];
-           float x2u                = x2l + cAmoebaSim.amoebaTorTorGridDelta[atom2.z];
-
-           int posIndex1            = index2 + index1*cAmoebaSim.amoebaTorTorGridNy[atom2.z];
-               posIndex1           += cAmoebaSim.amoebaTorTorGridOffset[atom2.z];
-
-           int posIndex2            = index2 + (index1+1)*cAmoebaSim.amoebaTorTorGridNy[atom2.z];
-               posIndex2           += cAmoebaSim.amoebaTorTorGridOffset[atom2.z];
-
-            // load grid points surrounding angle
-
-            float4 y;
-            float4 y1;
-            float4 y2;
-            float4 y12;
-
-            int localIndex         = posIndex1;
-                   y.x             = cAmoebaSim.pAmoebaTorsionTorsionGrids[localIndex].x;
-                   y1.x            = cAmoebaSim.pAmoebaTorsionTorsionGrids[localIndex].y;
-                   y2.x            = cAmoebaSim.pAmoebaTorsionTorsionGrids[localIndex].z;
-                   y12.x           = cAmoebaSim.pAmoebaTorsionTorsionGrids[localIndex].w;
-
-                   localIndex      = posIndex2;
-                   y.y             = cAmoebaSim.pAmoebaTorsionTorsionGrids[localIndex].x;
-                   y1.y            = cAmoebaSim.pAmoebaTorsionTorsionGrids[localIndex].y;
-                   y2.y            = cAmoebaSim.pAmoebaTorsionTorsionGrids[localIndex].z;
-                   y12.y           = cAmoebaSim.pAmoebaTorsionTorsionGrids[localIndex].w;
-
-                   localIndex      = posIndex2 + 1;
-                   y.z             = cAmoebaSim.pAmoebaTorsionTorsionGrids[localIndex].x;
-                   y1.z            = cAmoebaSim.pAmoebaTorsionTorsionGrids[localIndex].y;
-                   y2.z            = cAmoebaSim.pAmoebaTorsionTorsionGrids[localIndex].z;
-                   y12.z           = cAmoebaSim.pAmoebaTorsionTorsionGrids[localIndex].w;
-
-                   localIndex      = posIndex1 + 1;
-                   y.w             = cAmoebaSim.pAmoebaTorsionTorsionGrids[localIndex].x;
-                   y1.w            = cAmoebaSim.pAmoebaTorsionTorsionGrids[localIndex].y;
-                   y2.w            = cAmoebaSim.pAmoebaTorsionTorsionGrids[localIndex].z;
-                   y12.w           = cAmoebaSim.pAmoebaTorsionTorsionGrids[localIndex].w;
-
-            // perform interpolation
-
-            float    e;
-            float    dedang1;
-            float    dedang2;
-            //float4   cx0,cx1,cx2,cx3;
-
-            bicubic( y, y1, y2, y12, value1, x1l, x1u, value2,  x2l,  x2u, &e, &dedang1, &dedang2 );
-            //bicubic( y, y1, y2, y12, value1, x1l, x1u, value2,  x2l,  x2u, &e, &dedang1, &dedang2, &cx0, &cx1, &cx2, &cx3 );
-            energy                    += e;
-            dedang1                   *=  sign * LOCAL_HACK_RADIAN;
-            dedang2                   *=  sign * LOCAL_HACK_RADIAN;
-
-            // chain rule terms  for first angle derivative components
-
-            float    xca               = a3.x - a1.x;
-            float    yca               = a3.y - a1.y;
-            float    zca               = a3.z - a1.z;
-
-            float    xdb               = a4.x - a2.x;
-            float    ydb               = a4.y - a2.y;
-            float    zdb               = a4.z - a2.z;
-
-            float    dedxt             =  dedang1 * (yt*zcb - ycb*zt) / (rt2*rcb);
-            float    dedyt             =  dedang1 * (zt*xcb - zcb*xt) / (rt2*rcb);
-            float    dedzt             =  dedang1 * (xt*ycb - xcb*yt) / (rt2*rcb);
-            float    dedxu             = -dedang1 * (yu*zcb - ycb*zu) / (ru2*rcb);
-            float    dedyu             = -dedang1 * (zu*xcb - zcb*xu) / (ru2*rcb);
-            float    dedzu             = -dedang1 * (xu*ycb - xcb*yu) / (ru2*rcb);
-
-            // compute  first derivative components for first angle
-
-            float    dedxia            = zcb*dedyt - ycb*dedzt;
-            float    dedyia            = xcb*dedzt - zcb*dedxt;
-            float    dedzia            = ycb*dedxt - xcb*dedyt;
-
-            float    dedxib            = yca*dedzt - zca*dedyt + zdc*dedyu - ydc*dedzu;
-            float    dedyib            = zca*dedxt - xca*dedzt + xdc*dedzu - zdc*dedxu;
-            float    dedzib            = xca*dedyt - yca*dedxt + ydc*dedxu - xdc*dedyu;
-
-            float    dedxic            = zba*dedyt - yba*dedzt + ydb*dedzu - zdb*dedyu;
-            float    dedyic            = xba*dedzt - zba*dedxt + zdb*dedxu - xdb*dedzu;
-            float    dedzic            = yba*dedxt - xba*dedyt + xdb*dedyu - ydb*dedxu;
-
-            float    dedxid            = zcb*dedyu - ycb*dedzu;
-            float    dedyid            = xcb*dedzu - zcb*dedxu;
-            float    dedzid            = ycb*dedxu - xcb*dedyu;
-
-            // chain rule terms  for second angle derivative components
-
-            float    xec               = a5.x - a3.x;
-            float    yec               = a5.y - a3.y;
-            float    zec               = a5.z - a3.z;
-
-            float    dedxu2            =  dedang2 * (yu*zdc - ydc*zu) / (ru2*rdc);
-            float    dedyu2            =  dedang2 * (zu*xdc - zdc*xu) / (ru2*rdc);
-            float    dedzu2            =  dedang2 * (xu*ydc - xdc*yu) / (ru2*rdc);
-            float    dedxv2            = -dedang2 * (yv*zdc - ydc*zv) / (rv2*rdc);
-            float    dedyv2            = -dedang2 * (zv*xdc - zdc*xv) / (rv2*rdc);
-            float    dedzv2            = -dedang2 * (xv*ydc - xdc*yv) / (rv2*rdc);
-
-            // compute  first derivative components for second angle
-
-            float    dedxib2           = zdc*dedyu2 - ydc*dedzu2;
-            float    dedyib2           = xdc*dedzu2 - zdc*dedxu2;
-            float    dedzib2           = ydc*dedxu2 - xdc*dedyu2;
-            float    dedxic2           = ydb*dedzu2 - zdb*dedyu2 + zed*dedyv2 - yed*dedzv2;
-            float    dedyic2           = zdb*dedxu2 - xdb*dedzu2 + xed*dedzv2 - zed*dedxv2;
-            float    dedzic2           = xdb*dedyu2 - ydb*dedxu2 + yed*dedxv2 - xed*dedyv2;
-            float    dedxid2           = zcb*dedyu2 - ycb*dedzu2 + yec*dedzv2 - zec*dedyv2;
-            float    dedyid2           = xcb*dedzu2 - zcb*dedxu2 + zec*dedxv2 - xec*dedzv2;
-            float    dedzid2           = ycb*dedxu2 - xcb*dedyu2 + xec*dedyv2 - yec*dedxv2;
-            float    dedxie2           = zdc*dedyv2 - ydc*dedzv2;
-            float    dedyie2           = xdc*dedzv2 - zdc*dedxv2;
-            float    dedzie2           = ydc*dedxv2 - xdc*dedyv2;
-
-            // increment the torsion-torsion energy and gradient
-
-/*
-            float    ett = ett + e
-            float    dett(1,ia) = dett(1,ia) + dedxia 
-            float    dett(2,ia) = dett(2,ia) + dedyia 
-            float    dett(3,ia) = dett(3,ia) + dedzia
-            float    dett(1,ib) = dett(1,ib) + dedxib + dedxib2
-            float    dett(2,ib) = dett(2,ib) + dedyib + dedyib2
-            float    dett(3,ib) = dett(3,ib) + dedzib + dedzib2
-            float    dett(1,ic) = dett(1,ic) + dedxic + dedxic2
-            float    dett(2,ic) = dett(2,ic) + dedyic + dedyic2 
-            float    dett(3,ic) = dett(3,ic) + dedzic + dedzic2
-            float    dett(1,id) = dett(1,id) + dedxid + dedxid2
-            float    dett(2,id) = dett(2,id) + dedyid + dedyid2
-            float    dett(3,id) = dett(3,id) + dedzid + dedzid2
-            float    dett(1,ie) = dett(1,ie) + dedxie2 
-            float    dett(2,ie) = dett(2,ie) + dedyie2 
-            float    dett(3,ie) = dett(3,ie) + dedzie2
-*/
-            // increment the torsion-torsion gradient;
-            int4   atom3            = cAmoebaSim.pAmoebaTorsionTorsionID3[pos1];  
-
-            unsigned int offset     = atom1.x + atom2.w * cSim.stride;
-            float4 force            = cSim.pForce4[offset]; 
-            force.x                -= dedxia;
-            force.y                -= dedyia;
-            force.z                -= dedzia;
-            cSim.pForce4[offset]    = force;
-
-            offset                  = atom1.y + atom3.x * cSim.stride;
-            force                   = cSim.pForce4[offset];
-            force.x                -= (dedxib + dedxib2);
-            force.y                -= (dedyib + dedyib2);
-            force.z                -= (dedzib + dedzib2);
-            cSim.pForce4[offset]    = force;
-
-            offset                  = atom1.z + atom3.y * cSim.stride;
-            force                   = cSim.pForce4[offset];
-            force.x                -= (dedxic + dedxic2);
-            force.y                -= (dedyic + dedyic2);
-            force.z                -= (dedzic + dedzic2);
-            cSim.pForce4[offset]    = force;
-
-            offset                  = atom1.w + atom3.z * cSim.stride;
-            force                   = cSim.pForce4[offset];
-            force.x                -= (dedxid + dedxid2);
-            force.y                -= (dedyid + dedyid2);
-            force.z                -= (dedzid + dedzid2);
-            cSim.pForce4[offset]    = force;
-
-            offset                  = atom2.x + atom3.w * cSim.stride;
-            force                   = cSim.pForce4[offset];
-            force.x                -= dedxie2;
-            force.y                -= dedyie2;
-            force.z                -= dedzie2;
-            cSim.pForce4[offset]    = force;
-
-        }
-        pos += blockDim.x * gridDim.x;
-    }
-    cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy;
-}
-
-
-void kCalculateAmoebaLocalForces(amoebaGpuContext gpu)
-{
-   
-    if( gpu->log ){
-        (void) fprintf( gpu->log,"kCalculateAmoebaLocalForces: blks=%u thrds/blk=%u %u\n",
-                        gpu->gpuContext->sim.blocks, gpu->gpuContext->sim.localForces_threads_per_block,
-                        gpu->gpuContext->sim.localForces_threads_per_block * sizeof(Vectors)); fflush( stderr );
-    }
-
-    kCalculateAmoebaLocalForces_kernel<<<gpu->gpuContext->sim.blocks, gpu->gpuContext->sim.localForces_threads_per_block, gpu->gpuContext->sim.localForces_threads_per_block * sizeof(Vectors)>>>();
-    LAUNCHERROR("kCalculateAmoebaLocalForces");
-
-/*    
-    fprintf( stderr, "Done kCalculateAmoebaLocalForces %p \n", gpu->gpuContext->psForce4); fflush( stderr );
-    gpu->gpuContext->psForce4->Print( stderr ); fflush( stderr );
-    gpu->gpuContext->psEnergy->Print( stderr ); fflush( stderr );
-    GetCalculateAmoebaLocalForcesSim( gpu );
-    gpu->gpuContext->psForce4->Print( stderr ); fflush( stderr );
-    gpu->gpuContext->psEnergy->Print( stderr ); fflush( stderr );
-    fprintf( stderr, "Ez %p\n", (float* ) gpu->gpuContext->sim.pEnergy );
-*/
-    
-}
-
--- a/plugins/amoeba/platforms/cuda/tests/AmoebaTinkerParameterFile.cpp
+++ b/plugins/amoeba/platforms/cuda/tests/AmoebaTinkerParameterFile.cpp
@@ -3770,6 +3770,15 @@ Integrator* readAmoebaParameterFile( const std::string& inputParameterFile, MapS
                    (void) fprintf( log, "CMMotionRemover added w/ frequency=%d at line=%d\n", frequency, lineCount );
                }
   
+            // All forces 
+  
+            } else if( field == ALL_FORCES ){
+                readVec3( filePtr, tokens, forces[ALL_FORCES], &lineCount, field, log );
+            } else if( field == "AllEnergy" ){
+                if( tokens.size() > 1 ){
+                   potentialEnergy[ALL_FORCES] = atof( tokens[1].c_str() ); 
+                }
+   
            // AmoebaHarmonicBond
  
            } else if( field == "AmoebaHarmonicBondParameters" ){
@@ -3876,6 +3885,7 @@ Integrator* readAmoebaParameterFile( const std::string& inputParameterFile, MapS
                       field == "AmoebaGk_A_Force"                     ||
                       field == "AmoebaGk_A_DrB"                       ||
                       field == "AmoebaDBorn"                          ||
+                       field == "AmoebaBorn1Force"                     ||
                       field == "AmoebaBornForce"                      ||
                       field == "AmoebaGkEdiffForceAndTorque"          ||
                       field == "AmoebaGkEdiffForce"                      ){
@@ -3885,6 +3895,7 @@ Integrator* readAmoebaParameterFile( const std::string& inputParameterFile, MapS
           } else if( field == "AmoebaGkEnergy"       || 
                      field == "AmoebaGkEdiffEnergy"  ||
                      field == "AmoebaGk_A_Energy"    ||
+                      field == "AmoebaBorn1Energy"    ||
                      field == "AmoebaBornEnergy"     ){
               double value = atof( tokens[1].c_str() );
               std::vector< std::vector<double> > vectorOfDoubleVectors;
@@ -3948,14 +3959,19 @@ Integrator* readAmoebaParameterFile( const std::string& inputParameterFile, MapS
    double totalPotentialEnergy = 0.0;
    if( log )(void) fprintf( log, "Potential energies\n" );
   
+    double allEnergy = 0.0;
    for( MapStringDoubleI ii = potentialEnergy.begin(); ii != potentialEnergy.end(); ii++ ){
+        if( ii->first == ALL_FORCES ){
+            allEnergy = ii->second;
+        } else {
            totalPotentialEnergy += ii->second;
+        }
        if( log )(void) fprintf( log, "%30s %14.7e\n", ii->first.c_str(), ii->second );
    }
-    potentialEnergy["AllForces"] = totalPotentialEnergy;
+    potentialEnergy["SumOfInputEnergies"] = totalPotentialEnergy;
       
    if( log ){
-       (void) fprintf( log, "Total PE %14.7e\n", totalPotentialEnergy );
+       (void) fprintf( log, "Total PE %14.7e  %14.7e\n", totalPotentialEnergy, allEnergy );
       (void) fprintf( log, "Read %d lines from file=<%s>\n", lineCount, inputParameterFile.c_str() );
       (void) fflush( log );
    }
@@ -4614,6 +4630,9 @@ void testUsingAmoebaTinkerParameterFile( const std::string& amoebaTinkerParamete
            activeForceNames += ii->first + ":";
        }
    }
+    if( forceList.size() >= 11 ){
+        activeForceNames =ALL_FORCES;
+    }
  
    std::vector<Vec3> expectedForces;
    expectedForces.resize( system.getNumParticles() );
@@ -4634,7 +4653,7 @@ void testUsingAmoebaTinkerParameterFile( const std::string& amoebaTinkerParamete
        }
    }

-    int showAll                            = 0;
+    int showAll                            = 1;
    double energyConversion;
    double forceConversion;
    if( useOpenMMUnits ){
@@ -4650,6 +4669,7 @@ void testUsingAmoebaTinkerParameterFile( const std::string& amoebaTinkerParamete
    if( log ){
        std::vector<FILE*> fileList;
        if( log )fileList.push_back( log );
+        double cutoffDelta = 0.02;
        for( unsigned int ii = 0; ii < fileList.size(); ii++ ){
            FILE* filePtr = fileList[ii];
            (void) fprintf( filePtr, "\n" );
@@ -4668,9 +4688,10 @@ void testUsingAmoebaTinkerParameterFile( const std::string& amoebaTinkerParamete
                double delta           = fabs( normF1 - normF2 );
                double sumNorms        = 0.5*(normF1 + normF2);
                double relativeDelta   = sumNorms > 0.0 ? fabs( normF1 - normF2 )/sumNorms : 0.0;
-                if( ( (maxRelativeDelta < relativeDelta) && (sumNorms > 0.1)) || showAll ){
-                    (void) fprintf( filePtr, "%6u %10.3e %10.3e [%14.7e %14.7e %14.7e]   [%14.7e %14.7e %14.7e]\n", ii, relativeDelta, delta,
-                                    expectedForces[ii][0], expectedForces[ii][1], expectedForces[ii][2], forceConversion*forces[ii][0], forceConversion*forces[ii][1], forceConversion*forces[ii][2] );
+                bool badMatch          = (cutoffDelta < relativeDelta) && (sumNorms > 0.1) ? true : false;
+                if( badMatch || showAll ){
+                    (void) fprintf( filePtr, "%6u %10.3e %10.3e [%14.7e %14.7e %14.7e]   [%14.7e %14.7e %14.7e]  %s\n", ii, relativeDelta, delta,
+                                    expectedForces[ii][0], expectedForces[ii][1], expectedForces[ii][2], forceConversion*forces[ii][0], forceConversion*forces[ii][1], forceConversion*forces[ii][2], ( (showAll && badMatch) ? " XXX" : "") );
                    if( ( (maxRelativeDelta < relativeDelta) && (sumNorms > 0.1)) ){
                        maxRelativeDelta      =  relativeDelta;
                        maxRelativeDeltaIndex = ii;
@@ -5466,7 +5487,7 @@ int runTestsUsingAmoebaTinkerParameterFile( MapStringString& argumentMap ){
            //if( checkEnergyForceConsistency )checkForces = 0;
        } else if( key == "log" ){
            logControl = atoi( value.c_str() );
-        } else if( key == "AllForces" ){
+        } else if( key == ALL_FORCES ){
            initializeForceMap( forceMap, 1 );
        } else if( key == AMOEBA_HARMONIC_BOND_FORCE              ||
                   key == AMOEBA_HARMONIC_ANGLE_FORCE             ||

--- a/plugins/amoeba/platforms/cuda/tests/AmoebaTinkerParameterFile.h
+++ b/plugins/amoeba/platforms/cuda/tests/AmoebaTinkerParameterFile.h
@@ -76,6 +76,7 @@ static std::string AMOEBA_GK_FORCE                                    = "AmoebaG
 static std::string AMOEBA_VDW_FORCE                                   = "AmoebaVdw";
 static std::string AMOEBA_WCA_DISPERSION_FORCE                        = "AmoebaWcaDispersion";
 static std::string AMOEBA_SASA_FORCE                                  = "AmoebaSASA";
+static std::string ALL_FORCES                                         = "AllForces";

 static std::string AMOEBA_MULTIPOLE_ROTATION_MATRICES                 = "AmoebaMultipoleRotationMatrices";
 static std::string AMOEBA_MULTIPOLE_ROTATED                           = "AmoebaMultipolesRotated";