Optimzations and bug fix for mapping of torque to force for small molecules

4a1e9683 · Mark Friedrichs · 44bfd168 · 4a1e9683 · 4a1e9683 · 4a1e9683
Commit 4a1e9683 authored Jun 28, 2011 by Mark Friedrichs
20 changed files
--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernelFactory.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernelFactory.cpp
@@ -57,6 +57,7 @@ extern "C" OPENMMCUDA_EXPORT void registerKernelFactories() {
             platform.registerKernelFactory(CalcAmoebaGeneralizedKirkwoodForceKernel::Name(), factory);
             platform.registerKernelFactory(CalcAmoebaVdwForceKernel::Name(), factory);
             platform.registerKernelFactory(CalcAmoebaWcaDispersionForceKernel::Name(), factory);
+             platform.registerKernelFactory(CalcAmoebaForcesAndEnergyKernel::Name(), factory);
        }
    }
 }
@@ -138,5 +139,8 @@ KernelImpl* AmoebaCudaKernelFactory::createKernelImpl(std::string name, const Pl
    if (name == CalcAmoebaUreyBradleyForceKernel::Name())
        return new CudaCalcAmoebaUreyBradleyForceKernel(name, platform, *amoebaCudaData, context.getSystem());
+    if (name == CalcAmoebaForcesAndEnergyKernel::Name())
+        return new CalcAmoebaForcesAndEnergyKernel(name, platform, *amoebaCudaData);
    throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());
 }
--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -44,6 +44,50 @@ extern "C" int gpuSetConstants( gpuContext gpu );
 using namespace OpenMM;
 using namespace std;
+void CalcAmoebaForcesAndEnergyKernel::initialize(const System& system) {
+}
+void CalcAmoebaForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForces, bool includeEnergy) {
+//fprintf( stderr, "In CalcAmoebaForcesAndEnergyKernel::beginComputation computeForceCount=%d inbMethod=%d GBSA=%d includeForces=%d includeEnergy=%d\n", 
+//         data.cudaPlatformData.computeForceCount, data.cudaPlatformData.nonbondedMethod,  data.getAmoebaGpu()->gpuContext->bIncludeGBSA, includeForces, includeEnergy ); fflush( stderr );
+    amoebaGpuContext amoebaGpu  = data.getAmoebaGpu();
+    _gpuContext* gpu            = data.getAmoebaGpu()->gpuContext;
+    if (data.cudaPlatformData.nonbondedMethod != NO_CUTOFF && data.cudaPlatformData.computeForceCount%100 == 0){
+        gpuReorderAtoms(gpu);
+    }
+    data.cudaPlatformData.computeForceCount++;
+    if( gpu->bIncludeGBSA ){
+        kClearBornSumAndForces(gpu);
+    } else if (includeForces){
+        kClearForces(gpu);
+    }
+    if (includeEnergy)
+        kClearEnergy(gpu);
+}
+double CalcAmoebaForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForces, bool includeEnergy) {
+//fprintf( stderr, "IN CalcAmoebaForcesAndEnergyKernel::finishComputation\n" ); fflush( stderr );
+    amoebaGpuContext amoebaGpu  = data.getAmoebaGpu();
+    _gpuContext* gpu            = data.getAmoebaGpu()->gpuContext;
+    if( includeForces ){
+        kReduceForces(gpu);
+    }
+    double energy = 0.0; 
+    if( includeEnergy ){
+        energy = kReduceEnergy(gpu);
+    }    
+    return energy;
+}
 /* -------------------------------------------------------------------------- *
 *                           Calculates bonded forces                         *
 * -------------------------------------------------------------------------- */

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
@@ -33,6 +33,49 @@
 namespace OpenMM {
+/**
+ * This kernel is invoked at the beginning and end of force and energy computations.  It gives the
+ * Platform a chance to clear buffers and do other initialization at the beginning, and to do any
+ * necessary work at the end to determine the final results.
+ */
+class CalcAmoebaForcesAndEnergyKernel : public CalcForcesAndEnergyKernel {
+public:
+    CalcAmoebaForcesAndEnergyKernel(std::string name, const Platform& platform, AmoebaCudaData& data) : CalcForcesAndEnergyKernel(name, platform), data(data) {
+    }   
+    /*** 
+     * Initialize the kernel.
+     * 
+     * @param system     the System this kernel will be applied to
+     */
+    void initialize(const System& system);
+    /** 
+     * This is called at the beginning of each force/energy computation, before calcForcesAndEnergy() has been called on
+     * any ForceImpl.
+     *
+     * @param context       the context in which to execute this kernel
+     * @param includeForce  true if forces should be computed
+     * @param includeEnergy true if potential energy should be computed
+     */
+    void beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy);
+    /** 
+     * This is called at the end of each force/energy computation, after calcForcesAndEnergy() has been called on
+     * every ForceImpl.
+     *
+     * @param context       the context in which to execute this kernel
+     * @param includeForce  true if forces should be computed
+     * @param includeEnergy true if potential energy should be computed
+     * @return the potential energy of the system.  This value is added to all values returned by ForceImpls'
+     * calcForcesAndEnergy() methods.  That is, each force kernel may <i>either</i> return its contribution to the
+     * energy directly, <i>or</i> add it to an internal buffer so that it will be included here.
+     */
+    double finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy);
+private:
+    AmoebaCudaData& data;
+};
 /**
 * This kernel is invoked by AmoebaHarmonicBondForce to calculate the forces acting on the system and the energy of the system.
 */
@@ -72,9 +115,9 @@ private:
 class CudaCalcAmoebaUreyBradleyForceKernel : public CalcAmoebaUreyBradleyForceKernel {
 public:
    CudaCalcAmoebaUreyBradleyForceKernel(std::string name, 
-                                          const Platform& platform,
+                                         const Platform& platform,
-                                          AmoebaCudaData& data,
+                                         AmoebaCudaData& data,
-                                          System& system);
+                                         System& system);
    ~CudaCalcAmoebaUreyBradleyForceKernel();
    /**
     * Initialize the kernel.

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
@@ -358,16 +358,15 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
    totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psMultipoleParticlesIdsAndAxisType, log );
    (void) fprintf( log, "     pMultipoleParticlesIdsAndAxisType  %p\n",      amoebaGpu->amoebaSim.pMultipoleParticlesIdsAndAxisType);
-    (void) fprintf( log, "     maxTorqueBufferIndex            %d\n",      amoebaGpu->maxTorqueBufferIndex );
+    (void) fprintf( log, "     maxTorqueBufferIndex               %d\n",      amoebaGpu->amoebaSim.maxTorqueBufferIndex );
    totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psMultipoleParticlesTorqueBufferIndices, log );
    int memory   = gpuPrintCudaStreamFloat4( amoebaGpu->psTorqueMapForce4, log );
+    (void) fprintf( log, "     torqueMapForce4Delete              %d\n",      amoebaGpu->torqueMapForce4Delete );
    if( amoebaGpu->torqueMapForce4Delete )totalMemory += memory;
    totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psTorque, log );
-    (void) fprintf( log, "     psMultipoleParticlesTorqueBufferIndices %p\n",      amoebaGpu->amoebaSim.pMultipoleParticlesTorqueBufferIndices);
    totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psMolecularDipole, log );
    (void) fprintf( log, "     pMolecularDipole                   %p\n",      amoebaGpu->amoebaSim.pMolecularDipole);
    totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psMolecularQuadrupole, log );
@@ -1341,7 +1340,6 @@ static void gpuRotationToLabFrameAllocate( amoebaGpuContext amoebaGpu )
 static void gpuFixedEFieldAllocate( amoebaGpuContext amoebaGpu )
 {
    // ---------------------------------------------------------------------------------------
    static const std::string methodName = "gpuFixedEFieldAllocate";
    // ---------------------------------------------------------------------------------------
@@ -1869,7 +1867,7 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
    }
-    amoebaGpu->maxTorqueBufferIndex = maxTorqueBufferIndex;
+    amoebaGpu->amoebaSim.maxTorqueBufferIndex = maxTorqueBufferIndex;
    if( amoebaGpu->log ){
        (void) fprintf( amoebaGpu->log, "Max axis count=%d\n", maxTorqueBufferIndex );
        std::string axisLabel[maxAxisType+1] = {  "ZThenX", "Bisector", "ZBisect", "ThreeFold", "ZOnly", "NoAxisType", "Unknown"};
@@ -2812,14 +2810,14 @@ void amoebaGpuBuildOutputBuffers( amoebaGpuContext amoebaGpu, int hasAmoebaGener
    // use the Cuda force output buffers for mapping torques onto forces, if max torque buffer count < number of buffers
-    if( amoebaGpu->maxTorqueBufferIndex > outputBuffers ){
+    if( amoebaGpu->amoebaSim.maxTorqueBufferIndex > outputBuffers ){
-        amoebaGpu->psTorqueMapForce4            = new CUDAStream<float4>(paddedNumberOfAtoms*amoebaGpu->maxTorqueBufferIndex, 1, "torqueMapForce");
+        amoebaGpu->psTorqueMapForce4            = new CUDAStream<float4>(paddedNumberOfAtoms, amoebaGpu->amoebaSim.maxTorqueBufferIndex, "torqueMapForce");
        amoebaGpu->torqueMapForce4Delete        = 1;
    } else {
        amoebaGpu->psTorqueMapForce4            = amoebaGpu->gpuContext->psForce4;
        amoebaGpu->torqueMapForce4Delete        = 0;
    }
-    amoebaGpu->amoebaSim.pTorqueMapForce4 = amoebaGpu->psTorqueMapForce4->_pDevData;
+    amoebaGpu->amoebaSim.pTorqueMapForce4      = amoebaGpu->psTorqueMapForce4->_pDevData;
    return;
 }
@@ -3013,16 +3011,19 @@ void amoebaGpuBuildScalingList( amoebaGpuContext amoebaGpu )
        (void) fflush( amoebaGpu->log );
    }
 #else
-    if( debugOn && amoebaGpu->log ){
+    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "%s %d cells w/ exclusions\n",
+    //if( debugOn && amoebaGpu->log ){
-                                        methodName.c_str(), numWithScalingIndices );
+        (void) fprintf( amoebaGpu->log, "%s %d cells w/ exclusions out of %d\n",
-        for (unsigned int ii = 0; ii < cells; ii++)
+                                        methodName.c_str(), numWithScalingIndices, cells );
-        {
+        if( debugOn ){
-            unsigned int x, y, exclusion;
+            for (unsigned int ii = 0; ii < cells; ii++)
-            decodeCell( pWorkList[ii], &x, &y, &exclusion );
+            {
-            if( exclusion ){
+                unsigned int x, y, exclusion;
-                (void) fprintf( amoebaGpu->log, "%6d [%6u %6u] %8u %8u indexInToIndices=%8d\n", ii, x, y, exclusion, pWorkList[ii],
+                decodeCell( pWorkList[ii], &x, &y, &exclusion );
-                                psScalingIndicesIndex->_pSysData[ii] );
+                if( exclusion ){
+                    (void) fprintf( amoebaGpu->log, "%6d [%6u %6u] %8u %8u indexInToIndices=%8d\n", ii, x, y, exclusion, pWorkList[ii],
+                                    psScalingIndicesIndex->_pSysData[ii] );
+                }
            }
        }
        (void) fflush( amoebaGpu->log );
@@ -4471,3 +4472,32 @@ void reduceAndCopyCUDAStreamFloat4( CUDAStream<float4>* streamToCopy, CUDAStream
    outputStream->Upload();
 }
+/**----------------------------------------------------------------------------------------
+   Reduce and copy  CUDAStream<float> stream to  CUDAStream<float> with reduction
+   @param streamToCopy     float4 stream to copy
+   @param outputStream     output stream
+   @param conversion       conversion factor
+   --------------------------------------------------------------------------------------- */
+void reduceAndCopyCUDAStreamFloat( CUDAStream<float>* streamToCopy, CUDAStream<float>*  outputStream, float conversion )
+{
+    streamToCopy->Download();
+    for( unsigned int ii = 0; ii < streamToCopy->_stride; ii++ ){
+        outputStream->_pSysData[ii]    = streamToCopy->_pSysStream[0][ii];
+if( ii == 0 )(void) fprintf( stderr, "reduceAndCopyCUDAStreamFloat:%u %15.7e %u %u\n", ii, streamToCopy->_pSysStream[0][ii], streamToCopy->_stride, streamToCopy->_subStreams );
+        for( int jj = 1; jj < streamToCopy->_subStreams; jj++ ){
+            if(  streamToCopy->_pSysStream[jj][ii] !=  streamToCopy->_pSysStream[jj][ii] ){
+                (void) fprintf( stderr, "Nan at particle=%d stream=%d\n", ii, jj );
+            }
+            outputStream->_pSysData[ii]   += streamToCopy->_pSysStream[jj][ii];
+if( ii == 0 )(void) fprintf( stderr, "reduceAndCopyCUDAStreamFloat:%u %d %15.7e %15.7e\n", ii, jj, streamToCopy->_pSysStream[jj][ii], outputStream->_pSysData[ii] );
+        }
+        outputStream->_pSysData[ii]    *= conversion;
+    }
+    outputStream->Upload();
+}
--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
@@ -166,6 +166,7 @@ extern unsigned int getThreadsPerBlock( amoebaGpuContext amoebaGpu, unsigned int
 extern void trackMutualInducedIterations( amoebaGpuContext amoebaGpu, int iteration);
 extern void zeroCUDAStreamFloat4( CUDAStream<float4>* streamToCopy );
 extern void reduceAndCopyCUDAStreamFloat4( CUDAStream<float4>* streamToCopy, CUDAStream<float>*  outputStream, float conversion );
+extern void reduceAndCopyCUDAStreamFloat( CUDAStream<float>* streamToCopy, CUDAStream<float>*  outputStream, float conversion );
 // PME

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaTypes.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaTypes.h
@@ -140,6 +140,7 @@ struct cudaAmoebaGmxSimulation {
    int4*  pMultipoleParticlesIdsAndAxisType; 
    int4*  pMultipoleParticlesTorqueBufferIndices; 
+    int maxTorqueBufferIndex;
    float4* pTorqueMapForce4;
    float* pMolecularDipole; 

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
@@ -105,7 +105,6 @@ struct _amoebaGpuContext {
    // buffer indices used for mapping torques onto forces 
-    int maxTorqueBufferIndex;
    int torqueMapForce4Delete;
    CUDAStream<int4>* psMultipoleParticlesTorqueBufferIndices;
    CUDAStream<float4>*  psTorqueMapForce4; 

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
@@ -35,62 +35,7 @@ static int const PScaleIndex            =  0;
 static int const DScaleIndex            =  1; 
 static int const UScaleIndex            =  2; 
 static int const MScaleIndex            =  3;
-static int const Scale3Index            =  4;
+static int const LastScalingIndex       =  4;
-static int const Scale5Index            =  5;
-static int const Scale7Index            =  6;
-static int const Scale9Index            =  7;
-static int const Ddsc30Index            =  8;
-//static int const Ddsc31Index            =  9;
-//static int const Ddsc32Index            = 10; 
-static int const Ddsc50Index            = 11;
-//static int const Ddsc51Index            = 12;
-//static int const Ddsc52Index            = 13; 
-static int const Ddsc70Index            = 14;
-//static int const Ddsc71Index            = 15;
-//static int const Ddsc72Index            = 16;
-static int const LastScalingIndex       = 17;
-#define DOT3_4(u,v) ((u[0])*(v[0]) + (u[1])*(v[1]) + (u[2])*(v[2]))
-#define MATRIXDOT31(u,v) u[0]*v[0] + u[1]*v[1] + u[2]*v[2] + \
-  u[3]*v[3] + u[4]*v[4] + u[5]*v[5] + \
-  u[6]*v[6] + u[7]*v[7] + u[8]*v[8]
-#define DOT31(u,v) ((u[0])*(v[0]) + (u[1])*(v[1]) + (u[2])*(v[2]))
-#define i35 0.257142857f
-#define one 1.0f
-__device__ void acrossProductVector3(   float* vectorX, float* vectorY, float* vectorZ ){
-    vectorZ[0]  = vectorX[1]*vectorY[2] - vectorX[2]*vectorY[1];
-    vectorZ[1]  = vectorX[2]*vectorY[0] - vectorX[0]*vectorY[2];
-    vectorZ[2]  = vectorX[0]*vectorY[1] - vectorX[1]*vectorY[0];
-}
-__device__ void amatrixProductVector3(   float* matrixX, float* vectorY, float* vectorZ ){
-    vectorZ[0]  = matrixX[0]*vectorY[0] + matrixX[3]*vectorY[1] + matrixX[6]*vectorY[2];
-    vectorZ[1]  = matrixX[1]*vectorY[0] + matrixX[4]*vectorY[1] + matrixX[7]*vectorY[2];
-    vectorZ[2]  = matrixX[2]*vectorY[0] + matrixX[5]*vectorY[1] + matrixX[8]*vectorY[2];
-}
-__device__ void amatrixCrossProductMatrix3( float* matrixX, float* matrixY, float* vectorZ ){
-    float* xPtr[3];
-    float* yPtr[3];
-    xPtr[0]    = matrixX;
-    xPtr[1]    = matrixX + 3;
-    xPtr[2]    = matrixX + 6;
-    yPtr[0]    = matrixY;
-    yPtr[1]    = matrixY + 3;
-    yPtr[2]    = matrixY + 6;
-    vectorZ[0] = DOT31( xPtr[1], yPtr[2] ) - DOT31( xPtr[2], yPtr[1] );
-    vectorZ[1] = DOT31( xPtr[2], yPtr[0] ) - DOT31( xPtr[0], yPtr[2] );
-    vectorZ[2] = DOT31( xPtr[0], yPtr[1] ) - DOT31( xPtr[1], yPtr[0] );
-}
 struct ElectrostaticParticle {
@@ -124,17 +69,26 @@ struct ElectrostaticParticle {
    float force[3];
-    float torque[3];
+    //float torque[3];
-    float padding;
+    //float padding;
 };
-__device__ void calculateElectrostaticPairIxn_kernel( ElectrostaticParticle& atomI,   ElectrostaticParticle& atomJ,
+#ifdef Original
-                                                      float* scalingFactors, float4*  outputForce, float4  outputTorque[2]
-#ifdef AMOEBA_DEBUG
+#define i35 0.257142857f
-                                                      ,float4* debugArray 
+#define DOT3_4(u,v) ((u[0])*(v[0]) + (u[1])*(v[1]) + (u[2])*(v[2]))
-#endif
- ){
+#define MATRIXDOT31(u,v) u[0]*v[0] + u[1]*v[1] + u[2]*v[2] + \
+  u[3]*v[3] + u[4]*v[4] + u[5]*v[5] + \
+  u[6]*v[6] + u[7]*v[7] + u[8]*v[8]
+#define DOT31(u,v) ((u[0])*(v[0]) + (u[1])*(v[1]) + (u[2])*(v[2]))
+#define one 1.0f
+__device__ void calculateElectrostaticPairIxnOrig_kernel( ElectrostaticParticle& atomI,   ElectrostaticParticle& atomJ,
+                                                      float* scalingFactors, float4*  outputForce, float4  outputTorque[2]){
    float deltaR[3];
@@ -293,37 +247,6 @@ __device__ void calculateElectrostaticPairIxn_kernel( ElectrostaticParticle& ato
    float ei                 = 0.5f*(rr3*(gli1+gli6)*psc0 + rr5*(gli2+gli7)*psc1 + rr7*gli3*psc2);
    outputForce->w           = em+ei;
-#ifdef AMOEBA_DEBUG
-#if 0
-if( 1 ){
-    int debugIndex           = 0;
-    debugArray[debugIndex].x = em;
-    debugArray[debugIndex].y = ei;
-    debugArray[debugIndex].z = rr1;
-    debugArray[debugIndex].w = rr3;
-    debugIndex++;
-    debugArray[debugIndex].x = gl0;
-    debugArray[debugIndex].y = gl1;
-    debugArray[debugIndex].z = gl6;
-    debugArray[debugIndex].w = gl2;
-    debugIndex++;
-    debugArray[debugIndex].x = gli1;
-    debugArray[debugIndex].y = gli3;
-    debugArray[debugIndex].z = gli2;
-    debugArray[debugIndex].w = gli7;
-    debugIndex++;
-    debugArray[debugIndex].x = psc0;
-    debugArray[debugIndex].y = psc1;
-    debugArray[debugIndex].z = psc2;
-    debugArray[debugIndex].w = scalingFactors[MScaleIndex];
-}
-#endif
-#endif
    float temp1[3],temp2[3],temp3[3];
    float qIqJr[3], qJqIr[3], qIdJ[3], qJdI[3];
    amatrixProductVector3( atomI.labFrameQuadrupole,      atomJ.labFrameDipole,     qIdJ );//MK
@@ -528,99 +451,6 @@ if( 1 ){
    }
-#ifdef AMOEBA_DEBUG
-if( 0 ){
-int debugIndex               = 0;
-    debugArray[debugIndex].x = scalingFactors[DScaleIndex];
-    debugArray[debugIndex].y = scalingFactors[PScaleIndex];
-    debugArray[debugIndex].z = scalingFactors[MScaleIndex];
-    debugArray[debugIndex].w = scalingFactors[UScaleIndex];
-    debugIndex++;
-    debugArray[debugIndex].x = ftm2i_0 + (fridmp_0 + findmp_0);
-    debugArray[debugIndex].y = ftm2i_1 + (fridmp_1 + findmp_1);
-    debugArray[debugIndex].z = ftm2i_2 + (fridmp_2 + findmp_2);
-    debugArray[debugIndex].w = 1.5;
-/*
-    debugIndex++;
-    debugArray[debugIndex].x = temp2[0];
-    debugArray[debugIndex].y = temp2[1];
-    debugArray[debugIndex].z = temp2[2];
-    debugArray[debugIndex].w = 2.0f;
-    debugIndex++;
-    debugArray[debugIndex].x = temp3[0];
-    debugArray[debugIndex].y = temp3[1];
-    debugArray[debugIndex].z = temp3[2];
-    debugArray[debugIndex].w = 3.0f;
-    debugIndex++;
-    debugArray[debugIndex].x = temp4[0];
-    debugArray[debugIndex].y = temp4[1];
-    debugArray[debugIndex].z = temp4[2];
-    debugArray[debugIndex].w = 4.0f;
-    debugIndex++;
-    debugArray[debugIndex].x = temp5[0];
-    debugArray[debugIndex].y = temp5[1];
-    debugArray[debugIndex].z = temp5[2];
-    debugArray[debugIndex].w = 5.0f;
-    debugIndex++;
-    debugArray[debugIndex].x = temp6[0];
-    debugArray[debugIndex].y = temp6[1];
-    debugArray[debugIndex].z = temp6[2];
-    debugArray[debugIndex].w = 6.0f;
-    debugIndex++;
-    debugArray[debugIndex].x = temp14[0];
-    debugArray[debugIndex].y = temp14[1];
-    debugArray[debugIndex].z = temp14[2];
-    debugArray[debugIndex].w = 14.0f;
-    debugIndex++;
-    debugArray[debugIndex].x = temp7[0];
-    debugArray[debugIndex].y = temp7[1];
-    debugArray[debugIndex].z = temp7[2];
-    debugArray[debugIndex].w = 7.0f;
-    debugIndex++;
-    debugArray[debugIndex].x = temp8[0];
-    debugArray[debugIndex].y = temp8[1];
-    debugArray[debugIndex].z = temp8[2];
-    debugArray[debugIndex].w = 8.0f;
-    debugIndex++;
-    debugArray[debugIndex].x = rr3;
-    debugArray[debugIndex].y = gf3;
-    debugArray[debugIndex].z = gf6;
-    debugArray[debugIndex].w = 20.0f;
-    debugIndex++;
-    debugArray[debugIndex].x = gf4;
-    debugArray[debugIndex].y = gf7;
-    debugArray[debugIndex].z = 0.0f;
-    debugArray[debugIndex].w = 21.0f;
-    debugIndex++;
-    debugArray[debugIndex].x = atomJ.labFrameDipole[0];
-    debugArray[debugIndex].y = atomJ.labFrameDipole[1];
-    debugArray[debugIndex].z = atomJ.labFrameDipole[2];
-    debugArray[debugIndex].w = 22.0f;
-    debugIndex++;
-    debugArray[debugIndex].x = deltaR[0];
-    debugArray[debugIndex].y = deltaR[1];
-    debugArray[debugIndex].z = deltaR[2];
-    debugArray[debugIndex].w = 23.0f;
-*/
-}
-#endif
    outputForce->x       = -(ftm2_0 + ftm2i_0);
    outputForce->y       = -(ftm2_1 + ftm2i_1);
    outputForce->z       = -(ftm2_2 + ftm2i_2);
@@ -636,50 +466,124 @@ int debugIndex               = 0;
    return;
 }
+#endif
+static __device__ void loadElectrostaticParticle( struct ElectrostaticParticle* sA, unsigned int atomI ){
-__device__ void loadElectrostaticShared( struct ElectrostaticParticle* sA, unsigned int atomI,
-                                         float4* atomCoord, float* labFrameDipoleJ, float* labQuadrupole,
-                                         float* inducedDipole, float* inducedDipolePolar, float2* dampingFactorAndThole )
-{
    // coordinates & charge
-    sA->x                        = atomCoord[atomI].x;
+    sA->x                        = cSim.pPosq[atomI].x;
-    sA->y                        = atomCoord[atomI].y;
+    sA->y                        = cSim.pPosq[atomI].y;
-    sA->z                        = atomCoord[atomI].z;
+    sA->z                        = cSim.pPosq[atomI].z;
-    sA->q                        = atomCoord[atomI].w;
+    sA->q                        = cSim.pPosq[atomI].w;
    // lab dipole
-    sA->labFrameDipole[0]         = labFrameDipoleJ[atomI*3];
+    sA->labFrameDipole[0]        = cAmoebaSim.pLabFrameDipole[atomI*3];
-    sA->labFrameDipole[1]         = labFrameDipoleJ[atomI*3+1];
+    sA->labFrameDipole[1]        = cAmoebaSim.pLabFrameDipole[atomI*3+1];
-    sA->labFrameDipole[2]         = labFrameDipoleJ[atomI*3+2];
+    sA->labFrameDipole[2]        = cAmoebaSim.pLabFrameDipole[atomI*3+2];
    // lab quadrupole
-    sA->labFrameQuadrupole[0]    = labQuadrupole[atomI*9];
+    sA->labFrameQuadrupole[0]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9];
-    sA->labFrameQuadrupole[1]    = labQuadrupole[atomI*9+1];
+    sA->labFrameQuadrupole[1]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+1];
-    sA->labFrameQuadrupole[2]    = labQuadrupole[atomI*9+2];
+    sA->labFrameQuadrupole[2]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+2];
-    sA->labFrameQuadrupole[3]    = labQuadrupole[atomI*9+3];
+    sA->labFrameQuadrupole[3]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+3];
-    sA->labFrameQuadrupole[4]    = labQuadrupole[atomI*9+4];
+    sA->labFrameQuadrupole[4]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+4];
-    sA->labFrameQuadrupole[5]    = labQuadrupole[atomI*9+5];
+    sA->labFrameQuadrupole[5]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+5];
-    sA->labFrameQuadrupole[6]    = labQuadrupole[atomI*9+6];
+    sA->labFrameQuadrupole[6]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+6];
-    sA->labFrameQuadrupole[7]    = labQuadrupole[atomI*9+7];
+    sA->labFrameQuadrupole[7]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+7];
-    sA->labFrameQuadrupole[8]    = labQuadrupole[atomI*9+8];
+    sA->labFrameQuadrupole[8]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+8];
    // induced dipole
-    sA->inducedDipole[0]          = inducedDipole[atomI*3];
+    sA->inducedDipole[0]         = cAmoebaSim.pInducedDipole[atomI*3];
-    sA->inducedDipole[1]          = inducedDipole[atomI*3+1];
+    sA->inducedDipole[1]         = cAmoebaSim.pInducedDipole[atomI*3+1];
-    sA->inducedDipole[2]          = inducedDipole[atomI*3+2];
+    sA->inducedDipole[2]         = cAmoebaSim.pInducedDipole[atomI*3+2];
    // induced dipole polar
-    sA->inducedDipoleP[0]         = inducedDipolePolar[atomI*3];
+    sA->inducedDipoleP[0]        = cAmoebaSim.pInducedDipolePolar[atomI*3];
-    sA->inducedDipoleP[1]         = inducedDipolePolar[atomI*3+1];
+    sA->inducedDipoleP[1]        = cAmoebaSim.pInducedDipolePolar[atomI*3+1];
-    sA->inducedDipoleP[2]         = inducedDipolePolar[atomI*3+2];
+    sA->inducedDipoleP[2]        = cAmoebaSim.pInducedDipolePolar[atomI*3+2];
+    sA->damp                     = cAmoebaSim.pDampingFactorAndThole[atomI].x;
+    sA->thole                    = cAmoebaSim.pDampingFactorAndThole[atomI].y;
+}
+static __device__ void zeroElectrostaticParticle( struct ElectrostaticParticle* sA ){
-    sA->damp                     = dampingFactorAndThole[atomI].x;
+    // coordinates & charge
-    sA->thole                    = dampingFactorAndThole[atomI].y;
+    sA->force[0]                 = 0.0f;
+    sA->force[1]                 = 0.0f;
+    sA->force[2]                 = 0.0f;
+/*
+    sA->torque[0]                = 0.0f;
+    sA->torque[1]                = 0.0f;
+    sA->torque[2]                = 0.0f;
+*/
+}
+#undef SUB_METHOD_NAME
+#undef F1
+#define SUB_METHOD_NAME(a, b) a##F1##b
+#define F1
+#include "kCalculateAmoebaCudaElectrostatic_b.h"
+#undef F1
+#undef SUB_METHOD_NAME
+#undef SUB_METHOD_NAME
+#undef F2
+#define SUB_METHOD_NAME(a, b) a##F2##b
+#define F2
+//#include "kCalculateAmoebaCudaElectrostatic_b.h"
+#undef F2
+#undef SUB_METHOD_NAME
+#undef SUB_METHOD_NAME
+#undef T1
+#define SUB_METHOD_NAME(a, b) a##T1##b
+#define T1
+#include "kCalculateAmoebaCudaElectrostatic_b.h"
+#undef T1
+#undef SUB_METHOD_NAME
+#undef SUB_METHOD_NAME
+#undef T3
+#define SUB_METHOD_NAME(a, b) a##T3##b
+#define T3
+#include "kCalculateAmoebaCudaElectrostatic_b.h"
+#undef T3
+#undef SUB_METHOD_NAME
+__device__ void calculateElectrostaticPairIxn_kernel( ElectrostaticParticle& atomI,   ElectrostaticParticle& atomJ,
+                                                      float* scalingFactors, float4*  outputForce, float4 outputTorque[2], float forceFactor){
+#ifdef Orig
+    return calculateElectrostaticPairIxn_kernel( atomI, atomJ, scalingFactors, outputForce, outputTorque);
+#else
+    float force[3];
+    float energy;
+    calculateElectrostaticPairIxnF1_kernel( atomI,  atomJ, scalingFactors, &energy, force);
+    outputForce->x = force[0];
+    outputForce->y = force[1];
+    outputForce->z = force[2];
+    outputForce->w = energy;
+    calculateElectrostaticPairIxnT1_kernel( atomI,  atomJ, scalingFactors, force);
+    outputTorque[0].x = force[0];
+    outputTorque[0].y = force[1];
+    outputTorque[0].z = force[2];
+    calculateElectrostaticPairIxnT3_kernel( atomI,  atomJ, scalingFactors, force);
+    outputTorque[1].x = force[0];
+    outputTorque[1].y = force[1];
+    outputTorque[1].z = force[2];
+    return;
+#endif
 }
@@ -754,7 +658,8 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo
    if( threadsPerBlock == 0 ){
        unsigned int maxThreads;
        if (gpu->sm_version >= SM_20)
-            maxThreads = 384;
+            //maxThreads = 384;
+            maxThreads = 512;
        else if (gpu->sm_version >= SM_12)
            maxThreads = 128;
        else
@@ -773,53 +678,39 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo
 #endif
    if (gpu->bOutputBufferPerWarp){
        kCalculateAmoebaCudaElectrostaticN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
-                                                                           amoebaGpu->psWorkUnit->_pDevData,
+                                                                           amoebaGpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData );
-                                                                           gpu->psPosq4->_pDevData,
-                                                                           amoebaGpu->psLabFrameDipole->_pDevData,
-                                                                           amoebaGpu->psLabFrameQuadrupole->_pDevData,
-                                                                           amoebaGpu->psInducedDipole->_pDevData,
-                                                                           amoebaGpu->psInducedDipolePolar->_pDevData,
-#ifdef AMOEBA_DEBUG
-                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
-                                                                           debugArray->_pDevData, targetAtom );
-#else
-                                                                           amoebaGpu->psWorkArray_3_1->_pDevData );
-#endif
    } else {
        kCalculateAmoebaCudaElectrostaticN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
-                                                                           amoebaGpu->psWorkUnit->_pDevData,
+                                                                           amoebaGpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData );
-                                                                           gpu->psPosq4->_pDevData,
-                                                                           amoebaGpu->psLabFrameDipole->_pDevData,
-                                                                           amoebaGpu->psLabFrameQuadrupole->_pDevData,
-                                                                           amoebaGpu->psInducedDipole->_pDevData,
-                                                                           amoebaGpu->psInducedDipolePolar->_pDevData,
-#ifdef AMOEBA_DEBUG
-                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
-                                                                           debugArray->_pDevData, targetAtom );
-#else
-                                                                           amoebaGpu->psWorkArray_3_1->_pDevData );
-#endif
    }
    LAUNCHERROR("kCalculateAmoebaCudaElectrostaticN2Forces");
+    if( 0 ){ 
+        VectorOfDoubleVectors outputVector;
+        std::vector<int> fileId;
+        static int call = 0; 
+        fileId.push_back( call++ );
+        int paddedNumberOfAtoms  = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
+        CUDAStream<float>* temp  = new CUDAStream<float>(3*paddedNumberOfAtoms, 1, "Temp1");
+        //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,            outputVector, NULL, 1.0f );
+        reduceAndCopyCUDAStreamFloat4( gpu->psForce4, temp, 1.0 );
+        cudaLoadCudaFloatArray( gpu->natoms,  3, temp, outputVector, NULL, 1.0f/4.184f );
+        reduceAndCopyCUDAStreamFloat( amoebaGpu->psWorkArray_3_1, temp, 1.0 );
+        cudaLoadCudaFloatArray( gpu->natoms,  3, temp, outputVector, NULL, 1.0f/4.184f );
+        cudaWriteVectorOfDoubleVectorsToFile( "CudaElectrostaticTorque", fileId, outputVector );
+        delete temp;
+    }    
    if( addTorqueToForce ){
        kReduceTorque( amoebaGpu );
        cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );
    }
-    if( 0 ){
-        std::vector<int> fileId;
-        //fileId.push_back( 0 );
-        VectorOfDoubleVectors outputVector;
-        //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,            outputVector, NULL, 1.0f );
-        //cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psForce,      outputVector, NULL, 1.0f/4.184 );
-        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psTorque,     outputVector, NULL, 1.0f/4.184 );
-        cudaWriteVectorOfDoubleVectorsToFile( "CudaForceTorque", fileId, outputVector );
-     }
   // ---------------------------------------------------------------------------------------
 }
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.h
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic_b.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic_b.h
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.cu
@@ -85,13 +85,10 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
    // N2 debug array
-    CUDAStream<float4>* debugArray             = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
+    CUDAStream<float4>* debugArray             = new CUDAStream<float4>(10*paddedNumberOfAtoms, 1, "DebugArray");
    memset( debugArray->_pSysData,      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
    debugArray->Upload();
-    (*gpu->psInteractionCount)[0]              = gpu->sim.workUnits;
-    gpu->psInteractionCount->Upload();
    // print intermediate results for the targetAtom 
    unsigned int targetAtom  = 3;
@@ -201,6 +198,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
        (void) fflush( amoebaGpu->log );
        (void) fprintf( amoebaGpu->log, "EFields End\n" );
+/*
        (void) fprintf( amoebaGpu->log, "DebugQ\n" );
        debugArray->Download();
        if( 0 ){
@@ -256,23 +254,6 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
                   sum[1][1] += debugArray->_pSysData[debugIndex].y;
                   sum[1][2] += debugArray->_pSysData[debugIndex].z;
-/*
-                   debugIndex += amoebaGpu->paddedNumberOfAtoms;
-                   (void) fprintf( amoebaGpu->log,"atmJ[%16.9e %16.9e %16.9e]\n",
-                                   debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
-                                   debugArray->_pSysData[debugIndex].z );
-                   debugIndex += amoebaGpu->paddedNumberOfAtoms;
-                   (void) fprintf( amoebaGpu->log,"atmJ[%16.9e %16.9e %16.9e]\n",
-                                   debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
-                                   debugArray->_pSysData[debugIndex].z );
-                   debugIndex += gpu->natoms;
-                   (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
-                                   debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
-                                   debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
- */   
               }
               (void) fprintf( amoebaGpu->log,"SumQ [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e]\n",
                               sum[0][0], sum[0][1], sum[0][2],
@@ -301,10 +282,11 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
                                debugArray->_pSysData[ii].w );
            }
        }
+*/
        // write results to file
-        if( 1 ){
+        if( 0 ){
            std::vector<int> fileId;
            //fileId.push_back( 0 );
            VectorOfDoubleVectors outputVector;
@@ -314,7 +296,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
            cudaWriteVectorOfDoubleVectorsToFile( "CudaEField", fileId, outputVector );
         }
-         delete debugArray;
+         //delete debugArray;
    }
 #endif

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.h
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.cu
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.h
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiffParticle.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiffParticle.h
+#ifndef AMOEBA_CUDA_KIRKWOOD_PARTICLE_H
+#define AMOEBA_CUDA_KIRKWOOD_PARTICLE_H
+struct KirkwoodEDiffParticle {
+    // coordinates charge
+    float x;
+    float y;
+    float z;
+    float q;
+    // scaling factor
+    float thole;
+    float damp;
+    // lab frame dipole
+    float labFrameDipole[3]; 
+    // lab frame quadrupole
+    float labFrameQuadrupole_XX;
+    float labFrameQuadrupole_XY;
+    float labFrameQuadrupole_XZ;
+    float labFrameQuadrupole_YY;
+    float labFrameQuadrupole_YZ;
+    float labFrameQuadrupole_ZZ;
+    // induced dipole and polar counterpart
+    float inducedDipole[3]; 
+    float inducedDipoleP[3]; 
+    // solvent induced dipole and polar counterpart
+    float inducedDipoleS[3]; 
+    float inducedDipolePS[3]; 
+    // Born radii
+    float force[3];
+//    float torque[3];
+};
+#endif
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff_b.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff_b.h
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodParticle.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodParticle.h
@@ -36,55 +36,13 @@ struct KirkwoodParticle {
    float bornRadius;
    float force[3];
+#ifdef INCLUDE_TORQUE
    float torque[3];
+#endif
    float dBornRadius;
    float dBornRadiusPolar;
-    float padding;
+//    float padding;
-};
-struct KirkwoodEDiffParticle {
-    // coordinates charge
-    float x;
-    float y;
-    float z;
-    float q;
-    // scaling factor
-    float thole;
-    float damp;
-    // lab frame dipole
-    float labFrameDipole[3]; 
-    // lab frame quadrupole
-    float labFrameQuadrupole_XX;
-    float labFrameQuadrupole_XY;
-    float labFrameQuadrupole_XZ;
-    float labFrameQuadrupole_YY;
-    float labFrameQuadrupole_YZ;
-    float labFrameQuadrupole_ZZ;
-    // induced dipole and polar counterpart
-    float inducedDipole[3]; 
-    float inducedDipoleP[3]; 
-    // solvent induced dipole and polar counterpart
-    float inducedDipoleS[3]; 
-    float inducedDipolePS[3]; 
-    // Born radii
-    float force[3];
-    float torque[3];
 };

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood_b.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood_b.h
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMapTorques.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMapTorques.cu
@@ -67,6 +67,80 @@ __device__ static void loadMappedTorque( int particleId, int bufferIndex, float*
 }
+__global__
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
+#elif (__CUDA_ARCH__ >= 120)
+__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
+#else
+__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
+#endif
+void amoebaAddMapTorqueForceToForce_kernel( void )
+{
+  // ---------------------------------------------------------------------------------------
+    int pos                      = blockIdx.x*blockDim.x + threadIdx.x;
+  // ---------------------------------------------------------------------------------------
+    while (pos < cSim.stride4 )
+    {   
+        float totalForce = 0.0f;
+        float* pFt       = (float*)cAmoebaSim.pTorqueMapForce4 + pos;
+        int i            = cAmoebaSim.maxTorqueBufferIndex;
+        while (i >= 4)
+        {   
+            float f1    = *pFt;
+            pFt        += cSim.stride4;
+            float f2    = *pFt;
+            pFt        += cSim.stride4;
+            float f3    = *pFt;
+            pFt        += cSim.stride4;
+            float f4    = *pFt;
+            pFt        += cSim.stride4;
+            totalForce += f1 + f2 + f3 + f4; 
+            i -= 4;
+        }   
+        if (i >= 2)
+        {   
+            float f1    = *pFt;
+            pFt        += cSim.stride4;
+            float f2    = *pFt;
+            pFt        += cSim.stride4;
+            totalForce += f1 + f2; 
+            i -= 2;
+        }   
+        if (i > 0)
+        {   
+            totalForce += *pFt;
+        }   
+        pFt             = (float*)cSim.pForce4 + pos;
+        *pFt           += totalForce;
+        pos            += gridDim.x * blockDim.x;
+    }
+}
+__global__
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
+#elif (__CUDA_ARCH__ >= 120)
+__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
+#else
+__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
+#endif
+void amoebaClearMapTorqueForce_kernel( void )
+{
+    int pos                      = blockIdx.x*blockDim.x + threadIdx.x;
+    while (pos < cSim.stride4*cAmoebaSim.maxTorqueBufferIndex )
+    {   
+        cAmoebaSim.pTorqueMapForce4[pos]  = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+        pos                              += gridDim.x * blockDim.x;
+    }
+}
 __global__
 #if (__CUDA_ARCH__ >= 200)
 __launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
@@ -359,7 +433,77 @@ void cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpuContext amoebaGpu, CUDASt
 {
    gpuContext gpu    = amoebaGpu->gpuContext;
+    if( amoebaGpu->amoebaSim.maxTorqueBufferIndex > amoebaGpu->gpuContext->sim.outputBuffers && amoebaGpu->psTorqueMapForce4 != amoebaGpu->gpuContext->psForce4 && amoebaGpu->psTorqueMapForce4 ){
+        amoebaClearMapTorqueForce_kernel<<< gpu->sim.blocks, gpu->sim.threads_per_block>>> ( );
+        LAUNCHERROR("amoebaClearMapTorqueForce");
+    }
+    if( 0 ){  
+        VectorOfDoubleVectors outputVector;
+        std::vector<int> fileId;
+        static int call = 0;  
+        fileId.push_back( call++ );
+        int paddedNumberOfAtoms  = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
+        CUDAStream<float>* temp  = new CUDAStream<float>(3*paddedNumberOfAtoms, 1, "Temp1");
+        reduceAndCopyCUDAStreamFloat4( gpu->psForce4, temp, 1.0 );
+        cudaLoadCudaFloatArray( gpu->natoms,  3, temp, outputVector, NULL, 1.0f/4.184f );
+        reduceAndCopyCUDAStreamFloat( psTorque, temp, 1.0 );
+        cudaLoadCudaFloatArray( gpu->natoms,  3, temp, outputVector, NULL, 1.0f/4.184f );
+        reduceAndCopyCUDAStreamFloat4( amoebaGpu->psTorqueMapForce4, temp, 1.0 );
+        cudaLoadCudaFloatArray( gpu->natoms,  3, temp, outputVector, NULL, 1.0f/4.184f );
+        cudaWriteVectorOfDoubleVectorsToFile( "CudaElectrostatiPreTorqueForce", fileId, outputVector );
+        delete temp;
+    }    
    amoebaMapTorqueToForce_kernel<<< gpu->sim.blocks, gpu->sim.threads_per_block>>> ( psTorque->_pDevData );
    LAUNCHERROR("amoebaMapTorqueToForce");
+    if( amoebaGpu->amoebaSim.maxTorqueBufferIndex > amoebaGpu->gpuContext->sim.outputBuffers && amoebaGpu->psTorqueMapForce4 != amoebaGpu->gpuContext->psForce4 && amoebaGpu->psTorqueMapForce4 ){
+        amoebaAddMapTorqueForceToForce_kernel<<< gpu->sim.blocks, gpu->sim.threads_per_block>>> ( );
+        LAUNCHERROR("amoebaAddMapTorqueForceToForce");
+    }
+#ifdef AMOEBA_DEBUG
+    if( 0 ){  
+        VectorOfDoubleVectors outputVector;
+        std::vector<int> fileId;
+        static int call = 0;  
+        fileId.push_back( call++ );
+        int paddedNumberOfAtoms  = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
+        CUDAStream<float>* temp  = new CUDAStream<float>(3*paddedNumberOfAtoms, 1, "Temp1");
+        reduceAndCopyCUDAStreamFloat4( gpu->psForce4, temp, 1.0 );
+        cudaLoadCudaFloatArray( gpu->natoms,  3, temp, outputVector, NULL, 1.0f/4.184f );
+        reduceAndCopyCUDAStreamFloat4( amoebaGpu->psTorqueMapForce4, temp, 1.0 );
+        cudaLoadCudaFloatArray( gpu->natoms,  3, temp, outputVector, NULL, 1.0f/4.184f );
+        for( int pId = 0; pId < 5; pId++ ){
+        float sum[3] = { 0.0f,  0.0f,  0.0f };
+        (void) fprintf( stderr, "\n\nTorqueForceToForce for part=%d\n", pId );
+        for( int ii = 0; ii < amoebaGpu->amoebaSim.maxTorqueBufferIndex; ii++ ){
+            (void) fprintf( stderr, "%4d [%15.7e %15.7e %15.7e]\n", ii,
+                            amoebaGpu->psTorqueMapForce4->_pSysStream[ii][pId].x,
+                            amoebaGpu->psTorqueMapForce4->_pSysStream[ii][pId].y,
+                            amoebaGpu->psTorqueMapForce4->_pSysStream[ii][pId].z );
+            sum[0] += amoebaGpu->psTorqueMapForce4->_pSysStream[ii][pId].x;
+            sum[1] += amoebaGpu->psTorqueMapForce4->_pSysStream[ii][pId].y;
+            sum[2] += amoebaGpu->psTorqueMapForce4->_pSysStream[ii][pId].z;
+        }
+        (void) fprintf( stderr, "TorqueForceToForce for partcle=%d [%15.7e %15.7e %15.7e]  [%15.7e %15.7e %15.7e]\n", pId, sum[0], sum[1], sum[2], sum[0]/4.184f, sum[1]/4.184f, sum[2]/4.184f );
+        }
+        cudaWriteVectorOfDoubleVectorsToFile( "CudaElectrostatiPostTorqueForce", fileId, outputVector );
+        delete temp;
+    }
+#endif
 }