Torques now mapped to forces only once per force calculation

1aff5bcb · Mark Friedrichs · 101f206d · 1aff5bcb · 1aff5bcb · 1aff5bcb
Commit 1aff5bcb authored Apr 07, 2011 by Mark Friedrichs
19 changed files
--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -793,17 +793,12 @@ static void computeAmoebaMultipoleForce( AmoebaCudaData& data ) {
    if( data.getHasAmoebaGeneralizedKirkwood() ){
        kCalculateObcGbsaBornSum(gpu->gpuContext);
        kReduceObcGbsaBornSum(gpu->gpuContext);
-        //initializeCudaFloatArray( gpu->gpuContext->natoms, 1, gpu->gpuContext->psBornRadii, 0.1 );
-        //initializeCudaFloatArray( gpu->gpuContext->natoms, 1, gpu->gpuContext->psObcChain,  0.0 );
    }   

    // multipoles

    kCalculateAmoebaMultipoleForces(gpu, data.getHasAmoebaGeneralizedKirkwood() );

-//kClearForces(gpu->gpuContext);
-//kClearEnergy(gpu->gpuContext);
-
    // GK

    if( data.getHasAmoebaGeneralizedKirkwood() ){

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
@@ -104,7 +104,7 @@ extern void cudaWriteFloat4AndFloat1ArraysToFile( int numberOfAtoms, const std::

 extern void SetCalculateAmoebaElectrostaticSim( amoebaGpuContext amoebaGpu );
 extern void GetCalculateAmoebaElectrostaticSim( amoebaGpuContext amoebaGpu );
-extern void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu );
+extern void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueToForce );

 extern void SetCalculateAmoebaPmeDirectElectrostaticSim( amoebaGpuContext amoebaGpu );
 extern void GetCalculateAmoebaPmeDirectElectrostaticSim( amoebaGpuContext amoebaGpu );

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
@@ -705,7 +705,7 @@ static void kReduceTorque(amoebaGpuContext amoebaGpu )
    gpuContext gpu = amoebaGpu->gpuContext;
    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
-                               amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psTorque->_pDevData );
+                               amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psTorque->_pDevData, 0 );
    LAUNCHERROR("kReduceElectrostaticTorque");
 }

@@ -718,7 +718,7 @@ static void kReduceTorque(amoebaGpuContext amoebaGpu )

   --------------------------------------------------------------------------------------- */

-void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
+void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueToForce )
 {
  
   // ---------------------------------------------------------------------------------------
@@ -768,7 +768,6 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
    }

    kClearFields_3( amoebaGpu, 1 );
-    LAUNCHERROR("kClearFields_3 kCalculateAmoebaCudaElectrostatic");

 #ifdef AMOEBA_DEBUG
        if( amoebaGpu->log ){
@@ -812,117 +811,10 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
    }
    LAUNCHERROR("kCalculateAmoebaCudaElectrostaticN2Forces");

-    kReduceTorque( amoebaGpu );
-    LAUNCHERROR("kReduceForceTorque");
-
-    cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );
-
-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-
-        amoebaGpu->psTorque->Download();
-        debugArray->Download();
-
-        (void) fprintf( amoebaGpu->log, "Finished Electrostatic kernel execution\n" ); (void) fflush( amoebaGpu->log );
-
-        int maxPrint        = 1400;
-        for( int ii = 0; ii < gpu->natoms; ii++ ){
-           (void) fprintf( amoebaGpu->log, "%5d ", ii); 
-
-            int indexOffset     = ii*3;
-    
-           // torque
-
-           (void) fprintf( amoebaGpu->log,"ElectrostaticT [%16.9e %16.9e %16.9e] ",
-                           amoebaGpu->psTorque->_pSysData[indexOffset],
-                           amoebaGpu->psTorque->_pSysData[indexOffset+1],
-                           amoebaGpu->psTorque->_pSysData[indexOffset+2] );
-
-           (void) fprintf( amoebaGpu->log,"\n" );
-           if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
-                ii = gpu->natoms - maxPrint;
-           }
-        }
-        if( 1 ){
-            (void) fprintf( amoebaGpu->log,"DebugElec\n" );
-            int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-            for( int jj = 0; jj < gpu->natoms; jj++ ){
-                int debugIndex = jj;
-                for( int kk = 0; kk < 8; kk++ ){
-                    float conversion = kk >= 1 && kk <= 8 ? 1.0f/4.184f : 1.0;
-                    (void) fprintf( amoebaGpu->log,"%5d %5d [%16.9e %16.9e %16.9e %16.9e] E11\n", targetAtom, jj,
-                                    conversion*debugArray->_pSysData[debugIndex].x, conversion*debugArray->_pSysData[debugIndex].y,
-                                    conversion*debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
-                    debugIndex += paddedNumberOfAtoms;
-                }
-                (void) fprintf( amoebaGpu->log,"\n" );
-            }
-        }
-        if( 1 ){
-            (void) fprintf( amoebaGpu->log,"DebugElec\n" );
-            int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-            for( int jj = 0; jj < gpu->natoms; jj++ ){
-                int debugIndex1 = jj + paddedNumberOfAtoms;
-                int debugIndex2 = jj + 5*paddedNumberOfAtoms;
-                int debugIndex3 = jj + 6*paddedNumberOfAtoms;
-                int debugIndex4 = jj + 4*paddedNumberOfAtoms;
-                int debugIndex5 = jj + 7*paddedNumberOfAtoms;
-                float conversion = 1.0f/4.184f;
-                int i1,i2;
-                if( jj < targetAtom ){
-                    i1 = jj;
-                    i2 = targetAtom;
-                } else {
-                    i1 = targetAtom;
-                    i2 = jj;
-                }
-                (void) fprintf( amoebaGpu->log,"%5d %5d %16.9e %16.9e %16.9e    %16.9e %16.9e %16.9e   %16.9e %16.9e %16.9e %16.9e %16.9e %16.9e F11\n", i1,i2,
-                                conversion*debugArray->_pSysData[debugIndex1].x,
-                                conversion*debugArray->_pSysData[debugIndex1].y,
-                                conversion*debugArray->_pSysData[debugIndex1].z, 
-                                conversion*debugArray->_pSysData[debugIndex2].x,
-                                conversion*debugArray->_pSysData[debugIndex2].y,
-                                conversion*debugArray->_pSysData[debugIndex2].z, 
-                                conversion*debugArray->_pSysData[debugIndex3].x,
-                                conversion*debugArray->_pSysData[debugIndex3].y,
-                                conversion*debugArray->_pSysData[debugIndex3].z,
-                                conversion*debugArray->_pSysData[debugIndex5].x,
-                                conversion*debugArray->_pSysData[debugIndex5].y,
-                                conversion*debugArray->_pSysData[debugIndex5].z );
-            }
-        }
-        (void) fflush( amoebaGpu->log );
-
-        if( 0 ){
-            (void) fprintf( amoebaGpu->log, "%s Tiled F & T\n", methodName ); fflush( amoebaGpu->log );
-            int maxPrint = 12;
-            for( int ii = 0; ii < gpu->natoms; ii++ ){
-    
-                // print cpu & gpu reductions
-    
-                int offset  = 3*ii;
-    
-                (void) fprintf( amoebaGpu->log,"%6d T[%16.7e %16.7e %16.7e]\n", ii,
-                                amoebaGpu->psTorque->_pSysData[offset],
-                                amoebaGpu->psTorque->_pSysData[offset+1],
-                                amoebaGpu->psTorque->_pSysData[offset+2] );
-                if( (ii == maxPrint) && (ii < (gpu->natoms - maxPrint)) )ii = gpu->natoms - maxPrint; 
-            }   
-        }   
-
-        if( 1 ){
-            std::vector<int> fileId;
-            //fileId.push_back( 0 );
-            VectorOfDoubleVectors outputVector;
-            cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,            outputVector, NULL, 1.0f );
-            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psTorque,     outputVector, NULL, 1.0f );
-            cudaWriteVectorOfDoubleVectorsToFile( "CudaTorque", fileId, outputVector );
-         }
-
-    }   
-    delete debugArray;
-
-#endif
+    if( addTorqueToForce ){
+        kReduceTorque( amoebaGpu );
+        cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );
+    }

    if( 0 ){
        std::vector<int> fileId;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.cu
@@ -41,17 +41,17 @@ static void kReduceEAndGkFields(amoebaGpuContext amoebaGpu )
    gpuContext gpu = amoebaGpu->gpuContext;
    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
-                               amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psE_Field->_pDevData );
+                               amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psE_Field->_pDevData, 0 );

    LAUNCHERROR("kReduceEAndGK_Fields1");
    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
-                               amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData );
+                               amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData, 0 );
    LAUNCHERROR("kReduceEAndGK_Fields2");

    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
-                               amoebaGpu->psWorkArray_3_3->_pDevData, amoebaGpu->psGk_Field->_pDevData );
+                               amoebaGpu->psWorkArray_3_3->_pDevData, amoebaGpu->psGk_Field->_pDevData, 0 );
    LAUNCHERROR("kReduceEAndGK_Fields3");
 }


--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.cu
@@ -39,12 +39,12 @@ static void kReduceE_Fields_kernel(amoebaGpuContext amoebaGpu )
    gpuContext gpu = amoebaGpu->gpuContext;
    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
-                               amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psE_Field->_pDevData );
+                               amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psE_Field->_pDevData, 0 );
    LAUNCHERROR("kReduceE_Fields1");

    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
-                               amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData );
+                               amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData, 0 );
    LAUNCHERROR("kReduceE_Fields2");
 }


--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
@@ -1531,19 +1531,6 @@ __device__ void zeroKirkwoodParticleSharedField( struct KirkwoodParticle* sA )
 #define METHOD_NAME(a, b) a##N2ByWarp##b
 #include "kCalculateAmoebaCudaKirkwood.h"

-// reduce psWorkArray_3_1 -> force
-// reduce psWorkArray_3_2 -> torque
-
-static void kReduceTorque(amoebaGpuContext amoebaGpu )
-{
-
-    gpuContext gpu = amoebaGpu->gpuContext;
-    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
-                           gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
-                           amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psTorque->_pDevData );
-    LAUNCHERROR("kReduceTorque");
-}
-
 // reduce psWorkArray_1_1 -> dBorn
 // reduce psWorkArray_1_2 -> dBornPolar

@@ -1555,13 +1542,13 @@ static void kReduce_dBorn(amoebaGpuContext amoebaGpu )
 /*
    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
                           gpu->sim.paddedNumberOfAtoms, gpu->sim.outputBuffers,
-                           amoebaGpu->psWorkArray_1_1->_pDevData, amoebaGpu->psBorn->_pDevData );
+                           amoebaGpu->psWorkArray_1_1->_pDevData, amoebaGpu->psBorn->_pDevData, 0 );
    LAUNCHERROR("kReduce_dBorn1");
 */

    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
                           gpu->sim.paddedNumberOfAtoms, gpu->sim.outputBuffers,
-                           amoebaGpu->psWorkArray_1_2->_pDevData, amoebaGpu->psBornPolar->_pDevData );
+                           amoebaGpu->psWorkArray_1_2->_pDevData, amoebaGpu->psBornPolar->_pDevData, 0 );

    LAUNCHERROR("kReduce_dBorn2");
 }
@@ -1841,7 +1828,6 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
    }

    kClearFields_1( amoebaGpu );
-    kClearFields_3( amoebaGpu, 6 );

 #ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){
@@ -1871,78 +1857,10 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
                                                                           );
 #endif
    }
-
    LAUNCHERROR("kCalculateAmoebaCudaKirkwoodN2Forces");

-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-
-        amoebaGpu->psWorkArray_3_1->Download();
-        amoebaGpu->psWorkArray_1_1->Download();
-        amoebaGpu->psWorkArray_1_2->Download();
-/*
-        amoebaGpu->psLabFrameDipole->Download();
-        amoebaGpu->psLabFrameQuadrupole->Download();
-        amoebaGpu->psInducedDipoleS->Download();
-        amoebaGpu->psInducedDipolePolarS->Download();
-
-        for( int ii = 0; ii < gpu->natoms; ii++ ){
-           int indexOffset3    = ii*3;
-           int indexOffset9    = ii*9;
-           (void) fprintf( amoebaGpu->log, "%5d [%14.7e %14.7e %14.7e] q[%14.7e %14.7e %14.7e]\n", ii,
-                           amoebaGpu->psLabFrameDipole->_pSysData[indexOffset3],
-                           amoebaGpu->psLabFrameDipole->_pSysData[indexOffset3+1],
-                           amoebaGpu->psLabFrameDipole->_pSysData[indexOffset3+2],
-                           amoebaGpu->psLabFrameQuadrupole->_pSysData[indexOffset9],
-                           amoebaGpu->psLabFrameQuadrupole->_pSysData[indexOffset9+1],
-                           amoebaGpu->psLabFrameQuadrupole->_pSysData[indexOffset9+2] );
-           (void) fprintf( amoebaGpu->log, "%5d [%14.7e %14.7e %14.7e] q[%14.7e %14.7e %14.7e]\n", ii,
-                           amoebaGpu->psInducedDipoleS->_pSysData[indexOffset3],
-                           amoebaGpu->psInducedDipoleS->_pSysData[indexOffset3+1],
-                           amoebaGpu->psInducedDipoleS->_pSysData[indexOffset3+2],
-                           amoebaGpu->psInducedDipolePolarS->_pSysData[indexOffset3],
-                           amoebaGpu->psInducedDipolePolarS->_pSysData[indexOffset3+1],
-                           amoebaGpu->psInducedDipolePolarS->_pSysData[indexOffset3+2] );
-        }
-*/
-        debugArray->Download();
-
-        (void) fprintf( amoebaGpu->log, "Target Info\n" );
-        (void) fflush( amoebaGpu->log );
-
-        int paddedNumberOfAtoms          = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-        for( int jj = 0; jj < gpu->natoms; jj++ ){
-            int debugIndex = jj;
-            (void) fprintf( amoebaGpu->log,"%5d %5d DebugGk\n", targetAtom, jj );
-            for( int kk = 0; kk < 8; kk++ ){
-                (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
-                                debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
-                                debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
-                debugIndex += paddedNumberOfAtoms;
-            }
-            (void) fprintf( amoebaGpu->log,"\n" );
-       }
-    }
-#endif
-
-    kReduceTorque( amoebaGpu );
-
-    if( 0 ){
-        std::vector<int> fileId;
-        VectorOfDoubleVectors outputVector;
-        //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,            outputVector, NULL, 1.0f );
-        reduceAndCopyCUDAStreamFloat4( gpu->psForce4, amoebaGpu->psWorkArray_3_1, 1.0 );
-        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psWorkArray_3_1,      outputVector, NULL, 1.0f/4.184 );
-        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psTorque,             outputVector, NULL, 1.0f/4.184 );
-        cudaWriteVectorOfDoubleVectorsToFile( "CudaKirkwoodForceTorque", fileId, outputVector );
-    }   
-
    kReduceToBornForcePrefactor( amoebaGpu );

-    // map torques to forces
-
-    cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );
-
    if( 0 ){
        std::vector<int> fileId;
        VectorOfDoubleVectors outputVector;
@@ -1969,15 +1887,5 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )

    kCalculateAmoebaKirkwoodEDiff( amoebaGpu );

-    if( 0 ){
-        std::vector<int> fileId;
-        VectorOfDoubleVectors outputVector;
-        //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,            outputVector, NULL, 1.0f );
-        reduceAndCopyCUDAStreamFloat4( gpu->psForce4, amoebaGpu->psWorkArray_3_1, 1.0 );
-        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psWorkArray_3_1,      outputVector, NULL, 1.0f/4.184 );
-        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psTorque,             outputVector, NULL, 1.0f/4.184 );
-        cudaWriteVectorOfDoubleVectorsToFile( "CudaKirkwoodForceTorquePostEDiff", fileId, outputVector );
-    }   
-
   // ---------------------------------------------------------------------------------------
 }
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.h
@@ -54,8 +54,7 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwood, Forces_kernel)(
    unsigned int end             = (warp+1)*numWorkUnits/totalWarps;
    unsigned int lasty           = 0xFFFFFFFF;

-    // pWorkArray_3_1 == force
-    // pWorkArray_3_2 == torque
+    // pWorkArray_3_1 == torque

    // pWorkArray_1_1 == dBorn
    // pWorkArray_1_2 == dBornPolar
@@ -220,7 +219,7 @@ if( atomI == targetAtom  || atomJ == targetAtom ){
            cAmoebaSim.pWorkArray_1_2[offset]   = of;

            add3dArrayToFloat4(   offset, localParticle.force,  cSim.pForce4 );
-            load3dArrayBufferPerWarp( 3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 );
+            add3dArray( 3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 );

 #else
            unsigned int offset                 = x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms;
@@ -229,7 +228,7 @@ if( atomI == targetAtom  || atomJ == targetAtom ){
            cAmoebaSim.pWorkArray_1_2[offset]   = dBornPolarSum;

            add3dArrayToFloat4(    offset, localParticle.force,  cSim.pForce4);
-            load3dArray( 3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 );
+            add3dArray( 3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 );
 #endif

        }
@@ -377,7 +376,7 @@ if( mask || !mask ){
            cAmoebaSim.pWorkArray_1_2[offset]   = of;

            add3dArrayToFloat4( offset, localParticle.force,  cSim.pForce4 );
-            load3dArrayBufferPerWarp( 3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 );
+            add3dArray(       3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 );

            offset                              = y + tgx + warp*cSim.paddedNumberOfAtoms;

@@ -390,7 +389,7 @@ if( mask || !mask ){
            cAmoebaSim.pWorkArray_1_2[offset]   = of;

            add3dArrayToFloat4(   offset, sA[threadIdx.x].force,  cSim.pForce4 );
-            load3dArrayBufferPerWarp( 3*offset, sA[threadIdx.x].torque, cAmoebaSim.pWorkArray_3_1 );
+            add3dArray(         3*offset, sA[threadIdx.x].torque, cAmoebaSim.pWorkArray_3_1 );
 #else
            unsigned int offset                 = x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms;

@@ -398,7 +397,7 @@ if( mask || !mask ){
            cAmoebaSim.pWorkArray_1_2[offset]   = dBornPolarSum;

            add3dArrayToFloat4(   offset, localParticle.force,  cSim.pForce4 );
-            load3dArray( 3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 );
+            add3dArray(         3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 );

            offset                              = y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms;

@@ -406,7 +405,7 @@ if( mask || !mask ){
            cAmoebaSim.pWorkArray_1_2[offset]   = sA[threadIdx.x].dBornRadiusPolar;

            add3dArrayToFloat4( offset,    sA[threadIdx.x].force,  cSim.pForce4 );
-            load3dArray( 3*offset, sA[threadIdx.x].torque, cAmoebaSim.pWorkArray_3_1 );
+            add3dArray(       3*offset,    sA[threadIdx.x].torque, cAmoebaSim.pWorkArray_3_1 );

 #endif
            lasty = y;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.cu
@@ -936,7 +936,7 @@ static void kReduceTorque( amoebaGpuContext amoebaGpu )
    gpuContext gpu = amoebaGpu->gpuContext;
    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
                           gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
-                           amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psTorque->_pDevData );
+                           amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psTorque->_pDevData, 0 );

    LAUNCHERROR("kReduceForceTorqueKirkwoodEDiff");
 }
@@ -985,9 +985,6 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
    unsigned int targetAtom           = 0;
 #endif

-    kClearFields_3( amoebaGpu, 6 );
-    LAUNCHERROR("kClearFields_3_kCalculateAmoebaCudaKirkwoodEDiff");
-
    static unsigned int threadsPerBlock = 0;
    if( threadsPerBlock == 0 ){
        unsigned int maxThreads;
@@ -1047,14 +1044,11 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
                                                                           amoebaGpu->psWorkArray_3_1->_pDevData );
 #endif
    }
-
    LAUNCHERROR("kCalculateAmoebaCudaKirkwoodEDiffN2Forces");

-    kReduceTorque( amoebaGpu );
-    LAUNCHERROR("kReduceForceTorque_kCalculateAmoebaCudaKirkwoodEDiff");
-
-    // map torques to forces
+    // reduce and map torques to forces

+    kReduceTorque( amoebaGpu );
    cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );



--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.h
@@ -251,12 +251,12 @@ if( atomI == targetAtom  || atomJ == targetAtom ){
            unsigned int offset                 = x + tgx + warp*cSim.paddedNumberOfAtoms;

            add3dArrayToFloat4(         offset, localParticle.force,  cSim.pForce4 );
-            load3dArrayBufferPerWarp( 3*offset, localParticle.torque, outputTorque );
+            add3dArray(               3*offset, localParticle.torque, outputTorque );

 #else
            unsigned int offset                 = x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms;
            add3dArrayToFloat4( offset, localParticle.force,  cSim.pForce4 );
-            load3dArray(      3*offset, localParticle.torque, outputTorque );
+            add3dArray(       3*offset, localParticle.torque, outputTorque );

 #endif

@@ -437,23 +437,22 @@ if( atomI == targetAtom  || atomJ == targetAtom ){
            unsigned int offset                 = x + tgx + warp*cSim.paddedNumberOfAtoms;

            add3dArrayToFloat4( offset, localParticle.force,  cSim.pForce4 );
-            load3dArrayBufferPerWarp( 3*offset, localParticle.torque, outputTorque );
+            add3dArray(       3*offset, localParticle.torque, outputTorque );

            offset                              = y + tgx + warp*cSim.paddedNumberOfAtoms;

-            add3dArrayToFloat4(         offset, sA[threadIdx.x].force,  cSim.pForce4 );
-            load3dArrayBufferPerWarp( 3*offset, sA[threadIdx.x].torque, outputTorque );
+            add3dArrayToFloat4( offset, sA[threadIdx.x].force,  cSim.pForce4 );
+            add3dArray(       3*offset, sA[threadIdx.x].torque, outputTorque );
 #else
            unsigned int offset                 = x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms;

            add3dArrayToFloat4( offset, localParticle.force,  cSim.pForce4 );
-            load3dArray(      3*offset, localParticle.torque, outputTorque );
-
+            add3dArray(       3*offset, localParticle.torque, outputTorque );

            offset                              = y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms;

            add3dArrayToFloat4( offset, sA[threadIdx.x].force,  cSim.pForce4 );
-            load3dArray(      3*offset, sA[threadIdx.x].torque, outputTorque );
+            add3dArray(       3*offset, sA[threadIdx.x].torque, outputTorque );

 #endif
            lasty = y;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
@@ -418,22 +418,22 @@ static void kReduceMutualInducedAndGkFields(amoebaGpuContext amoebaGpu,
    gpuContext gpu = amoebaGpu->gpuContext;
    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
-                               amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData );
+                               amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData, 0 );
    LAUNCHERROR("kReduceMutualInducedAndGkFields1");

    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
-                               amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData );
+                               amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData, 0 );
    LAUNCHERROR("kReduceMutualInducedAndGkFields2");

    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
-                               amoebaGpu->psWorkArray_3_3->_pDevData, outputArrayS->_pDevData );
+                               amoebaGpu->psWorkArray_3_3->_pDevData, outputArrayS->_pDevData, 0 );
    LAUNCHERROR("kReduceMutualInducedAndGkFields3");

    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
-                               amoebaGpu->psWorkArray_3_4->_pDevData, outputPolarArrayS->_pDevData );
+                               amoebaGpu->psWorkArray_3_4->_pDevData, outputPolarArrayS->_pDevData, 0 );
    LAUNCHERROR("kReduceMutualInducedAndGkFields4");
 }


--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
@@ -220,12 +220,12 @@ static void kReduceMutualInducedFields(amoebaGpuContext amoebaGpu, CUDAStream<fl
    gpuContext gpu = amoebaGpu->gpuContext;
    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
-                               amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData );
+                               amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData, 0 );
    LAUNCHERROR("kReduceMI_Fields1");

    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
-                               amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData );
+                               amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData, 0 );
    LAUNCHERROR("kReduceMI_Fields2");
 }


--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPME.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPME.cu
@@ -867,15 +867,15 @@ void kComputeInducedDipoleForceAndEnergy_kernel()
        multipole[8] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+2];
        multipole[9] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+5];
        float* phidp = &cAmoebaSim.pPhidp[20*i];
-        cAmoebaSim.pTorque[3*i] = 0.5f*cAmoebaSim.electric*(multipole[3]*yscale*phidp[2] - multipole[2]*zscale*phidp[3]
+        cAmoebaSim.pTorque[3*i] += 0.5f*cAmoebaSim.electric*(multipole[3]*yscale*phidp[2] - multipole[2]*zscale*phidp[3]
                      + 2.0f*(multipole[6]-multipole[5])*zscale*zscale*phidp[9]
                      + multipole[8]*yscale*yscale*phidp[7] + multipole[9]*xscale*yscale*phidp[5]
                      - multipole[7]*yscale*zscale*phidp[8] - multipole[9]*xscale*zscale*phidp[6]);
-        cAmoebaSim.pTorque[3*i+1] = 0.5f*cAmoebaSim.electric*(multipole[1]*zscale*phidp[3] - multipole[3]*xscale*phidp[1]
+        cAmoebaSim.pTorque[3*i+1] += 0.5f*cAmoebaSim.electric*(multipole[1]*zscale*phidp[3] - multipole[3]*xscale*phidp[1]
                      + 2.0f*(multipole[4]-multipole[6])*zscale*zscale*phidp[8]
                      + multipole[7]*zscale*zscale*phidp[9] + multipole[8]*xscale*zscale*phidp[6]
                      - multipole[8]*xscale*xscale*phidp[4] - multipole[9]*yscale*yscale*phidp[7]);
-        cAmoebaSim.pTorque[3*i+2] = 0.5f*cAmoebaSim.electric*(multipole[2]*xscale*phidp[1] - multipole[1]*yscale*phidp[2]
+        cAmoebaSim.pTorque[3*i+2] += 0.5f*cAmoebaSim.electric*(multipole[2]*xscale*phidp[1] - multipole[1]*yscale*phidp[2]
                      + 2.0f*(multipole[5]-multipole[4])*yscale*yscale*phidp[7]
                      + multipole[7]*xscale*xscale*phidp[4] + multipole[9]*yscale*zscale*phidp[8]
                      - multipole[7]*xscale*yscale*phidp[5] - multipole[8]*zscale*zscale*phidp[9]);
@@ -1016,20 +1016,6 @@ void kCalculateAmoebaPMEFixedMultipoles(amoebaGpuContext amoebaGpu)
    kComputeFixedMultipoleForceAndEnergy_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
    LAUNCHERROR("kComputeFixedMultipoleForceAndEnergy");

-    if( 0 ){
-        gpuContext gpu                       = amoebaGpu->gpuContext;
-        std::vector<int> fileId;
-        fileId.push_back( 0 );
-        VectorOfDoubleVectors outputVector;
-        kReduceForces( gpu );
-        cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psForce4,              outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-        cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,              outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-        cudaWriteVectorOfDoubleVectorsToFile( "CudaRecipForceOnlyFixed", fileId, outputVector );
-        kClearForces( gpu );
-    }   
-
-    cudaComputeAmoebaMapTorqueAndAddToForce(amoebaGpu, amoebaGpu->psTorque);
-
 }

 /**
@@ -1062,5 +1048,4 @@ void kCalculateAmoebaPMEInducedDipoleForces(amoebaGpuContext amoebaGpu)
    kComputeInducedDipoleForceAndEnergy_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
    LAUNCHERROR("kComputeInducedDipoleForceAndEnergy");

-    cudaComputeAmoebaMapTorqueAndAddToForce(amoebaGpu, amoebaGpu->psTorque );
 }
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
@@ -1090,7 +1090,7 @@ static void kReduceTorque(amoebaGpuContext amoebaGpu )
    gpuContext gpu = amoebaGpu->gpuContext;
    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
                           gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
-                           amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psTorque->_pDevData );
+                           amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psTorque->_pDevData, 1 );
    LAUNCHERROR("kReducePmeDirectElectrostaticTorque");
 }

@@ -1185,7 +1185,6 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
    LAUNCHERROR("kCalculateAmoebaPmeDirectElectrostaticCutoffForces");

    kReduceTorque( amoebaGpu );
-    cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );

 }

@@ -1201,5 +1200,6 @@ void cudaComputeAmoebaPmeElectrostatic( amoebaGpuContext amoebaGpu )
 {
    cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpu );
    kCalculateAmoebaPMEInducedDipoleForces( amoebaGpu );
+    cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );
 }

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
@@ -357,12 +357,12 @@ static void kReduceMutualInducedFields(amoebaGpuContext amoebaGpu, CUDAStream<fl
    gpuContext gpu = amoebaGpu->gpuContext;
    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
-                               amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData );
+                               amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData, 0 );
    LAUNCHERROR("kReducePmeMI_Fields1");

    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
-                               amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData );
+                               amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData, 0 );
    LAUNCHERROR("kReducePmeMI_Fields2");
 }


--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaRotateFrame.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaRotateFrame.cu
@@ -487,7 +487,7 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
    // calculate electrostatic forces

    if( amoebaGpu->multipoleNonbondedMethod == AMOEBA_NO_CUTOFF ){
-        cudaComputeAmoebaElectrostatic( amoebaGpu );
+        cudaComputeAmoebaElectrostatic( amoebaGpu, (hasAmoebaGeneralizedKirkwood ? 0 : 1) );
    } else {
        cudaComputeAmoebaPmeElectrostatic( amoebaGpu );
    }

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaUtilities.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaUtilities.cu
@@ -166,7 +166,7 @@ __launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
 #else
 __launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
 #endif
-void kReduceFields_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn, float* fieldOut )
+void kReduceFields_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn, float* fieldOut, int addTo )
 {
    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;

@@ -174,7 +174,7 @@ void kReduceFields_kernel( unsigned int fieldComponents, unsigned int outputBuff

    while (pos < fieldComponents)
    {   
-        float totalField = 0.0f;
+        float totalField = addTo ? fieldOut[pos] : 0.0f;
        float* pFt       = fieldIn + pos;
        unsigned int i   = outputBuffers;
        while (i >= 4)

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaUtilities.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaUtilities.h
@@ -3,7 +3,7 @@

 #include "amoebaCudaKernels.h"

-__global__ void kReduceFields_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn, float* fieldOut );
+__global__ void kReduceFields_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn, float* fieldOut, int addTo );
 __global__ void kReduceAndCombineFields_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn1, float* fieldIn2, float* fieldOut );
 __global__ void kReduceFieldsToFloat4_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn, float4* fieldOut );


--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.cu
@@ -452,7 +452,7 @@ static void kReduceVdw14_7(amoebaGpuContext amoebaGpu, CUDAStream<float>* output
    gpuContext gpu = amoebaGpu->gpuContext;
    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
                           gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
-                           amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData );
+                           amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData, 0 );
    LAUNCHERROR("kReduceVdw14_7");
 }


--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersion.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersion.cu
@@ -360,7 +360,7 @@ static void kReduceWcaDispersion(amoebaGpuContext amoebaGpu, CUDAStream<float>*
    gpuContext gpu = amoebaGpu->gpuContext;
    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
-                               amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData );
+                               amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData, 0 );
    LAUNCHERROR("kReduceWcaDispersion");
 }