Mods to reduce calculation of ixns within blocks for Vdw

50a15fb0 · Mark Friedrichs · e6c19b54 · 50a15fb0 · 50a15fb0 · 50a15fb0
Commit 50a15fb0 authored Oct 13, 2010 by Mark Friedrichs
4 changed files
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.h
@@ -268,14 +268,6 @@ if( atomI == targetAtom ){
                // No interactions in this block.
            } else {
-                if (lasty != y) {
-                    // load shared data
-                   loadPmeDirectElectrostaticShared( &(sA[threadIdx.x]), (y+tgx) );
-                }
                sA[threadIdx.x].force[0]     = 0.0f;
                sA[threadIdx.x].force[1]     = 0.0f;
                sA[threadIdx.x].force[2]     = 0.0f;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.cu
@@ -250,6 +250,12 @@ void kCalculateAmoebaVdw14_7Reduction_kernel( float* inputForce, float4* outputF
    }
 }
+__device__ void sumTempBuffer( Vdw14_7Particle& atomI, Vdw14_7Particle& atomJ ){
+    atomI.tempForce[0]  += atomJ.tempForce[0];
+    atomI.tempForce[1]  += atomJ.tempForce[1];
+    atomI.tempForce[2]  += atomJ.tempForce[2];
+}
 static void kCalculateAmoebaVdw14_7Reduction(amoebaGpuContext amoebaGpu, CUDAStream<float>* vdwOutputArray, CUDAStream<float4>* forceOutputArray )
 {

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.h
@@ -212,12 +212,12 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
            if (flags == 0) {
            } else {
 #endif
               // zero shared fields
                zeroVdw14_7SharedForce( &(sA[threadIdx.x]) );
-            if( bExclusionFlag ){
+                if( bExclusionFlag )
+                {
                    unsigned int xi              = x >> GRIDBITS;
                    unsigned int yi              = y >> GRIDBITS;
@@ -231,20 +231,27 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
                {
                    float ijForce[3];
+#ifdef USE_CUTOFF
+                    if ((flags&(1<<j)) != 0)
+                    {
+                        unsigned int jIdx  = (flags == 0xFFFFFFFF) ? tj : j;
+#else
+                        unsigned int jIdx  = tj;
+#endif
                        // get combined sigma and epsilon
                        float combindedSigma;
                        float combindedEpsilon;
-                getVdw14_7CombindedSigmaEpsilon_kernel( sigmaCombiningRule,   localParticle.sigma,   psA[tj].sigma,   &combindedSigma,
+                        getVdw14_7CombindedSigmaEpsilon_kernel( sigmaCombiningRule,   localParticle.sigma,   psA[jIdx].sigma,   &combindedSigma,
-                                                        epsilonCombiningRule, localParticle.epsilon, psA[tj].epsilon, &combindedEpsilon );
+                                                                epsilonCombiningRule, localParticle.epsilon, psA[jIdx].epsilon, &combindedEpsilon );
                        // calculate force
                        float energy;
-                ijForce[0]    = psA[tj].x - localParticle.x;
+                        ijForce[0]    = psA[jIdx].x - localParticle.x;
-                ijForce[1]    = psA[tj].y - localParticle.y;
+                        ijForce[1]    = psA[jIdx].y - localParticle.y;
-                ijForce[2]    = psA[tj].z - localParticle.z;
+                        ijForce[2]    = psA[jIdx].z - localParticle.z;
                        if( cAmoebaSim.vdwUsePBC )
                        {
                            ijForce[0]   -= floor(ijForce[0]*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
@@ -259,32 +266,72 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
                        // mask out excluded ixns
-                unsigned int mask       =  ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+tj) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
+                        unsigned int mask       =  ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+jIdx) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
                        if( mask && bExclusionFlag ){
-                    unsigned int maskIndex  = 1 << tj;
+                            unsigned int maskIndex  = 1 << jIdx;
                            mask =  (exclusionMask & maskIndex) ? 0 : 1;
                        }
+                        if( mask == 0 )
+                        {
+                            energy = ijForce[0] = ijForce[1] = ijForce[2] = 0.0f;
+                        }
                        // accumulate force for atomI
-                forceSum[0]        += mask ? ijForce[0] : 0.0f;
+                        forceSum[0]        += ijForce[0];
-                forceSum[1]        += mask ? ijForce[1] : 0.0f;
+                        forceSum[1]        += ijForce[1];
-                forceSum[2]        += mask ? ijForce[2] : 0.0f;
+                        forceSum[2]        += ijForce[2];
-                // accumulate force for atomJ
+                        totalEnergy        += energy;
-                psA[tj].force[0]   -= mask ? ijForce[0] : 0.0f;
+#ifndef USE_CUTOFF
-                psA[tj].force[1]   -= mask ? ijForce[1] : 0.0f;
+                        psA[jIdx].force[0]   -= ijForce[0];
-                psA[tj].force[2]   -= mask ? ijForce[2] : 0.0f;
+                        psA[jIdx].force[1]   -= ijForce[1];
+                        psA[jIdx].force[2]   -= ijForce[2];
+#else
+                        if( flags == 0xFFFFFFFF ){
+                            psA[jIdx].force[0]   -= ijForce[0];
+                            psA[jIdx].force[1]   -= ijForce[1];
+                            psA[jIdx].force[2]   -= ijForce[2];
+                        } else {
+                            sA[threadIdx.x].tempForce[0]     = ijForce[0];
+                            sA[threadIdx.x].tempForce[1]     = ijForce[1];
+                            sA[threadIdx.x].tempForce[2]     = ijForce[2];
+                            if( tgx % 2 == 0 ){
+                                sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );
+                            }
+                            if( tgx % 4 == 0 ){
+                                sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+2] );
+                            }
+                            if( tgx % 8 == 0 ){
+                                sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+4] );
+                            }
+                            if( tgx % 16 == 0 ){
+                                sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+8] );
+                            }
+                            if (tgx == 0)
+                            {
+                                psA[jIdx].force[0]  -= sA[threadIdx.x].tempForce[0]  + sA[threadIdx.x+16].tempForce[0];
+                                psA[jIdx].force[1]  -= sA[threadIdx.x].tempForce[1]  + sA[threadIdx.x+16].tempForce[1];
+                                psA[jIdx].force[2]  -= sA[threadIdx.x].tempForce[2]  + sA[threadIdx.x+16].tempForce[2];
+                            }
+                        }
+#endif
-                totalEnergy        += mask ? energy     : 0.0f;
 #ifdef AMOEBA_DEBUG
-if( atomI == targetAtom || (y+tj) == targetAtom ){
+if( atomI == targetAtom || (y+jIdx) == targetAtom ){
-        unsigned int index                 = (atomI == targetAtom) ? (y + tj) : atomI;
+        unsigned int index                 = (atomI == targetAtom) ? (y + jIdx) : atomI;
        debugArray[index].x                = (float) atomI;
-        debugArray[index].y                = (float) (y + tj); 
+        debugArray[index].y                = (float) (y + jIdx); 
        debugArray[index].z                = -3.0;
        debugArray[index].w                = (float) (mask + 1); 
@@ -311,12 +358,16 @@ if( atomI == targetAtom || (y+tj) == targetAtom ){
        debugArray[index].y                = mask ? ijForce[1] : 0.0f;
        debugArray[index].z                = mask ? ijForce[2] : 0.0f;
 }
+#endif
+#ifdef USE_CUTOFF
+                    }
 #endif
                    tj                  = (tj + 1) & (GRID - 1);
                } // end of j-loop 
 #ifdef USE_CUTOFF
-}
+            }
 #endif
            // Write results

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdwParticle.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdwParticle.h
@@ -13,6 +13,7 @@ struct Vdw14_7Particle {
    float epsilon;
    float force[3];
+    float tempForce[3];
 };