Fixed bug in Vdw w/o cutoffs if bOutputBufferPerWarp is set

1b5ee8f9 · Mark Friedrichs · db8a55b3 · 1b5ee8f9 · 1b5ee8f9 · 1b5ee8f9
Commit 1b5ee8f9 authored Dec 10, 2010 by Mark Friedrichs
6 changed files
--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -1129,6 +1129,10 @@ void CudaCalcAmoebaVdwForceKernel::initialize(const System& system, const Amoeba
                               force.getSigmaCombiningRule(), force.getEpsilonCombiningRule(),
                               allExclusions, force.getPBC(), static_cast<float>(force.getCutoff()) );
    data.getAmoebaGpu()->gpuContext->forces.push_back(new ForceInfo(force));
+    if( data.getLog() ){
+        (void) fprintf( data.getLog(), "CudaCalcAmoebaVdwForceKernel PBC=%d getUseNeighborList=%d\n",
+                        force.getPBC(), force.getUseNeighborList() );
+    }
    data.setUseVdwNeighborList( force.getUseNeighborList() );
 }

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.cu
@@ -306,7 +306,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
            std::vector<int> fileId;
            //fileId.push_back( 0 );
            VectorOfDoubleVectors outputVector;
-            cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,              outputVector );
+            //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,              outputVector );
            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_Field,      outputVector );
            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_FieldPolar, outputVector);
            cudaWriteVectorOfDoubleVectorsToFile( "CudaEField", fileId, outputVector );

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
@@ -247,8 +247,7 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
    static const char* methodName = "cudaComputeAmoebaMutualInducedFieldMatrixMultiply";
    static int iteration = 1;
    if( 1 && amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "%s: scalingDistanceCutoff=%.5f\n",
+        (void) fprintf( amoebaGpu->log, "%s\n", methodName );
-                        methodName, amoebaGpu->scalingDistanceCutoff );
        (void) fflush( amoebaGpu->log );
    }
    int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
@@ -594,17 +593,16 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
    amoebaGpu->mutualInducedConverged        = ( !done || iteration > amoebaGpu->mutualInducedMaxIterations ) ? 0 : 1;
 #ifdef AMOEBA_DEBUG
-/*
    if( 0 ){
        std::vector<int> fileId;
        //fileId.push_back( 0 );
        VectorOfDoubleVectors outputVector;
-        cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,                    outputVector );
+//        cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,                    outputVector );
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipole,      outputVector );
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipolePolar, outputVector );
        cudaWriteVectorOfDoubleVectorsToFile( "CudaMI", fileId, outputVector );
     }
-*/
 #endif
   // ---------------------------------------------------------------------------------------

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaRotateFrame.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaRotateFrame.cu
@@ -599,7 +599,6 @@ if( 0 ){
                     x          = (x >> 17) << GRIDBITS;
        (void) fprintf( amoebaGpu->log, "   AmGpu %8u [%5u %5u %1u]\n", amoebaGpu->psWorkUnit->_pSysStream[0][ii], x,y,exclusions );
    }    
-} else {
 }
            cudaComputeAmoebaPmeFixedEField( amoebaGpu );

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.cu
@@ -10,6 +10,7 @@
 #include "amoebaScaleFactors.h"
 #include <stdio.h>
+extern int isNanOrInfinity( double number );
 using namespace std;
@@ -36,8 +37,11 @@ void GetCalculateAmoebaCudaVdw14_7Sim(amoebaGpuContext amoebaGpu)
    RTERROR(status, "GetCalculateAmoebaCudaVdw14_7Sim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
 }
-#define AMOEBA_DEBUG
+//#define AMOEBA_DEBUG_PRINT
-//#undef AMOEBA_DEBUG
+#undef AMOEBA_DEBUG_PRINT
+//#define AMOEBA_DEBUG
+#undef AMOEBA_DEBUG
 __device__ void zeroVdw14_7SharedForce( struct Vdw14_7Particle* sA ) 
 {
@@ -486,17 +490,19 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
     gpuContext gpu    = amoebaGpu->gpuContext;
-#ifdef AMOEBA_DEBUG
+#ifdef AMOEBA_DEBUG_PRINT
    static const char* methodName = "kCalculateAmoebaVdw14_7Forces";
    if( 1 && amoebaGpu->log ){
        (void) fprintf( amoebaGpu->log, "%s: \n", methodName );
        (void) fflush( amoebaGpu->log );
    }   
+#ifdef AMOEBA_DEBUG
    int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
    CUDAStream<float4>* debugArray             = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
    memset( debugArray->_pSysStream[0],      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
    debugArray->Upload();
    int targetAtom                             = 342;
+#endif
 #endif
    // set threads/block first time through
@@ -517,12 +523,30 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
    kCalculateAmoebaVdw14_7CopyCoordinates( amoebaGpu, gpu->psPosq4, amoebaGpu->psAmoebaVdwCoordinates );
    kCalculateAmoebaVdw14_7CoordinateReduction( amoebaGpu, amoebaGpu->psAmoebaVdwCoordinates, amoebaGpu->psAmoebaVdwCoordinates );
-#ifdef AMOEBA_DEBUG
+#ifdef AMOEBA_DEBUG_PRINT
    (void) fprintf( amoebaGpu->log, "Apply cutoff=%d warp=%d\n", applyCutoff, gpu->bOutputBufferPerWarp );
    (void) fprintf( amoebaGpu->log, "numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n",
                    amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
                    sizeof(Vdw14_7Particle), sizeof(Vdw14_7Particle)*threadsPerBlock,
                    amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
+    if( 0 ){  
+        gpu->psInteractionCount->Download();
+        amoebaGpu->psVdwWorkUnit->Download();
+        unsigned int totalWarps  = (amoebaGpu->nonbondBlocks*threadsPerBlock)/GRID;
+        float        ratiof     = (float)totalWarps/(float)amoebaGpu->psVdwWorkUnit->_length;
+        (void) fprintf( amoebaGpu->log, "Ixn warps=%u count=%u\n", totalWarps, gpu->psInteractionCount->_pSysStream[0][0] );
+        for( unsigned int ii = 0; ii < amoebaGpu->psVdwWorkUnit->_length; ii++ ){
+            unsigned int x          = amoebaGpu->psVdwWorkUnit->_pSysStream[0][ii];
+            unsigned int y          = ((x >> 2) & 0x7fff) << GRIDBITS;
+            unsigned int exclusions = (x & 0x1);
+                         x          = (x >> 17) << GRIDBITS;
+            float        warp       = (float)(ii)*ratiof; 
+            (void) fprintf( amoebaGpu->log, "GpuCell %8u  [%5u %5u %1u] %10u warp=%15.6f\n", ii, x,y,exclusions, warp );
+        }           
+    }
    (void) fflush( amoebaGpu->log );
 #endif
@@ -541,7 +565,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
        LAUNCHERROR("kFindInteractionsWithinBlocksVdwPeriodic");
-if( 1 ){  
+        if( 0 ){  
            gpu->psInteractionCount->Download();
            gpu->psInteractingWorkUnit->Download();
            gpu->psInteractionFlag->Download();
@@ -562,7 +586,7 @@ if( 1 ){
                (void) fprintf( amoebaGpu->log, "   AmGpu %8u [%5u %5u %1u]\n", amoebaGpu->psWorkUnit->_pSysStream[0][ii], x,y,exclusions );
            }    
                (void) fflush( amoebaGpu->log );
-}
+        }
        if (gpu->bOutputBufferPerWarp){
            kCalculateAmoebaVdw14_7CutoffByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
@@ -578,6 +602,7 @@ if( 1 ){
                                                                amoebaGpu->psWorkArray_3_1->_pDevStream[0] );
 #endif
        } else {
            kCalculateAmoebaVdw14_7Cutoff_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
                                                                gpu->sim.pInteractingWorkUnit,
                                                                amoebaGpu->psAmoebaVdwCoordinates->_pDevStream[0],
@@ -598,9 +623,8 @@ if( 1 ){
        if (gpu->bOutputBufferPerWarp){
-                                                                //amoebaGpu->psVdwWorkUnit->_pDevStream[0],
            kCalculateAmoebaVdw14_7N2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
-                                                                gpu->sim.pInteractingWorkUnit,
+                                                                amoebaGpu->psVdwWorkUnit->_pDevStream[0],
                                                                amoebaGpu->psAmoebaVdwCoordinates->_pDevStream[0],
                                                                amoebaGpu->psVdwSigmaEpsilon->_pDevStream[0],
                                                                amoebaGpu->vdwSigmaCombiningRule,
@@ -612,6 +636,7 @@ if( 1 ){
                                                                amoebaGpu->psWorkArray_3_1->_pDevStream[0] );
 #endif
        } else {
            kCalculateAmoebaVdw14_7N2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
                                                                amoebaGpu->psVdwWorkUnit->_pDevStream[0],
                                                                amoebaGpu->psAmoebaVdwCoordinates->_pDevStream[0],
@@ -629,14 +654,15 @@ if( 1 ){
        LAUNCHERROR("kCalculateAmoebaVdw14_7N2");  
    }
-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-        debugArray->Download();
+#ifdef AMOEBA_DEBUG_PRINT
+    if( amoebaGpu->log ){
        (void) fprintf( amoebaGpu->log, "Finished 14-7 kernel execution\n" );
        (void) fflush( amoebaGpu->log );
+#ifdef AMOEBA_DEBUG
+        debugArray->Download();
        int paddedNumberOfAtoms          = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
        double cutOff                    = 1.0e+03;
        for( int jj = 0; jj < gpu->natoms; jj++ ){
@@ -655,6 +681,25 @@ if( 1 ){
            }
            (void) fprintf( amoebaGpu->log,"\n" );
        }
+#endif
+        amoebaGpu->psWorkArray_3_2->Download();
+        amoebaGpu->psWorkArray_3_1->Download();
+        //for( int jj = 0; jj < 3*gpu->natoms; jj += 3 )
+        for( int jj = 0; jj < 3*gpu->natoms; jj += 3 ){
+            for( int kk = 0; kk < amoebaGpu->outputBuffers; kk++ ){
+                float delta = fabs(amoebaGpu->psWorkArray_3_1->_pSysStream[kk][jj+2] + 1.0f);
+                if( delta < 5.0e-06 || isNanOrInfinity( (double) amoebaGpu->psWorkArray_3_1->_pSysStream[kk][jj] ) || isNanOrInfinity( (double) amoebaGpu->psWorkArray_3_1->_pSysStream[kk][jj+2] )  )
+                (void) fprintf( amoebaGpu->log,"%6d %6d [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e]\n", jj, kk,
+                                amoebaGpu->psWorkArray_3_1->_pSysStream[kk][jj],
+                                amoebaGpu->psWorkArray_3_1->_pSysStream[kk][jj+1],
+                                amoebaGpu->psWorkArray_3_1->_pSysStream[kk][jj+2],
+                                amoebaGpu->psWorkArray_3_2->_pSysStream[kk][jj],
+                                amoebaGpu->psWorkArray_3_2->_pSysStream[kk][jj+1],
+                                amoebaGpu->psWorkArray_3_2->_pSysStream[kk][jj+2] );
+            }
+        }
    }
 #endif
@@ -667,7 +712,7 @@ if( 1 ){
        CUDAStream<float4>* psTempForce     = new CUDAStream<float4>(paddedNumberOfAtoms, 1, "psTempForce");
        kClearFloat4( amoebaGpu, paddedNumberOfAtoms, psTempForce );
        kCalculateAmoebaVdw14_7Reduction( amoebaGpu, amoebaGpu->psWorkArray_3_2, psTempForce );
-        kCalculateAmoebaVdw14_7NonReduction( amoebaGpu, amoebaGpu->psWorkArray_3_2, psTempForce );
+        //kCalculateAmoebaVdw14_7NonReduction( amoebaGpu, amoebaGpu->psWorkArray_3_2, psTempForce );
        std::vector<int> fileId;
        //fileId.push_back( 0 );
        VectorOfDoubleVectors outputVector;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.h
@@ -147,7 +147,7 @@ void METHOD_NAME(kCalculateAmoebaVdw14_7, _kernel)(
                forceSum[0]            += mask ? ijForce[0]  : 0.0f;
                forceSum[1]            += mask ? ijForce[1]  : 0.0f;
                forceSum[2]            += mask ? ijForce[2]  : 0.0f;
-                totalEnergy            += mask ? 0.5*energy : 0.0f;
+                totalEnergy            += mask ? 0.5f*energy : 0.0f;
 #ifdef AMOEBA_DEBUG
 if( atomI == targetAtom || (y+j) == targetAtom ){