Warp/non-warp calls were reversed in kCalculateAmoebaCudaKirkwood

132a94bc · Mark Friedrichs · 07f8d5ce · 132a94bc · 132a94bc · 132a94bc
Commit 132a94bc authored Jan 20, 2011 by Mark Friedrichs
8 changed files
--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -793,6 +793,8 @@ static void computeAmoebaMultipoleForce( AmoebaCudaData& data ) {
    if( data.getHasAmoebaGeneralizedKirkwood() ){
        kCalculateObcGbsaBornSum(gpu->gpuContext);
        kReduceObcGbsaBornSum(gpu->gpuContext);
+        //initializeCudaFloatArray( gpu->gpuContext->natoms, 1, gpu->gpuContext->psBornRadii, 0.1 );
+        //initializeCudaFloatArray( gpu->gpuContext->natoms, 1, gpu->gpuContext->psObcChain,  0.0 );
    }   

    // multipoles
@@ -801,7 +803,6 @@ static void computeAmoebaMultipoleForce( AmoebaCudaData& data ) {

 //kClearForces(gpu->gpuContext);
 //kClearEnergy(gpu->gpuContext);
-//(void) fprintf( data.getLog(), "computeAmoebaMultipoleForce clearing forces/energy after kCalculateAmoebaMultipoleForces()\n" );

    // GK


--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
@@ -191,12 +191,10 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )

    (void) fprintf( log, "\n\n" );
    (void) fprintf( log, "     gpuContext                         %p\n",      amoebaGpu->gpuContext );
-    (void) fprintf( log, "     log                                %p\n",      amoebaGpu->log );
+    (void) fprintf( log, "     log                                %p %s\n",   amoebaGpu->log, amoebaGpu->log == stderr ? "is stderr" : "is not stderr");
    (void) fprintf( log, "     sm_version                         %u\n",      gpu->sm_version );
    (void) fprintf( log, "     device                             %u\n",      gpu->device );
    (void) fprintf( log, "     sharedMemoryPerBlock               %u\n",      gpu->sharedMemoryPerBlock );
-    (void) fprintf( log, "     pMapArray                          %p\n",      amoebaGpu->pMapArray );
-    (void) fprintf( log, "     dMapArray                          %p\n",      amoebaGpu->dMapArray );
    (void) fprintf( log, "     bOutputBufferPerWarp               %d\n",      amoebaGpu->bOutputBufferPerWarp );
    (void) fprintf( log, "     paddedNumberOfAtoms                %u\n",      amoebaGpu->paddedNumberOfAtoms );
    (void) fprintf( log, "     nonbondBlocks                      %u\n",      amoebaGpu->nonbondBlocks );
@@ -209,6 +207,13 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
    (void) fprintf( log, "     outputBuffers                      %u\n",      amoebaGpu->outputBuffers );
    (void) fprintf( log, "     workUnits                          %u\n",      amoebaGpu->workUnits );

+    gpuPrintCudaStreamFloat(  amoebaGpu->gpuContext->psEnergy,    log );
+    gpuPrintCudaStreamFloat4( amoebaGpu->gpuContext->psForce4,    log );
+    gpuPrintCudaStreamFloat4( amoebaGpu->gpuContext->psPosq4,     log );
+    gpuPrintCudaStreamFloat2( amoebaGpu->gpuContext->psObcData,   log );
+    gpuPrintCudaStreamFloat(  amoebaGpu->gpuContext->psBornForce, log );
+    (void) fprintf( log, "\n\n" );
+    (void) fprintf( log, "     amoebaBonds                       %u\n",      amoebaGpu->amoebaSim.amoebaBonds );
    gpuPrintCudaStreamFloat(  amoebaGpu->psWorkArray_3_1, log );
    gpuPrintCudaStreamFloat(  amoebaGpu->psWorkArray_3_2, log );
    gpuPrintCudaStreamFloat(  amoebaGpu->psWorkArray_3_3, log );
@@ -337,6 +342,7 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
    (void) fprintf( log, "     quartic                            %15.7e\n",  amoebaGpu->amoebaSim.amoebaUreyBradleyQuarticicParameter);
    (void) fprintf( log, "     pAmoebaUreyBradleyID               %p\n",      amoebaGpu->amoebaSim.pAmoebaUreyBradleyID );
    (void) fprintf( log, "     pAmoebaUreyBradleyParameter        %p\n",      amoebaGpu->amoebaSim.pAmoebaUreyBradleyParameter );
+    (void) fprintf( log, "\n\n" );
    
 //    if( amoebaGpu->psRotationMatrix)(void) fprintf( log, "\n" );
 //    gpuPrintCudaStreamFloat( amoebaGpu->psRotationMatrix, log );
@@ -394,7 +400,6 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )

    gpuPrintCudaStreamFloat( amoebaGpu->psInducedDipole, log );
    gpuPrintCudaStreamFloat( amoebaGpu->psInducedDipolePolar, log );
-    gpuPrintCudaStreamFloat( amoebaGpu->psInducedDipolePolar, log );
    gpuPrintCudaStreamFloat( amoebaGpu->psCurrentEpsilon, log );

    (void) fprintf( log, "     numberOfSorWorkVectors             %u\n",  amoebaGpu->numberOfSorWorkVectors);
@@ -4437,3 +4442,32 @@ void gpuCopyWorkUnit( amoebaGpuContext amoebaGpu ){
 }

 #undef  AMOEBA_DEBUG
+
+/**---------------------------------------------------------------------------------------
+
+   Load contents of arrays into vector
+
+   @param numberOfParticles    number of particles
+   @param entriesPerParticle   entries/particles array
+   @param array                cuda array
+   @param initValue            vector init value
+
+   --------------------------------------------------------------------------------------- */
+
+void initializeCudaFloatArray( int numberOfParticles, int entriesPerParticle,
+                               CUDAStream<float>* array, float initValue )
+{
+    // ---------------------------------------------------------------------------------------
+
+    // static const std::string methodName = "initializeCudaFloatArray";
+
+    // ---------------------------------------------------------------------------------------
+
+    for( int ii = 0; ii < numberOfParticles; ii++ ){ 
+        for( int jj = 0; jj < entriesPerParticle; jj++ ) { 
+            array->_pSysStream[0][entriesPerParticle*ii+jj] = initValue;
+        }
+    }
+    array->Upload();
+}
+
--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
@@ -151,6 +151,8 @@ extern void cudaLoadCudaFloatArray( int numberOfParticles, int entriesPerParticl
 extern void cudaLoadCudaFloat2Array( int numberOfParticles, int entriesPerParticle, CUDAStream<float2>* array, VectorOfDoubleVectors& outputVector );
 extern void cudaLoadCudaFloat4Array( int numberOfParticles, int entriesPerParticle, CUDAStream<float4>* array, VectorOfDoubleVectors& outputVector, int* order );
 extern void cudaWriteVectorOfDoubleVectorsToFile( char* fname, std::vector<int>& fileId, VectorOfDoubleVectors& outputVector );
+extern void initializeCudaFloatArray( int numberOfParticles, int entriesPerParticle, CUDAStream<float>* array, float initValue );
+

 extern void kClearFloat( amoebaGpuContext amoebaGpu, unsigned int entries, CUDAStream<float>* fieldToClear );
 extern void kClearFloat4( amoebaGpuContext amoebaGpu, unsigned int entries, CUDAStream<float4>* fieldToClear );

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
@@ -1907,7 +1907,7 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
    kClearFields_3( amoebaGpu, 6 );

    if (gpu->bOutputBufferPerWarp){
-        kCalculateAmoebaCudaKirkwoodN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(KirkwoodParticle)*threadsPerBlock>>>(
+        kCalculateAmoebaCudaKirkwoodN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(KirkwoodParticle)*threadsPerBlock>>>(
                                                                           amoebaGpu->psWorkUnit->_pDevStream[0]
 #ifdef AMOEBA_DEBUG
                                                                           , debugArray->_pDevStream[0], targetAtom );
@@ -1924,7 +1924,7 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
        (void) fflush( amoebaGpu->log );
 #endif

-        kCalculateAmoebaCudaKirkwoodN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(KirkwoodParticle)*threadsPerBlock>>>(
+        kCalculateAmoebaCudaKirkwoodN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(KirkwoodParticle)*threadsPerBlock>>>(
                                                                           amoebaGpu->psWorkUnit->_pDevStream[0]
 #ifdef AMOEBA_DEBUG
                                                                           , debugArray->_pDevStream[0], targetAtom );

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.cu
@@ -1059,6 +1059,7 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
        threadsPerBlock = std::min(getThreadsPerBlock( amoebaGpu, sizeof(KirkwoodEDiffParticle)), maxThreads);
    }   
    
+#ifdef AMOEBA_DEBUG
    if( amoebaGpu->log && timestep == 1 ){
        (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaKirkwoodEDiffN2Forces: blocks=%u threads=%u bffr/Warp=%u atm=%lu shrd=%lu"
                                        " Ebuf=%u ixnCt=%lu workUnits=%u sm=%d device=%d sharedMemoryPerBlock=%u\n",
@@ -1068,6 +1069,7 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
        //gpuPrintCudaAmoebaGmxSimulation(amoebaGpu, amoebaGpu->log );
        (void) fflush( amoebaGpu->log );
    }   
+#endif

    if (gpu->bOutputBufferPerWarp){


--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
@@ -502,8 +502,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
    int targetAtom    = 0;
    static const char* methodName       = "cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply";
    if( 1 && amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "%s: scalingDistanceCutoff=%.5f\n",
-                        methodName, amoebaGpu->scalingDistanceCutoff );
+        (void) fprintf( amoebaGpu->log, "%s\n", methodName );
        (void) fflush( amoebaGpu->log );
    }
    int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
@@ -583,7 +582,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
        //printMiFieldBuffer( amoebaGpu, 37 );
        //printMiFieldBuffer( amoebaGpu, 38 );

-    if( amoebaGpu->log && iteration == -1 ){
+    if( amoebaGpu->log && iteration == 1 ){

        (void) fprintf( amoebaGpu->log, "Finished MI kernel execution %d\n", iteration ); (void) fflush( amoebaGpu->log );


--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
@@ -1390,6 +1390,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
    }

 }
+
 /**---------------------------------------------------------------------------------------

   Compute Amoeba electrostatic force & torque using PME
@@ -1413,7 +1414,7 @@ void cudaComputeAmoebaPmeElectrostatic( amoebaGpuContext amoebaGpu )
        zeroForce( amoebaGpu );
    }

-    if( 1 ){
+    if( 0 ){
        gpuContext gpu = amoebaGpu->gpuContext;
        std::vector<int> fileId;


--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaRotateFrame.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaRotateFrame.cu
@@ -565,7 +565,23 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG

    if( hasAmoebaGeneralizedKirkwood ){
        cudaComputeAmoebaFixedEAndGkFields( amoebaGpu );
+        if( 0 ){
+            gpuContext gpu = amoebaGpu->gpuContext;
+            initializeCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, 0.0 );
+            initializeCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, 0.0 );
+            initializeCudaFloatArray( gpu->natoms, 3, amoebaGpu->psGk_Field, 0.0 );
+        }
+
        cudaComputeAmoebaMutualInducedAndGkField( amoebaGpu );
+        if( 0 ){
+            gpuContext gpu = amoebaGpu->gpuContext;
+            initializeCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, 0.0 );
+            initializeCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, 0.0 );
+            initializeCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipoleS, 0.0 );
+            initializeCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolarS, 0.0 );
+            amoebaGpu->mutualInducedDone = 1;
+        }
+
    } else {
        if( amoebaGpu->multipoleNonbondedMethod == AMOEBA_NO_CUTOFF ){
            cudaComputeAmoebaFixedEField( amoebaGpu );