No substantive changes -- cleanup

06089c9a · Mark Friedrichs · 4d6895b0 · 06089c9a · 06089c9a · 06089c9a
Commit 06089c9a authored Jul 27, 2011 by Mark Friedrichs
5 changed files
--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
@@ -1642,9 +1642,10 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
            }
        }

-        if( 0 && amoebaGpu->log )
-            fprintf( amoebaGpu->log, "Z1 %4d %d [%4d %4d %4d]\n", ii,  axisType[ii],
-                     multipoleParticleX[ii], multipoleParticleY[ii], multipoleParticleZ[ii] );
+        if( 0 && amoebaGpu->log ){
+            fprintf( amoebaGpu->log, "Z1 %4d %d [%4d %4d %4d] dmp/thole %15.7e %15.7e\n", ii,  axisType[ii],
+                     multipoleParticleX[ii], multipoleParticleY[ii], multipoleParticleZ[ii], dampingFactors[ii], tholes[ii] );
+        }

        // charges


--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
@@ -513,17 +513,9 @@ static __device__ void loadElectrostaticParticle( struct ElectrostaticParticle*
 }

 static __device__ void zeroElectrostaticParticle( struct ElectrostaticParticle* sA ){
-
-    // coordinates & charge
-
    sA->force[0]                 = 0.0f;
    sA->force[1]                 = 0.0f;
    sA->force[2]                 = 0.0f;
-/*
-    sA->torque[0]                = 0.0f;
-    sA->torque[1]                = 0.0f;
-    sA->torque[2]                = 0.0f;
-*/
 }

 #undef SUB_METHOD_NAME
@@ -649,7 +641,8 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo
    CUDAStream<float4>* debugArray            = new CUDAStream<float4>(maxSlots*paddedNumberOfAtoms, 1, "DebugArray");
    memset( debugArray->_pSysData,      0, sizeof( float )*4*maxSlots*paddedNumberOfAtoms);
    debugArray->Upload();
-    unsigned int targetAtom                   = 237;
+    //unsigned int targetAtom                   = 1137;
+    unsigned int targetAtom                   = 1;
 #endif

    // on first pass, set threads/block
@@ -675,8 +668,14 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo
                        gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
                        sizeof(ElectrostaticParticle), sizeof(ElectrostaticParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits ); (void) fflush( amoebaGpu->log );
    }
-#endif
-
+    if (gpu->bOutputBufferPerWarp){
+        kCalculateAmoebaCudaElectrostaticN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
+                                                                           gpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData, debugArray->_pDevData, targetAtom );
+    } else {
+        kCalculateAmoebaCudaElectrostaticN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
+                                                                           gpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData, debugArray->_pDevData, targetAtom );
+    }
+#else
    if (gpu->bOutputBufferPerWarp){
        kCalculateAmoebaCudaElectrostaticN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
                                                                           gpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData );
@@ -684,8 +683,44 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo
        kCalculateAmoebaCudaElectrostaticN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
                                                                           gpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData );
    }
+#endif
    LAUNCHERROR("kCalculateAmoebaCudaElectrostaticN2Forces");

+#ifdef AMOEBA_DEBUG
+    if( 0 ){
+        debugArray->Download();
+        std::vector<double> conversions;
+        conversions.push_back( 0.1f/4.184f );
+        conversions.push_back( 0.1f/4.184f );
+        unsigned int kkBlocks = 4;
+        (void) fprintf( stderr, "\nTarget atom output %5u\n", targetAtom );
+        for( unsigned int ii = 0; ii < amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; ii++ ){
+            double sum = 0.0;
+            for( unsigned int kk = 0; kk < kkBlocks && sum == 0.0; kk++ ){
+                unsigned int index = ii + kk*amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
+                sum               += debugArray->_pSysData[index].x + debugArray->_pSysData[index].y + debugArray->_pSysData[index].z +  debugArray->_pSysData[index].w;
+            }
+            if( sum > 0.0 ){
+                (void) fprintf( stderr, "%5u", ii );
+                for( unsigned int kk = 0; kk < kkBlocks; kk++ ){
+                    unsigned int index = ii + kk*amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
+                    (void) fprintf( stderr, " %15.7e %15.7e %15.7e %5.1f",
+                                    conversions[kk]*debugArray->_pSysData[index].x, conversions[kk]*debugArray->_pSysData[index].y, conversions[kk]*debugArray->_pSysData[index].z,
+                                    debugArray->_pSysData[index].w );
+                    if( ((kk+1) % 2) == 0 && (kk != (kkBlocks-1) ) ){
+                        (void) fprintf( stderr, "\n%5u", ii );
+                    }
+                }
+                (void) fprintf( stderr, "\n" );
+                if( kkBlocks > 2 ){
+                    (void) fprintf( stderr, "\n" );
+                }
+            }
+        }
+    }
+#endif
+
+#ifdef AMOEBA_DEBUG
    if( 0 ){ 
        VectorOfDoubleVectors outputVector;

@@ -694,7 +729,7 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo
        fileId.push_back( call++ );

        int paddedNumberOfAtoms  = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-        CUDAStream<float>* temp  = new CUDAStream<float>(3*paddedNumberOfAtoms, 1, "Temp1");
+        CUDAStream<float>* temp  = new CUDAStream<float>(3*paddedNumberOfAtoms, 1, "ElectrostaticTemp");

        //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,            outputVector, NULL, 1.0f );
        reduceAndCopyCUDAStreamFloat4( gpu->psForce4, temp, 1.0 );
@@ -706,6 +741,7 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo
        cudaWriteVectorOfDoubleVectorsToFile( "CudaElectrostaticTorque", fileId, outputVector );
        delete temp;
    }    
+#endif

    if( addTorqueToForce ){
        kReduceTorque( amoebaGpu );

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.h
@@ -42,10 +42,6 @@ void METHOD_NAME(kCalculateAmoebaCudaElectrostatic, Forces_kernel)(
 #endif
 ){

-#ifdef AMOEBA_DEBUG
-    float4 pullBack[20];
-#endif
-
    extern __shared__ ElectrostaticParticle sA[];

    unsigned int totalWarps      = gridDim.x*blockDim.x/GRID;
@@ -210,6 +206,7 @@ void METHOD_NAME(kCalculateAmoebaCudaElectrostatic, Forces_kernel)(
                    psA[tj].force[0]                  -= force[0];
                    psA[tj].force[1]                  -= force[1];
                    psA[tj].force[2]                  -= force[2];
+
                }

                tj = (tj + 1) & (GRID - 1);
@@ -243,6 +240,7 @@ void METHOD_NAME(kCalculateAmoebaCudaElectrostatic, Forces_kernel)(
 #endif
            zeroElectrostaticParticle( &(sA[threadIdx.x]) );
            zeroElectrostaticParticle( &localParticle );
+            tj = tgx;
            for (unsigned int j = 0; j < GRID; j++){
       
                unsigned int atomJ = y + tj;
@@ -264,6 +262,7 @@ void METHOD_NAME(kCalculateAmoebaCudaElectrostatic, Forces_kernel)(
                    psA[tj].force[0]                 += force[0];
                    psA[tj].force[1]                 += force[1];
                    psA[tj].force[2]                 += force[2];
+
                }

                tj = (tj + 1) & (GRID - 1);
@@ -298,5 +297,6 @@ void METHOD_NAME(kCalculateAmoebaCudaElectrostatic, Forces_kernel)(

        pos++;
    }
+
    cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += (conversionFactor*totalEnergy);
 }
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.cu
@@ -110,7 +110,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )

 #ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%lu ixnCt=%lu workUnits=%lu\n", methodName,
+        (void) fprintf( amoebaGpu->log, "cudaComputeAmoebaFixedEField numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%lu ixnCt=%lu workUnits=%lu\n",
                        gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
                        sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
        (void) fflush( amoebaGpu->log );
@@ -121,23 +121,13 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
        kCalculateAmoebaFixedE_FieldN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
                                                                           gpu->psWorkUnit->_pDevData,
                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
-#ifdef AMOEBA_DEBUG
-                                                                           amoebaGpu->psWorkArray_3_2->_pDevData,
-                                                                           debugArray->_pDevData, targetAtom );
-#else
                                                                           amoebaGpu->psWorkArray_3_2->_pDevData );
-#endif
    } else {

        kCalculateAmoebaFixedE_FieldN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
                                                                           gpu->psWorkUnit->_pDevData,
                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
-#ifdef AMOEBA_DEBUG
-                                                                           amoebaGpu->psWorkArray_3_2->_pDevData,
-                                                                           debugArray->_pDevData, targetAtom );
-#else
                                                                           amoebaGpu->psWorkArray_3_2->_pDevData );
-#endif
    }

    LAUNCHERROR("kCalculateAmoebaFixedE_FieldN2Forces_kernel");
@@ -198,91 +188,6 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
        (void) fflush( amoebaGpu->log );

        (void) fprintf( amoebaGpu->log, "EFields End\n" );
-/*
-        (void) fprintf( amoebaGpu->log, "DebugQ\n" );
-        debugArray->Download();
-        if( 0 ){
-               int ii = targetAtom;
-               float sum[2][3] = { { 0.0f, 0.0f, 0.0f },  { 0.0f, 0.0f, 0.0f } };
-
-               (void) fprintf( amoebaGpu->log,"\n" );
-               for( int jj = 0; jj < 1248; jj++ ){
-                   int debugIndex = jj;
-                   if( jj == ii )continue;
-                   (void) fprintf( amoebaGpu->log,"\n\n%4d %4d rrs\n[%16.9e %16.9e %16.9e %16.9e]\n",
-                                   ii, jj,
-                                   debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
-                                   debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
-    
-                   debugIndex += amoebaGpu->paddedNumberOfAtoms;
-                   (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e]\n",
-                                   debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
-                                   debugArray->_pSysData[debugIndex].z );
-    
-                   debugIndex += amoebaGpu->paddedNumberOfAtoms;
-                   (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e]\n",
-                                   debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
-                                   debugArray->_pSysData[debugIndex].z );
-    
-    
-                   debugIndex += amoebaGpu->paddedNumberOfAtoms;
-                   (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e]\n",
-                                   debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
-                                   debugArray->_pSysData[debugIndex].z );
-    
-    
-                   debugIndex += amoebaGpu->paddedNumberOfAtoms;
-                   (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e]\n",
-                                   debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
-                                   debugArray->_pSysData[debugIndex].z );
-
-                   debugIndex += amoebaGpu->paddedNumberOfAtoms;
-                   (void) fprintf( amoebaGpu->log,"Y1 %5d %16.9e %16.9e %16.9e\n", jj,
-                                   debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
-                                   debugArray->_pSysData[debugIndex].z );
-
-                   sum[0][0] += debugArray->_pSysData[debugIndex].x;
-                   sum[0][1] += debugArray->_pSysData[debugIndex].y;
-                   sum[0][2] += debugArray->_pSysData[debugIndex].z;
-    
-                   debugIndex += amoebaGpu->paddedNumberOfAtoms;
-                   (void) fprintf( amoebaGpu->log,"Y2 %5d %16.9e %16.9e %16.9e\n", jj,
-                                   debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
-                                   debugArray->_pSysData[debugIndex].z );
-
-                   sum[1][0] += debugArray->_pSysData[debugIndex].x;
-                   sum[1][1] += debugArray->_pSysData[debugIndex].y;
-                   sum[1][2] += debugArray->_pSysData[debugIndex].z;
-
-               }
-               (void) fprintf( amoebaGpu->log,"SumQ [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e]\n",
-                               sum[0][0], sum[0][1], sum[0][2],
-                               sum[1][0], sum[1][1], sum[1][2] );
-           }
-        for( unsigned int ii = 0; ii < debugArray->_stride; ii++ ){
-           int print;
-           if( debugArray->_pSysData[ii].x  != 0.0f || debugArray->_pSysData[ii].y != 0.0f ||
-               debugArray->_pSysData[ii].y  != 0.0f || debugArray->_pSysData[ii].w != 0.0f ||
-               debugArray->_pSysData[ii].x  != debugArray->_pSysData[ii].x               ||
-               debugArray->_pSysData[ii].y  != debugArray->_pSysData[ii].y               ||
-               debugArray->_pSysData[ii].z  != debugArray->_pSysData[ii].z               ||
-               debugArray->_pSysData[ii].w  != debugArray->_pSysData[ii].w               ){
-               print = 0;
-           } else {
-               print = 0;
-           }
-           if( print ){
-                unsigned int atomI = ii/amoebaGpu->paddedNumberOfAtoms;
-                unsigned int atomJ = ii - atomI*amoebaGpu->paddedNumberOfAtoms;
-                (void) fprintf( amoebaGpu->log, "%5u [%5u %5u]  ", ii, atomI, atomJ);
-                (void) fprintf( amoebaGpu->log, "%14.6e %14.6e %14.6e %14.6e\n",
-                                debugArray->_pSysData[ii].x,
-                                debugArray->_pSysData[ii].y,
-                                debugArray->_pSysData[ii].z,
-                                debugArray->_pSysData[ii].w );
-            }
-        }
-*/

        // write results to file


--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMapTorques.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMapTorques.cu
@@ -447,7 +447,7 @@ void cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpuContext amoebaGpu, CUDASt
        fileId.push_back( call++ );

        int paddedNumberOfAtoms  = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-        CUDAStream<float>* temp  = new CUDAStream<float>(3*paddedNumberOfAtoms, 1, "Temp1");
+        CUDAStream<float>* temp  = new CUDAStream<float>(3*paddedNumberOfAtoms, 1, "Temp_MapTorqueAndAddToForce");

        reduceAndCopyCUDAStreamFloat4( gpu->psForce4, temp, 1.0 );
        cudaLoadCudaFloatArray( gpu->natoms,  3, temp, outputVector, NULL, 1.0f/4.184f );
@@ -479,7 +479,7 @@ void cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpuContext amoebaGpu, CUDASt
        fileId.push_back( call++ );

        int paddedNumberOfAtoms  = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-        CUDAStream<float>* temp  = new CUDAStream<float>(3*paddedNumberOfAtoms, 1, "Temp1");
+        CUDAStream<float>* temp  = new CUDAStream<float>(3*paddedNumberOfAtoms, 1, "TempMapTorqueAndAddToForce2");

        reduceAndCopyCUDAStreamFloat4( gpu->psForce4, temp, 1.0 );
        cudaLoadCudaFloatArray( gpu->natoms,  3, temp, outputVector, NULL, 1.0f/4.184f );