Further optimizations

72bd8a83 · Peter Eastman · 80d8311e · 72bd8a83 · 72bd8a83 · 72bd8a83
Commit 72bd8a83 authored Jul 29, 2010 by Peter Eastman
6 changed files
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
@@ -943,6 +943,8 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
  
   // ---------------------------------------------------------------------------------------

+    static unsigned int threadsPerBlock = 0;
+
 #ifdef AMOEBA_DEBUG
    static const char* methodName = "cudaComputeAmoebaElectrostatic";
    static int timestep = 0;
@@ -959,8 +961,6 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )

    // apparently debug array can take up nontrivial no. registers

-#undef THREADS_PER_BLOCK
-
 #ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){
        (void) fprintf( amoebaGpu->log, "%s %d maxCovalentDegreeSz=%d"
@@ -976,15 +976,28 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
    unsigned int targetAtom                   = 0;
 #endif

+    // on first pass, set threads/block
+
+    if( threadsPerBlock == 0 ){
+        unsigned int maxThreads;
+        if (gpu->sm_version >= SM_20)
+            maxThreads = 256;
+        else if (gpu->sm_version >= SM_12)
+            maxThreads = 128;
+        else
+            maxThreads = 64;
+        threadsPerBlock = std::max(getThreadsPerBlock(amoebaGpu, sizeof(ElectrostaticParticle)), maxThreads);
+    }
+
    kClearFields_3( amoebaGpu, 2 );

    if (gpu->bOutputBufferPerWarp){

        (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaElectrostaticN2Forces warp:  numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n",
-                        amoebaGpu->nonbondBlocks, amoebaGpu->nonbondElectrostaticThreadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
-                        sizeof(ElectrostaticParticle), sizeof(ElectrostaticParticle)*amoebaGpu->nonbondElectrostaticThreadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
+                        amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
+                        sizeof(ElectrostaticParticle), sizeof(ElectrostaticParticle)*threadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
        (void) fflush( amoebaGpu->log );
-        kCalculateAmoebaCudaElectrostaticN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondElectrostaticThreadsPerBlock, sizeof(ElectrostaticParticle)*amoebaGpu->nonbondElectrostaticThreadsPerBlock>>>(
+        kCalculateAmoebaCudaElectrostaticN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
                                                                           amoebaGpu->psWorkUnit->_pDevStream[0],
                                                                           gpu->psPosq4->_pDevStream[0],
                                                                           amoebaGpu->psLabFrameDipole->_pDevStream[0],
@@ -1003,12 +1016,12 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )

 #ifdef AMOEBA_DEBUG
        (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaElectrostaticN2Forces no warp:  numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n",
-                        amoebaGpu->nonbondBlocks, amoebaGpu->nonbondElectrostaticThreadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
-                        sizeof(ElectrostaticParticle), sizeof(ElectrostaticParticle)*amoebaGpu->nonbondElectrostaticThreadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
+                        amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
+                        sizeof(ElectrostaticParticle), sizeof(ElectrostaticParticle)*threadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
        (void) fflush( amoebaGpu->log );
 #endif

-        kCalculateAmoebaCudaElectrostaticN2Forces_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondElectrostaticThreadsPerBlock, sizeof(ElectrostaticParticle)*amoebaGpu->nonbondElectrostaticThreadsPerBlock>>>(
+        kCalculateAmoebaCudaElectrostaticN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
                                                                           amoebaGpu->psWorkUnit->_pDevStream[0],
                                                                           gpu->psPosq4->_pDevStream[0],
                                                                           amoebaGpu->psLabFrameDipole->_pDevStream[0],

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.h
@@ -27,15 +27,13 @@
 #include "amoebaScaleFactors.h"

 __global__ 
-/*
 #if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
+__launch_bounds__(256, 1)
 #elif (__CUDA_ARCH__ >= 130)
-__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
+__launch_bounds__(128, 1)
 #else
-__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
+__launch_bounds__(64, 1)
 #endif
-*/
 void METHOD_NAME(kCalculateAmoebaCudaElectrostatic, Forces_kernel)(
                            unsigned int* workUnit,
                            float4* atomCoord,

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
@@ -1958,13 +1958,13 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
    // on first pass, set threads/block and based on that setting the energy buffer array

    if( threadsPerBlock == 0 ){
-#if (__CUDA_ARCH__ >= 200)
-        unsigned int maxThreads = 256;
-#elif (__CUDA_ARCH__ >= 130)
-        unsigned int maxThreads = 128;
-#else
-        unsigned int maxThreads = 64;
-#endif
+        unsigned int maxThreads;
+        if (gpu->sm_version >= SM_20)
+            maxThreads = 256;
+        else if (gpu->sm_version >= SM_12)
+            maxThreads = 128;
+        else
+            maxThreads = 64;
        threadsPerBlock = std::max(getThreadsPerBlock(amoebaGpu, sizeof(KirkwoodParticle)), maxThreads);
        //unsigned int eDiffhreadsPerBlock            = getThreadsPerBlock( amoebaGpu, sizeof(KirkwoodEDiffParticle));
        //unsigned int maxThreadsPerBlock             = threadsPerBlock> eDiffhreadsPerBlock ? threadsPerBlock : eDiffhreadsPerBlock;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.h
@@ -27,7 +27,6 @@
 #include "amoebaScaleFactors.h"

 __global__
-/*
 #if (__CUDA_ARCH__ >= 200)
 __launch_bounds__(256, 1)
 #elif (__CUDA_ARCH__ >= 130)
@@ -35,7 +34,6 @@ __launch_bounds__(128, 1)
 #else
 __launch_bounds__(64, 1)
 #endif
-*/
 void METHOD_NAME(kCalculateAmoebaCudaKirkwood, Forces_kernel)(
                            unsigned int* workUnit
 #ifdef AMOEBA_DEBUG

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.h
@@ -28,11 +28,11 @@

 __global__
 #if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
+__launch_bounds__(192, 1)
 #elif (__CUDA_ARCH__ >= 130)
-__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
+__launch_bounds__(96, 1)
 #else
-__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
+__launch_bounds__(32, 1)
 #endif
 void METHOD_NAME(kCalculateAmoebaCudaKirkwoodEDiff, Forces_kernel)(
                            unsigned int* workUnit,

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodParticle.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodParticle.h
@@ -40,6 +40,7 @@ struct KirkwoodParticle {

    float dBornRadius;
    float dBornRadiusPolar;
+    float padding;

 };