Further optimizations

c2361935 · Peter Eastman · 72bd8a83 · c2361935 · c2361935 · c2361935
Commit c2361935 authored Jul 30, 2010 by Peter Eastman
7 changed files
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
@@ -986,7 +986,7 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
            maxThreads = 128;
        else
            maxThreads = 64;
-        threadsPerBlock = std::max(getThreadsPerBlock(amoebaGpu, sizeof(ElectrostaticParticle)), maxThreads);
+        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(ElectrostaticParticle)), maxThreads);
    }
    kClearFields_3( amoebaGpu, 2 );

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.cu
@@ -370,6 +370,8 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
   // ---------------------------------------------------------------------------------------
+    static unsigned int threadsPerBlock = 0;
    gpuContext gpu                             = amoebaGpu->gpuContext;
 #ifdef AMOEBA_DEBUG
@@ -393,11 +395,24 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
 #endif
+    // on first pass, set threads/block
+    if( threadsPerBlock == 0 ){
+        unsigned int maxThreads;
+        if (gpu->sm_version >= SM_20)
+            maxThreads = 256;
+        else if (gpu->sm_version >= SM_12)
+            maxThreads = 128;
+        else
+            maxThreads = 64;
+        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)), maxThreads);
+    }
    kClearFields_3( amoebaGpu, 3 );
    if (gpu->bOutputBufferPerWarp){
        (void) fprintf( amoebaGpu->log, "N2 warp\n" );
-        kCalculateAmoebaFixedEAndGkFieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondElectrostaticThreadsPerBlock, sizeof(FixedFieldParticle)*amoebaGpu->nonbondElectrostaticThreadsPerBlock>>>(
+        kCalculateAmoebaFixedEAndGkFieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
                                                                           amoebaGpu->psWorkUnit->_pDevStream[0],
                                                                           gpu->psPosq4->_pDevStream[0],
                                                                           amoebaGpu->psLabFrameDipole->_pDevStream[0],
@@ -416,12 +431,12 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
 #ifdef AMOEBA_DEBUG
        (void) fprintf( amoebaGpu->log, "N2 no warp\n" );
        (void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n",
-                        amoebaGpu->nonbondBlocks, amoebaGpu->nonbondElectrostaticThreadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
+                        amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
-                        sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*amoebaGpu->nonbondElectrostaticThreadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
+                        sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
        (void) fflush( amoebaGpu->log );
 #endif
-        kCalculateAmoebaFixedEAndGkFieldN2_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondElectrostaticThreadsPerBlock, sizeof(FixedFieldParticle)*amoebaGpu->nonbondElectrostaticThreadsPerBlock>>>(
+        kCalculateAmoebaFixedEAndGkFieldN2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
                                                          amoebaGpu->psWorkUnit->_pDevStream[0],
                                                          gpu->psPosq4->_pDevStream[0],
                                                          amoebaGpu->psLabFrameDipole->_pDevStream[0],

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.h
@@ -28,11 +28,11 @@
 __global__ 
 #if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
+__launch_bounds__(256, 1)
 #elif (__CUDA_ARCH__ >= 130)
-__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
+__launch_bounds__(128, 1)
 #else
-__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
+__launch_bounds__(64, 1)
 #endif
 void METHOD_NAME(kCalculateAmoebaFixedEAndGkField, _kernel)(
                            unsigned int* workUnit,

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
@@ -1965,7 +1965,7 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
            maxThreads = 128;
        else
            maxThreads = 64;
-        threadsPerBlock = std::max(getThreadsPerBlock(amoebaGpu, sizeof(KirkwoodParticle)), maxThreads);
+        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(KirkwoodParticle)), maxThreads);
        //unsigned int eDiffhreadsPerBlock            = getThreadsPerBlock( amoebaGpu, sizeof(KirkwoodEDiffParticle));
        //unsigned int maxThreadsPerBlock             = threadsPerBlock> eDiffhreadsPerBlock ? threadsPerBlock : eDiffhreadsPerBlock;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.h
@@ -28,11 +28,11 @@
 __global__
 #if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
+__launch_bounds__(256, 1)
 #elif (__CUDA_ARCH__ >= 130)
-__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
+__launch_bounds__(128, 1)
 #else
-__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
+__launch_bounds__(64, 1)
 #endif
 void METHOD_NAME(kCalculateAmoebaMutualInducedAndGkFields, _kernel)(
                            unsigned int* workUnit,

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersion.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersion.h
@@ -26,11 +26,11 @@
 __global__
 #if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
+__launch_bounds__(384, 1)
 #elif (__CUDA_ARCH__ >= 130)
-__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
+__launch_bounds__(192, 1)
 #else
-__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
+__launch_bounds__(64, 1)
 #endif
 void METHOD_NAME(kCalculateAmoebaWcaDispersion, _kernel)(
                            unsigned int* workUnit,

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersionParticle.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersionParticle.h
@@ -15,6 +15,7 @@ struct WcaDispersionParticle {
    float epsilon;
    float force[3];
+    float padding;
 };