Commit 72bd8a83 authored by Peter Eastman's avatar Peter Eastman
Browse files

Further optimizations

parent 80d8311e
...@@ -943,6 +943,8 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -943,6 +943,8 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
static unsigned int threadsPerBlock = 0;
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
static const char* methodName = "cudaComputeAmoebaElectrostatic"; static const char* methodName = "cudaComputeAmoebaElectrostatic";
static int timestep = 0; static int timestep = 0;
...@@ -959,8 +961,6 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -959,8 +961,6 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
// apparently debug array can take up nontrivial no. registers // apparently debug array can take up nontrivial no. registers
#undef THREADS_PER_BLOCK
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){ if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s %d maxCovalentDegreeSz=%d" (void) fprintf( amoebaGpu->log, "%s %d maxCovalentDegreeSz=%d"
...@@ -976,15 +976,28 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -976,15 +976,28 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
unsigned int targetAtom = 0; unsigned int targetAtom = 0;
#endif #endif
// on first pass, set threads/block
if( threadsPerBlock == 0 ){
unsigned int maxThreads;
if (gpu->sm_version >= SM_20)
maxThreads = 256;
else if (gpu->sm_version >= SM_12)
maxThreads = 128;
else
maxThreads = 64;
threadsPerBlock = std::max(getThreadsPerBlock(amoebaGpu, sizeof(ElectrostaticParticle)), maxThreads);
}
kClearFields_3( amoebaGpu, 2 ); kClearFields_3( amoebaGpu, 2 );
if (gpu->bOutputBufferPerWarp){ if (gpu->bOutputBufferPerWarp){
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaElectrostaticN2Forces warp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n", (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaElectrostaticN2Forces warp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n",
amoebaGpu->nonbondBlocks, amoebaGpu->nonbondElectrostaticThreadsPerBlock, amoebaGpu->bOutputBufferPerWarp, amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(ElectrostaticParticle), sizeof(ElectrostaticParticle)*amoebaGpu->nonbondElectrostaticThreadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits ); sizeof(ElectrostaticParticle), sizeof(ElectrostaticParticle)*threadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
kCalculateAmoebaCudaElectrostaticN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondElectrostaticThreadsPerBlock, sizeof(ElectrostaticParticle)*amoebaGpu->nonbondElectrostaticThreadsPerBlock>>>( kCalculateAmoebaCudaElectrostaticN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0], amoebaGpu->psWorkUnit->_pDevStream[0],
gpu->psPosq4->_pDevStream[0], gpu->psPosq4->_pDevStream[0],
amoebaGpu->psLabFrameDipole->_pDevStream[0], amoebaGpu->psLabFrameDipole->_pDevStream[0],
...@@ -1003,12 +1016,12 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -1003,12 +1016,12 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaElectrostaticN2Forces no warp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n", (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaElectrostaticN2Forces no warp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n",
amoebaGpu->nonbondBlocks, amoebaGpu->nonbondElectrostaticThreadsPerBlock, amoebaGpu->bOutputBufferPerWarp, amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(ElectrostaticParticle), sizeof(ElectrostaticParticle)*amoebaGpu->nonbondElectrostaticThreadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits ); sizeof(ElectrostaticParticle), sizeof(ElectrostaticParticle)*threadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
#endif #endif
kCalculateAmoebaCudaElectrostaticN2Forces_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondElectrostaticThreadsPerBlock, sizeof(ElectrostaticParticle)*amoebaGpu->nonbondElectrostaticThreadsPerBlock>>>( kCalculateAmoebaCudaElectrostaticN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0], amoebaGpu->psWorkUnit->_pDevStream[0],
gpu->psPosq4->_pDevStream[0], gpu->psPosq4->_pDevStream[0],
amoebaGpu->psLabFrameDipole->_pDevStream[0], amoebaGpu->psLabFrameDipole->_pDevStream[0],
......
...@@ -27,15 +27,13 @@ ...@@ -27,15 +27,13 @@
#include "amoebaScaleFactors.h" #include "amoebaScaleFactors.h"
__global__ __global__
/*
#if (__CUDA_ARCH__ >= 200) #if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1) __launch_bounds__(256, 1)
#elif (__CUDA_ARCH__ >= 130) #elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1) __launch_bounds__(128, 1)
#else #else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1) __launch_bounds__(64, 1)
#endif #endif
*/
void METHOD_NAME(kCalculateAmoebaCudaElectrostatic, Forces_kernel)( void METHOD_NAME(kCalculateAmoebaCudaElectrostatic, Forces_kernel)(
unsigned int* workUnit, unsigned int* workUnit,
float4* atomCoord, float4* atomCoord,
......
...@@ -1958,13 +1958,13 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu ) ...@@ -1958,13 +1958,13 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
// on first pass, set threads/block and based on that setting the energy buffer array // on first pass, set threads/block and based on that setting the energy buffer array
if( threadsPerBlock == 0 ){ if( threadsPerBlock == 0 ){
#if (__CUDA_ARCH__ >= 200) unsigned int maxThreads;
unsigned int maxThreads = 256; if (gpu->sm_version >= SM_20)
#elif (__CUDA_ARCH__ >= 130) maxThreads = 256;
unsigned int maxThreads = 128; else if (gpu->sm_version >= SM_12)
#else maxThreads = 128;
unsigned int maxThreads = 64; else
#endif maxThreads = 64;
threadsPerBlock = std::max(getThreadsPerBlock(amoebaGpu, sizeof(KirkwoodParticle)), maxThreads); threadsPerBlock = std::max(getThreadsPerBlock(amoebaGpu, sizeof(KirkwoodParticle)), maxThreads);
//unsigned int eDiffhreadsPerBlock = getThreadsPerBlock( amoebaGpu, sizeof(KirkwoodEDiffParticle)); //unsigned int eDiffhreadsPerBlock = getThreadsPerBlock( amoebaGpu, sizeof(KirkwoodEDiffParticle));
//unsigned int maxThreadsPerBlock = threadsPerBlock> eDiffhreadsPerBlock ? threadsPerBlock : eDiffhreadsPerBlock; //unsigned int maxThreadsPerBlock = threadsPerBlock> eDiffhreadsPerBlock ? threadsPerBlock : eDiffhreadsPerBlock;
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
#include "amoebaScaleFactors.h" #include "amoebaScaleFactors.h"
__global__ __global__
/*
#if (__CUDA_ARCH__ >= 200) #if (__CUDA_ARCH__ >= 200)
__launch_bounds__(256, 1) __launch_bounds__(256, 1)
#elif (__CUDA_ARCH__ >= 130) #elif (__CUDA_ARCH__ >= 130)
...@@ -35,7 +34,6 @@ __launch_bounds__(128, 1) ...@@ -35,7 +34,6 @@ __launch_bounds__(128, 1)
#else #else
__launch_bounds__(64, 1) __launch_bounds__(64, 1)
#endif #endif
*/
void METHOD_NAME(kCalculateAmoebaCudaKirkwood, Forces_kernel)( void METHOD_NAME(kCalculateAmoebaCudaKirkwood, Forces_kernel)(
unsigned int* workUnit unsigned int* workUnit
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
......
...@@ -28,11 +28,11 @@ ...@@ -28,11 +28,11 @@
__global__ __global__
#if (__CUDA_ARCH__ >= 200) #if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1) __launch_bounds__(192, 1)
#elif (__CUDA_ARCH__ >= 130) #elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1) __launch_bounds__(96, 1)
#else #else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1) __launch_bounds__(32, 1)
#endif #endif
void METHOD_NAME(kCalculateAmoebaCudaKirkwoodEDiff, Forces_kernel)( void METHOD_NAME(kCalculateAmoebaCudaKirkwoodEDiff, Forces_kernel)(
unsigned int* workUnit, unsigned int* workUnit,
......
...@@ -40,6 +40,7 @@ struct KirkwoodParticle { ...@@ -40,6 +40,7 @@ struct KirkwoodParticle {
float dBornRadius; float dBornRadius;
float dBornRadiusPolar; float dBornRadiusPolar;
float padding;
}; };
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment