"platforms/vscode:/vscode.git/clone" did not exist on "3b3def0e3efe0bd76f2754b0b65417d6c9cc5e70"
Commit c2361935 authored by Peter Eastman's avatar Peter Eastman
Browse files

Further optimizations

parent 72bd8a83
...@@ -986,7 +986,7 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -986,7 +986,7 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
maxThreads = 128; maxThreads = 128;
else else
maxThreads = 64; maxThreads = 64;
threadsPerBlock = std::max(getThreadsPerBlock(amoebaGpu, sizeof(ElectrostaticParticle)), maxThreads); threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(ElectrostaticParticle)), maxThreads);
} }
kClearFields_3( amoebaGpu, 2 ); kClearFields_3( amoebaGpu, 2 );
......
...@@ -370,6 +370,8 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu ) ...@@ -370,6 +370,8 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
static unsigned int threadsPerBlock = 0;
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
...@@ -393,11 +395,24 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu ) ...@@ -393,11 +395,24 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
#endif #endif
// on first pass, set threads/block
if( threadsPerBlock == 0 ){
unsigned int maxThreads;
if (gpu->sm_version >= SM_20)
maxThreads = 256;
else if (gpu->sm_version >= SM_12)
maxThreads = 128;
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)), maxThreads);
}
kClearFields_3( amoebaGpu, 3 ); kClearFields_3( amoebaGpu, 3 );
if (gpu->bOutputBufferPerWarp){ if (gpu->bOutputBufferPerWarp){
(void) fprintf( amoebaGpu->log, "N2 warp\n" ); (void) fprintf( amoebaGpu->log, "N2 warp\n" );
kCalculateAmoebaFixedEAndGkFieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondElectrostaticThreadsPerBlock, sizeof(FixedFieldParticle)*amoebaGpu->nonbondElectrostaticThreadsPerBlock>>>( kCalculateAmoebaFixedEAndGkFieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0], amoebaGpu->psWorkUnit->_pDevStream[0],
gpu->psPosq4->_pDevStream[0], gpu->psPosq4->_pDevStream[0],
amoebaGpu->psLabFrameDipole->_pDevStream[0], amoebaGpu->psLabFrameDipole->_pDevStream[0],
...@@ -416,12 +431,12 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu ) ...@@ -416,12 +431,12 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
(void) fprintf( amoebaGpu->log, "N2 no warp\n" ); (void) fprintf( amoebaGpu->log, "N2 no warp\n" );
(void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n", (void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n",
amoebaGpu->nonbondBlocks, amoebaGpu->nonbondElectrostaticThreadsPerBlock, amoebaGpu->bOutputBufferPerWarp, amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*amoebaGpu->nonbondElectrostaticThreadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits ); sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
#endif #endif
kCalculateAmoebaFixedEAndGkFieldN2_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondElectrostaticThreadsPerBlock, sizeof(FixedFieldParticle)*amoebaGpu->nonbondElectrostaticThreadsPerBlock>>>( kCalculateAmoebaFixedEAndGkFieldN2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0], amoebaGpu->psWorkUnit->_pDevStream[0],
gpu->psPosq4->_pDevStream[0], gpu->psPosq4->_pDevStream[0],
amoebaGpu->psLabFrameDipole->_pDevStream[0], amoebaGpu->psLabFrameDipole->_pDevStream[0],
......
...@@ -28,11 +28,11 @@ ...@@ -28,11 +28,11 @@
__global__ __global__
#if (__CUDA_ARCH__ >= 200) #if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1) __launch_bounds__(256, 1)
#elif (__CUDA_ARCH__ >= 130) #elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1) __launch_bounds__(128, 1)
#else #else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1) __launch_bounds__(64, 1)
#endif #endif
void METHOD_NAME(kCalculateAmoebaFixedEAndGkField, _kernel)( void METHOD_NAME(kCalculateAmoebaFixedEAndGkField, _kernel)(
unsigned int* workUnit, unsigned int* workUnit,
......
...@@ -1965,7 +1965,7 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu ) ...@@ -1965,7 +1965,7 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
maxThreads = 128; maxThreads = 128;
else else
maxThreads = 64; maxThreads = 64;
threadsPerBlock = std::max(getThreadsPerBlock(amoebaGpu, sizeof(KirkwoodParticle)), maxThreads); threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(KirkwoodParticle)), maxThreads);
//unsigned int eDiffhreadsPerBlock = getThreadsPerBlock( amoebaGpu, sizeof(KirkwoodEDiffParticle)); //unsigned int eDiffhreadsPerBlock = getThreadsPerBlock( amoebaGpu, sizeof(KirkwoodEDiffParticle));
//unsigned int maxThreadsPerBlock = threadsPerBlock> eDiffhreadsPerBlock ? threadsPerBlock : eDiffhreadsPerBlock; //unsigned int maxThreadsPerBlock = threadsPerBlock> eDiffhreadsPerBlock ? threadsPerBlock : eDiffhreadsPerBlock;
......
...@@ -28,11 +28,11 @@ ...@@ -28,11 +28,11 @@
__global__ __global__
#if (__CUDA_ARCH__ >= 200) #if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1) __launch_bounds__(256, 1)
#elif (__CUDA_ARCH__ >= 130) #elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1) __launch_bounds__(128, 1)
#else #else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1) __launch_bounds__(64, 1)
#endif #endif
void METHOD_NAME(kCalculateAmoebaMutualInducedAndGkFields, _kernel)( void METHOD_NAME(kCalculateAmoebaMutualInducedAndGkFields, _kernel)(
unsigned int* workUnit, unsigned int* workUnit,
......
...@@ -26,11 +26,11 @@ ...@@ -26,11 +26,11 @@
__global__ __global__
#if (__CUDA_ARCH__ >= 200) #if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1) __launch_bounds__(384, 1)
#elif (__CUDA_ARCH__ >= 130) #elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1) __launch_bounds__(192, 1)
#else #else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1) __launch_bounds__(64, 1)
#endif #endif
void METHOD_NAME(kCalculateAmoebaWcaDispersion, _kernel)( void METHOD_NAME(kCalculateAmoebaWcaDispersion, _kernel)(
unsigned int* workUnit, unsigned int* workUnit,
......
...@@ -15,6 +15,7 @@ struct WcaDispersionParticle { ...@@ -15,6 +15,7 @@ struct WcaDispersionParticle {
float epsilon; float epsilon;
float force[3]; float force[3];
float padding;
}; };
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment