Commit dd218599 authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Remove nonbondElectrostaticThreadsPerBlock, energyOutputBuffers

Set nonbondThreadsPerBlock to gpu->sim.nonbond_threads_per_block
parent 80ae6d45
...@@ -195,13 +195,16 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log ) ...@@ -195,13 +195,16 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
(void) fprintf( log, " sm_version %u\n", gpu->sm_version ); (void) fprintf( log, " sm_version %u\n", gpu->sm_version );
(void) fprintf( log, " device %u\n", gpu->device ); (void) fprintf( log, " device %u\n", gpu->device );
(void) fprintf( log, " sharedMemoryPerBlock %u\n", gpu->sharedMemoryPerBlock ); (void) fprintf( log, " sharedMemoryPerBlock %u\n", gpu->sharedMemoryPerBlock );
(void) fprintf( log, " bOutputBufferPerWarp %d\n", amoebaGpu->bOutputBufferPerWarp ); (void) fprintf( log, " amoebaBOutputBufferPerWarp %d\n", amoebaGpu->bOutputBufferPerWarp );
(void) fprintf( log, " bOutputBufferPerWarp %d\n", gpu->bOutputBufferPerWarp );
//`amoebaGpu->nonbondOutputBuffers = (amoebaGpu->nonbondBlocks*amoebaGpu->nonbondThreadsPerBlock)/GRID;
(void) fprintf( log, " paddedNumberOfAtoms %u\n", amoebaGpu->paddedNumberOfAtoms ); (void) fprintf( log, " paddedNumberOfAtoms %u\n", amoebaGpu->paddedNumberOfAtoms );
(void) fprintf( log, " nonbondBlocks %u\n", amoebaGpu->nonbondBlocks ); (void) fprintf( log, " amoebaNonbondBlocks %u\n", amoebaGpu->nonbondBlocks );
(void) fprintf( log, " nonbondThreadsPerBlock %u\n", amoebaGpu->nonbondThreadsPerBlock ); (void) fprintf( log, " nonbondBlocks %u\n", gpu->sim.nonbond_blocks );
(void) fprintf( log, " nonbondElectrostaticThreadsPerBlock%u\n", amoebaGpu->nonbondElectrostaticThreadsPerBlock ); (void) fprintf( log, " amoebaNonbondThreadsPerBlock %u\n", amoebaGpu->nonbondThreadsPerBlock );
(void) fprintf( log, " nonbondOutputBuffers %u\n", amoebaGpu->nonbondOutputBuffers ); (void) fprintf( log, " nonbondThreadsPerBlock %u\n", gpu->sim.nonbond_threads_per_block);
(void) fprintf( log, " energyOutputBuffers %u\n", amoebaGpu->energyOutputBuffers ); (void) fprintf( log, " amoebaNonbondOutputBuffers %u\n", amoebaGpu->nonbondOutputBuffers );
(void) fprintf( log, " nonbondOutputBuffers %u\n", gpu->sim.nonbondOutputBuffers );
(void) fprintf( log, " threadsPerBlock %u\n", amoebaGpu->threadsPerBlock ); (void) fprintf( log, " threadsPerBlock %u\n", amoebaGpu->threadsPerBlock );
(void) fprintf( log, " fieldReduceThreadsPerBlock %u\n", amoebaGpu->fieldReduceThreadsPerBlock ); (void) fprintf( log, " fieldReduceThreadsPerBlock %u\n", amoebaGpu->fieldReduceThreadsPerBlock );
(void) fprintf( log, " outputBuffers %u\n", amoebaGpu->outputBuffers ); (void) fprintf( log, " outputBuffers %u\n", amoebaGpu->outputBuffers );
...@@ -2902,11 +2905,7 @@ void amoebaGpuBuildOutputBuffers( amoebaGpuContext amoebaGpu ) ...@@ -2902,11 +2905,7 @@ void amoebaGpuBuildOutputBuffers( amoebaGpuContext amoebaGpu )
amoebaGpu->nonbondBlocks = amoebaGpu->gpuContext->sim.blocks; amoebaGpu->nonbondBlocks = amoebaGpu->gpuContext->sim.blocks;
amoebaGpu->threadsPerBlock = amoebaGpu->gpuContext->sim.threads_per_block; amoebaGpu->threadsPerBlock = amoebaGpu->gpuContext->sim.threads_per_block;
amoebaGpu->nonbondThreadsPerBlock = amoebaGpu->gpuContext->sim.nonbond_threads_per_block;
// nonbondThreadsPerBlock & nonbondElectrostaticThreadsPerBlock need to be multiples of 32
amoebaGpu->nonbondThreadsPerBlock = 192;
amoebaGpu->nonbondElectrostaticThreadsPerBlock = 128;
amoebaGpu->fieldReduceThreadsPerBlock = (amoebaGpu->paddedNumberOfAtoms*3 + amoebaGpu->gpuContext->natoms + amoebaGpu->nonbondBlocks - 1) / amoebaGpu->nonbondBlocks; amoebaGpu->fieldReduceThreadsPerBlock = (amoebaGpu->paddedNumberOfAtoms*3 + amoebaGpu->gpuContext->natoms + amoebaGpu->nonbondBlocks - 1) / amoebaGpu->nonbondBlocks;
amoebaGpu->fieldReduceThreadsPerBlock = ((amoebaGpu->fieldReduceThreadsPerBlock + (amoebaGpu->gpuContext->grid - 1)) / amoebaGpu->gpuContext->grid) * amoebaGpu->gpuContext->grid; amoebaGpu->fieldReduceThreadsPerBlock = ((amoebaGpu->fieldReduceThreadsPerBlock + (amoebaGpu->gpuContext->grid - 1)) / amoebaGpu->gpuContext->grid) * amoebaGpu->gpuContext->grid;
...@@ -2927,16 +2926,13 @@ void amoebaGpuBuildOutputBuffers( amoebaGpuContext amoebaGpu ) ...@@ -2927,16 +2926,13 @@ void amoebaGpuBuildOutputBuffers( amoebaGpuContext amoebaGpu )
amoebaGpu->nonbondOutputBuffers = paddedNumberOfAtoms/GRID; amoebaGpu->nonbondOutputBuffers = paddedNumberOfAtoms/GRID;
} }
amoebaGpu->outputBuffers = amoebaGpu->nonbondOutputBuffers; amoebaGpu->outputBuffers = amoebaGpu->nonbondOutputBuffers;
//amoebaGpu->energyOutputBuffers = max(amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->sim.localForces_threads_per_block)*amoebaGpu->sim.blocks;
amoebaGpu->energyOutputBuffers = amoebaGpu->nonbondThreadsPerBlock*amoebaGpu->nonbondBlocks;
if( amoebaGpu->log ){ if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "amoebaGpuBuildOutputBuffers: bOutputBufferPerWarp=%u nonbondBuffers=%u " (void) fprintf( amoebaGpu->log, "amoebaGpuBuildOutputBuffers: bOutputBufferPerWarp=%u nonbondBuffers=%u "
"outputBuffers=%u energyBuffers=%d nonbondBlocks=%u fieldReduceThreadsPerBlock=%u\n", "outputBuffers=%u onbondBlocks=%u fieldReduceThreadsPerBlock=%u\n",
amoebaGpu->bOutputBufferPerWarp, amoebaGpu->bOutputBufferPerWarp,
amoebaGpu->nonbondOutputBuffers, amoebaGpu->nonbondOutputBuffers,
amoebaGpu->outputBuffers, amoebaGpu->outputBuffers,
amoebaGpu->energyOutputBuffers,
amoebaGpu->nonbondBlocks, amoebaGpu->nonbondBlocks,
amoebaGpu->fieldReduceThreadsPerBlock ); amoebaGpu->fieldReduceThreadsPerBlock );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
......
...@@ -56,9 +56,7 @@ struct _amoebaGpuContext { ...@@ -56,9 +56,7 @@ struct _amoebaGpuContext {
unsigned int paddedNumberOfAtoms; unsigned int paddedNumberOfAtoms;
unsigned int nonbondBlocks; unsigned int nonbondBlocks;
unsigned int nonbondThreadsPerBlock; unsigned int nonbondThreadsPerBlock;
unsigned int nonbondElectrostaticThreadsPerBlock;
unsigned int nonbondOutputBuffers; unsigned int nonbondOutputBuffers;
unsigned int energyOutputBuffers;
unsigned int threadsPerBlock; unsigned int threadsPerBlock;
unsigned int fieldReduceThreadsPerBlock; unsigned int fieldReduceThreadsPerBlock;
unsigned int outputBuffers; unsigned int outputBuffers;
......
...@@ -777,9 +777,9 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -777,9 +777,9 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
if (gpu->bOutputBufferPerWarp){ if (gpu->bOutputBufferPerWarp){
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaElectrostaticN2Forces warp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu Ebuf=%u ixnCt=%lu workUnits=%u\n", (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaElectrostaticN2Forces warp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp, amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(ElectrostaticParticle), sizeof(ElectrostaticParticle)*threadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits ); (void) fflush( amoebaGpu->log ); sizeof(ElectrostaticParticle), sizeof(ElectrostaticParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits ); (void) fflush( amoebaGpu->log );
kCalculateAmoebaCudaElectrostaticN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>( kCalculateAmoebaCudaElectrostaticN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
...@@ -800,9 +800,9 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -800,9 +800,9 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
} else { } else {
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaElectrostaticN2Forces no warp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n", (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaElectrostaticN2Forces no warp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u xnCt=%u workUnits=%u\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp, amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(ElectrostaticParticle), sizeof(ElectrostaticParticle)*threadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits ); sizeof(ElectrostaticParticle), sizeof(ElectrostaticParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
#endif #endif
......
...@@ -430,9 +430,9 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu ) ...@@ -430,9 +430,9 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
(void) fprintf( amoebaGpu->log, "N2 no warp\n" ); (void) fprintf( amoebaGpu->log, "N2 no warp\n" );
(void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n", (void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp, amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits ); sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
#endif #endif
...@@ -472,9 +472,9 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu ) ...@@ -472,9 +472,9 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){ if( amoebaGpu->log ){
gpu->psInteractionCount->Download(); gpu->psInteractionCount->Download();
(void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n", (void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n",
amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->bOutputBufferPerWarp, amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits ); sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
amoebaGpu->psWorkArray_3_1->Download(); amoebaGpu->psWorkArray_3_1->Download();
......
...@@ -116,9 +116,9 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -116,9 +116,9 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
(void) fprintf( amoebaGpu->log, "N2 no warp\n" ); (void) fprintf( amoebaGpu->log, "N2 no warp\n" );
(void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n", (void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n",
amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->bOutputBufferPerWarp, amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits ); sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
#endif #endif
...@@ -154,9 +154,9 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -154,9 +154,9 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){ if( amoebaGpu->log ){
gpu->psInteractionCount->Download(); gpu->psInteractionCount->Download();
(void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n", (void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n",
amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->bOutputBufferPerWarp, amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits ); sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
amoebaGpu->psWorkArray_3_1->Download(); amoebaGpu->psWorkArray_3_1->Download();
amoebaGpu->psWorkArray_3_2->Download(); amoebaGpu->psWorkArray_3_2->Download();
......
...@@ -1895,10 +1895,10 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu ) ...@@ -1895,10 +1895,10 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
if( amoebaGpu->log ){ if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaKirkwood: blcks=%u tds=%u %u bPrWrp=%u atm=%lu shrd=%lu Ebuf=%u ixnCt=%lu workUnits=%u\n", (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaKirkwood: blcks=%u tds=%u %u bPrWrp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, maxThreads, amoebaGpu->bOutputBufferPerWarp, amoebaGpu->nonbondBlocks, threadsPerBlock, maxThreads, amoebaGpu->bOutputBufferPerWarp,
sizeof(KirkwoodParticle), sizeof(KirkwoodParticle)*threadsPerBlock, sizeof(KirkwoodParticle), sizeof(KirkwoodParticle)*threadsPerBlock,
amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits ); (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
} }
} }
...@@ -1917,10 +1917,10 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu ) ...@@ -1917,10 +1917,10 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
} else { } else {
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaKirkwoodN2Forces no warp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n", (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaKirkwoodN2Forces no warp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp, amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(KirkwoodParticle), sizeof(KirkwoodParticle)*threadsPerBlock, sizeof(KirkwoodParticle), sizeof(KirkwoodParticle)*threadsPerBlock,
amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits ); (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
#endif #endif
......
...@@ -1064,10 +1064,10 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu ) ...@@ -1064,10 +1064,10 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
if( amoebaGpu->log && timestep == 1 ){ if( amoebaGpu->log && timestep == 1 ){
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaKirkwoodEDiffN2Forces: blocks=%u threads=%u bffr/Warp=%u atm=%lu shrd=%lu" (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaKirkwoodEDiffN2Forces: blocks=%u threads=%u bffr/Warp=%u atm=%lu shrd=%lu"
" Ebuf=%u ixnCt=%lu workUnits=%u sm=%d device=%d sharedMemoryPerBlock=%u\n", " ixnCt=%lu workUnits=%u sm=%d device=%d sharedMemoryPerBlock=%u\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp, amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(KirkwoodEDiffParticle), sizeof(KirkwoodEDiffParticle)*threadsPerBlock, sizeof(KirkwoodEDiffParticle), sizeof(KirkwoodEDiffParticle)*threadsPerBlock,
amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->sm_version, gpu->device, gpu->sharedMemoryPerBlock ); (*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->sm_version, gpu->device, gpu->sharedMemoryPerBlock );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
} }
#endif #endif
......
...@@ -544,10 +544,10 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon ...@@ -544,10 +544,10 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
(void) fprintf( amoebaGpu->log, "N2 no warp\n" ); (void) fprintf( amoebaGpu->log, "N2 no warp\n" );
(void) fprintf( amoebaGpu->log, "cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n", (void) fprintf( amoebaGpu->log, "cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp, amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock, sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock,
amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits ); (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
#endif #endif
......
...@@ -273,10 +273,10 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext ...@@ -273,10 +273,10 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
(void) fprintf( amoebaGpu->log, "N2 no warp\n" ); (void) fprintf( amoebaGpu->log, "N2 no warp\n" );
(void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n", (void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n",
amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->bOutputBufferPerWarp, amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock, sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock,
amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits ); (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
#endif #endif
kCalculateAmoebaMutualInducedFieldN2_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock>>>( kCalculateAmoebaMutualInducedFieldN2_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
......
...@@ -1238,10 +1238,10 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -1238,10 +1238,10 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(PmeDirectElectrostaticParticle)), threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(PmeDirectElectrostaticParticle)),
sizeof(PmeDirectElectrostaticParticle) ); sizeof(PmeDirectElectrostaticParticle) );
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticCutoffForces: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Obuf=%u ixnCt=%u workUnits=%u gpu->nonbond_threads_per_block=%u\n", (void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticCutoffForces: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u gpu->nonbond_threads_per_block=%u\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp, amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(PmeDirectElectrostaticParticle), (sizeof(PmeDirectElectrostaticParticle))*threadsPerBlock, sizeof(PmeDirectElectrostaticParticle), (sizeof(PmeDirectElectrostaticParticle))*threadsPerBlock,
amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->sim.nonbond_threads_per_block ); (*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->sim.nonbond_threads_per_block );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
#endif #endif
...@@ -1314,18 +1314,6 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -1314,18 +1314,6 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
} }
} }
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
/*
gpu->psEnergy->Download();
double energy = 0.0;
for( unsigned int ii = 0; ii < gpu->sim.energyOutputBuffers; ii++ ){
if( (*gpu->psEnergy)[ii] != (*gpu->psEnergy)[ii] || (*gpu->psEnergy)[ii] == std::numeric_limits<double>::infinity() || (*gpu->psEnergy)[ii] == -std::numeric_limits<double>::infinity() ){
(void) fprintf( amoebaGpu->log,"Energy nan at index=%d\n", ii );
} else {
energy += (*gpu->psEnergy)[ii];
}
}
(void) fprintf( amoebaGpu->log,"Force sums: [%16.9e %16.9e %16.9e] Energy=%16.9e\n", forceSum[0], forceSum[1], forceSum[2], energy );
*/
if( 0 ){ if( 0 ){
(void) fprintf( amoebaGpu->log,"DebugElec\n" ); (void) fprintf( amoebaGpu->log,"DebugElec\n" );
......
...@@ -468,9 +468,9 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -468,9 +468,9 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
(void) fprintf( amoebaGpu->log, "cudaComputeAmoebaPmeDirectFixedEField: threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u shrd=%u\n", (void) fprintf( amoebaGpu->log, "cudaComputeAmoebaPmeDirectFixedEField: threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u shrd=%u\n",
threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)+sizeof(float3)), threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)+sizeof(float3)),
(sizeof(FixedFieldParticle)+sizeof(float3)), (sizeof(FixedFieldParticle)+sizeof(float3))*threadsPerBlock ); (sizeof(FixedFieldParticle)+sizeof(float3)), (sizeof(FixedFieldParticle)+sizeof(float3))*threadsPerBlock );
(void) fprintf( amoebaGpu->log, "AmoebaCutoffForces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u warp=%d\n", (void) fprintf( amoebaGpu->log, "AmoebaCutoffForces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u warp=%d\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp, amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->energyOutputBuffers, sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->bOutputBufferPerWarp ); (*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->bOutputBufferPerWarp );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
/* /*
......
...@@ -389,7 +389,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte ...@@ -389,7 +389,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
} }
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
int maxSlots = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; int maxSlots = 10;
CUDAStream<float4>* debugArray = new CUDAStream<float4>(maxSlots*paddedNumberOfAtoms, 1, "DebugArray"); CUDAStream<float4>* debugArray = new CUDAStream<float4>(maxSlots*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysData, 0, sizeof( float )*4*maxSlots*paddedNumberOfAtoms); memset( debugArray->_pSysData, 0, sizeof( float )*4*maxSlots*paddedNumberOfAtoms);
debugArray->Upload(); debugArray->Upload();
...@@ -414,10 +414,10 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte ...@@ -414,10 +414,10 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
(void) fprintf( amoebaGpu->log, "Cutoff -- use warp\n" ); (void) fprintf( amoebaGpu->log, "Cutoff -- use warp\n" );
(void) fprintf( amoebaGpu->log, "AmoebaCutoffForces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n", (void) fprintf( amoebaGpu->log, "%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp, methodName, amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock, sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock,
amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits ); (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
#endif #endif
//gpu->sim.pInteractingWorkUnit, //gpu->sim.pInteractingWorkUnit,
...@@ -436,10 +436,10 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte ...@@ -436,10 +436,10 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
(void) fprintf( amoebaGpu->log, "Cutoff no warp\n" ); (void) fprintf( amoebaGpu->log, "Cutoff no warp\n" );
(void) fprintf( amoebaGpu->log, "AmoebaCutoffForces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n", (void) fprintf( amoebaGpu->log, "%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp, methodName, amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock, sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock,
amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits ); (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
#endif #endif
kCalculateAmoebaPmeMutualInducedFieldCutoff_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>( kCalculateAmoebaPmeMutualInducedFieldCutoff_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
......
...@@ -123,6 +123,7 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)( ...@@ -123,6 +123,7 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)(
fieldPolarSum[2] += mask ? ijField[2].z : 0.0f; fieldPolarSum[2] += mask ? ijField[2].z : 0.0f;
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
/*
if( atomI == targetAtom || (y+j) == targetAtom ){ if( atomI == targetAtom || (y+j) == targetAtom ){
unsigned int index = atomI == targetAtom ? (y+j) : atomI; unsigned int index = atomI == targetAtom ? (y+j) : atomI;
unsigned int pullBackIndex = 0; unsigned int pullBackIndex = 0;
...@@ -190,6 +191,7 @@ if( atomI == targetAtom || (y+j) == targetAtom ){ ...@@ -190,6 +191,7 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
debugArray[index].w = + 10.0f; debugArray[index].w = + 10.0f;
} }
*/
#endif #endif
} }
......
...@@ -434,6 +434,7 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu ) ...@@ -434,6 +434,7 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
amoebaGpu->psMolecularDipole->_pSysData[mIndex], amoebaGpu->psMolecularDipole->_pSysData[mIndex],
amoebaGpu->psMolecularDipole->_pSysData[mIndex+1], amoebaGpu->psMolecularDipole->_pSysData[mIndex+1],
amoebaGpu->psMolecularDipole->_pSysData[mIndex+2] ); amoebaGpu->psMolecularDipole->_pSysData[mIndex+2] );
if( ii == 30 )ii = gpu->natoms - 30;
} }
} }
// int64 kernelTime = AmoebaTiming::getTimeOfDay(); // int64 kernelTime = AmoebaTiming::getTimeOfDay();
...@@ -459,7 +460,7 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu ) ...@@ -459,7 +460,7 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
amoebaGpu->psLabFrameQuadrupole->Download(); amoebaGpu->psLabFrameQuadrupole->Download();
(void) fprintf( amoebaGpu->log, "psLabFrameQpole completed\n" ); (void) fflush( amoebaGpu->log ); (void) fprintf( amoebaGpu->log, "psLabFrameQpole completed\n" ); (void) fflush( amoebaGpu->log );
int maxPrint = 10; int maxPrint = 20;
for( int ii = 0; ii < amoebaGpu->paddedNumberOfAtoms; ii++ ){ for( int ii = 0; ii < amoebaGpu->paddedNumberOfAtoms; ii++ ){
int dipoleOffset = 3*ii; int dipoleOffset = 3*ii;
......
...@@ -525,10 +525,10 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff ...@@ -525,10 +525,10 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
#ifdef AMOEBA_DEBUG_PRINT #ifdef AMOEBA_DEBUG_PRINT
(void) fprintf( amoebaGpu->log, "Apply cutoff=%d warp=%d\n", applyCutoff, gpu->bOutputBufferPerWarp ); (void) fprintf( amoebaGpu->log, "Apply cutoff=%d warp=%d\n", applyCutoff, gpu->bOutputBufferPerWarp );
(void) fprintf( amoebaGpu->log, "numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n", (void) fprintf( amoebaGpu->log, "numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp, amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(Vdw14_7Particle), sizeof(Vdw14_7Particle)*threadsPerBlock, sizeof(Vdw14_7Particle), sizeof(Vdw14_7Particle)*threadsPerBlock,
amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits ); (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
if( 0 ){ if( 0 ){
gpu->psInteractionCount->Download(); gpu->psInteractionCount->Download();
amoebaGpu->psVdwWorkUnit->Download(); amoebaGpu->psVdwWorkUnit->Download();
......
...@@ -434,10 +434,10 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu ) ...@@ -434,10 +434,10 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
(void) fprintf( amoebaGpu->log, "N2 no warp\n" ); (void) fprintf( amoebaGpu->log, "N2 no warp\n" );
(void) fprintf( amoebaGpu->log, "%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n", (void) fprintf( amoebaGpu->log, "%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n",
methodName, amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp, methodName, amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(WcaDispersionParticle), sizeof(WcaDispersionParticle)*threadsPerBlock, sizeof(WcaDispersionParticle), sizeof(WcaDispersionParticle)*threadsPerBlock,
amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits ); (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
#endif #endif
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment