"platforms/common/vscode:/vscode.git/clone" did not exist on "be19e0222ddf66f612016a3c1f687161a53c2396"
Commit c97d5ee1 authored by Peter Eastman's avatar Peter Eastman
Browse files

Optimization: eliminated an unnecessary gpu->cpu data transfer on each time step

parent d63d3fc6
...@@ -1475,7 +1475,9 @@ int gpuBuildThreadBlockWorkList(gpuContext gpu) ...@@ -1475,7 +1475,9 @@ int gpuBuildThreadBlockWorkList(gpuContext gpu)
count++; count++;
} }
} }
(*gpu->psInteractionCount)[0] = gpu->sim.workUnits;
gpu->psInteractionCount->Upload();
psWorkUnit->Upload(); psWorkUnit->Upload();
gpuSetConstants(gpu); gpuSetConstants(gpu);
return cells; return cells;
......
...@@ -132,16 +132,15 @@ void kCalculateCDLJForces(gpuContext gpu) ...@@ -132,16 +132,15 @@ void kCalculateCDLJForces(gpuContext gpu)
{ {
// printf("kCalculateCDLJCutoffForces\n"); // printf("kCalculateCDLJCutoffForces\n");
CUDPPResult result; CUDPPResult result;
size_t numWithInteractions;
switch (gpu->sim.nonbondedMethod) switch (gpu->sim.nonbondedMethod)
{ {
case NO_CUTOFF: case NO_CUTOFF:
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateCDLJN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits); sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
else else
kCalculateCDLJN2Forces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJN2Forces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits); sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
LAUNCHERROR("kCalculateCDLJN2Forces"); LAUNCHERROR("kCalculateCDLJN2Forces");
break; break;
case CUTOFF: case CUTOFF:
...@@ -156,16 +155,14 @@ void kCalculateCDLJForces(gpuContext gpu) ...@@ -156,16 +155,14 @@ void kCalculateCDLJForces(gpuContext gpu)
printf("Error in cudppCompact: %d\n", result); printf("Error in cudppCompact: %d\n", result);
exit(-1); exit(-1);
} }
gpu->psInteractionCount->Download();
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
kFindInteractionsWithinBlocksCutoff_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kFindInteractionsWithinBlocksCutoff_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions); sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateCDLJCutoffByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJCutoffByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions); (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else else
kCalculateCDLJCutoffForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJCutoffForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions); (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCDLJCutoffForces"); LAUNCHERROR("kCalculateCDLJCutoffForces");
break; break;
case PERIODIC: case PERIODIC:
...@@ -180,16 +177,14 @@ void kCalculateCDLJForces(gpuContext gpu) ...@@ -180,16 +177,14 @@ void kCalculateCDLJForces(gpuContext gpu)
printf("Error in cudppCompact: %d\n", result); printf("Error in cudppCompact: %d\n", result);
exit(-1); exit(-1);
} }
gpu->psInteractionCount->Download();
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions); sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateCDLJPeriodicByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJPeriodicByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions); (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else else
kCalculateCDLJPeriodicForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJPeriodicForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions); (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCDLJPeriodicForces"); LAUNCHERROR("kCalculateCDLJPeriodicForces");
break; break;
case EWALD: case EWALD:
...@@ -204,16 +199,14 @@ void kCalculateCDLJForces(gpuContext gpu) ...@@ -204,16 +199,14 @@ void kCalculateCDLJForces(gpuContext gpu)
printf("Error in cudppCompact: %d\n", result); printf("Error in cudppCompact: %d\n", result);
exit(-1); exit(-1);
} }
gpu->psInteractionCount->Download();
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
kFindInteractionsWithinBlocksEwaldDirect_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kFindInteractionsWithinBlocksEwaldDirect_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions); sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateCDLJEwaldDirectByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJEwaldDirectByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions); (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else else
kCalculateCDLJEwaldDirectForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJEwaldDirectForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions); (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCDLJEwaldDirectForces"); LAUNCHERROR("kCalculateCDLJEwaldDirectForces");
} }
......
...@@ -35,11 +35,12 @@ ...@@ -35,11 +35,12 @@
* different versions of the kernels. * different versions of the kernels.
*/ */
__global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUnit, unsigned int numWorkUnits) __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUnit)
{ {
extern __shared__ Atom sA[]; extern __shared__ Atom sA[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID; unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID; unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps; unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps; unsigned int end = (warp+1)*numWorkUnits/totalWarps;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
......
...@@ -106,8 +106,8 @@ extern __global__ void kFindBlockBoundsCutoff_kernel(); ...@@ -106,8 +106,8 @@ extern __global__ void kFindBlockBoundsCutoff_kernel();
extern __global__ void kFindBlockBoundsPeriodic_kernel(); extern __global__ void kFindBlockBoundsPeriodic_kernel();
extern __global__ void kFindBlocksWithInteractionsCutoff_kernel(); extern __global__ void kFindBlocksWithInteractionsCutoff_kernel();
extern __global__ void kFindBlocksWithInteractionsPeriodic_kernel(); extern __global__ void kFindBlocksWithInteractionsPeriodic_kernel();
extern __global__ void kFindInteractionsWithinBlocksCutoff_kernel(unsigned int*, unsigned int); extern __global__ void kFindInteractionsWithinBlocksCutoff_kernel(unsigned int*);
extern __global__ void kFindInteractionsWithinBlocksPeriodic_kernel(unsigned int*, unsigned int); extern __global__ void kFindInteractionsWithinBlocksPeriodic_kernel(unsigned int*);
void kCalculateCDLJObcGbsaForces1(gpuContext gpu) void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
{ {
...@@ -117,7 +117,6 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu) ...@@ -117,7 +117,6 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
kClearBornForces(gpu); kClearBornForces(gpu);
CUDPPResult result; CUDPPResult result;
size_t numWithInteractions;
switch (gpu->sim.nonbondedMethod) switch (gpu->sim.nonbondedMethod)
{ {
case NO_CUTOFF: case NO_CUTOFF:
...@@ -128,10 +127,10 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu) ...@@ -128,10 +127,10 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
} }
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaN2ByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaN2ByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits); sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
else else
kCalculateCDLJObcGbsaN2Forces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaN2Forces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits); sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
LAUNCHERROR("kCalculateCDLJObcGbsaN2Forces1"); LAUNCHERROR("kCalculateCDLJObcGbsaN2Forces1");
break; break;
case CUTOFF: case CUTOFF:
...@@ -146,10 +145,8 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu) ...@@ -146,10 +145,8 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
printf("Error in cudppCompact: %d\n", result); printf("Error in cudppCompact: %d\n", result);
exit(-1); exit(-1);
} }
gpu->psInteractionCount->Download();
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
kFindInteractionsWithinBlocksCutoff_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kFindInteractionsWithinBlocksCutoff_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions); sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
if (gpu->bRecalculateBornRadii) if (gpu->bRecalculateBornRadii)
{ {
kCalculateObcGbsaBornSum(gpu); kCalculateObcGbsaBornSum(gpu);
...@@ -157,10 +154,10 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu) ...@@ -157,10 +154,10 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
} }
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaCutoffByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaCutoffByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions); (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else else
kCalculateCDLJObcGbsaCutoffForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaCutoffForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions); (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCDLJObcGbsaCutoffForces1"); LAUNCHERROR("kCalculateCDLJObcGbsaCutoffForces1");
break; break;
case PERIODIC: case PERIODIC:
...@@ -175,10 +172,8 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu) ...@@ -175,10 +172,8 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
printf("Error in cudppCompact: %d\n", result); printf("Error in cudppCompact: %d\n", result);
exit(-1); exit(-1);
} }
gpu->psInteractionCount->Download();
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions); sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
if (gpu->bRecalculateBornRadii) if (gpu->bRecalculateBornRadii)
{ {
kCalculateObcGbsaBornSum(gpu); kCalculateObcGbsaBornSum(gpu);
...@@ -186,10 +181,10 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu) ...@@ -186,10 +181,10 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
} }
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaPeriodicByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaPeriodicByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions); (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else else
kCalculateCDLJObcGbsaPeriodicForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaPeriodicForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions); (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCDLJObcGbsaPeriodicForces1"); LAUNCHERROR("kCalculateCDLJObcGbsaPeriodicForces1");
break; break;
} }
......
...@@ -35,11 +35,12 @@ ...@@ -35,11 +35,12 @@
* different versions of the kernels. * different versions of the kernels.
*/ */
__global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit, unsigned int numWorkUnits) __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit)
{ {
extern __shared__ Atom sA[]; extern __shared__ Atom sA[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID; unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID; unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps; unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps; unsigned int end = (warp+1)*numWorkUnits/totalWarps;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
......
...@@ -160,34 +160,31 @@ void kCalculateObcGbsaBornSum(gpuContext gpu) ...@@ -160,34 +160,31 @@ void kCalculateObcGbsaBornSum(gpuContext gpu)
// printf("kCalculateObcgbsaBornSum\n"); // printf("kCalculateObcgbsaBornSum\n");
kClearObcGbsaBornSum_kernel<<<gpu->sim.blocks, 384>>>(); kClearObcGbsaBornSum_kernel<<<gpu->sim.blocks, 384>>>();
LAUNCHERROR("kClearBornSum"); LAUNCHERROR("kClearBornSum");
size_t numWithInteractions;
switch (gpu->sim.nonbondedMethod) switch (gpu->sim.nonbondedMethod)
{ {
case NO_CUTOFF: case NO_CUTOFF:
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateObcGbsaN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits); sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
else else
kCalculateObcGbsaN2BornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateObcGbsaN2BornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits); sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
break; break;
case CUTOFF: case CUTOFF:
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateObcGbsaCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions); (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else else
kCalculateObcGbsaCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateObcGbsaCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions); (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
break; break;
case PERIODIC: case PERIODIC:
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaPeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateObcGbsaPeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions); (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else else
kCalculateObcGbsaPeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateObcGbsaPeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions); (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
break; break;
} }
LAUNCHERROR("kCalculateBornSum"); LAUNCHERROR("kCalculateBornSum");
......
...@@ -35,10 +35,11 @@ ...@@ -35,10 +35,11 @@
* different versions of the kernels. * different versions of the kernels.
*/ */
__global__ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit, unsigned int workUnits) __global__ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit)
{ {
extern __shared__ Atom sA[]; extern __shared__ Atom sA[];
int end = workUnits / gridDim.x; unsigned int numWorkUnits = cSim.pInteractionCount[0];
int end = numWorkUnits / gridDim.x;
int pos = end - (threadIdx.x >> GRIDBITS) - 1; int pos = end - (threadIdx.x >> GRIDBITS) - 1;
#ifdef USE_OUTPUT_BUFFER_PER_WARP #ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID; unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
...@@ -50,7 +51,7 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* wor ...@@ -50,7 +51,7 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* wor
while (pos >= 0) while (pos >= 0)
{ {
// Extract cell coordinates from appropriate work unit // Extract cell coordinates from appropriate work unit
unsigned int x = workUnit[pos + (blockIdx.x*workUnits)/gridDim.x]; unsigned int x = workUnit[pos + (blockIdx.x*numWorkUnits)/gridDim.x];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS; unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
x = (x >> 17) << GRIDBITS; x = (x >> 17) << GRIDBITS;
float dx; float dx;
...@@ -146,7 +147,7 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* wor ...@@ -146,7 +147,7 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* wor
sA[threadIdx.x].sum = apos.w = 0.0f; sA[threadIdx.x].sum = apos.w = 0.0f;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int flags = cSim.pInteractionFlag[pos + (blockIdx.x*workUnits)/gridDim.x]; unsigned int flags = cSim.pInteractionFlag[pos + (blockIdx.x*numWorkUnits)/gridDim.x];
if (flags == 0) if (flags == 0)
{ {
// No interactions in this block. // No interactions in this block.
......
...@@ -107,34 +107,31 @@ void GetCalculateObcGbsaForces2Sim(gpuContext gpu) ...@@ -107,34 +107,31 @@ void GetCalculateObcGbsaForces2Sim(gpuContext gpu)
void kCalculateObcGbsaForces2(gpuContext gpu) void kCalculateObcGbsaForces2(gpuContext gpu)
{ {
//printf("kCalculateObcGbsaForces2\n"); //printf("kCalculateObcGbsaForces2\n");
size_t numWithInteractions;
switch (gpu->sim.nonbondedMethod) switch (gpu->sim.nonbondedMethod)
{ {
case NO_CUTOFF: case NO_CUTOFF:
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block, kCalculateObcGbsaN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits); sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit);
else else
kCalculateObcGbsaN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block, kCalculateObcGbsaN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits); sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit);
break; break;
case CUTOFF: case CUTOFF:
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaCutoffByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block, kCalculateObcGbsaCutoffByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions); (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else else
kCalculateObcGbsaCutoffForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block, kCalculateObcGbsaCutoffForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions); (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
break; break;
case PERIODIC: case PERIODIC:
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaPeriodicByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block, kCalculateObcGbsaPeriodicByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions); (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else else
kCalculateObcGbsaPeriodicForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block, kCalculateObcGbsaPeriodicForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions); (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
break; break;
} }
LAUNCHERROR("kCalculateObcGbsaForces2"); LAUNCHERROR("kCalculateObcGbsaForces2");
......
...@@ -35,11 +35,12 @@ ...@@ -35,11 +35,12 @@
* different versions of the kernels. * different versions of the kernels.
*/ */
__global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* workUnit, unsigned int numWorkUnits) __global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* workUnit)
{ {
extern __shared__ Atom sA[]; extern __shared__ Atom sA[];
unsigned int totalWarps = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID; unsigned int totalWarps = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID; unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps; unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps; unsigned int end = (warp+1)*numWorkUnits/totalWarps;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
......
...@@ -117,11 +117,12 @@ __global__ void METHOD_NAME(kFindBlocksWithInteractions, _kernel)() ...@@ -117,11 +117,12 @@ __global__ void METHOD_NAME(kFindBlocksWithInteractions, _kernel)()
* Compare each atom in one block to the bounding box of another block, and set * Compare each atom in one block to the bounding box of another block, and set
* flags for which ones are interacting. * flags for which ones are interacting.
*/ */
__global__ void METHOD_NAME(kFindInteractionsWithinBlocks, _kernel)(unsigned int* workUnit, unsigned int numWorkUnits) __global__ void METHOD_NAME(kFindInteractionsWithinBlocks, _kernel)(unsigned int* workUnit)
{ {
extern __shared__ unsigned int flags[]; extern __shared__ unsigned int flags[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID; unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID; unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps; unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps; unsigned int end = (warp+1)*numWorkUnits/totalWarps;
unsigned int index = threadIdx.x & (GRID - 1); unsigned int index = threadIdx.x & (GRID - 1);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment