Commit f5ea8297 authored by Peter Eastman's avatar Peter Eastman
Browse files

Optimization (clear the force and Born sum buffers in a single kernel)

parent 2cb112f0
...@@ -47,7 +47,10 @@ void CudaCalcForcesAndEnergyKernel::beginForceComputation(ContextImpl& context) ...@@ -47,7 +47,10 @@ void CudaCalcForcesAndEnergyKernel::beginForceComputation(ContextImpl& context)
if (data.nonbondedMethod != NO_CUTOFF && data.computeForceCount%100 == 0) if (data.nonbondedMethod != NO_CUTOFF && data.computeForceCount%100 == 0)
gpuReorderAtoms(gpu); gpuReorderAtoms(gpu);
data.computeForceCount++; data.computeForceCount++;
kClearForces(gpu); if (gpu->bIncludeGBSA || gpu->bIncludeGBVI)
kClearBornSumAndForces(gpu);
else
kClearForces(gpu);
} }
void CudaCalcForcesAndEnergyKernel::finishForceComputation(ContextImpl& context) { void CudaCalcForcesAndEnergyKernel::finishForceComputation(ContextImpl& context) {
...@@ -77,6 +80,8 @@ void CudaCalcForcesAndEnergyKernel::beginEnergyComputation(ContextImpl& context) ...@@ -77,6 +80,8 @@ void CudaCalcForcesAndEnergyKernel::beginEnergyComputation(ContextImpl& context)
gpuReorderAtoms(gpu); gpuReorderAtoms(gpu);
data.stepCount++; data.stepCount++;
kClearEnergy(gpu); kClearEnergy(gpu);
if (gpu->bIncludeGBSA || gpu->bIncludeGBVI)
kClearBornSumAndForces(gpu);
} }
double CudaCalcForcesAndEnergyKernel::finishEnergyComputation(ContextImpl& context) { double CudaCalcForcesAndEnergyKernel::finishEnergyComputation(ContextImpl& context) {
...@@ -790,8 +795,10 @@ void OPENMMCUDA_EXPORT OpenMM::cudaOpenMMInitializeIntegration(const System& sys ...@@ -790,8 +795,10 @@ void OPENMMCUDA_EXPORT OpenMM::cudaOpenMMInitializeIntegration(const System& sys
gpuBuildExclusionList(gpu); gpuBuildExclusionList(gpu);
gpuBuildOutputBuffers(gpu); gpuBuildOutputBuffers(gpu);
gpuSetConstants(gpu); gpuSetConstants(gpu);
kClearBornForces(gpu); if (gpu->bIncludeGBSA || gpu->bIncludeGBVI)
kClearForces(gpu); kClearBornSumAndForces(gpu);
else
kClearForces(gpu);
cudaThreadSynchronize(); cudaThreadSynchronize();
} }
......
...@@ -29,7 +29,7 @@ ...@@ -29,7 +29,7 @@
// Initialization // Initialization
extern void kClearForces(gpuContext gpu); extern void kClearForces(gpuContext gpu);
extern void kClearEnergy(gpuContext gpu); extern void kClearEnergy(gpuContext gpu);
extern void kClearBornForces(gpuContext gpu); extern void kClearBornSumAndForces(gpuContext gpu);
extern void kClearObcGbsaBornSum(gpuContext gpu); extern void kClearObcGbsaBornSum(gpuContext gpu);
extern void kCalculateObcGbsaBornSum(gpuContext gpu); extern void kCalculateObcGbsaBornSum(gpuContext gpu);
extern void kReduceObcGbsaBornSum(gpuContext gpu); extern void kReduceObcGbsaBornSum(gpuContext gpu);
......
...@@ -134,10 +134,6 @@ extern void kCalculatePME(gpuContext gpu); ...@@ -134,10 +134,6 @@ extern void kCalculatePME(gpuContext gpu);
void kCalculateCDLJObcGbsaForces1(gpuContext gpu) void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
{ {
// printf("kCalculateCDLJObcGbsaForces1\n"); // printf("kCalculateCDLJObcGbsaForces1\n");
// check if Born radii need to be calculated
kClearBornForces(gpu);
switch (gpu->sim.nonbondedMethod) switch (gpu->sim.nonbondedMethod)
{ {
case NO_CUTOFF: case NO_CUTOFF:
......
...@@ -102,21 +102,6 @@ void GetCalculateGBVIBornSumSim(gpuContext gpu) ...@@ -102,21 +102,6 @@ void GetCalculateGBVIBornSumSim(gpuContext gpu)
#define METHOD_NAME(a, b) a##PeriodicByWarp##b #define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateGBVIBornSum.h" #include "kCalculateGBVIBornSum.h"
__global__ void kClearGBVIBornSum_kernel()
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
while (pos < cSim.stride * cSim.nonbondOutputBuffers)
{
((float*)cSim.pBornSum)[pos] = 0.0f;
pos += gridDim.x * blockDim.x;
}
}
void kClearGBVIBornSum(gpuContext gpu) {
kClearGBVIBornSum_kernel<<<gpu->sim.blocks, 384>>>();
}
__global__ void kReduceGBVIBornSum_kernel() __global__ void kReduceGBVIBornSum_kernel()
{ {
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x); unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
...@@ -177,8 +162,6 @@ void kReduceGBVIBornSum(gpuContext gpu) ...@@ -177,8 +162,6 @@ void kReduceGBVIBornSum(gpuContext gpu)
void kCalculateGBVIBornSum(gpuContext gpu) void kCalculateGBVIBornSum(gpuContext gpu)
{ {
//printf("kCalculateGBVIBornSum\n"); //printf("kCalculateGBVIBornSum\n");
kClearGBVIBornSum( gpu );
LAUNCHERROR("kClearBornSum");
//size_t numWithInteractions; //size_t numWithInteractions;
switch (gpu->sim.nonbondedMethod) switch (gpu->sim.nonbondedMethod)
{ {
......
...@@ -97,19 +97,6 @@ void GetCalculateObcGbsaBornSumSim(gpuContext gpu) ...@@ -97,19 +97,6 @@ void GetCalculateObcGbsaBornSumSim(gpuContext gpu)
#define METHOD_NAME(a, b) a##PeriodicByWarp##b #define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateObcGbsaBornSum.h" #include "kCalculateObcGbsaBornSum.h"
__global__
__launch_bounds__(384, 1)
void kClearObcGbsaBornSum_kernel()
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
while (pos < cSim.stride * cSim.nonbondOutputBuffers)
{
((float*)cSim.pBornSum)[pos] = 0.0f;
pos += gridDim.x * blockDim.x;
}
}
__global__ __global__
__launch_bounds__(384, 1) __launch_bounds__(384, 1)
void kReduceObcGbsaBornSum_kernel() void kReduceObcGbsaBornSum_kernel()
...@@ -154,17 +141,9 @@ void kReduceObcGbsaBornSum(gpuContext gpu) ...@@ -154,17 +141,9 @@ void kReduceObcGbsaBornSum(gpuContext gpu)
LAUNCHERROR("kReduceObcGbsaBornSum"); LAUNCHERROR("kReduceObcGbsaBornSum");
} }
extern void kClearObcGbsaBornSum(gpuContext gpu)
{
// printf("kClearObcGbsaBornSum\n");
kClearObcGbsaBornSum_kernel<<<gpu->sim.blocks, 384>>>();
}
void kCalculateObcGbsaBornSum(gpuContext gpu) void kCalculateObcGbsaBornSum(gpuContext gpu)
{ {
// printf("kCalculateObcgbsaBornSum\n"); // printf("kCalculateObcgbsaBornSum\n");
kClearObcGbsaBornSum(gpu);
LAUNCHERROR("kClearBornSum");
switch (gpu->sim.nonbondedMethod) switch (gpu->sim.nonbondedMethod)
{ {
case NO_CUTOFF: case NO_CUTOFF:
......
...@@ -53,8 +53,8 @@ void GetForcesSim(gpuContext gpu) ...@@ -53,8 +53,8 @@ void GetForcesSim(gpuContext gpu)
RTERROR(status, "cudaMemcpyFromSymbol: GetForcesSim copy from cSim failed"); RTERROR(status, "cudaMemcpyFromSymbol: GetForcesSim copy from cSim failed");
} }
__global__ __global__
__launch_bounds__(384, 1) __launch_bounds__(384, 1)
void kClearForces_kernel() void kClearForces_kernel()
{ {
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x; unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
...@@ -72,27 +72,34 @@ void kClearForces(gpuContext gpu) ...@@ -72,27 +72,34 @@ void kClearForces(gpuContext gpu)
LAUNCHERROR("kClearForces"); LAUNCHERROR("kClearForces");
} }
__global__ __global__
__launch_bounds__(384, 1) __launch_bounds__(384, 1)
void kClearBornForces_kernel() void kClearBornSumAndForces_kernel()
{ {
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x; unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
while (pos < cSim.stride * cSim.nonbondOutputBuffers) while (pos < cSim.stride * cSim.nonbondOutputBuffers)
{ {
((float*)cSim.pBornForce)[pos] = 0.0f; cSim.pBornSum[pos] = 0.0f;
cSim.pBornForce[pos] = 0.0f;
cSim.pForce4[pos] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
pos += gridDim.x * blockDim.x;
}
while (pos < cSim.stride * cSim.outputBuffers)
{
cSim.pForce4[pos] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
pos += gridDim.x * blockDim.x; pos += gridDim.x * blockDim.x;
} }
} }
void kClearBornForces(gpuContext gpu) void kClearBornSumAndForces(gpuContext gpu)
{ {
// printf("kClearBornForces\n"); // printf("kClearBornSumAndForces\n");
kClearBornForces_kernel<<<gpu->sim.blocks, 384>>>(); kClearBornSumAndForces_kernel<<<gpu->sim.blocks, 384>>>();
LAUNCHERROR("kClearBornForces"); LAUNCHERROR("kClearBornSumAndForces");
} }
__global__ __global__
__launch_bounds__(384, 1) __launch_bounds__(384, 1)
void kClearEnergy_kernel() void kClearEnergy_kernel()
{ {
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x; unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
...@@ -110,15 +117,15 @@ void kClearEnergy(gpuContext gpu) ...@@ -110,15 +117,15 @@ void kClearEnergy(gpuContext gpu)
LAUNCHERROR("kClearEnergy"); LAUNCHERROR("kClearEnergy");
} }
__global__ __global__
#if (__CUDA_ARCH__ >= 200) #if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1) __launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130) #elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1) __launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else #else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1) __launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif #endif
void kReduceBornSumAndForces_kernel() void kReduceBornSumAndForces_kernel()
{ {
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x); unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
...@@ -221,14 +228,14 @@ void kReduceBornSumAndForces(gpuContext gpu) ...@@ -221,14 +228,14 @@ void kReduceBornSumAndForces(gpuContext gpu)
LAUNCHERROR("kReduceBornSumAndForces"); LAUNCHERROR("kReduceBornSumAndForces");
} }
__global__ __global__
#if (__CUDA_ARCH__ >= 200) #if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1) __launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130) #elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1) __launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else #else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1) __launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif #endif
void kReduceForces_kernel() void kReduceForces_kernel()
{ {
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x); unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
...@@ -291,15 +298,15 @@ double kReduceEnergy(gpuContext gpu) ...@@ -291,15 +298,15 @@ double kReduceEnergy(gpuContext gpu)
return sum; return sum;
} }
__global__ __global__
#if (__CUDA_ARCH__ >= 200) #if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1) __launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130) #elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1) __launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
#else #else
__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1) __launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
#endif #endif
void kReduceObcGbsaBornForces_kernel() void kReduceObcGbsaBornForces_kernel()
{ {
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x); unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
float energy = 0.0f; float energy = 0.0f;
...@@ -357,15 +364,15 @@ void kReduceObcGbsaBornForces_kernel() ...@@ -357,15 +364,15 @@ void kReduceObcGbsaBornForces_kernel()
cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy / -6.0f; cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy / -6.0f;
} }
__global__ __global__
#if (__CUDA_ARCH__ >= 200) #if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1) __launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130) #elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1) __launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
#else #else
__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1) __launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
#endif #endif
void kReduceGBVIBornForces_kernel() void kReduceGBVIBornForces_kernel()
{ {
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x); unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
float energy = 0.0f; float energy = 0.0f;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment