Commit 08f52358 authored by Peter Eastman's avatar Peter Eastman
Browse files

Reduced number of force buffers for GBSA

parent f5ea8297
...@@ -331,7 +331,6 @@ struct cudaGmxSimulation { ...@@ -331,7 +331,6 @@ struct cudaGmxSimulation {
unsigned int stride3; // Atomic attributes stride x 3 unsigned int stride3; // Atomic attributes stride x 3
unsigned int stride4; // Atomic attributes stride x 4 unsigned int stride4; // Atomic attributes stride x 4
unsigned int nonbondOutputBuffers; // Nonbond output buffers per nonbond call unsigned int nonbondOutputBuffers; // Nonbond output buffers per nonbond call
unsigned int totalNonbondOutputBuffers; // Total nonbond output buffers
unsigned int outputBuffers; // Number of output buffers unsigned int outputBuffers; // Number of output buffers
unsigned int energyOutputBuffers; // Number of energy output buffers unsigned int energyOutputBuffers; // Number of energy output buffers
float bigFloat; // Floating point value used as a flag for Shaken atoms float bigFloat; // Floating point value used as a flag for Shaken atoms
...@@ -474,9 +473,7 @@ struct cudaGmxSimulation { ...@@ -474,9 +473,7 @@ struct cudaGmxSimulation {
float4* pVelm4; // Pointer to atom velocity and inverse mass float4* pVelm4; // Pointer to atom velocity and inverse mass
float4* pvVector4; // Pointer to atom v Vector float4* pvVector4; // Pointer to atom v Vector
float4* pxVector4; // Pointer to atom x Vector float4* pxVector4; // Pointer to atom x Vector
float4* pForce4; // Pointer to all force4 data float4* pForce4; // Pointer to force data
float4* pForce4a; // Pointer to first set of force4 data
float4* pForce4b; // Pointer to second set of force4 data
float* pEnergy; // Pointer to energy output buffer float* pEnergy; // Pointer to energy output buffer
float* pBornForce; // Pointer to Born force data float* pBornForce; // Pointer to Born force data
float* pBornSum; // Pointer to Born Radii calculation output buffers float* pBornSum; // Pointer to Born Radii calculation output buffers
......
...@@ -1936,8 +1936,6 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync) ...@@ -1936,8 +1936,6 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
gpu->psForce4 = NULL; gpu->psForce4 = NULL;
gpu->psEnergy = NULL; gpu->psEnergy = NULL;
gpu->sim.pForce4 = NULL; gpu->sim.pForce4 = NULL;
gpu->sim.pForce4a = NULL;
gpu->sim.pForce4b = NULL;
gpu->psBornForce = NULL; gpu->psBornForce = NULL;
gpu->sim.pBornForce = NULL; gpu->sim.pBornForce = NULL;
gpu->psBornSum = NULL; gpu->psBornSum = NULL;
...@@ -2246,10 +2244,9 @@ int gpuBuildOutputBuffers(gpuContext gpu) ...@@ -2246,10 +2244,9 @@ int gpuBuildOutputBuffers(gpuContext gpu)
gpu->bOutputBufferPerWarp = false; gpu->bOutputBufferPerWarp = false;
gpu->sim.nonbondOutputBuffers = gpu->sim.paddedNumberOfAtoms / GRID; gpu->sim.nonbondOutputBuffers = gpu->sim.paddedNumberOfAtoms / GRID;
} }
gpu->sim.totalNonbondOutputBuffers = ( (gpu->bIncludeGBSA || gpu->bIncludeGBVI) ? 2 * gpu->sim.nonbondOutputBuffers : gpu->sim.nonbondOutputBuffers); gpu->sim.outputBuffers = gpu->sim.nonbondOutputBuffers;
gpu->sim.outputBuffers = gpu->sim.totalNonbondOutputBuffers;
unsigned int outputBuffers = gpu->sim.totalNonbondOutputBuffers; unsigned int outputBuffers = gpu->sim.outputBuffers;
for (unsigned int i = 0; i < gpu->sim.paddedNumberOfAtoms; i++) for (unsigned int i = 0; i < gpu->sim.paddedNumberOfAtoms; i++)
{ {
if (outputBuffers < gpu->pOutputBufferCounter[i]) if (outputBuffers < gpu->pOutputBufferCounter[i])
...@@ -2264,8 +2261,6 @@ int gpuBuildOutputBuffers(gpuContext gpu) ...@@ -2264,8 +2261,6 @@ int gpuBuildOutputBuffers(gpuContext gpu)
gpu->psBornForce = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, gpu->sim.nonbondOutputBuffers, "BornForce"); gpu->psBornForce = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, gpu->sim.nonbondOutputBuffers, "BornForce");
gpu->psBornSum = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, gpu->sim.nonbondOutputBuffers, "BornSum"); gpu->psBornSum = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, gpu->sim.nonbondOutputBuffers, "BornSum");
gpu->sim.pForce4 = gpu->psForce4->_pDevStream[0]; gpu->sim.pForce4 = gpu->psForce4->_pDevStream[0];
gpu->sim.pForce4a = gpu->sim.pForce4;
gpu->sim.pForce4b = gpu->sim.pForce4 + 1 * gpu->sim.nonbondOutputBuffers * gpu->sim.stride;
gpu->sim.pEnergy = gpu->psEnergy->_pDevStream[0]; gpu->sim.pEnergy = gpu->psEnergy->_pDevStream[0];
gpu->sim.pBornForce = gpu->psBornForce->_pDevStream[0]; gpu->sim.pBornForce = gpu->psBornForce->_pDevStream[0];
gpu->sim.pBornSum = gpu->psBornSum->_pDevStream[0]; gpu->sim.pBornSum = gpu->psBornSum->_pDevStream[0];
......
...@@ -243,11 +243,11 @@ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUnit) ...@@ -243,11 +243,11 @@ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUnit)
#else #else
unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride; unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
#endif #endif
float4 of = cSim.pForce4a[offset]; float4 of = cSim.pForce4[offset];
of.x += af.x; of.x += af.x;
of.y += af.y; of.y += af.y;
of.z += af.z; of.z += af.z;
cSim.pForce4a[offset] = of; cSim.pForce4[offset] = of;
} }
else // 100% utilization else // 100% utilization
{ {
...@@ -536,21 +536,21 @@ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUnit) ...@@ -536,21 +536,21 @@ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUnit)
#else #else
unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride; unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
#endif #endif
of = cSim.pForce4a[offset]; of = cSim.pForce4[offset];
of.x += af.x; of.x += af.x;
of.y += af.y; of.y += af.y;
of.z += af.z; of.z += af.z;
cSim.pForce4a[offset] = of; cSim.pForce4[offset] = of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP #ifdef USE_OUTPUT_BUFFER_PER_WARP
offset = y + tgx + warp*cSim.stride; offset = y + tgx + warp*cSim.stride;
#else #else
offset = y + tgx + (x >> GRIDBITS) * cSim.stride; offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
#endif #endif
of = cSim.pForce4a[offset]; of = cSim.pForce4[offset];
of.x += sA[threadIdx.x].fx; of.x += sA[threadIdx.x].fx;
of.y += sA[threadIdx.x].fy; of.y += sA[threadIdx.x].fy;
of.z += sA[threadIdx.x].fz; of.z += sA[threadIdx.x].fz;
cSim.pForce4a[offset] = of; cSim.pForce4[offset] = of;
lasty = y; lasty = y;
} }
......
...@@ -289,12 +289,12 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit) ...@@ -289,12 +289,12 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit)
#else #else
unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride; unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
#endif #endif
float4 of = cSim.pForce4a[offset]; float4 of = cSim.pForce4[offset];
of.x += af.x; of.x += af.x;
of.y += af.y; of.y += af.y;
of.z += af.z; of.z += af.z;
of.w += af.w; of.w += af.w;
cSim.pForce4a[offset] = of; cSim.pForce4[offset] = of;
cSim.pBornForce[offset] = of.w; cSim.pBornForce[offset] = of.w;
} }
else // 100% utilization else // 100% utilization
...@@ -671,24 +671,24 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit) ...@@ -671,24 +671,24 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit)
#else #else
unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride; unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
#endif #endif
float4 of = cSim.pForce4a[offset]; float4 of = cSim.pForce4[offset];
of.x += af.x; of.x += af.x;
of.y += af.y; of.y += af.y;
of.z += af.z; of.z += af.z;
of.w += af.w; of.w += af.w;
cSim.pForce4a[offset] = of; cSim.pForce4[offset] = of;
cSim.pBornForce[offset] = of.w; cSim.pBornForce[offset] = of.w;
#ifdef USE_OUTPUT_BUFFER_PER_WARP #ifdef USE_OUTPUT_BUFFER_PER_WARP
offset = y + tgx + warp*cSim.stride; offset = y + tgx + warp*cSim.stride;
#else #else
offset = y + tgx + (x >> GRIDBITS) * cSim.stride; offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
#endif #endif
of = cSim.pForce4a[offset]; of = cSim.pForce4[offset];
of.x += sA[threadIdx.x].fx; of.x += sA[threadIdx.x].fx;
of.y += sA[threadIdx.x].fy; of.y += sA[threadIdx.x].fy;
of.z += sA[threadIdx.x].fz; of.z += sA[threadIdx.x].fz;
of.w += sA[threadIdx.x].fb; of.w += sA[threadIdx.x].fb;
cSim.pForce4a[offset] = of; cSim.pForce4[offset] = of;
cSim.pBornForce[offset] = of.w; cSim.pBornForce[offset] = of.w;
lasty = y; lasty = y;
} }
......
...@@ -126,18 +126,18 @@ __global__ void METHOD_NAME(kCalculateCustomNonbonded, Forces_kernel)(unsigned i ...@@ -126,18 +126,18 @@ __global__ void METHOD_NAME(kCalculateCustomNonbonded, Forces_kernel)(unsigned i
float4 of; float4 of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP #ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*cSim.stride; unsigned int offset = x + tgx + warp*cSim.stride;
of = cSim.pForce4a[offset]; of = cSim.pForce4[offset];
of.x += af.x; of.x += af.x;
of.y += af.y; of.y += af.y;
of.z += af.z; of.z += af.z;
cSim.pForce4a[offset] = of; cSim.pForce4[offset] = of;
#else #else
of.x = af.x; of.x = af.x;
of.y = af.y; of.y = af.y;
of.z = af.z; of.z = af.z;
of.w = 0.0f; of.w = 0.0f;
unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride; unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of; cSim.pForce4[offset] = of;
#endif #endif
} }
else // 100% utilization else // 100% utilization
...@@ -370,29 +370,29 @@ __global__ void METHOD_NAME(kCalculateCustomNonbonded, Forces_kernel)(unsigned i ...@@ -370,29 +370,29 @@ __global__ void METHOD_NAME(kCalculateCustomNonbonded, Forces_kernel)(unsigned i
float4 of; float4 of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP #ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*cSim.stride; unsigned int offset = x + tgx + warp*cSim.stride;
of = cSim.pForce4a[offset]; of = cSim.pForce4[offset];
of.x += af.x; of.x += af.x;
of.y += af.y; of.y += af.y;
of.z += af.z; of.z += af.z;
cSim.pForce4a[offset] = of; cSim.pForce4[offset] = of;
offset = y + tgx + warp*cSim.stride; offset = y + tgx + warp*cSim.stride;
of = cSim.pForce4a[offset]; of = cSim.pForce4[offset];
of.x += sA[threadIdx.x].fx; of.x += sA[threadIdx.x].fx;
of.y += sA[threadIdx.x].fy; of.y += sA[threadIdx.x].fy;
of.z += sA[threadIdx.x].fz; of.z += sA[threadIdx.x].fz;
cSim.pForce4a[offset] = of; cSim.pForce4[offset] = of;
#else #else
of.x = af.x; of.x = af.x;
of.y = af.y; of.y = af.y;
of.z = af.z; of.z = af.z;
of.w = 0.0f; of.w = 0.0f;
unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride; unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of; cSim.pForce4[offset] = of;
of.x = sA[threadIdx.x].fx; of.x = sA[threadIdx.x].fx;
of.y = sA[threadIdx.x].fy; of.y = sA[threadIdx.x].fy;
of.z = sA[threadIdx.x].fz; of.z = sA[threadIdx.x].fz;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride; offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of; cSim.pForce4[offset] = of;
#endif #endif
lasty = y; lasty = y;
} }
......
...@@ -128,7 +128,7 @@ __global__ void kCalculateGBVIForces2a_kernel() ...@@ -128,7 +128,7 @@ __global__ void kCalculateGBVIForces2a_kernel()
} }
// Write results // Write results
cSim.pForce4a[pos] = force; cSim.pForce4[pos] = force;
} }
......
...@@ -130,20 +130,14 @@ METHOD_NAME(kCalculateGBVI, Forces2_kernel)(unsigned int* workUnit, unsigned int ...@@ -130,20 +130,14 @@ METHOD_NAME(kCalculateGBVI, Forces2_kernel)(unsigned int* workUnit, unsigned int
float4 of; float4 of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP #ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*cSim.stride; unsigned int offset = x + tgx + warp*cSim.stride;
of = cSim.pForce4b[offset];
of.x += af.x + sA[threadIdx.x].fx;
of.y += af.y + sA[threadIdx.x].fy;
of.z += af.z + sA[threadIdx.x].fz;
cSim.pForce4b[offset] = of;
#else #else
unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride; unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
of = cSim.pForce4b[offset]; #endif
of = cSim.pForce4[offset];
of.x += af.x + sA[threadIdx.x].fx; of.x += af.x + sA[threadIdx.x].fx;
of.y += af.y + sA[threadIdx.x].fy; of.y += af.y + sA[threadIdx.x].fy;
of.z += af.z + sA[threadIdx.x].fz; of.z += af.z + sA[threadIdx.x].fz;
of.w = 0.0f; cSim.pForce4[offset] = of;
cSim.pForce4b[offset] = of;
#endif
} }
else else
{ {
...@@ -349,32 +343,24 @@ METHOD_NAME(kCalculateGBVI, Forces2_kernel)(unsigned int* workUnit, unsigned int ...@@ -349,32 +343,24 @@ METHOD_NAME(kCalculateGBVI, Forces2_kernel)(unsigned int* workUnit, unsigned int
float4 of; float4 of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP #ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*cSim.stride; unsigned int offset = x + tgx + warp*cSim.stride;
of = cSim.pForce4b[offset];
of.x += af.x;
of.y += af.y;
of.z += af.z;
cSim.pForce4b[offset] = of;
offset = y + tgx + warp*cSim.stride;
of = cSim.pForce4b[offset];
of.x += sA[threadIdx.x].fx;
of.y += sA[threadIdx.x].fy;
of.z += sA[threadIdx.x].fz;
cSim.pForce4b[offset] = of;
#else #else
unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride; unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
of = cSim.pForce4b[offset]; #endif
of = cSim.pForce4[offset];
of.x += af.x; of.x += af.x;
of.y += af.y; of.y += af.y;
of.z += af.z; of.z += af.z;
of.w = 0.0f; cSim.pForce4[offset] = of;
cSim.pForce4b[offset] = of; #ifdef USE_OUTPUT_BUFFER_PER_WARP
offset = y + tgx + warp*cSim.stride;
#else
offset = y + tgx + (x >> GRIDBITS) * cSim.stride; offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
of = cSim.pForce4b[offset]; #endif
of = cSim.pForce4[offset];
of.x += sA[threadIdx.x].fx; of.x += sA[threadIdx.x].fx;
of.y += sA[threadIdx.x].fy; of.y += sA[threadIdx.x].fy;
of.z += sA[threadIdx.x].fz; of.z += sA[threadIdx.x].fz;
cSim.pForce4b[offset] = of; cSim.pForce4[offset] = of;
#endif
} }
lasty = y; lasty = y;
pos++; pos++;
......
...@@ -140,19 +140,14 @@ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* workUnit) ...@@ -140,19 +140,14 @@ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* workUnit)
float4 of; float4 of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP #ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*cSim.stride; unsigned int offset = x + tgx + warp*cSim.stride;
of = cSim.pForce4b[offset];
of.x += af.x + sA[threadIdx.x].fx;
of.y += af.y + sA[threadIdx.x].fy;
of.z += af.z + sA[threadIdx.x].fz;
cSim.pForce4b[offset] = of;
#else #else
unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride; unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
of.x = af.x + sA[threadIdx.x].fx;
of.y = af.y + sA[threadIdx.x].fy;
of.z = af.z + sA[threadIdx.x].fz;
of.w = 0.0f;
cSim.pForce4b[offset] = of;
#endif #endif
of = cSim.pForce4[offset];
of.x += af.x + sA[threadIdx.x].fx;
of.y += af.y + sA[threadIdx.x].fy;
of.z += af.z + sA[threadIdx.x].fz;
cSim.pForce4[offset] = of;
} }
else else
{ {
...@@ -409,30 +404,24 @@ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* workUnit) ...@@ -409,30 +404,24 @@ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* workUnit)
float4 of; float4 of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP #ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*cSim.stride; unsigned int offset = x + tgx + warp*cSim.stride;
of = cSim.pForce4b[offset]; #else
unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
#endif
of = cSim.pForce4[offset];
of.x += af.x; of.x += af.x;
of.y += af.y; of.y += af.y;
of.z += af.z; of.z += af.z;
cSim.pForce4b[offset] = of; cSim.pForce4[offset] = of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
offset = y + tgx + warp*cSim.stride; offset = y + tgx + warp*cSim.stride;
of = cSim.pForce4b[offset];
of.x += sA[threadIdx.x].fx;
of.y += sA[threadIdx.x].fy;
of.z += sA[threadIdx.x].fz;
cSim.pForce4b[offset] = of;
#else #else
unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
cSim.pForce4b[offset] = of;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride; offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
of.x = sA[threadIdx.x].fx;
of.y = sA[threadIdx.x].fy;
of.z = sA[threadIdx.x].fz;
cSim.pForce4b[offset] = of;
#endif #endif
of = cSim.pForce4[offset];
of.x += sA[threadIdx.x].fx;
of.y += sA[threadIdx.x].fy;
of.z += sA[threadIdx.x].fz;
cSim.pForce4[offset] = of;
} }
lasty = y; lasty = y;
pos++; pos++;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment