"platforms/cuda/tests/TestCudaCompoundIntegrator.cpp" did not exist on "1de2e2a018e190da664165dc0ac343fc16143c66"
Commit e04283fe authored by Peter Eastman's avatar Peter Eastman
Browse files

Implemented finer grained neighbor list. Also various minor optimizations.

parent 56d7ed02
......@@ -47,7 +47,7 @@ using namespace std;
static void calcForces(OpenMMContextImpl& context, CudaPlatform::PlatformData& data) {
_gpuContext* gpu = data.gpu;
if (data.stepCount%100 == 0)
if (data.nonbondedMethod != NO_CUTOFF && data.stepCount%100 == 0)
gpuReorderAtoms(gpu);
data.stepCount++;
kClearForces(gpu);
......
......@@ -3127,10 +3127,10 @@ void gpuReorderAtoms(gpuContext gpu)
// Select a bin for each molecule, then sort them by bin.
bool useHilbert = (numMolecules > 5000); // For small systems, a simple zigzag curve works better than a Hilbert curve.
bool useHilbert = (numMolecules > 5000 || atoms.size() > 8); // For small systems, a simple zigzag curve works better than a Hilbert curve.
float binWidth;
if (useHilbert)
binWidth = max(max(maxx-minx, maxy-miny), maxz-minz)/255.0;
binWidth = max(max(maxx-minx, maxy-miny), maxz-minz)/255.0;
else
binWidth = 0.2*sqrt(gpu->sim.nonbondedCutoffSqr);
int xbins = 1 + (int) ((maxx-minx)/binWidth);
......
......@@ -140,12 +140,14 @@ void kCalculateCDLJForces(gpuContext gpu)
}
gpu->psInteractionCount->Download();
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
kFindInteractionsWithinBlocksCutoff_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJCutoffByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
else
kCalculateCDLJCutoffForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
LAUNCHERROR("kCalculateCDLJCutoffForces");
break;
case PERIODIC:
......@@ -162,12 +164,14 @@ void kCalculateCDLJForces(gpuContext gpu)
}
gpu->psInteractionCount->Download();
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJPeriodicByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
else
kCalculateCDLJPeriodicForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
LAUNCHERROR("kCalculateCDLJPeriodicForces");
}
}
\ No newline at end of file
......@@ -42,6 +42,9 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
int pos = warp*numWorkUnits/totalWarps;
int end = (warp+1)*numWorkUnits/totalWarps;
#ifdef USE_CUTOFF
float3* tempBuffer = (float3*) &sA[cSim.nonbond_threads_per_block];
#endif
int lasty = -1;
while (pos < end)
......@@ -210,47 +213,142 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
apos.w *= cSim.epsfac;
if (!bExclusionFlag)
{
for (unsigned int j = 0; j < GRID; j++)
#ifdef USE_CUTOFF
unsigned int flags = cSim.pInteractionFlag[pos];
if (flags == 0)
{
dx = psA[tj].x - apos.x;
dy = psA[tj].y - apos.y;
dz = psA[tj].z - apos.z;
// No interactions in this block.
}
else if (flags == 0xFFFFFFFF)
#endif
{
// Compute all interactions within this block.
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[tj].x - apos.x;
dy = psA[tj].y - apos.y;
dz = psA[tj].z - apos.z;
#ifdef USE_PERIODIC
dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrt(r2);
sig = a.x + psA[tj].sig;
sig2 = invR * sig;
sig2 *= sig2;
sig6 = sig2 * sig2 * sig2;
eps = a.y * psA[tj].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrt(r2);
sig = a.x + psA[tj].sig;
sig2 = invR * sig;
sig2 *= sig2;
sig6 = sig2 * sig2 * sig2;
eps = a.y * psA[tj].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
#ifdef USE_CUTOFF
dEdR += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
dEdR += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
#else
dEdR += apos.w * psA[tj].q * invR;
dEdR += apos.w * psA[tj].q * invR;
#endif
dEdR *= invR * invR;
dEdR *= invR * invR;
#ifdef USE_CUTOFF
if (r2 > cSim.nonbondedCutoffSqr)
{
dEdR = 0.0f;
if (r2 > cSim.nonbondedCutoffSqr)
{
dEdR = 0.0f;
}
#endif
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
tj = (tj + 1) & (GRID - 1);
}
}
#ifdef USE_CUTOFF
else
{
// Compute only a subset of the interactions in this block.
for (unsigned int j = 0; j < GRID; j++)
{
if ((flags&(1<<j)) != 0)
{
dx = psA[j].x - apos.x;
dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC
dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
tj = (tj + 1) & (GRID - 1);
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrt(r2);
sig = a.x + psA[j].sig;
sig2 = invR * sig;
sig2 *= sig2;
sig6 = sig2 * sig2 * sig2;
eps = a.y * psA[j].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
#ifdef USE_CUTOFF
dEdR += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
#else
dEdR += apos.w * psA[j].q * invR;
#endif
dEdR *= invR * invR;
#ifdef USE_CUTOFF
if (r2 > cSim.nonbondedCutoffSqr)
{
dEdR = 0.0f;
}
#endif
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
tempBuffer[threadIdx.x].x = dx;
tempBuffer[threadIdx.x].y = dy;
tempBuffer[threadIdx.x].z = dz;
// Sum the forces on atom j.
if (tgx % 2 == 0)
{
tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+1].x;
tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+1].y;
tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+1].z;
}
if (tgx % 4 == 0)
{
tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+2].x;
tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+2].y;
tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+2].z;
}
if (tgx % 8 == 0)
{
tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+4].x;
tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+4].y;
tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+4].z;
}
if (tgx % 16 == 0)
{
tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+8].x;
tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+8].y;
tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+8].z;
}
if (tgx == 0)
{
psA[j].fx += tempBuffer[threadIdx.x].x + tempBuffer[threadIdx.x+16].x;
psA[j].fy += tempBuffer[threadIdx.x].y + tempBuffer[threadIdx.x+16].y;
psA[j].fz += tempBuffer[threadIdx.x].z + tempBuffer[threadIdx.x+16].z;
}
}
}
}
#endif
}
else // bExclusion
{
......
......@@ -106,6 +106,8 @@ extern __global__ void kFindBlockBoundsCutoff_kernel();
extern __global__ void kFindBlockBoundsPeriodic_kernel();
extern __global__ void kFindBlocksWithInteractionsCutoff_kernel();
extern __global__ void kFindBlocksWithInteractionsPeriodic_kernel();
extern __global__ void kFindInteractionsWithinBlocksCutoff_kernel(unsigned int*, int);
extern __global__ void kFindInteractionsWithinBlocksPeriodic_kernel(unsigned int*, int);
void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
{
......@@ -145,19 +147,21 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
exit(-1);
}
gpu->psInteractionCount->Download();
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
kFindInteractionsWithinBlocksCutoff_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
if (gpu->bRecalculateBornRadii)
{
kCalculateObcGbsaBornSum(gpu);
kReduceObcGbsaBornSum(gpu);
}
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaCutoffByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
else
kCalculateCDLJObcGbsaCutoffForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
LAUNCHERROR("kCalculateCDLJCutoffForces");
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
LAUNCHERROR("kCalculateCDLJObcGbsaCutoffForces1");
break;
case PERIODIC:
kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
......@@ -172,19 +176,21 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
exit(-1);
}
gpu->psInteractionCount->Download();
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
if (gpu->bRecalculateBornRadii)
{
kCalculateObcGbsaBornSum(gpu);
kReduceObcGbsaBornSum(gpu);
}
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaPeriodicByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
else
kCalculateCDLJObcGbsaPeriodicForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
LAUNCHERROR("kCalculateCDLJPeriodicForces");
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
LAUNCHERROR("kCalculateCDLJObcGbsaPeriodicForces1");
break;
}
}
......@@ -42,6 +42,9 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
int pos = warp*numWorkUnits/totalWarps;
int end = (warp+1)*numWorkUnits/totalWarps;
#ifdef USE_CUTOFF
float* tempBuffer = (float*) &sA[cSim.nonbond_threads_per_block];
#endif
int lasty = -1;
while (pos < end)
......@@ -241,63 +244,184 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
apos.w *= cSim.epsfac;
if (!bExclusionFlag)
{
for (int j = 0; j < GRID; j++)
#ifdef USE_CUTOFF
unsigned int flags = cSim.pInteractionFlag[pos];
if (flags == 0)
{
float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y;
float dz = psA[tj].z - apos.z;
// No interactions in this block.
}
else if (flags == 0xFFFFFFFF)
#endif
{
// Compute all interactions within this block.
for (int j = 0; j < GRID; j++)
{
float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y;
float dz = psA[tj].z - apos.z;
#ifdef USE_PERIODIC
dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
float r2 = dx * dx + dy * dy + dz * dz;
float r2 = dx * dx + dy * dy + dz * dz;
// CDLJ part
float invR = 1.0f / sqrt(r2);
float sig = a.x + psA[tj].sig;
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float eps = a.y * psA[tj].eps;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
// CDLJ part
float invR = 1.0f / sqrt(r2);
float sig = a.x + psA[tj].sig;
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float eps = a.y * psA[tj].eps;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
#ifdef USE_CUTOFF
dEdR += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
dEdR += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
#else
dEdR += apos.w * psA[tj].q * invR;
dEdR += apos.w * psA[tj].q * invR;
#endif
dEdR *= invR * invR;
dEdR *= invR * invR;
// ObcGbsaForce1 part
float alpha2_ij = br * psA[tj].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = exp(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrt(denominator2);
float Gpol = (q2 * psA[tj].q) / (denominator * denominator2);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
af.w += dGpol_dalpha2_ij * psA[tj].br;
psA[tj].fb += dGpol_dalpha2_ij * br;
dEdR += Gpol * (1.0f - 0.25f * expTerm);
// ObcGbsaForce1 part
float alpha2_ij = br * psA[tj].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = exp(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrt(denominator2);
float Gpol = (q2 * psA[tj].q) / (denominator * denominator2);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
af.w += dGpol_dalpha2_ij * psA[tj].br;
psA[tj].fb += dGpol_dalpha2_ij * br;
dEdR += Gpol * (1.0f - 0.25f * expTerm);
#ifdef USE_CUTOFF
if (r2 > cSim.nonbondedCutoffSqr)
{
dEdR = 0.0f;
if (r2 > cSim.nonbondedCutoffSqr)
{
dEdR = 0.0f;
}
#endif
// Add forces
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
tj = (tj + 1) & (GRID - 1);
}
}
#ifdef USE_CUTOFF
else
{
// Compute only a subset of the interactions in this block.
for (int j = 0; j < GRID; j++)
{
if ((flags&(1<<j)) != 0)
{
float dx = psA[j].x - apos.x;
float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC
dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
float r2 = dx * dx + dy * dy + dz * dz;
// Add forces
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
tj = (tj + 1) & (GRID - 1);
// CDLJ part
float invR = 1.0f / sqrt(r2);
float sig = a.x + psA[j].sig;
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float eps = a.y * psA[j].eps;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
#ifdef USE_CUTOFF
dEdR += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
#else
dEdR += apos.w * psA[j].q * invR;
#endif
dEdR *= invR * invR;
// ObcGbsaForce1 part
float alpha2_ij = br * psA[j].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = exp(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrt(denominator2);
float Gpol = (q2 * psA[j].q) / (denominator * denominator2);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
af.w += dGpol_dalpha2_ij * psA[j].br;
dEdR += Gpol * (1.0f - 0.25f * expTerm);
// Sum the Born forces.
tempBuffer[threadIdx.x] = dGpol_dalpha2_ij * br;
if (tgx % 2 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
if (tgx % 4 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
if (tgx % 8 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
if (tgx % 16 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
if (tgx == 0)
psA[j].fb += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
#ifdef USE_CUTOFF
if (r2 > cSim.nonbondedCutoffSqr)
{
dEdR = 0.0f;
}
#endif
// Add forces
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
tempBuffer[threadIdx.x] = dx;
if (tgx % 2 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
if (tgx % 4 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
if (tgx % 8 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
if (tgx % 16 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
if (tgx == 0)
psA[j].fx += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
tempBuffer[threadIdx.x] = dy;
if (tgx % 2 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
if (tgx % 4 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
if (tgx % 8 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
if (tgx % 16 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
if (tgx == 0)
psA[j].fy += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
tempBuffer[threadIdx.x] = dz;
if (tgx % 2 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
if (tgx % 4 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
if (tgx % 8 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
if (tgx % 16 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
if (tgx == 0)
psA[j].fz += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
}
}
}
#endif
}
else // bExclusion
{
......
......@@ -187,19 +187,19 @@ void kCalculateObcGbsaBornSum(gpuContext gpu)
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
else
kCalculateObcGbsaCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
break;
case PERIODIC:
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaPeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
else
kCalculateObcGbsaPeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
break;
}
LAUNCHERROR("kCalculateBornSum");
......
......@@ -43,6 +43,9 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* wor
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
#endif
#ifdef USE_CUTOFF
float* tempBuffer = (float*) &sA[cSim.nonbond_threads_per_block];
#endif
while (pos >= 0)
{
......@@ -107,8 +110,8 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* wor
(0.50f * rInverse * ratio) +
(0.25f * psA[j].sr * psA[j].sr * rInverse) *
(l_ij2 - u_ij2);
if (ar.x < (psA[j].r - r))
float rj = psA[j].r;
if (ar.x < (rj - r))
{
apos.w += 2.0f * ((1.0f / ar.x) - l_ij);
}
......@@ -142,69 +145,169 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* wor
sA[threadIdx.x].sr = temp1.y;
sA[threadIdx.x].sum = apos.w = 0.0f;
for (unsigned int j = 0; j < GRID; j++)
#ifdef USE_CUTOFF
unsigned int flags = cSim.pInteractionFlag[pos + (blockIdx.x*workUnits)/gridDim.x];
if (flags == 0)
{
// No interactions in this block.
}
else if (flags == 0xFFFFFFFF)
#endif
{
dx = psA[tj].x - apos.x;
dy = psA[tj].y - apos.y;
dz = psA[tj].z - apos.z;
// Compute all interactions within this block.
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[tj].x - apos.x;
dy = psA[tj].y - apos.y;
dz = psA[tj].z - apos.z;
#ifdef USE_PERIODIC
dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
r2 = dx * dx + dy * dy + dz * dz;
r2 = dx * dx + dy * dy + dz * dz;
#ifdef USE_PERIODIC
if (i < cSim.atoms && y+tj < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
if (i < cSim.atoms && y+tj < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
#elif defined USE_CUTOFF
if (r2 < cSim.nonbondedCutoffSqr)
if (r2 < cSim.nonbondedCutoffSqr)
#endif
{
r = sqrt(r2);
float rInverse = 1.0f / r;
float rScaledRadiusJ = r + psA[tj].sr;
if (ar.x < rScaledRadiusJ)
{
float l_ij = 1.0f / max(ar.x, fabs(r - psA[tj].sr));
float u_ij = 1.0f / rScaledRadiusJ;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float ratio = log(u_ij / l_ij);
float term = l_ij -
u_ij +
0.25f * r * (u_ij2 - l_ij2) +
(0.50f * rInverse * ratio) +
(0.25f * psA[tj].sr * psA[tj].sr * rInverse) *
(l_ij2 - u_ij2);
if (ar.x < (psA[tj].sr - r))
r = sqrt(r2);
float rInverse = 1.0f / r;
float rScaledRadiusJ = r + psA[tj].sr;
if (ar.x < rScaledRadiusJ)
{
term += 2.0f * ((1.0f / ar.x) - l_ij);
float l_ij = 1.0f / max(ar.x, fabs(r - psA[tj].sr));
float u_ij = 1.0f / rScaledRadiusJ;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float ratio = log(u_ij / l_ij);
float term = l_ij -
u_ij +
0.25f * r * (u_ij2 - l_ij2) +
(0.50f * rInverse * ratio) +
(0.25f * psA[tj].sr * psA[tj].sr * rInverse) *
(l_ij2 - u_ij2);
float srj = psA[tj].sr;
if (ar.x < (srj - r))
{
term += 2.0f * ((1.0f / ar.x) - l_ij);
}
apos.w += term;
}
float rScaledRadiusI = r + ar.y;
if (psA[tj].r < rScaledRadiusI)
{
float l_ij = 1.0f / max(psA[tj].r, fabs(r - ar.y));
float u_ij = 1.0f / rScaledRadiusI;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float ratio = log(u_ij / l_ij);
float term = l_ij -
u_ij +
0.25f * r * (u_ij2 - l_ij2) +
(0.50f * rInverse * ratio) +
(0.25f * ar.y * ar.y * rInverse) *
(l_ij2 - u_ij2);
float rj = psA[tj].r;
if (rj < (ar.y - r))
{
term += 2.0f * ((1.0f / psA[tj].r) - l_ij);
}
psA[tj].sum += term;
}
apos.w += term;
}
float rScaledRadiusI = r + ar.y;
if (psA[tj].r < rScaledRadiusI)
{
float l_ij = 1.0f / max(psA[tj].r, fabs(r - ar.y));
float u_ij = 1.0f / rScaledRadiusI;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float ratio = log(u_ij / l_ij);
float term = l_ij -
u_ij +
0.25f * r * (u_ij2 - l_ij2) +
(0.50f * rInverse * ratio) +
(0.25f * ar.y * ar.y * rInverse) *
(l_ij2 - u_ij2);
tj = (tj - 1) & (GRID - 1);
}
}
#ifdef USE_CUTOFF
else
{
// Compute only a subset of the interactions in this block.
if (psA[tj].r < (ar.y - r))
for (unsigned int j = 0; j < GRID; j++)
{
if ((flags&(1<<j)) != 0)
{
tempBuffer[threadIdx.x] = 0.0f;
dx = psA[j].x - apos.x;
dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC
dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
r2 = dx * dx + dy * dy + dz * dz;
#ifdef USE_PERIODIC
if (i < cSim.atoms && y+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
#elif defined USE_CUTOFF
if (r2 < cSim.nonbondedCutoffSqr)
#endif
{
term += 2.0f * ((1.0f / psA[tj].r) - l_ij);
r = sqrt(r2);
float rInverse = 1.0f / r;
float rScaledRadiusJ = r + psA[j].sr;
if (ar.x < rScaledRadiusJ)
{
float l_ij = 1.0f / max(ar.x, fabs(r - psA[j].sr));
float u_ij = 1.0f / rScaledRadiusJ;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float ratio = log(u_ij / l_ij);
float term = l_ij -
u_ij +
0.25f * r * (u_ij2 - l_ij2) +
(0.50f * rInverse * ratio) +
(0.25f * psA[j].sr * psA[j].sr * rInverse) *
(l_ij2 - u_ij2);
float srj = psA[j].sr;
if (ar.x < (srj - r))
{
term += 2.0f * ((1.0f / ar.x) - l_ij);
}
apos.w += term;
}
float rScaledRadiusI = r + ar.y;
if (psA[j].r < rScaledRadiusI)
{
float l_ij = 1.0f / max(psA[j].r, fabs(r - ar.y));
float u_ij = 1.0f / rScaledRadiusI;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float ratio = log(u_ij / l_ij);
float term = l_ij -
u_ij +
0.25f * r * (u_ij2 - l_ij2) +
(0.50f * rInverse * ratio) +
(0.25f * ar.y * ar.y * rInverse) *
(l_ij2 - u_ij2);
float rj = psA[j].r;
if (rj < (ar.y - r))
{
term += 2.0f * ((1.0f / psA[j].r) - l_ij);
}
tempBuffer[threadIdx.x] = term;
}
}
psA[tj].sum += term;
// Sum the terms.
if (tgx % 2 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
if (tgx % 4 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
if (tgx % 8 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
if (tgx % 16 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
if (tgx == 0)
psA[j].sum += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
}
}
tj = (tj - 1) & (GRID - 1);
}
#endif
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
......
......@@ -47,7 +47,6 @@ struct Atom {
float z;
float r;
float sr;
float sr2;
float fx;
float fy;
float fz;
......@@ -123,19 +122,19 @@ void kCalculateObcGbsaForces2(gpuContext gpu)
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaCutoffByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
(sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
else
kCalculateObcGbsaCutoffForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
(sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
break;
case PERIODIC:
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaPeriodicByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
(sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
else
kCalculateObcGbsaPeriodicForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
(sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
break;
}
LAUNCHERROR("kCalculateObcGbsaForces2");
......
......@@ -38,10 +38,13 @@
__global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* workUnit, int numWorkUnits)
{
extern __shared__ Atom sA[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
unsigned int totalWarps = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
int pos = warp*numWorkUnits/totalWarps;
int end = (warp+1)*numWorkUnits/totalWarps;
#ifdef USE_CUTOFF
float3* tempBuffer = (float3*) &sA[cSim.bornForce2_threads_per_block];
#endif
int lasty = -1;
while (pos < end)
......@@ -72,7 +75,6 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* wor
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].r = a.x;
sA[threadIdx.x].sr = a.y;
sA[threadIdx.x].sr2 = a.y * a.y;
sA[threadIdx.x].fb = fb;
for (unsigned int j = (tgx+1)&(GRID-1); j != tgx; j = (j+1)&(GRID-1))
......@@ -105,7 +107,7 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* wor
// Born Forces term
float term = 0.125f *
(1.000f + psA[j].sr2 * r2Inverse) * t3 +
(1.000f + psA[j].sr * psA[j].sr * r2Inverse) * t3 +
0.250f * t1 * r2Inverse;
float dE = fb * term;
......@@ -162,98 +164,242 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* wor
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].r = temp1.x;
sA[threadIdx.x].sr = temp1.y;
sA[threadIdx.x].sr2 = temp1.y * temp1.y;
}
float sr2 = a.y * a.y;
for (int j = 0; j < GRID; j++)
#ifdef USE_CUTOFF
unsigned int flags = cSim.pInteractionFlag[pos];
if (flags == 0)
{
// No interactions in this block.
}
else if (flags == 0xFFFFFFFF)
#endif
{
float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y;
float dz = psA[tj].z - apos.z;
// Compute all interactions within this block.
for (int j = 0; j < GRID; j++)
{
float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y;
float dz = psA[tj].z - apos.z;
#ifdef USE_PERIODIC
dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
float r2 = dx * dx + dy * dy + dz * dz;
float r = sqrt(r2);
float r2 = dx * dx + dy * dy + dz * dz;
float r = sqrt(r2);
// Interleaved Atom I and J Born Forces and sum components
float r2Inverse = 1.0f / r2;
float rScaledRadiusJ = r + psA[tj].sr;
float rScaledRadiusI = r + a.y;
float rInverse = 1.0f / r;
float l_ijJ = 1.0f / max(a.x, fabs(r - psA[tj].sr));
float l_ijI = 1.0f / max(psA[tj].r, fabs(r - a.y));
float u_ijJ = 1.0f / rScaledRadiusJ;
float u_ijI = 1.0f / rScaledRadiusI;
float l_ij2J = l_ijJ * l_ijJ;
float l_ij2I = l_ijI * l_ijI;
float u_ij2J = u_ijJ * u_ijJ;
float u_ij2I = u_ijI * u_ijI;
float t1J = log (u_ijJ / l_ijJ);
float t1I = log (u_ijI / l_ijI);
float t2J = (l_ij2J - u_ij2J);
float t2I = (l_ij2I - u_ij2I);
float t3J = t2J * rInverse;
float t3I = t2I * rInverse;
t1J *= rInverse;
t1I *= rInverse;
// Interleaved Atom I and J Born Forces and sum components
float r2Inverse = 1.0f / r2;
float rScaledRadiusJ = r + psA[tj].sr;
float rScaledRadiusI = r + a.y;
float rInverse = 1.0f / r;
float l_ijJ = 1.0f / max(a.x, fabs(r - psA[tj].sr));
float l_ijI = 1.0f / max(psA[tj].r, fabs(r - a.y));
float u_ijJ = 1.0f / rScaledRadiusJ;
float u_ijI = 1.0f / rScaledRadiusI;
float l_ij2J = l_ijJ * l_ijJ;
float l_ij2I = l_ijI * l_ijI;
float u_ij2J = u_ijJ * u_ijJ;
float u_ij2I = u_ijI * u_ijI;
float t1J = log (u_ijJ / l_ijJ);
float t1I = log (u_ijI / l_ijI);
float t2J = (l_ij2J - u_ij2J);
float t2I = (l_ij2I - u_ij2I);
float t3J = t2J * rInverse;
float t3I = t2I * rInverse;
t1J *= rInverse;
t1I *= rInverse;
// Born Forces term
float term = 0.125f *
(1.000f + psA[tj].sr2 * r2Inverse) * t3J +
0.250f * t1J * r2Inverse;
float dE = fb * term;
// Born Forces term
float term = 0.125f *
(1.000f + psA[tj].sr * psA[tj].sr * r2Inverse) * t3J +
0.250f * t1J * r2Inverse;
float dE = fb * term;
#if defined USE_PERIODIC
if (a.x >= rScaledRadiusJ || i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
if (a.x >= rScaledRadiusJ || i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
#elif defined USE_CUTOFF
if (a.x >= rScaledRadiusJ || r2 > cSim.nonbondedCutoffSqr)
if (a.x >= rScaledRadiusJ || r2 > cSim.nonbondedCutoffSqr)
#else
if (a.x >= rScaledRadiusJ)
if (a.x >= rScaledRadiusJ)
#endif
{
dE = 0.0f;
}
{
dE = 0.0f;
}
float d = dx * dE;
af.x -= d;
psA[tj].fx += d;
d = dy * dE;
af.y -= d;
psA[tj].fy += d;
d = dz * dE;
af.z -= d;
psA[tj].fz += d;
float d = dx * dE;
af.x -= d;
psA[tj].fx += d;
d = dy * dE;
af.y -= d;
psA[tj].fy += d;
d = dz * dE;
af.z -= d;
psA[tj].fz += d;
// Atom J Born sum term
term = 0.125f *
(1.000f + sr2 * r2Inverse) * t3I +
0.250f * t1I * r2Inverse;
dE = psA[tj].fb * term;
// Atom J Born sum term
term = 0.125f *
(1.000f + sr2 * r2Inverse) * t3I +
0.250f * t1I * r2Inverse;
dE = psA[tj].fb * term;
float rj = psA[tj].r;
#ifdef USE_PERIODIC
if (psA[tj].r >= rScaledRadiusI || i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
if (rj >= rScaledRadiusI || i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
#elif defined USE_CUTOFF
if (psA[tj].r >= rScaledRadiusI || r2 > cSim.nonbondedCutoffSqr)
if (rj >= rScaledRadiusI || r2 > cSim.nonbondedCutoffSqr)
#else
if (psA[tj].r >= rScaledRadiusI)
if (rj >= rScaledRadiusI)
#endif
{
dE = 0.0f;
}
dx *= dE;
dy *= dE;
dz *= dE;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
af.x -= dx;
af.y -= dy;
af.z -= dz;
tj = (tj + 1) & (GRID - 1);
}
}
#ifdef USE_CUTOFF
else
{
// Compute only a subset of the interactions in this block.
for (int j = 0; j < GRID; j++)
{
dE = 0.0f;
if ((flags&(1<<j)) != 0)
{
float dx = psA[j].x - apos.x;
float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC
dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
float r2 = dx * dx + dy * dy + dz * dz;
float r = sqrt(r2);
// Interleaved Atom I and J Born Forces and sum components
float r2Inverse = 1.0f / r2;
float rScaledRadiusJ = r + psA[j].sr;
float rScaledRadiusI = r + a.y;
float rInverse = 1.0f / r;
float l_ijJ = 1.0f / max(a.x, fabs(r - psA[j].sr));
float l_ijI = 1.0f / max(psA[j].r, fabs(r - a.y));
float u_ijJ = 1.0f / rScaledRadiusJ;
float u_ijI = 1.0f / rScaledRadiusI;
float l_ij2J = l_ijJ * l_ijJ;
float l_ij2I = l_ijI * l_ijI;
float u_ij2J = u_ijJ * u_ijJ;
float u_ij2I = u_ijI * u_ijI;
float t1J = log (u_ijJ / l_ijJ);
float t1I = log (u_ijI / l_ijI);
float t2J = (l_ij2J - u_ij2J);
float t2I = (l_ij2I - u_ij2I);
float t3J = t2J * rInverse;
float t3I = t2I * rInverse;
t1J *= rInverse;
t1I *= rInverse;
// Born Forces term
float term = 0.125f *
(1.000f + psA[j].sr * psA[j].sr * r2Inverse) * t3J +
0.250f * t1J * r2Inverse;
float dE = fb * term;
#if defined USE_PERIODIC
if (a.x >= rScaledRadiusJ || i >= cSim.atoms || y+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
#elif defined USE_CUTOFF
if (a.x >= rScaledRadiusJ || r2 > cSim.nonbondedCutoffSqr)
#else
if (a.x >= rScaledRadiusJ)
#endif
{
dE = 0.0f;
}
float d = dx * dE;
af.x -= d;
tempBuffer[threadIdx.x].x = d;
d = dy * dE;
af.y -= d;
tempBuffer[threadIdx.x].y = d;
d = dz * dE;
af.z -= d;
tempBuffer[threadIdx.x].z = d;
// Atom J Born sum term
term = 0.125f *
(1.000f + sr2 * r2Inverse) * t3I +
0.250f * t1I * r2Inverse;
dE = psA[j].fb * term;
float rj = psA[j].r;
#ifdef USE_PERIODIC
if (rj >= rScaledRadiusI || i >= cSim.atoms || y+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
#elif defined USE_CUTOFF
if (rj >= rScaledRadiusI || r2 > cSim.nonbondedCutoffSqr)
#else
if (rj >= rScaledRadiusI)
#endif
{
dE = 0.0f;
}
dx *= dE;
dy *= dE;
dz *= dE;
tempBuffer[threadIdx.x].x += dx;
tempBuffer[threadIdx.x].y += dy;
tempBuffer[threadIdx.x].z += dz;
af.x -= dx;
af.y -= dy;
af.z -= dz;
// Sum the forces on atom j.
if (tgx % 2 == 0)
{
tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+1].x;
tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+1].y;
tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+1].z;
}
if (tgx % 4 == 0)
{
tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+2].x;
tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+2].y;
tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+2].z;
}
if (tgx % 8 == 0)
{
tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+4].x;
tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+4].y;
tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+4].z;
}
if (tgx % 16 == 0)
{
tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+8].x;
tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+8].y;
tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+8].z;
}
if (tgx == 0)
{
psA[j].fx += tempBuffer[threadIdx.x].x + tempBuffer[threadIdx.x+16].x;
psA[j].fy += tempBuffer[threadIdx.x].y + tempBuffer[threadIdx.x+16].y;
psA[j].fz += tempBuffer[threadIdx.x].z + tempBuffer[threadIdx.x+16].z;
}
}
}
dx *= dE;
dy *= dE;
dz *= dE;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
af.x -= dx;
af.y -= dy;
af.z -= dz;
tj = (tj + 1) & (GRID - 1);
}
#endif
// Write results
float4 of;
......
......@@ -112,3 +112,88 @@ __global__ void METHOD_NAME(kFindBlocksWithInteractions, _kernel)()
cSim.pInteractionFlag[pos] = (dx*dx+dy*dy+dz*dz > cSim.nonbondedCutoffSqr ? 0 : 1);
}
}
/**
* Compare each atom in one block to the bounding box of another block, and set
* flags for which ones are interacting.
*/
__global__ void METHOD_NAME(kFindInteractionsWithinBlocks, _kernel)(unsigned int* workUnit, int numWorkUnits)
{
extern __shared__ unsigned int flags[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
int pos = warp*numWorkUnits/totalWarps;
int end = (warp+1)*numWorkUnits/totalWarps;
unsigned int index = threadIdx.x & (GRID - 1);
int lasty = -1;
float4 apos;
while (pos < end)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = workUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff);
bool bExclusionFlag = (x & 0x1);
x = (x >> 17);
if (x == y || bExclusionFlag)
{
// Assume this block will be dense.
if (index == 0)
cSim.pInteractionFlag[pos] = 0xFFFFFFFF;
}
else
{
// Load the bounding box for x and the atom positions for y.
float4 center = cSim.pGridCenter[x];
float4 boxSize = cSim.pGridBoundingBox[x];
if (y != lasty)
{
apos = cSim.pPosq[(y<<GRIDBITS)+index];
}
// Find the distance of the atom from the bounding box.
float dx = apos.x-center.x;
float dy = apos.y-center.y;
float dz = apos.z-center.z;
#ifdef USE_PERIODIC
dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
dx = max(0.0f, abs(dx)-boxSize.x);
dy = max(0.0f, abs(dy)-boxSize.y);
dz = max(0.0f, abs(dz)-boxSize.z);
flags[threadIdx.x] = (dx*dx+dy*dy+dz*dz > cSim.nonbondedCutoffSqr ? 0 : 1 << index);
// Sum the flags.
if (index % 2 == 0)
flags[threadIdx.x] += flags[threadIdx.x+1];
if (index % 4 == 0)
flags[threadIdx.x] += flags[threadIdx.x+2];
if (index % 8 == 0)
flags[threadIdx.x] += flags[threadIdx.x+4];
if (index % 16 == 0)
flags[threadIdx.x] += flags[threadIdx.x+8];
if (index == 0)
{
unsigned int allFlags = flags[threadIdx.x] + flags[threadIdx.x+16];
// Count how many flags are set, and based on that decide whether to compute all interactions
// or only a fraction of them.
unsigned int bits = (allFlags&0x55555555) + ((allFlags>>1)&0x55555555);
bits = (bits&0x33333333) + ((bits>>2)&0x33333333);
bits = (bits&0x0F0F0F0F) + ((bits>>4)&0x0F0F0F0F);
bits = (bits&0x00FF00FF) + ((bits>>8)&0x00FF00FF);
bits = (bits&0x0000FFFF) + ((bits>>16)&0x0000FFFF);
cSim.pInteractionFlag[pos] = (bits > 12 ? 0xFFFFFFFF : allFlags);
}
lasty = y;
}
pos++;
}
}
......@@ -497,6 +497,36 @@ void testBlockInteractions(bool periodic) {
dy = max(0.0f, abs(dy)-gridSize1.y-gridSize2.y);
dz = max(0.0f, abs(dz)-gridSize1.z-gridSize2.z);
ASSERT(sqrt(dx*dx+dy*dy+dz*dz) < cutoff+TOL);
// Check the interaction flags.
unsigned int flags = data.gpu->psInteractionFlag->_pSysData[i];
for (int atom2 = 0; atom2 < 32; atom2++) {
if ((flags & 1) == 0) {
float4 pos2 = data.gpu->psPosq4->_pSysData[y*blockSize+atom2];
for (int atom1 = 0; atom1 < blockSize; ++atom1) {
float4 pos1 = data.gpu->psPosq4->_pSysData[x*blockSize+atom1];
float dx = pos2.x-pos1.x;
float dy = pos2.y-pos1.y;
float dz = pos2.z-pos1.z;
if (periodic) {
dx -= floor(0.5+dx/boxSize)*boxSize;
dy -= floor(0.5+dy/boxSize)*boxSize;
dz -= floor(0.5+dz/boxSize)*boxSize;
}
if (dx*dx+dy*dy+dz*dz < cutoff*cutoff) {
dx = pos2.x-center1.x;
dy = pos2.y-center1.y;
dz = pos2.z-center1.z;
dx = max(0.0f, abs(dx)-gridSize1.x);
dy = max(0.0f, abs(dy)-gridSize1.y);
dz = max(0.0f, abs(dz)-gridSize1.z);
}
ASSERT(dx*dx+dy*dy+dz*dz > cutoff*cutoff);
}
}
flags >>= 1;
}
}
// Check the tiles that did not have interactions to make sure all atoms are beyond the cutoff.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment