"vscode:/vscode.git/clone" did not exist on "ce661524d5e708ee5e0102dd49a50cbe0465ed6b"
Commit b9d12c46 authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Clean code and several bug fixes to GB/VI and Obc for nonbonded methods w/ cutoffs

parent ce74c6ae
...@@ -133,3 +133,7 @@ extern void SetCustomExternalGlobalParams(const std::vector<float>& paramValues) ...@@ -133,3 +133,7 @@ extern void SetCustomExternalGlobalParams(const std::vector<float>& paramValues)
extern void SetCustomNonbondedForceExpression(const Expression<256>& expression); extern void SetCustomNonbondedForceExpression(const Expression<256>& expression);
extern void SetCustomNonbondedEnergyExpression(const Expression<256>& expression); extern void SetCustomNonbondedEnergyExpression(const Expression<256>& expression);
extern void SetCustomNonbondedGlobalParams(const std::vector<float>& paramValues); extern void SetCustomNonbondedGlobalParams(const std::vector<float>& paramValues);
extern void kPrintGBVI( gpuContext gpu, std::string callId, int call, FILE* log);
extern void kPrintObc( gpuContext gpu, std::string callId, int call, FILE* log);
...@@ -64,16 +64,6 @@ void GetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu) ...@@ -64,16 +64,6 @@ void GetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu)
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation)); status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed"); RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
} }
texture<float, 1, cudaReadModeElementType> tabulatedErfcRef;
static __device__ float fastErfc(float r)
{
float normalized = cSim.tabulatedErfcScale*r;
int index = (int) normalized;
float fract2 = normalized-index;
float fract1 = 1.0f-fract2;
return fract1*tex1Dfetch(tabulatedErfcRef, index) + fract2*tex1Dfetch(tabulatedErfcRef, index+1);
}
// Include versions of the kernel for N^2 calculations. // Include versions of the kernel for N^2 calculations.
...@@ -134,9 +124,11 @@ extern void kCalculatePME(gpuContext gpu); ...@@ -134,9 +124,11 @@ extern void kCalculatePME(gpuContext gpu);
void kCalculateCDLJObcGbsaForces1(gpuContext gpu) void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
{ {
// printf("kCalculateCDLJObcGbsaForces1\n"); // printf("kCalculateCDLJObcGbsaForces1\n");
switch (gpu->sim.nonbondedMethod) switch (gpu->sim.nonbondedMethod)
{ {
case NO_CUTOFF: case NO_CUTOFF:
if (gpu->bRecalculateBornRadii) if (gpu->bRecalculateBornRadii)
{ {
if( gpu->bIncludeGBVI ){ if( gpu->bIncludeGBVI ){
...@@ -148,16 +140,19 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu) ...@@ -148,16 +140,19 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
} }
} }
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaN2ByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaN2ByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit); sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
else else
kCalculateCDLJObcGbsaN2Forces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaN2Forces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit); sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
LAUNCHERROR("kCalculateCDLJObcGbsaN2Forces1"); LAUNCHERROR("kCalculateCDLJObcGbsaN2Forces1");
break; break;
case CUTOFF: case CUTOFF:
kFindBlockBoundsCutoff_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>(); kFindBlockBoundsCutoff_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
LAUNCHERROR("kFindBlockBoundsCutoff"); LAUNCHERROR("kFindBlockBoundsCutoff");
kFindBlocksWithInteractionsCutoff_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>(); kFindBlocksWithInteractionsCutoff_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
...@@ -165,20 +160,31 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu) ...@@ -165,20 +160,31 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount); compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
kFindInteractionsWithinBlocksCutoff_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kFindInteractionsWithinBlocksCutoff_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
if (gpu->bRecalculateBornRadii) if (gpu->bRecalculateBornRadii)
{ {
kCalculateObcGbsaBornSum(gpu); if( gpu->bIncludeGBVI ){
kReduceObcGbsaBornSum(gpu); kCalculateGBVIBornSum(gpu);
kReduceGBVIBornSum(gpu);
} else {
kCalculateObcGbsaBornSum(gpu);
kReduceObcGbsaBornSum(gpu);
}
} }
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaCutoffByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaCutoffByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else else
kCalculateCDLJObcGbsaCutoffForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaCutoffForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCDLJObcGbsaCutoffForces1"); LAUNCHERROR("kCalculateCDLJObcGbsaCutoffForces1");
break; break;
case PERIODIC: case PERIODIC:
kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>(); kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
LAUNCHERROR("kFindBlockBoundsPeriodic"); LAUNCHERROR("kFindBlockBoundsPeriodic");
kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>(); kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
...@@ -186,52 +192,28 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu) ...@@ -186,52 +192,28 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount); compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
if (gpu->bRecalculateBornRadii) if (gpu->bRecalculateBornRadii)
{ {
kCalculateObcGbsaBornSum(gpu); if( gpu->bIncludeGBVI ){
kReduceObcGbsaBornSum(gpu); kCalculateGBVIBornSum(gpu);
kReduceGBVIBornSum(gpu);
} else {
kCalculateObcGbsaBornSum(gpu);
kReduceObcGbsaBornSum(gpu);
}
} }
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaPeriodicByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaPeriodicByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else else
kCalculateCDLJObcGbsaPeriodicForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaPeriodicForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCDLJObcGbsaPeriodicForces1"); LAUNCHERROR("kCalculateCDLJObcGbsaPeriodicForces1");
break; break;
case EWALD:
case PARTICLE_MESH_EWALD:
kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
LAUNCHERROR("kFindBlockBoundsPeriodic");
kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kFindInteractionsWithinBlocksPeriodic");
if (gpu->bRecalculateBornRadii)
{
kCalculateObcGbsaBornSum(gpu);
kReduceObcGbsaBornSum(gpu);
}
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaBindTexture(NULL, &tabulatedErfcRef, gpu->psTabulatedErfc->_pDevData, &channelDesc, gpu->psTabulatedErfc->_length*sizeof(float));
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaEwaldByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else
kCalculateCDLJObcGbsaEwaldForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCDLJObcGbsaEwaldForces");
if (gpu->sim.nonbondedMethod == EWALD)
{
// Ewald summation
kCalculateEwaldFastCosSinSums_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
LAUNCHERROR("kCalculateEwaldFastCosSinSums");
kCalculateEwaldFastForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kCalculateEwaldFastForces");
}
else
kCalculatePME(gpu);
} }
} }
...@@ -30,9 +30,6 @@ ...@@ -30,9 +30,6 @@
* different versions of the kernels. * different versions of the kernels.
*/ */
/* Cuda compiler on Windows does not recognized "static const float" values */
#define LOCAL_HACK_PI 3.1415926535897932384626433832795
__global__ __global__
#if (__CUDA_ARCH__ >= 200) #if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1) __launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
...@@ -41,25 +38,21 @@ __launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1) ...@@ -41,25 +38,21 @@ __launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
#else #else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1) __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif #endif
void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit) void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit )
{ {
extern __shared__ Atom sA[]; extern __shared__ Atom sA[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID; unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID; unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0]; unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps; unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps; unsigned int end = (warp+1)*numWorkUnits/totalWarps;
float CDLJObcGbsa_energy; float CDLJObcGbsa_energy;
float energy = 0.0f; float energy = 0.0f;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
float* tempBuffer = (float*) &sA[cSim.nonbond_threads_per_block]; float* tempBuffer = (float*) &sA[cSim.nonbond_threads_per_block];
#endif
#ifdef USE_EWALD
const float TWO_OVER_SQRT_PI = 2.0f/sqrt(LOCAL_HACK_PI);
#endif #endif
unsigned int lasty = -0xFFFFFFFF; unsigned int lasty = -0xFFFFFFFF;
while (pos < end) while (pos < end)
{ {
...@@ -101,9 +94,9 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit) ...@@ -101,9 +94,9 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit)
float dy = psA[j].y - apos.y; float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z; float dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
dx -= floor(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX; dx -= floor(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floor(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY; dy -= floor(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floor(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ; dz -= floor(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif #endif
float r2 = dx * dx + dy * dy + dz * dz; float r2 = dx * dx + dy * dy + dz * dz;
...@@ -115,26 +108,14 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit) ...@@ -115,26 +108,14 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit)
float sig6 = sig2 * sig2 * sig2; float sig6 = sig2 * sig2 * sig2;
float eps = a.y * psA[j].eps; float eps = a.y * psA[j].eps;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6; float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
/* E */ CDLJObcGbsa_energy = eps * (sig6 - 1.0f) * sig6;
CDLJObcGbsa_energy = eps * (sig6 - 1.0f) * sig6;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
#ifdef USE_EWALD
float r = sqrt(r2);
float alphaR = cSim.alphaEwald * r;
float erfcAlphaR = fastErfc(alphaR);
dEdR += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
/* E */
CDLJObcGbsa_energy += apos.w * psA[j].q * invR * erfcAlphaR;
#else
dEdR += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2); dEdR += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
/* E */
CDLJObcGbsa_energy += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC); CDLJObcGbsa_energy += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
#endif
#else #else
float factorX = apos.w * psA[j].q * invR; float factorX = apos.w * psA[j].q * invR;
dEdR += factorX; dEdR += factorX;
/* E */
CDLJObcGbsa_energy += factorX; CDLJObcGbsa_energy += factorX;
#endif #endif
dEdR *= invR * invR; dEdR *= invR * invR;
...@@ -149,18 +130,18 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit) ...@@ -149,18 +130,18 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit)
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij); float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
dEdR += Gpol * (1.0f - 0.25f * expTerm); dEdR += Gpol * (1.0f - 0.25f * expTerm);
CDLJObcGbsa_energy += (q2 * psA[j].q) / denominator; CDLJObcGbsa_energy += (q2 * psA[j].q) / denominator;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 > cSim.nonbondedCutoffSqr) if ( i >= cSim.atoms || (x+j) >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
#else
if ( i >= cSim.atoms || (x+j) >= cSim.atoms)
#endif
{ {
dEdR = 0.0f; dEdR = 0.0f;
dGpol_dalpha2_ij = 0.0f; dGpol_dalpha2_ij = 0.0f;
CDLJObcGbsa_energy = 0.0f; CDLJObcGbsa_energy = 0.0f;
}
#endif
if (i < cSim.atoms)
{
energy += 0.5f*CDLJObcGbsa_energy;
} }
energy += 0.5f*CDLJObcGbsa_energy;
// Add Forces // Add Forces
dx *= dEdR; dx *= dEdR;
dy *= dEdR; dy *= dEdR;
...@@ -169,10 +150,11 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit) ...@@ -169,10 +150,11 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit)
af.y -= dy; af.y -= dy;
af.z -= dz; af.z -= dz;
af.w += dGpol_dalpha2_ij * psA[j].br; af.w += dGpol_dalpha2_ij * psA[j].br;
} }
}
else // bExclusion } else {
{
unsigned int xi = x>>GRIDBITS; unsigned int xi = x>>GRIDBITS;
unsigned int cell = xi+xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2; unsigned int cell = xi+xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx]; unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
...@@ -182,11 +164,11 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit) ...@@ -182,11 +164,11 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit)
float dy = psA[j].y - apos.y; float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z; float dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
dx -= floor(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX; dx -= floor(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floor(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY; dy -= floor(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floor(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ; dz -= floor(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif #endif
float r2 = dx * dx + dy * dy + dz * dz; float r2 = dx * dx + dy * dy + dz * dz;
// CDLJ part // CDLJ part
float invR = 1.0f / sqrt(r2); float invR = 1.0f / sqrt(r2);
...@@ -198,24 +180,8 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit) ...@@ -198,24 +180,8 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit)
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6; float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
CDLJObcGbsa_energy = eps * (sig6 - 1.0f) * sig6; CDLJObcGbsa_energy = eps * (sig6 - 1.0f) * sig6;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
#ifdef USE_EWALD
float r = sqrt(r2);
float alphaR = cSim.alphaEwald * r;
float erfcAlphaR = fastErfc(alphaR);
dEdR += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
CDLJObcGbsa_energy += apos.w * psA[j].q * invR * erfcAlphaR;
bool needCorrection = !(excl & 0x1) && x+tgx != y+j && x+tgx < cSim.atoms && y+j < cSim.atoms;
if (needCorrection)
{
// Subtract off the part of this interaction that was included in the reciprocal space contribution.
dEdR = -apos.w * psA[j].q * invR * ((1.0f-erfcAlphaR) - alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
CDLJObcGbsa_energy = -apos.w * psA[j].q * invR * (1.0f-erfcAlphaR);
}
#else
dEdR += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2); dEdR += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
CDLJObcGbsa_energy += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC); CDLJObcGbsa_energy += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
#endif
#else #else
float factorX = apos.w * psA[j].q * invR; float factorX = apos.w * psA[j].q * invR;
...@@ -223,13 +189,9 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit) ...@@ -223,13 +189,9 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit)
CDLJObcGbsa_energy += factorX; CDLJObcGbsa_energy += factorX;
#endif #endif
dEdR *= invR * invR; dEdR *= invR * invR;
#ifdef USE_EWALD
if (!(excl & 0x1) && !needCorrection)
#else
if (!(excl & 0x1)) if (!(excl & 0x1))
#endif
{ {
dEdR = 0.0f; dEdR = 0.0f;
CDLJObcGbsa_energy = 0.0f; CDLJObcGbsa_energy = 0.0f;
} }
...@@ -243,26 +205,18 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit) ...@@ -243,26 +205,18 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit)
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij); float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
dEdR += Gpol * (1.0f - 0.25f * expTerm); dEdR += Gpol * (1.0f - 0.25f * expTerm);
CDLJObcGbsa_energy += (q2 * psA[j].q) / denominator; CDLJObcGbsa_energy += (q2 * psA[j].q) / denominator;
#if defined USE_PERIODIC #if defined USE_CUTOFF
if (i >= cSim.atoms || x+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr) if (i >= cSim.atoms || x+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
{ #else
dEdR = 0.0f; if (i >= cSim.atoms || x+j >= cSim.atoms )
dGpol_dalpha2_ij = 0.0f;
CDLJObcGbsa_energy = 0.0f;
}
#elif defined USE_CUTOFF
if (r2 > cSim.nonbondedCutoffSqr)
{
dEdR = 0.0f;
dGpol_dalpha2_ij = 0.0f;
CDLJObcGbsa_energy = 0.0f;
}
#endif #endif
/* E */
if (i < cSim.atoms)
{ {
energy += 0.5f*CDLJObcGbsa_energy; dEdR = 0.0f;
dGpol_dalpha2_ij = 0.0f;
CDLJObcGbsa_energy = 0.0f;
} }
energy += 0.5f*CDLJObcGbsa_energy;
// Add Forces // Add Forces
dx *= dEdR; dx *= dEdR;
dy *= dEdR; dy *= dEdR;
...@@ -288,9 +242,8 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit) ...@@ -288,9 +242,8 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit)
of.w += af.w; of.w += af.w;
cSim.pForce4[offset] = of; cSim.pForce4[offset] = of;
cSim.pBornForce[offset] = of.w; cSim.pBornForce[offset] = of.w;
}
else // 100% utilization } else {
{
// Read fixed atom data into registers and GRF // Read fixed atom data into registers and GRF
if (lasty != y) if (lasty != y)
{ {
...@@ -330,9 +283,9 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit) ...@@ -330,9 +283,9 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit)
float dy = psA[tj].y - apos.y; float dy = psA[tj].y - apos.y;
float dz = psA[tj].z - apos.z; float dz = psA[tj].z - apos.z;
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
dx -= floor(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX; dx -= floor(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floor(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY; dy -= floor(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floor(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ; dz -= floor(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif #endif
float r2 = dx * dx + dy * dy + dz * dz; float r2 = dx * dx + dy * dy + dz * dz;
...@@ -346,16 +299,8 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit) ...@@ -346,16 +299,8 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit)
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6; float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
CDLJObcGbsa_energy = eps * (sig6 - 1.0f) * sig6; CDLJObcGbsa_energy = eps * (sig6 - 1.0f) * sig6;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
#ifdef USE_EWALD
float r = sqrt(r2);
float alphaR = cSim.alphaEwald * r;
float erfcAlphaR = fastErfc(alphaR);
dEdR += apos.w * psA[tj].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
CDLJObcGbsa_energy += apos.w * psA[tj].q * invR * erfcAlphaR;
#else
dEdR += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2); dEdR += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
CDLJObcGbsa_energy += apos.w * psA[tj].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC); CDLJObcGbsa_energy += apos.w * psA[tj].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
#endif
#else #else
float factorX = apos.w * psA[tj].q * invR; float factorX = apos.w * psA[tj].q * invR;
...@@ -375,18 +320,17 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit) ...@@ -375,18 +320,17 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit)
dEdR += Gpol * (1.0f - 0.25f * expTerm); dEdR += Gpol * (1.0f - 0.25f * expTerm);
CDLJObcGbsa_energy += (q2 * psA[tj].q) / denominator; CDLJObcGbsa_energy += (q2 * psA[tj].q) / denominator;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 > cSim.nonbondedCutoffSqr) if ( i >= cSim.atoms || (y+tj) >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
#else
if ( i >= cSim.atoms || (y+tj) >= cSim.atoms)
#endif
{ {
dEdR = 0.0f; dEdR = 0.0f;
dGpol_dalpha2_ij = 0.0f; dGpol_dalpha2_ij = 0.0f;
CDLJObcGbsa_energy = 0.0f; CDLJObcGbsa_energy = 0.0f;
} }
#endif energy += CDLJObcGbsa_energy;
/* E */
if (i < cSim.atoms)
{
energy += CDLJObcGbsa_energy;
}
// Add forces // Add forces
dx *= dEdR; dx *= dEdR;
dy *= dEdR; dy *= dEdR;
...@@ -399,7 +343,10 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit) ...@@ -399,7 +343,10 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit)
psA[tj].fy += dy; psA[tj].fy += dy;
psA[tj].fz += dz; psA[tj].fz += dz;
psA[tj].fb += dGpol_dalpha2_ij * br; psA[tj].fb += dGpol_dalpha2_ij * br;
tj = (tj + 1) & (GRID - 1); tj = (tj + 1) & (GRID - 1);
} }
} }
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
...@@ -431,16 +378,8 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit) ...@@ -431,16 +378,8 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit)
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6; float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
CDLJObcGbsa_energy = eps * (sig6 - 1.0f) * sig6; CDLJObcGbsa_energy = eps * (sig6 - 1.0f) * sig6;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
#ifdef USE_EWALD
float r = sqrt(r2);
float alphaR = cSim.alphaEwald * r;
float erfcAlphaR = fastErfc(alphaR);
dEdR += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
CDLJObcGbsa_energy += apos.w * psA[j].q * invR * erfcAlphaR;
#else
dEdR += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2); dEdR += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
CDLJObcGbsa_energy += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC); CDLJObcGbsa_energy += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
#endif
#else #else
float factorX = apos.w * psA[j].q * invR; float factorX = apos.w * psA[j].q * invR;
...@@ -461,17 +400,17 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit) ...@@ -461,17 +400,17 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit)
CDLJObcGbsa_energy += (q2 * psA[j].q) / denominator; CDLJObcGbsa_energy += (q2 * psA[j].q) / denominator;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 > cSim.nonbondedCutoffSqr) if ( i >= cSim.atoms || (y+j) >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
{ #else
dEdR = 0.0f; if ( i >= cSim.atoms || (y+j) >= cSim.atoms)
dGpol_dalpha2_ij = 0.0f;
CDLJObcGbsa_energy = 0.0f;
}
#endif #endif
if (i < cSim.atoms)
{ {
energy += CDLJObcGbsa_energy; dEdR = 0.0f;
dGpol_dalpha2_ij = 0.0f;
CDLJObcGbsa_energy = 0.0f;
} }
energy += CDLJObcGbsa_energy;
// Add forces // Add forces
dx *= dEdR; dx *= dEdR;
dy *= dEdR; dy *= dEdR;
...@@ -535,9 +474,7 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit) ...@@ -535,9 +474,7 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit)
} }
} }
#endif #endif
} } else {
else // bExclusion
{
unsigned int xi = x>>GRIDBITS; unsigned int xi = x>>GRIDBITS;
unsigned int yi = y>>GRIDBITS; unsigned int yi = y>>GRIDBITS;
unsigned int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2; unsigned int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
...@@ -549,9 +486,9 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit) ...@@ -549,9 +486,9 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit)
float dy = psA[tj].y - apos.y; float dy = psA[tj].y - apos.y;
float dz = psA[tj].z - apos.z; float dz = psA[tj].z - apos.z;
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
dx -= floor(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX; dx -= floor(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floor(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY; dy -= floor(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floor(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ; dz -= floor(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif #endif
float r2 = dx * dx + dy * dy + dz * dz; float r2 = dx * dx + dy * dy + dz * dz;
...@@ -565,38 +502,18 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit) ...@@ -565,38 +502,18 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit)
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6; float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
CDLJObcGbsa_energy = eps * (sig6 - 1.0f) * sig6; CDLJObcGbsa_energy = eps * (sig6 - 1.0f) * sig6;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
#ifdef USE_EWALD
float r = sqrt(r2);
float alphaR = cSim.alphaEwald * r;
float erfcAlphaR = fastErfc(alphaR);
dEdR += apos.w * psA[tj].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
CDLJObcGbsa_energy += apos.w * psA[tj].q * invR * erfcAlphaR;
bool needCorrection = !(excl & 0x1) && x+tgx != y+tj && x+tgx < cSim.atoms && y+tj < cSim.atoms;
if (needCorrection)
{
// Subtract off the part of this interaction that was included in the reciprocal space contribution.
dEdR = -apos.w * psA[tj].q * invR * ((1.0f-erfcAlphaR) - alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
CDLJObcGbsa_energy = -apos.w * psA[tj].q * invR * (1.0f-erfcAlphaR);
}
#else
dEdR += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2); dEdR += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
CDLJObcGbsa_energy += apos.w * psA[tj].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC); CDLJObcGbsa_energy += apos.w * psA[tj].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
#endif
#else #else
float factorX = apos.w * psA[tj].q * invR; float factorX = apos.w * psA[tj].q * invR;
dEdR += factorX; dEdR += factorX;
CDLJObcGbsa_energy += factorX; CDLJObcGbsa_energy += factorX;
#endif #endif
dEdR *= invR * invR; dEdR *= invR * invR;
#ifdef USE_EWALD
if (!(excl & 0x1) && !needCorrection)
#else
if (!(excl & 0x1)) if (!(excl & 0x1))
#endif
{ {
dEdR = 0.0f; dEdR = 0.0f;
CDLJObcGbsa_energy = 0.0f; CDLJObcGbsa_energy = 0.0f;
} }
// ObcGbsaForce1 part // ObcGbsaForce1 part
...@@ -609,26 +526,19 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit) ...@@ -609,26 +526,19 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit)
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij); float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
dEdR += Gpol * (1.0f - 0.25f * expTerm); dEdR += Gpol * (1.0f - 0.25f * expTerm);
CDLJObcGbsa_energy += (q2 * psA[tj].q) / denominator; CDLJObcGbsa_energy += (q2 * psA[tj].q) / denominator;
#if defined USE_PERIODIC
#ifdef USE_CUTOFF
if (i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr) if (i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
#else
if (i >= cSim.atoms || y+tj >= cSim.atoms )
#endif
{ {
dEdR = 0.0f; dEdR = 0.0f;
dGpol_dalpha2_ij = 0.0f; dGpol_dalpha2_ij = 0.0f;
CDLJObcGbsa_energy = 0.0f; CDLJObcGbsa_energy = 0.0f;
} }
#elif defined USE_CUTOFF energy += CDLJObcGbsa_energy;
if (r2 > cSim.nonbondedCutoffSqr)
{
dEdR = 0.0f;
dGpol_dalpha2_ij = 0.0f;
CDLJObcGbsa_energy = 0.0f;
}
#endif
/* E */
if (i < cSim.atoms)
{
energy += CDLJObcGbsa_energy;
}
// Add forces // Add forces
dx *= dEdR; dx *= dEdR;
dy *= dEdR; dy *= dEdR;
...@@ -642,6 +552,7 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit) ...@@ -642,6 +552,7 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit)
psA[tj].fz += dz; psA[tj].fz += dz;
psA[tj].fb += dGpol_dalpha2_ij * br; psA[tj].fb += dGpol_dalpha2_ij * br;
excl >>= 1; excl >>= 1;
tj = (tj + 1) & (GRID - 1); tj = (tj + 1) & (GRID - 1);
} }
} }
......
...@@ -237,71 +237,48 @@ __global__ void kReduceGBVIBornSum_kernel() ...@@ -237,71 +237,48 @@ __global__ void kReduceGBVIBornSum_kernel()
void kReduceGBVIBornSum(gpuContext gpu) void kReduceGBVIBornSum(gpuContext gpu)
{ {
//printf("kReduceGBVIBornSum\n"); //printf("kReduceGBVIBornSum\n");
#define GBVI_DEBUG 0
#if ( GBVI_DEBUG == 1 )
gpu->psGBVIData->Download();
gpu->psBornSum->Download();
gpu->psPosq4->Download();
(void) fprintf( stderr, "\nkReduceGBVIBornSum: Post BornSum %s Born radii & params\n",
(gpu->bIncludeGBVI ? "GBVI" : "Obc") );
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( stderr, "%d bSum=%14.6e param[%14.6e %14.6e %14.6e] x[%14.6f %14.6f %14.6f %14.6f]\n",
ii,
gpu->psBornSum->_pSysStream[0][ii],
gpu->psGBVIData->_pSysStream[0][ii].x,
gpu->psGBVIData->_pSysStream[0][ii].y,
gpu->psGBVIData->_pSysStream[0][ii].z,
gpu->psPosq4->_pSysStream[0][ii].x, gpu->psPosq4->_pSysStream[0][ii].y,
gpu->psPosq4->_pSysStream[0][ii].z, gpu->psPosq4->_pSysStream[0][ii].w
);
}
#endif
#undef GBVI_DEBUG
kReduceGBVIBornSum_kernel<<<gpu->sim.blocks, 384>>>(); kReduceGBVIBornSum_kernel<<<gpu->sim.blocks, 384>>>();
gpu->bRecalculateBornRadii = false; gpu->bRecalculateBornRadii = false;
LAUNCHERROR("kReduceGBVIBornSum"); LAUNCHERROR("kReduceGBVIBornSum");
} }
void kPrintGBVI( gpuContext gpu, std::string callId, int call, FILE* log)
{
gpu->psGBVIData->Download();
gpu->psBornRadii->Download();
gpu->psBornForce->Download();
gpu->psPosq4->Download();
gpu->psSigEps2->Download();
(void) fprintf( log, "kPrintGBVI Cuda comp bR bF prm sigeps2\n" );
(void) fprintf( stderr, "kCalculateGBVIBornSum: bOutputBufferPerWarp=%u blks=%u th/blk=%u wu=%u %u shrd=%u\n", gpu->bOutputBufferPerWarp,
gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, gpu->sim.workUnits, gpu->psWorkUnit->_pSysStream[0][0],
sizeof(Atom)*gpu->sim.nonbond_threads_per_block );
for( int ii = 0; ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
(void) fprintf( log, "%6d %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e \n", ii,
gpu->psBornRadii->_pSysData[ii],
gpu->psBornForce->_pSysData[ii],
gpu->psGBVIData->_pSysData[ii].x,
gpu->psGBVIData->_pSysData[ii].y,
gpu->psGBVIData->_pSysData[ii].z,
gpu->psGBVIData->_pSysData[ii].w,
gpu->psSigEps2->_pSysData[ii].x,
gpu->psSigEps2->_pSysData[ii].y );
}
}
void kCalculateGBVIBornSum(gpuContext gpu) void kCalculateGBVIBornSum(gpuContext gpu)
{ {
//printf("kCalculateGBVIBornSum\n"); //printf("kCalculateGBVIBornSum\n");
//size_t numWithInteractions;
switch (gpu->sim.nonbondedMethod) switch (gpu->sim.nonbondedMethod)
{ {
case NO_CUTOFF: case NO_CUTOFF:
#define GBVI 0
#if GBVI == 1
int maxPrint = 10;
gpu->psWorkUnit->Download();
fprintf( stderr, "kCalculateGBVIBornSum: bOutputBufferPerWarp=%u blks=%u th/blk=%u wu=%u %u shrd=%u\n", gpu->bOutputBufferPerWarp,
gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, gpu->sim.workUnits, gpu->psWorkUnit->_pSysStream[0][0],
sizeof(Atom)*gpu->sim.nonbond_threads_per_block );
gpu->psGBVIData->Download();
gpu->psBornSum->Download();
gpu->psPosq4->Download();
(void) fprintf( stderr, "\nkCalculateGBVIBornSum: pre BornSum %s Born radii & params\n",
(gpu->bIncludeGBVI ? "GBVI" : "Obc") );
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( stderr, "%d bSum=%14.6e param[%14.6e %14.6e %14.6e] x[%14.6f %14.6f %14.6f %14.6f]\n",
ii,
gpu->psBornSum->_pSysStream[0][ii],
gpu->psGBVIData->_pSysStream[0][ii].x,
gpu->psGBVIData->_pSysStream[0][ii].y,
gpu->psGBVIData->_pSysStream[0][ii].z,
gpu->psPosq4->_pSysStream[0][ii].x, gpu->psPosq4->_pSysStream[0][ii].y,
gpu->psPosq4->_pSysStream[0][ii].z, gpu->psPosq4->_pSysStream[0][ii].w
);
if( (ii == maxPrint) && ( ii < (gpu->natoms - maxPrint)) ){
ii = gpu->natoms - maxPrint;
}
}
#endif
#undef GBVI
if (gpu->bOutputBufferPerWarp){ if (gpu->bOutputBufferPerWarp){
kCalculateGBVIN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateGBVIN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit); sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
...@@ -310,6 +287,7 @@ fprintf( stderr, "kCalculateGBVIBornSum: bOutputBufferPerWarp=%u blks=%u th/blk= ...@@ -310,6 +287,7 @@ fprintf( stderr, "kCalculateGBVIBornSum: bOutputBufferPerWarp=%u blks=%u th/blk=
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit); sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
} }
break; break;
case CUTOFF: case CUTOFF:
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateGBVICutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateGBVICutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
...@@ -318,6 +296,7 @@ fprintf( stderr, "kCalculateGBVIBornSum: bOutputBufferPerWarp=%u blks=%u th/blk= ...@@ -318,6 +296,7 @@ fprintf( stderr, "kCalculateGBVIBornSum: bOutputBufferPerWarp=%u blks=%u th/blk=
kCalculateGBVICutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateGBVICutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit ); (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
break; break;
case PERIODIC: case PERIODIC:
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateGBVIPeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateGBVIPeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
...@@ -326,6 +305,7 @@ fprintf( stderr, "kCalculateGBVIBornSum: bOutputBufferPerWarp=%u blks=%u th/blk= ...@@ -326,6 +305,7 @@ fprintf( stderr, "kCalculateGBVIBornSum: bOutputBufferPerWarp=%u blks=%u th/blk=
kCalculateGBVIPeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateGBVIPeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit ); (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
break; break;
} }
LAUNCHERROR("kCalculateGBVIBornSum"); LAUNCHERROR("kCalculateGBVIBornSum");
} }
...@@ -167,76 +167,41 @@ __global__ void kCalculateGBVIForces2a_kernel() ...@@ -167,76 +167,41 @@ __global__ void kCalculateGBVIForces2a_kernel()
void kCalculateGBVIForces2(gpuContext gpu) void kCalculateGBVIForces2(gpuContext gpu)
{ {
//printf("kCalculateGBVIForces2\n");
size_t numWithInteractions;
#if 0
kClearForces(gpu);
(void) fprintf( stderr, "\nkCalculateGBVIForces2: cleared force prior loop2\n" ); (void) fflush( stderr );
kCalculateGBVIForces2a_kernel<<<gpu->sim.blocks, 384>>>();
(void) fprintf( stderr, "\ncalled kCalculateGBVIForces2a\n" ); (void) fflush( stderr );
return;
#endif
switch (gpu->sim.nonbondedMethod) switch (gpu->sim.nonbondedMethod)
{ {
case NO_CUTOFF: case NO_CUTOFF:
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateGBVIN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block, kCalculateGBVIN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits); sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit );
else else
kCalculateGBVIN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block, kCalculateGBVIN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits); sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit );
//(void) fprintf( stderr, "\nkCalculateGBVIForces2: Born radii/force forces warp=%u\n", gpu->bOutputBufferPerWarp ); (void) fflush( stderr );
#define GBVI_DEBUG 0
#if ( GBVI_DEBUG == 1 )
(void) fprintf( stderr, "\nkCalculateGBVIForces2: Born radii/force forces:\n" ); (void) fflush( stderr );
gpu->psBornForce->Download();
gpu->psForce4->Download();
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( stderr, "%d bF=%14.6e Fa[%14.6e %14.6e %14.6e] Fb[%14.6e %14.6e %14.6e]\n",
ii,
gpu->psBornForce->_pSysStream[0][ii],
gpu->psForce4->_pSysStream[0][ii].x,
gpu->psForce4->_pSysStream[0][ii].y,
gpu->psForce4->_pSysStream[0][ii].z,
gpu->psForce4->_pSysStream[1][ii].x,
gpu->psForce4->_pSysStream[1][ii].y,
gpu->psForce4->_pSysStream[1][ii].z
);
}
for( int ii = 0; ii < gpu->sim.paddedNumberOfAtoms*2; ii++ ){
(void) fprintf( stderr, "%d bF=%14.6e Fa[%14.6e %14.6e %14.6e %14.6e]\n",
ii,
gpu->psBornForce->_pSysStream[0][ii],
gpu->psForce4->_pSysStream[0][ii].x,
gpu->psForce4->_pSysStream[0][ii].y,
gpu->psForce4->_pSysStream[0][ii].z,
gpu->psForce4->_pSysStream[0][ii].w
);
}
#endif
#undef GBVI_DEBUG
break; break;
case CUTOFF: case CUTOFF:
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateGBVICutoffByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block, kCalculateGBVICutoffByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions); (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
else else
kCalculateGBVICutoffForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block, kCalculateGBVICutoffForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions); (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
break; break;
case PERIODIC: case PERIODIC:
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateGBVIPeriodicByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block, kCalculateGBVIPeriodicByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions); (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
else else
kCalculateGBVIPeriodicForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block, kCalculateGBVIPeriodicForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions); (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
break; break;
} }
LAUNCHERROR("kCalculateGBVIForces2"); LAUNCHERROR("kCalculateGBVIForces2");
//kPrintGBVI( gpu, "kCalculateGBVIForces2", 0, stderr);
} }
...@@ -45,15 +45,16 @@ __launch_bounds__(GT2XX_BORNFORCE2_THREADS_PER_BLOCK, 1) ...@@ -45,15 +45,16 @@ __launch_bounds__(GT2XX_BORNFORCE2_THREADS_PER_BLOCK, 1)
#else #else
__launch_bounds__(G8X_BORNFORCE2_THREADS_PER_BLOCK, 1) __launch_bounds__(G8X_BORNFORCE2_THREADS_PER_BLOCK, 1)
#endif #endif
METHOD_NAME(kCalculateGBVI, Forces2_kernel)(unsigned int* workUnit, unsigned int numWorkUnits) METHOD_NAME(kCalculateGBVI, Forces2_kernel)(unsigned int* workUnit )
{ {
extern __shared__ Atom sA[]; extern __shared__ Atom sA[];
unsigned int totalWarps = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID; unsigned int totalWarps = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID; unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int pos = warp*numWorkUnits/totalWarps; unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int end = (warp+1)*numWorkUnits/totalWarps; unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
float3* tempBuffer = (float3*) &sA[cSim.bornForce2_threads_per_block]; float3* tempBuffer = (float3*) &sA[cSim.bornForce2_threads_per_block];
#endif #endif
unsigned int lasty = -0xFFFFFFFF; unsigned int lasty = -0xFFFFFFFF;
......
...@@ -35,9 +35,6 @@ using namespace std; ...@@ -35,9 +35,6 @@ using namespace std;
#include "gputypes.h" #include "gputypes.h"
#define UNROLLXX 0
#define UNROLLXY 0
struct Atom { struct Atom {
float x; float x;
float y; float y;
...@@ -105,19 +102,17 @@ void kReduceObcGbsaBornSum_kernel() ...@@ -105,19 +102,17 @@ void kReduceObcGbsaBornSum_kernel()
while (pos < cSim.atoms) while (pos < cSim.atoms)
{ {
float sum = 0.0f; float sum = 0.0f;
float* pSt = cSim.pBornSum + pos; float* pSt = cSim.pBornSum + pos;
float2 atom = cSim.pObcData[pos]; float2 atom = cSim.pObcData[pos];
// Get summed Born data // Get summed Born data
for (int i = 0; i < cSim.nonbondOutputBuffers; i++) for (int i = 0; i < cSim.nonbondOutputBuffers; i++)
{ {
sum += *pSt; sum += *pSt;
// printf("%4d %4d A: %9.4f\n", pos, i, *pSt);
pSt += cSim.stride; pSt += cSim.stride;
} }
// Now calculate Born radius and OBC term. // Now calculate Born radius and OBC term.
sum *= 0.5f * atom.x; sum *= 0.5f * atom.x;
float sum2 = sum * sum; float sum2 = sum * sum;
...@@ -127,20 +122,49 @@ void kReduceObcGbsaBornSum_kernel() ...@@ -127,20 +122,49 @@ void kReduceObcGbsaBornSum_kernel()
float bornRadius = 1.0f / (1.0f / atom.x - tanhSum / nonOffsetRadii); float bornRadius = 1.0f / (1.0f / atom.x - tanhSum / nonOffsetRadii);
float obcChain = atom.x * (cSim.alphaOBC - 2.0f * cSim.betaOBC * sum + 3.0f * cSim.gammaOBC * sum2); float obcChain = atom.x * (cSim.alphaOBC - 2.0f * cSim.betaOBC * sum + 3.0f * cSim.gammaOBC * sum2);
obcChain = (1.0f - tanhSum * tanhSum) * obcChain / nonOffsetRadii; obcChain = (1.0f - tanhSum * tanhSum) * obcChain / nonOffsetRadii;
cSim.pBornRadii[pos] = bornRadius; cSim.pBornRadii[pos] = bornRadius;
cSim.pObcChain[pos] = obcChain; cSim.pObcChain[pos] = obcChain;
pos += gridDim.x * blockDim.x; pos += gridDim.x * blockDim.x;
} }
} }
void OPENMMCUDA_EXPORT kReduceObcGbsaBornSum(gpuContext gpu) void OPENMMCUDA_EXPORT kReduceObcGbsaBornSum(gpuContext gpu)
{ {
// printf("kReduceObcGbsaBornSum\n");
kReduceObcGbsaBornSum_kernel<<<gpu->sim.blocks, 384>>>(); kReduceObcGbsaBornSum_kernel<<<gpu->sim.blocks, 384>>>();
gpu->bRecalculateBornRadii = false; gpu->bRecalculateBornRadii = false;
LAUNCHERROR("kReduceObcGbsaBornSum"); LAUNCHERROR("kReduceObcGbsaBornSum");
} }
void kPrintObc( gpuContext gpu, std::string callId, int call, FILE* log)
{
gpu->psObcData->Download();
gpu->psBornRadii->Download();
gpu->psObcChain->Download();
gpu->psBornForce->Download();
gpu->psPosq4->Download();
gpu->psSigEps2->Download();
(void) fprintf( log, "kPrintObc Cuda bCh bR bF prm[2] sigeps[2]\n" );
(void) fprintf( stderr, "bOutputWarp=%u blks=%u th/blk=%u wu=%u %u shrd=%u\n", gpu->bOutputBufferPerWarp,
gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, gpu->sim.workUnits, gpu->psWorkUnit->_pSysStream[0][0],
sizeof(Atom)*gpu->sim.nonbond_threads_per_block );
for( int ii = 0; ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
(void) fprintf( log, "%6d %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e \n", ii,
gpu->psObcChain->_pSysData[ii],
gpu->psBornRadii->_pSysData[ii],
gpu->psBornForce->_pSysData[ii],
gpu->psObcData->_pSysData[ii].x,
gpu->psObcData->_pSysData[ii].y,
gpu->psSigEps2->_pSysData[ii].x,
gpu->psSigEps2->_pSysData[ii].y );
}
}
void OPENMMCUDA_EXPORT kCalculateObcGbsaBornSum(gpuContext gpu) void OPENMMCUDA_EXPORT kCalculateObcGbsaBornSum(gpuContext gpu)
{ {
// printf("kCalculateObcgbsaBornSum\n"); // printf("kCalculateObcgbsaBornSum\n");
...@@ -155,7 +179,9 @@ void OPENMMCUDA_EXPORT kCalculateObcGbsaBornSum(gpuContext gpu) ...@@ -155,7 +179,9 @@ void OPENMMCUDA_EXPORT kCalculateObcGbsaBornSum(gpuContext gpu)
kCalculateObcGbsaN2BornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateObcGbsaN2BornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit); sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
break; break;
case CUTOFF: case CUTOFF:
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateObcGbsaCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
...@@ -163,7 +189,9 @@ void OPENMMCUDA_EXPORT kCalculateObcGbsaBornSum(gpuContext gpu) ...@@ -163,7 +189,9 @@ void OPENMMCUDA_EXPORT kCalculateObcGbsaBornSum(gpuContext gpu)
kCalculateObcGbsaCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateObcGbsaCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
break; break;
case PERIODIC: case PERIODIC:
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaPeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateObcGbsaPeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
...@@ -172,5 +200,6 @@ void OPENMMCUDA_EXPORT kCalculateObcGbsaBornSum(gpuContext gpu) ...@@ -172,5 +200,6 @@ void OPENMMCUDA_EXPORT kCalculateObcGbsaBornSum(gpuContext gpu)
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
break; break;
} }
LAUNCHERROR("kCalculateBornSum"); LAUNCHERROR("kCalculateBornSum");
} }
...@@ -41,15 +41,12 @@ __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1) ...@@ -41,15 +41,12 @@ __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit) void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit)
{ {
extern __shared__ Atom sA[]; extern __shared__ Atom sA[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID; unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID; unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0]; unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps; unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps; unsigned int end = (warp+1)*numWorkUnits/totalWarps;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
// unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
#endif
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
float* tempBuffer = (float*) &sA[cSim.nonbond_threads_per_block]; float* tempBuffer = (float*) &sA[cSim.nonbond_threads_per_block];
#endif #endif
...@@ -69,8 +66,8 @@ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit) ...@@ -69,8 +66,8 @@ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit)
unsigned int tgx = threadIdx.x & (GRID - 1); unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx; unsigned int tbx = threadIdx.x - tgx;
unsigned int tj = tgx; unsigned int tj = tgx;
Atom* psA = &sA[tbx]; Atom* psA = &sA[tbx];
if (x == y) // Handle diagonals uniquely at 50% efficiency if (x == y) // Handle diagonals uniquely at 50% efficiency
{ {
...@@ -91,15 +88,15 @@ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit) ...@@ -91,15 +88,15 @@ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit)
dy = psA[j].y - apos.y; dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z; dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
dx -= floor(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX; dx -= floor(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floor(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY; dy -= floor(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floor(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ; dz -= floor(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif #endif
r2 = dx * dx + dy * dy + dz * dz; r2 = dx * dx + dy * dy + dz * dz;
#if defined USE_PERIODIC #if defined USE_CUTOFF
if (i < cSim.atoms && x+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr) if (i < cSim.atoms && x+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
#elif defined USE_CUTOFF #else
if (r2 < cSim.nonbondedCutoffSqr) if (i < cSim.atoms && x+j < cSim.atoms )
#endif #endif
{ {
r = sqrt(r2); r = sqrt(r2);
...@@ -118,7 +115,7 @@ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit) ...@@ -118,7 +115,7 @@ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit)
(0.50f * rInverse * ratio) + (0.50f * rInverse * ratio) +
(0.25f * psA[j].sr * psA[j].sr * rInverse) * (0.25f * psA[j].sr * psA[j].sr * rInverse) *
(l_ij2 - u_ij2); (l_ij2 - u_ij2);
float rj = psA[j].r; float rj = psA[j].sr;
if (ar.x < (rj - r)) if (ar.x < (rj - r))
{ {
apos.w += 2.0f * ((1.0f / ar.x) - l_ij); apos.w += 2.0f * ((1.0f / ar.x) - l_ij);
...@@ -170,15 +167,15 @@ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit) ...@@ -170,15 +167,15 @@ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit)
dy = psA[tj].y - apos.y; dy = psA[tj].y - apos.y;
dz = psA[tj].z - apos.z; dz = psA[tj].z - apos.z;
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
dx -= floor(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX; dx -= floor(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floor(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY; dy -= floor(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floor(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ; dz -= floor(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif #endif
r2 = dx * dx + dy * dy + dz * dz; r2 = dx * dx + dy * dy + dz * dz;
#ifdef USE_PERIODIC #ifdef USE_CUTOFF
if (i < cSim.atoms && y+tj < cSim.atoms && r2 < cSim.nonbondedCutoffSqr) if (i < cSim.atoms && y+tj < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
#elif defined USE_CUTOFF #else
if (r2 < cSim.nonbondedCutoffSqr) if (i < cSim.atoms && y+tj < cSim.atoms)
#endif #endif
{ {
r = sqrt(r2); r = sqrt(r2);
...@@ -243,15 +240,15 @@ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit) ...@@ -243,15 +240,15 @@ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit)
dy = psA[j].y - apos.y; dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z; dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
dx -= floor(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX; dx -= floor(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floor(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY; dy -= floor(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floor(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ; dz -= floor(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif #endif
r2 = dx * dx + dy * dy + dz * dz; r2 = dx * dx + dy * dy + dz * dz;
#ifdef USE_PERIODIC #ifdef USE_CUTOFF
if (i < cSim.atoms && y+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr) if (i < cSim.atoms && y+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
#elif defined USE_CUTOFF #else
if (r2 < cSim.nonbondedCutoffSqr) if (i < cSim.atoms && y+j < cSim.atoms)
#endif #endif
{ {
r = sqrt(r2); r = sqrt(r2);
......
...@@ -130,4 +130,6 @@ void OPENMMCUDA_EXPORT kCalculateObcGbsaForces2(gpuContext gpu) ...@@ -130,4 +130,6 @@ void OPENMMCUDA_EXPORT kCalculateObcGbsaForces2(gpuContext gpu)
break; break;
} }
LAUNCHERROR("kCalculateObcGbsaForces2"); LAUNCHERROR("kCalculateObcGbsaForces2");
//kPrintObc( gpu, "Post kCalculateObcGbsaForces2", 0, stderr );
} }
...@@ -223,7 +223,7 @@ void kReduceBornSumAndForces_kernel() ...@@ -223,7 +223,7 @@ void kReduceBornSumAndForces_kernel()
void kReduceBornSumAndForces(gpuContext gpu) void kReduceBornSumAndForces(gpuContext gpu)
{ {
//printf("kReduceBornSumAndForces\n"); fprintf( stderr, "kReduceBornSumAndForces\n");
kReduceBornSumAndForces_kernel<<<gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block>>>(); kReduceBornSumAndForces_kernel<<<gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block>>>();
LAUNCHERROR("kReduceBornSumAndForces"); LAUNCHERROR("kReduceBornSumAndForces");
} }
...@@ -428,8 +428,6 @@ void kReduceGBVIBornForces_kernel() ...@@ -428,8 +428,6 @@ void kReduceGBVIBornForces_kernel()
void kReduceObcGbsaBornForces(gpuContext gpu) void kReduceObcGbsaBornForces(gpuContext gpu)
{ {
//fprintf( stderr, "kReduceObcGbsaBornForces %u %u %u %u %u\n", gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block, GF1XX_THREADS_PER_BLOCK,
// GT2XX_THREADS_PER_BLOCK, G8X_THREADS_PER_BLOCK ); fflush( stderr );
if( gpu->bIncludeGBSA ){ if( gpu->bIncludeGBSA ){
kReduceObcGbsaBornForces_kernel<<<gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block>>>(); kReduceObcGbsaBornForces_kernel<<<gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block>>>();
LAUNCHERROR("kReduceObcGbsaBornForces"); LAUNCHERROR("kReduceObcGbsaBornForces");
......
...@@ -254,7 +254,7 @@ void CpuGBVI::computeBornRadii( const vector<RealVec>& atomCoordinates, RealOpen ...@@ -254,7 +254,7 @@ void CpuGBVI::computeBornRadii( const vector<RealVec>& atomCoordinates, RealOpen
} }
RealOpenMM atomicRadius3 = POW( radiusI, minusThree ); RealOpenMM atomicRadius3 = POW( radiusI, minusThree );
if( _gbviParameters->getBornRadiusScalingMethod() == GBVIParameters::QuinticSpline ){ if( _gbviParameters->getBornRadiusScalingMethod() != GBVIParameters::QuinticSpline ){
sum = atomicRadius3 - sum; sum = atomicRadius3 - sum;
bornRadii[atomI] = POW( sum, minusOneThird ); bornRadii[atomI] = POW( sum, minusOneThird );
switchDeriviatives[atomI] = one; switchDeriviatives[atomI] = one;
......
...@@ -495,5 +495,69 @@ RealOpenMM CpuObc::computeBornEnergyForces( const vector<RealVec>& atomCoordinat ...@@ -495,5 +495,69 @@ RealOpenMM CpuObc::computeBornEnergyForces( const vector<RealVec>& atomCoordinat
} }
//printObc( atomCoordinates, partialCharges, bornRadii, bornForces, inputForces, "Obc Post loop2", stderr );
return obcEnergy; return obcEnergy;
} }
/**---------------------------------------------------------------------------------------
Print Obc parameters, radii, forces, ...
@param atomCoordinates atomic coordinates
@param partialCharges partial charges
@param bornRadii Born radii (may be empty)
@param bornForces Born forces (may be empty)
@param forces forces (may be empty)
@param idString id string (who is calling)
@param log log file
--------------------------------------------------------------------------------------- */
void CpuObc::printObc( const std::vector<OpenMM::RealVec>& atomCoordinates,
const RealOpenMMVector& partialCharges,
const RealOpenMMVector& bornRadii,
const RealOpenMMVector& bornForces,
const std::vector<OpenMM::RealVec>& forces,
const std::string& idString, FILE* log ){
// ---------------------------------------------------------------------------------------
const ObcParameters* obcParameters = getObcParameters();
const int numberOfAtoms = obcParameters->getNumberOfAtoms();
const RealOpenMMVector& atomicRadii = obcParameters->getAtomicRadii();
const RealOpenMM preFactor = 2.0*obcParameters->getElectricConstant();
const RealOpenMMVector& obcChain = getObcChain();
const RealOpenMMVector& scaledRadiusFactor = obcParameters->getScaledRadiusFactors();
const RealOpenMM alphaObc = obcParameters->getAlphaObc();
const RealOpenMM betaObc = obcParameters->getBetaObc();
const RealOpenMM gammaObc = obcParameters->getGammaObc();
// ---------------------------------------------------------------------------------------
(void) fprintf( log, "Reference Obc %s atoms=%d\n", idString.c_str(), numberOfAtoms );
(void) fprintf( log, " preFactor %15.7e\n", preFactor );
(void) fprintf( log, " alpha %15.7e\n", alphaObc);
(void) fprintf( log, " beta %15.7e\n", betaObc);
(void) fprintf( log, " gamma %15.7e\n", gammaObc );
for( int atomI = 0; atomI < numberOfAtoms; atomI++ ){
(void) fprintf( log, "%6d r=%15.7e q=%6.3f", atomI,
atomicRadii[atomI], partialCharges[atomI] );
if( obcChain.size() > atomI ){
(void) fprintf( log, " bChn=%15.7e", obcChain[atomI] );
}
if( bornRadii.size() > atomI ){
(void) fprintf( log, " bR=%15.7e", bornRadii[atomI] );
}
if( bornForces.size() > atomI ){
(void) fprintf( log, " bF=%15.7e", bornForces[atomI] );
}
(void) fprintf( log, "\n" );
}
return;
}
...@@ -158,6 +158,27 @@ class CpuObc { ...@@ -158,6 +158,27 @@ class CpuObc {
RealOpenMM computeBornEnergyForces( const std::vector<OpenMM::RealVec>& atomCoordinates, RealOpenMM computeBornEnergyForces( const std::vector<OpenMM::RealVec>& atomCoordinates,
const RealOpenMMVector& partialCharges, std::vector<OpenMM::RealVec>& forces ); const RealOpenMMVector& partialCharges, std::vector<OpenMM::RealVec>& forces );
/**---------------------------------------------------------------------------------------
Print Obc parameters, radii, forces, ...
@param atomCoordinates atomic coordinates
@param partialCharges partial charges
@param bornRadii Born radii (may be empty)
@param bornForces Born forces (may be empty)
@param forces forces (may be empty)
@param idString id string (who is calling)
@param log log file
--------------------------------------------------------------------------------------- */
void printObc( const std::vector<OpenMM::RealVec>& atomCoordinates,
const RealOpenMMVector& partialCharges,
const RealOpenMMVector& bornRadii,
const RealOpenMMVector& bornForces,
const std::vector<OpenMM::RealVec>& forces,
const std::string& idString, FILE* log );
}; };
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment