Commit 408469c3 authored by Peter Eastman's avatar Peter Eastman
Browse files

Optimizations to PME

parent 45b0302d
......@@ -152,15 +152,26 @@ __device__ static void calculatePmeSelfTorqueElectrostaticPairIxn_kernel( PmeDir
}
__device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ,
float* scalingFactors, float* outputForce, float outputTorque[2][3], float* energy
float* scalingFactors, float* outputForce, float3 outputTorque[3], float* energy
#ifdef AMOEBA_DEBUG
,float4* debugArray
#endif
){
float xr = atomJ.x - atomI.x;
float yr = atomJ.y - atomI.y;
float zr = atomJ.z - atomI.z;
// periodic box
float e,ei;
float erl,erli;
xr -= floor(xr*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
yr -= floor(yr*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
zr -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
float r2 = xr*xr + yr*yr + zr*zr;
if( r2 <= cSim.nonbondedCutoffSqr ){
float r = sqrt(r2);
float ck = atomJ.q;
float conversionFactor = (-cAmoebaSim.electric/cAmoebaSim.dielec);
......@@ -184,22 +195,6 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
float qi8 = atomI.labFrameQuadrupole[7];
float qi9 = atomI.labFrameQuadrupole[8];
float xr = atomJ.x - atomI.x;
float yr = atomJ.y - atomI.y;
float zr = atomJ.z - atomI.z;
// periodic box
xr -= floor(xr*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
yr -= floor(yr*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
zr -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
float r2 = xr*xr + yr*yr + zr*zr;
if( r2 <= cSim.nonbondedCutoffSqr ){
float r = sqrt(r2);
float ck = atomJ.q;
float dk1 = atomJ.labFrameDipole[0];
float dk2 = atomJ.labFrameDipole[1];
float dk3 = atomJ.labFrameDipole[2];
......@@ -497,18 +492,18 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
// compute the energy contributions for this interaction
e = bn0*gl0 + bn1*(gl1+gl6)
float e = bn0*gl0 + bn1*(gl1+gl6)
+ bn2*(gl2+gl7+gl8)
+ bn3*(gl3+gl5) + bn4*gl4;
ei = 0.5f * (bn1*(gli1+gli6)
float ei = 0.5f * (bn1*(gli1+gli6)
+ bn2*(gli2+gli7) + bn3*gli3);
// get the real energy without any screening function
erl = rr1*gl0 + rr3*(gl1+gl6)
float erl = rr1*gl0 + rr3*(gl1+gl6)
+ rr5*(gl2+gl7+gl8)
+ rr7*(gl3+gl5) + rr9*gl4;
erli = 0.5f*(rr3*(gli1+gli6)*psc3
float erli = 0.5f*(rr3*(gli1+gli6)*psc3
+ rr5*(gli2+gli7)*psc5
+ rr7*gli3*psc7);
e = e - (1.0f-scalingFactors[MScaleIndex])*erl;
......@@ -552,23 +547,23 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
// intermediate variables for induced force terms
float gfi1 = 0.5f*bn2*(gli1+glip1+gli6+glip6)
+ 0.5f*bn2*scip2
+ 0.5f*bn3*(gli2+glip2+gli7+glip7)
- 0.5f*bn3*(sci3*scip4+scip3*sci4)
+ 0.5f*bn4*(gli3+glip3);
float gfi1 = 0.5f*(bn2*(gli1+glip1+gli6+glip6)
+ bn2*scip2
+ bn3*(gli2+glip2+gli7+glip7)
- bn3*(sci3*scip4+scip3*sci4)
+ bn4*(gli3+glip3));
float gfi2 = -ck*bn1 + sc4*bn2 - sc6*bn3;
float gfi3 = ci*bn1 + sc3*bn2 + sc5*bn3;
float gfi4 = 2.0f * bn2;
float gfi5 = bn3 * (sci4+scip4);
float gfi6 = -bn3 * (sci3+scip3);
float gfri1 = 0.5f*rr5*((gli1+gli6)*psc3
float gfri1 = 0.5f*(rr5*((gli1+gli6)*psc3
+ (glip1+glip6)*dsc3
+ scip2*usc3)
+ 0.5f*rr7*((gli7+gli2)*psc5
+ rr7*((gli7+gli2)*psc5
+ (glip7+glip2)*dsc5
- (sci3*scip4+scip3*sci4)*usc5)
+ 0.5f*rr9*(gli3*psc7+glip3*dsc7);
+ rr9*(gli3*psc7+glip3*dsc7));
float gfri4 = 2.0f * rr5;
float gfri5 = rr7 * (sci4*psc7+scip4*dsc7);
float gfri6 = -rr7 * (sci3*psc7+scip3*dsc7);
......@@ -858,13 +853,13 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
outputForce[2] = conversionFactor*(ftm23 + ftm2i3);
conversionFactor *= -1.0;
outputTorque[0][0] = conversionFactor*(ttm21 + ttm2i1);
outputTorque[0][1] = conversionFactor*(ttm22 + ttm2i2);
outputTorque[0][2] = conversionFactor*(ttm23 + ttm2i3);
outputTorque[0].x = conversionFactor*(ttm21 + ttm2i1);
outputTorque[1].x = conversionFactor*(ttm22 + ttm2i2);
outputTorque[2].x = conversionFactor*(ttm23 + ttm2i3);
outputTorque[1][0] = conversionFactor*(ttm31 + ttm3i1);
outputTorque[1][1] = conversionFactor*(ttm32 + ttm3i2);
outputTorque[1][2] = conversionFactor*(ttm33 + ttm3i3);
outputTorque[1].x = conversionFactor*(ttm31 + ttm3i1);
outputTorque[1].y = conversionFactor*(ttm32 + ttm3i2);
outputTorque[1].z = conversionFactor*(ttm33 + ttm3i3);
#ifdef AMOEBA_DEBUG
int debugIndex = 0;
......@@ -959,13 +954,13 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
outputForce[1] = 0.0f;
outputForce[2] = 0.0f;
outputTorque[0][0] = 0.0f;
outputTorque[0][1] = 0.0f;
outputTorque[0][2] = 0.0f;
outputTorque[0].x = 0.0f;
outputTorque[0].y = 0.0f;
outputTorque[0].z = 0.0f;
outputTorque[1][0] = 0.0f;
outputTorque[1][1] = 0.0f;
outputTorque[1][2] = 0.0f;
outputTorque[1].x = 0.0f;
outputTorque[1].y = 0.0f;
outputTorque[1].z = 0.0f;
*energy = 0.0f;
......
......@@ -117,7 +117,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
{
float force[3];
float torque[2][3];
float3 torque[2];
unsigned int atomJ = y + j;
......@@ -151,9 +151,9 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
localParticle.force[1] += mask ? force[1] : 0.0f;
localParticle.force[2] += mask ? force[2] : 0.0f;
localParticle.torque[0] += mask ? torque[0][0] : 0.0f;
localParticle.torque[1] += mask ? torque[0][1] : 0.0f;
localParticle.torque[2] += mask ? torque[0][2] : 0.0f;
localParticle.torque[0] += mask ? torque[0].x : 0.0f;
localParticle.torque[1] += mask ? torque[0].y : 0.0f;
localParticle.torque[2] += mask ? torque[0].z : 0.0f;
totalEnergy += mask ? 0.5*energy : 0.0f;
......@@ -181,15 +181,15 @@ if( atomI == targetAtom ){
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? torque[0][0] : 0.0f;
debugArray[index].y = mask ? torque[0][1] : 0.0f;
debugArray[index].z = mask ? torque[0][2] : 0.0f;
debugArray[index].x = mask ? torque[0].x : 0.0f;
debugArray[index].y = mask ? torque[0].y : 0.0f;
debugArray[index].z = mask ? torque[0].z : 0.0f;
debugArray[index].w = mask ? energy : 0.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? torque[0][0] : 0.0f;
debugArray[index].y = mask ? torque[0][1] : 0.0f;
debugArray[index].z = mask ? torque[0][2] : 0.0f;
debugArray[index].x = mask ? torque[0].x : 0.0f;
debugArray[index].y = mask ? torque[0].y : 0.0f;
debugArray[index].z = mask ? torque[0].z : 0.0f;
debugArray[index].w = (float) (blockIdx.x * blockDim.x + threadIdx.x);
for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
......@@ -304,7 +304,7 @@ if( atomI == targetAtom ){
unsigned int atomJ = y + jIdx;
float force[3];
float torque[2][3];
float3 torque[2];
// set scale factors
......@@ -335,9 +335,9 @@ if( atomI == targetAtom ){
localParticle.force[1] += mask ? force[1] : 0.0f;
localParticle.force[2] += mask ? force[2] : 0.0f;
localParticle.torque[0] += mask ? torque[0][0] : 0.0f;
localParticle.torque[1] += mask ? torque[0][1] : 0.0f;
localParticle.torque[2] += mask ? torque[0][2] : 0.0f;
localParticle.torque[0] += mask ? torque[0].x : 0.0f;
localParticle.torque[1] += mask ? torque[0].y : 0.0f;
localParticle.torque[2] += mask ? torque[0].z : 0.0f;
totalEnergy += mask ? energy : 0.0f;
......@@ -349,9 +349,9 @@ if( atomI == targetAtom ){
psA[jIdx].force[1] -= mask ? force[1] : 0.0f;
psA[jIdx].force[2] -= mask ? force[2] : 0.0f;
psA[jIdx].torque[0] += mask ? torque[1][0] : 0.0f;
psA[jIdx].torque[1] += mask ? torque[1][1] : 0.0f;
psA[jIdx].torque[2] += mask ? torque[1][2] : 0.0f;
psA[jIdx].torque[0] += mask ? torque[1].x : 0.0f;
psA[jIdx].torque[1] += mask ? torque[1].y : 0.0f;
psA[jIdx].torque[2] += mask ? torque[1].z : 0.0f;
} else {
......@@ -359,9 +359,9 @@ if( atomI == targetAtom ){
sA[threadIdx.x].tempForce[1] = mask ? 0.0f : force[1];
sA[threadIdx.x].tempForce[2] = mask ? 0.0f : force[2];
sA[threadIdx.x].tempTorque[0] = mask ? 0.0f : torque[1][0];
sA[threadIdx.x].tempTorque[1] = mask ? 0.0f : torque[1][1];
sA[threadIdx.x].tempTorque[2] = mask ? 0.0f : torque[1][2];
sA[threadIdx.x].tempTorque[0] = mask ? 0.0f : torque[1].x;
sA[threadIdx.x].tempTorque[1] = mask ? 0.0f : torque[1].y;
sA[threadIdx.x].tempTorque[2] = mask ? 0.0f : torque[1].z;
if( tgx % 2 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );
......
......@@ -187,6 +187,7 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
zr -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
float r2 = xr*xr + yr*yr + zr*zr;
if( r2 <= cSim.nonbondedCutoffSqr ){
float r = sqrtf(r2);
// calculate the error function damping terms
......@@ -317,8 +318,6 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
// increment the field at each site due to this interaction
if( r2 <= cSim.nonbondedCutoffSqr ){
fields[0].x = fim0 - fid0;
fields[1].x = fim1 - fid1;
fields[2].x = fim2 - fid2;
......
......@@ -75,6 +75,7 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce
zr -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
float r2 = xr*xr + yr* yr + zr*zr;
if( r2 <= cSim.nonbondedCutoffSqr ){
float r = sqrtf(r2);
// calculate the error function damping terms
......@@ -160,8 +161,6 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce
// increment the field at each site due to this interaction
if( r2 <= cSim.nonbondedCutoffSqr ){
fields[0].x = fimd0 - fid0;
fields[0].y = fkmd0 - fkd0;
fields[0].z = fimp0 - fip0;
......
......@@ -29,7 +29,7 @@
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment