Commit 408469c3 authored by Peter Eastman's avatar Peter Eastman
Browse files

Optimizations to PME

parent 45b0302d
......@@ -15,9 +15,9 @@ void SetCalculateAmoebaPmeDirectElectrostaticSim(amoebaGpuContext amoebaGpu)
{
cudaError_t status;
gpuContext gpu = amoebaGpu->gpuContext;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "SetCalculateAmoebaPmeDirectElectrostaticSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
status = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));
status = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));
RTERROR(status, "SetCalculateAmoebaPmeDirectElectrostaticSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
}
......@@ -25,15 +25,15 @@ void GetCalculateAmoebaPmeDirectElectrostaticSim(amoebaGpuContext amoebaGpu)
{
cudaError_t status;
gpuContext gpu = amoebaGpu->gpuContext;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "GetCalculateAmoebaPmeDirectElectrostaticSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));
status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));
RTERROR(status, "GetCalculateAmoebaPmeDirectElectrostaticSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
}
static int const PScaleIndex = 0;
static int const DScaleIndex = 1;
static int const UScaleIndex = 2;
static int const PScaleIndex = 0;
static int const DScaleIndex = 1;
static int const UScaleIndex = 2;
static int const MScaleIndex = 3;
static int const LastScalingIndex = 4;
......@@ -105,7 +105,7 @@ __device__ static void debugSetup( unsigned int atomI, unsigned int atomJ,
debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z;
debugArray[index].w = pullBack[pullIndex].w;
}
}
}
*/
......@@ -152,43 +152,16 @@ __device__ static void calculatePmeSelfTorqueElectrostaticPairIxn_kernel( PmeDir
}
__device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ,
float* scalingFactors, float* outputForce, float outputTorque[2][3], float* energy
float* scalingFactors, float* outputForce, float3 outputTorque[3], float* energy
#ifdef AMOEBA_DEBUG
,float4* debugArray
,float4* debugArray
#endif
){
float e,ei;
float erl,erli;
float conversionFactor = (-cAmoebaSim.electric/cAmoebaSim.dielec);
// set the permanent multipole and induced dipole values;
float pdi = atomI.damp;
float pti = atomI.thole;
float ci = atomI.q;
float di1 = atomI.labFrameDipole[0];
float di2 = atomI.labFrameDipole[1];
float di3 = atomI.labFrameDipole[2];
float qi1 = atomI.labFrameQuadrupole[0];
float qi2 = atomI.labFrameQuadrupole[1];
float qi3 = atomI.labFrameQuadrupole[2];
float qi4 = atomI.labFrameQuadrupole[3];
float qi5 = atomI.labFrameQuadrupole[4];
float qi6 = atomI.labFrameQuadrupole[5];
float qi7 = atomI.labFrameQuadrupole[6];
float qi8 = atomI.labFrameQuadrupole[7];
float qi9 = atomI.labFrameQuadrupole[8];
float xr = atomJ.x - atomI.x;
float yr = atomJ.y - atomI.y;
float zr = atomJ.z - atomI.z;
// periodic box
// periodic box
xr -= floor(xr*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
yr -= floor(yr*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
......@@ -199,11 +172,33 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
float r = sqrt(r2);
float ck = atomJ.q;
float conversionFactor = (-cAmoebaSim.electric/cAmoebaSim.dielec);
// set the permanent multipole and induced dipole values;
float pdi = atomI.damp;
float pti = atomI.thole;
float ci = atomI.q;
float di1 = atomI.labFrameDipole[0];
float di2 = atomI.labFrameDipole[1];
float di3 = atomI.labFrameDipole[2];
float qi1 = atomI.labFrameQuadrupole[0];
float qi2 = atomI.labFrameQuadrupole[1];
float qi3 = atomI.labFrameQuadrupole[2];
float qi4 = atomI.labFrameQuadrupole[3];
float qi5 = atomI.labFrameQuadrupole[4];
float qi6 = atomI.labFrameQuadrupole[5];
float qi7 = atomI.labFrameQuadrupole[6];
float qi8 = atomI.labFrameQuadrupole[7];
float qi9 = atomI.labFrameQuadrupole[8];
float dk1 = atomJ.labFrameDipole[0];
float dk2 = atomJ.labFrameDipole[1];
float dk3 = atomJ.labFrameDipole[2];
float qk1 = atomJ.labFrameQuadrupole[0];
float qk2 = atomJ.labFrameQuadrupole[1];
float qk3 = atomJ.labFrameQuadrupole[2];
......@@ -213,7 +208,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
float qk7 = atomJ.labFrameQuadrupole[6];
float qk8 = atomJ.labFrameQuadrupole[7];
float qk9 = atomJ.labFrameQuadrupole[8];
// calculate the real space error function terms;
float ralpha = cSim.alphaEwald*r;
......@@ -497,18 +492,18 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
// compute the energy contributions for this interaction
e = bn0*gl0 + bn1*(gl1+gl6)
float e = bn0*gl0 + bn1*(gl1+gl6)
+ bn2*(gl2+gl7+gl8)
+ bn3*(gl3+gl5) + bn4*gl4;
ei = 0.5f * (bn1*(gli1+gli6)
float ei = 0.5f * (bn1*(gli1+gli6)
+ bn2*(gli2+gli7) + bn3*gli3);
// get the real energy without any screening function
erl = rr1*gl0 + rr3*(gl1+gl6)
float erl = rr1*gl0 + rr3*(gl1+gl6)
+ rr5*(gl2+gl7+gl8)
+ rr7*(gl3+gl5) + rr9*gl4;
erli = 0.5f*(rr3*(gli1+gli6)*psc3
float erli = 0.5f*(rr3*(gli1+gli6)*psc3
+ rr5*(gli2+gli7)*psc5
+ rr7*gli3*psc7);
e = e - (1.0f-scalingFactors[MScaleIndex])*erl;
......@@ -552,23 +547,23 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
// intermediate variables for induced force terms
float gfi1 = 0.5f*bn2*(gli1+glip1+gli6+glip6)
+ 0.5f*bn2*scip2
+ 0.5f*bn3*(gli2+glip2+gli7+glip7)
- 0.5f*bn3*(sci3*scip4+scip3*sci4)
+ 0.5f*bn4*(gli3+glip3);
float gfi1 = 0.5f*(bn2*(gli1+glip1+gli6+glip6)
+ bn2*scip2
+ bn3*(gli2+glip2+gli7+glip7)
- bn3*(sci3*scip4+scip3*sci4)
+ bn4*(gli3+glip3));
float gfi2 = -ck*bn1 + sc4*bn2 - sc6*bn3;
float gfi3 = ci*bn1 + sc3*bn2 + sc5*bn3;
float gfi4 = 2.0f * bn2;
float gfi5 = bn3 * (sci4+scip4);
float gfi6 = -bn3 * (sci3+scip3);
float gfri1 = 0.5f*rr5*((gli1+gli6)*psc3
float gfri1 = 0.5f*(rr5*((gli1+gli6)*psc3
+ (glip1+glip6)*dsc3
+ scip2*usc3)
+ 0.5f*rr7*((gli7+gli2)*psc5
+ rr7*((gli7+gli2)*psc5
+ (glip7+glip2)*dsc5
- (sci3*scip4+scip3*sci4)*usc5)
+ 0.5f*rr9*(gli3*psc7+glip3*dsc7);
+ rr9*(gli3*psc7+glip3*dsc7));
float gfri4 = 2.0f * rr5;
float gfri5 = rr7 * (sci4*psc7+scip4*dsc7);
float gfri6 = -rr7 * (sci3*psc7+scip3*dsc7);
......@@ -856,15 +851,15 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
outputForce[0] = conversionFactor*(ftm21 + ftm2i1);
outputForce[1] = conversionFactor*(ftm22 + ftm2i2);
outputForce[2] = conversionFactor*(ftm23 + ftm2i3);
conversionFactor *= -1.0;
outputTorque[0][0] = conversionFactor*(ttm21 + ttm2i1);
outputTorque[0][1] = conversionFactor*(ttm22 + ttm2i2);
outputTorque[0][2] = conversionFactor*(ttm23 + ttm2i3);
outputTorque[1][0] = conversionFactor*(ttm31 + ttm3i1);
outputTorque[1][1] = conversionFactor*(ttm32 + ttm3i2);
outputTorque[1][2] = conversionFactor*(ttm33 + ttm3i3);
outputTorque[0].x = conversionFactor*(ttm21 + ttm2i1);
outputTorque[1].x = conversionFactor*(ttm22 + ttm2i2);
outputTorque[2].x = conversionFactor*(ttm23 + ttm2i3);
outputTorque[1].x = conversionFactor*(ttm31 + ttm3i1);
outputTorque[1].y = conversionFactor*(ttm32 + ttm3i2);
outputTorque[1].z = conversionFactor*(ttm33 + ttm3i3);
#ifdef AMOEBA_DEBUG
int debugIndex = 0;
......@@ -958,14 +953,14 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
outputForce[0] = 0.0f;
outputForce[1] = 0.0f;
outputForce[2] = 0.0f;
outputTorque[0][0] = 0.0f;
outputTorque[0][1] = 0.0f;
outputTorque[0][2] = 0.0f;
outputTorque[1][0] = 0.0f;
outputTorque[1][1] = 0.0f;
outputTorque[1][2] = 0.0f;
outputTorque[0].x = 0.0f;
outputTorque[0].y = 0.0f;
outputTorque[0].z = 0.0f;
outputTorque[1].x = 0.0f;
outputTorque[1].y = 0.0f;
outputTorque[1].z = 0.0f;
*energy = 0.0f;
......@@ -977,7 +972,7 @@ for( int ii = 0; ii < 5; ii++ ){
debugArray[ii].w = (float) (11*ii);
}
#endif
}
return;
......@@ -1070,7 +1065,7 @@ static void kReduceForceTorque(amoebaGpuContext amoebaGpu )
void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
{
// ---------------------------------------------------------------------------------------
static unsigned int threadsPerBlock = 0;
......@@ -1098,7 +1093,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
methodName, gpu->natoms,
amoebaGpu->maxCovalentDegreeSz, amoebaGpu->pGamma,
amoebaGpu->scalingDistanceCutoff );
}
}
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
CUDAStream<float4>* debugArray = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysStream[0], 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
......@@ -1120,8 +1115,8 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
// (void) fprintf( amoebaGpu->log, " %u %s %s\n", ii, lineTokens[0].c_str(), lineTokens[lineTokenIndex].c_str() ); fflush( amoebaGpu->log );
amoebaGpu->psInducedDipole->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
amoebaGpu->psInducedDipole->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
amoebaGpu->psInducedDipole->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
offset -= 3;
amoebaGpu->psInducedDipole->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
offset -= 3;
amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
......@@ -1153,7 +1148,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
kClearFields_3( amoebaGpu, 2 );
#ifdef AMOEBA_DEBUG
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticN2Forces: threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u\n",
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticN2Forces: threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u\n",
threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(PmeDirectElectrostaticParticle)+sizeof(float3)),
(sizeof(PmeDirectElectrostaticParticle)+sizeof(float3)) );
......@@ -1200,19 +1195,19 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
amoebaGpu->psForce->Download();
amoebaGpu->psTorque->Download();
debugArray->Download();
(void) fprintf( amoebaGpu->log, "Finished PmeDirectElectrostatic kernel execution\n" ); (void) fflush( amoebaGpu->log );
int maxPrint = 5;
float conversion = 1.0f/41.84f;
float forceSum[3] = { 0.0f, 0.0f, 0.0f};
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii);
(void) fprintf( amoebaGpu->log, "%5d ", ii);
int indexOffset = ii*3;
// force
(void) fprintf( amoebaGpu->log,"PmeDirectElectrostaticF [%16.9e %16.9e %16.9e] ",
conversion*amoebaGpu->psForce->_pSysStream[0][indexOffset],
conversion*amoebaGpu->psForce->_pSysStream[0][indexOffset+1],
......@@ -1223,12 +1218,12 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
forceSum[2] += amoebaGpu->psForce->_pSysStream[0][indexOffset+2];
// torque
(void) fprintf( amoebaGpu->log,"PmeDirectElectrostaticT [%16.9e %16.9e %16.9e] ",
conversion*amoebaGpu->psTorque->_pSysStream[0][indexOffset],
conversion*amoebaGpu->psTorque->_pSysStream[0][indexOffset+1],
conversion*amoebaGpu->psTorque->_pSysStream[0][indexOffset+2] );
(void) fprintf( amoebaGpu->log,"\n" );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
ii = gpu->natoms - maxPrint;
......@@ -1243,7 +1238,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
} else {
energy += (*gpu->psEnergy)[ii];
}
}
}
(void) fprintf( amoebaGpu->log,"Force sums: [%16.9e %16.9e %16.9e] Energy=%16.9e\n", forceSum[0], forceSum[1], forceSum[2], energy );
if( 0 ){
......@@ -1279,7 +1274,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
}
}
(void) fflush( amoebaGpu->log );
if( 1 ){
std::vector<int> fileId;
//fileId.push_back( 0 );
......@@ -1290,7 +1285,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeDirectForceTorque", fileId, outputVector );
}
}
}
delete debugArray;
#endif
......@@ -1310,5 +1305,5 @@ void cudaComputeAmoebaPmeElectrostatic( amoebaGpuContext amoebaGpu )
{
cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpu );
kCalculateAmoebaPMEInducedDipoleForces( amoebaGpu );
}
}
......@@ -117,7 +117,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
{
float force[3];
float torque[2][3];
float3 torque[2];
unsigned int atomJ = y + j;
......@@ -151,9 +151,9 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
localParticle.force[1] += mask ? force[1] : 0.0f;
localParticle.force[2] += mask ? force[2] : 0.0f;
localParticle.torque[0] += mask ? torque[0][0] : 0.0f;
localParticle.torque[1] += mask ? torque[0][1] : 0.0f;
localParticle.torque[2] += mask ? torque[0][2] : 0.0f;
localParticle.torque[0] += mask ? torque[0].x : 0.0f;
localParticle.torque[1] += mask ? torque[0].y : 0.0f;
localParticle.torque[2] += mask ? torque[0].z : 0.0f;
totalEnergy += mask ? 0.5*energy : 0.0f;
......@@ -181,15 +181,15 @@ if( atomI == targetAtom ){
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? torque[0][0] : 0.0f;
debugArray[index].y = mask ? torque[0][1] : 0.0f;
debugArray[index].z = mask ? torque[0][2] : 0.0f;
debugArray[index].x = mask ? torque[0].x : 0.0f;
debugArray[index].y = mask ? torque[0].y : 0.0f;
debugArray[index].z = mask ? torque[0].z : 0.0f;
debugArray[index].w = mask ? energy : 0.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? torque[0][0] : 0.0f;
debugArray[index].y = mask ? torque[0][1] : 0.0f;
debugArray[index].z = mask ? torque[0][2] : 0.0f;
debugArray[index].x = mask ? torque[0].x : 0.0f;
debugArray[index].y = mask ? torque[0].y : 0.0f;
debugArray[index].z = mask ? torque[0].z : 0.0f;
debugArray[index].w = (float) (blockIdx.x * blockDim.x + threadIdx.x);
for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
......@@ -304,7 +304,7 @@ if( atomI == targetAtom ){
unsigned int atomJ = y + jIdx;
float force[3];
float torque[2][3];
float3 torque[2];
// set scale factors
......@@ -335,9 +335,9 @@ if( atomI == targetAtom ){
localParticle.force[1] += mask ? force[1] : 0.0f;
localParticle.force[2] += mask ? force[2] : 0.0f;
localParticle.torque[0] += mask ? torque[0][0] : 0.0f;
localParticle.torque[1] += mask ? torque[0][1] : 0.0f;
localParticle.torque[2] += mask ? torque[0][2] : 0.0f;
localParticle.torque[0] += mask ? torque[0].x : 0.0f;
localParticle.torque[1] += mask ? torque[0].y : 0.0f;
localParticle.torque[2] += mask ? torque[0].z : 0.0f;
totalEnergy += mask ? energy : 0.0f;
......@@ -349,9 +349,9 @@ if( atomI == targetAtom ){
psA[jIdx].force[1] -= mask ? force[1] : 0.0f;
psA[jIdx].force[2] -= mask ? force[2] : 0.0f;
psA[jIdx].torque[0] += mask ? torque[1][0] : 0.0f;
psA[jIdx].torque[1] += mask ? torque[1][1] : 0.0f;
psA[jIdx].torque[2] += mask ? torque[1][2] : 0.0f;
psA[jIdx].torque[0] += mask ? torque[1].x : 0.0f;
psA[jIdx].torque[1] += mask ? torque[1].y : 0.0f;
psA[jIdx].torque[2] += mask ? torque[1].z : 0.0f;
} else {
......@@ -359,9 +359,9 @@ if( atomI == targetAtom ){
sA[threadIdx.x].tempForce[1] = mask ? 0.0f : force[1];
sA[threadIdx.x].tempForce[2] = mask ? 0.0f : force[2];
sA[threadIdx.x].tempTorque[0] = mask ? 0.0f : torque[1][0];
sA[threadIdx.x].tempTorque[1] = mask ? 0.0f : torque[1][1];
sA[threadIdx.x].tempTorque[2] = mask ? 0.0f : torque[1][2];
sA[threadIdx.x].tempTorque[0] = mask ? 0.0f : torque[1].x;
sA[threadIdx.x].tempTorque[1] = mask ? 0.0f : torque[1].y;
sA[threadIdx.x].tempTorque[2] = mask ? 0.0f : torque[1].z;
if( tgx % 2 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );
......
......@@ -187,137 +187,136 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
zr -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
float r2 = xr*xr + yr*yr + zr*zr;
float r = sqrtf(r2);
if( r2 <= cSim.nonbondedCutoffSqr ){
float r = sqrtf(r2);
// calculate the error function damping terms
// calculate the error function damping terms
float ralpha = cSim.alphaEwald*r;
float ralpha = cSim.alphaEwald*r;
float bn0 = erfc(ralpha)/r;
float alsq2 = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
float alsq2n = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald);
float exp2a = exp(-(ralpha*ralpha));
alsq2n *= alsq2;
float bn1 = (bn0+alsq2n*exp2a)/r2;
float bn0 = erfc(ralpha)/r;
float alsq2 = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
float alsq2n = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald);
float exp2a = exp(-(ralpha*ralpha));
alsq2n *= alsq2;
float bn1 = (bn0+alsq2n*exp2a)/r2;
alsq2n *= alsq2;
float bn2 = (3.0f*bn1+alsq2n*exp2a)/r2;
alsq2n *= alsq2;
float bn2 = (3.0f*bn1+alsq2n*exp2a)/r2;
alsq2n *= alsq2;
float bn3 = (5.0f*bn2+alsq2n*exp2a)/r2;
alsq2n *= alsq2;
float bn3 = (5.0f*bn2+alsq2n*exp2a)/r2;
// compute the error function scaled and unscaled terms
// compute the error function scaled and unscaled terms
float scale3 = 1.0f;
float scale5 = 1.0f;
float scale7 = 1.0f;
float damp = atomI.damp*atomJ.damp;
if( damp != 0.0f ){
float scale3 = 1.0f;
float scale5 = 1.0f;
float scale7 = 1.0f;
float damp = atomI.damp*atomJ.damp;
if( damp != 0.0f ){
float ratio = (r/damp);
ratio = ratio*ratio*ratio;
float ratio = (r/damp);
ratio = ratio*ratio*ratio;
float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
damp = -pgamma*ratio;
damp = -pgamma*ratio;
if( damp > -50.0f) {
float expdamp = exp(damp);
scale3 = 1.0f - expdamp;
scale5 = 1.0f - expdamp*(1.0f-damp);
scale7 = 1.0f - expdamp*(1.0f-damp+(0.6f*damp*damp));
if( damp > -50.0f) {
float expdamp = exp(damp);
scale3 = 1.0f - expdamp;
scale5 = 1.0f - expdamp*(1.0f-damp);
scale7 = 1.0f - expdamp*(1.0f-damp+(0.6f*damp*damp));
}
}
}
float dsc3 = dscale*scale3;
float dsc5 = dscale*scale5;
float dsc7 = dscale*scale7;
float dsc3 = dscale*scale3;
float dsc5 = dscale*scale5;
float dsc7 = dscale*scale7;
float psc3 = pscale*scale3;
float psc5 = pscale*scale5;
float psc7 = pscale*scale7;
float psc3 = pscale*scale3;
float psc5 = pscale*scale5;
float psc7 = pscale*scale7;
float r3 = (r*r2);
float r5 = (r3*r2);
float r7 = (r5*r2);
float drr3 = (1.0f-dsc3)/r3;
float drr5 = 3.0f * (1.0f-dsc5)/r5;
float drr7 = 15.0f * (1.0f-dsc7)/r7;
float r3 = (r*r2);
float r5 = (r3*r2);
float r7 = (r5*r2);
float drr3 = (1.0f-dsc3)/r3;
float drr5 = 3.0f * (1.0f-dsc5)/r5;
float drr7 = 15.0f * (1.0f-dsc7)/r7;
float prr3 = (1.0f-psc3) / r3;
float prr5 = 3.0f *(1.0f-psc5)/r5;
float prr7 = 15.0f*(1.0f-psc7)/r7;
float prr3 = (1.0f-psc3) / r3;
float prr5 = 3.0f *(1.0f-psc5)/r5;
float prr7 = 15.0f*(1.0f-psc7)/r7;
float dir = atomI.labFrameDipole_X*xr + atomI.labFrameDipole_Y*yr + atomI.labFrameDipole_Z*zr;
float dir = atomI.labFrameDipole_X*xr + atomI.labFrameDipole_Y*yr + atomI.labFrameDipole_Z*zr;
float qix = atomI.labFrameQuadrupole_XX*xr + atomI.labFrameQuadrupole_XY*yr + atomI.labFrameQuadrupole_XZ*zr;
float qiy = atomI.labFrameQuadrupole_XY*xr + atomI.labFrameQuadrupole_YY*yr + atomI.labFrameQuadrupole_YZ*zr;
float qiz = atomI.labFrameQuadrupole_XZ*xr + atomI.labFrameQuadrupole_YZ*yr + atomI.labFrameQuadrupole_ZZ*zr;
float qix = atomI.labFrameQuadrupole_XX*xr + atomI.labFrameQuadrupole_XY*yr + atomI.labFrameQuadrupole_XZ*zr;
float qiy = atomI.labFrameQuadrupole_XY*xr + atomI.labFrameQuadrupole_YY*yr + atomI.labFrameQuadrupole_YZ*zr;
float qiz = atomI.labFrameQuadrupole_XZ*xr + atomI.labFrameQuadrupole_YZ*yr + atomI.labFrameQuadrupole_ZZ*zr;
float qir = qix*xr + qiy*yr + qiz*zr;
float qir = qix*xr + qiy*yr + qiz*zr;
float dkr = atomJ.labFrameDipole_X*xr + atomJ.labFrameDipole_Y*yr + atomJ.labFrameDipole_Z*zr;
float qkx = atomJ.labFrameQuadrupole_XX*xr + atomJ.labFrameQuadrupole_XY*yr + atomJ.labFrameQuadrupole_XZ*zr;
float qky = atomJ.labFrameQuadrupole_XY*xr + atomJ.labFrameQuadrupole_YY*yr + atomJ.labFrameQuadrupole_YZ*zr;
float qkz = atomJ.labFrameQuadrupole_XZ*xr + atomJ.labFrameQuadrupole_YZ*yr + atomJ.labFrameQuadrupole_ZZ*zr;
float qkr = qkx*xr + qky*yr + qkz*zr;
float dkr = atomJ.labFrameDipole_X*xr + atomJ.labFrameDipole_Y*yr + atomJ.labFrameDipole_Z*zr;
float qkx = atomJ.labFrameQuadrupole_XX*xr + atomJ.labFrameQuadrupole_XY*yr + atomJ.labFrameQuadrupole_XZ*zr;
float qky = atomJ.labFrameQuadrupole_XY*xr + atomJ.labFrameQuadrupole_YY*yr + atomJ.labFrameQuadrupole_YZ*zr;
float qkz = atomJ.labFrameQuadrupole_XZ*xr + atomJ.labFrameQuadrupole_YZ*yr + atomJ.labFrameQuadrupole_ZZ*zr;
float qkr = qkx*xr + qky*yr + qkz*zr;
float fim0 = -xr*(bn1*atomJ.q-bn2*dkr+bn3*qkr)
- bn1*atomJ.labFrameDipole_X + 2.0f*bn2*qkx;
float fim0 = -xr*(bn1*atomJ.q-bn2*dkr+bn3*qkr)
- bn1*atomJ.labFrameDipole_X + 2.0f*bn2*qkx;
float fim1 = -yr*(bn1*atomJ.q-bn2*dkr+bn3*qkr)
- bn1*atomJ.labFrameDipole_Y + 2.0f*bn2*qky;
float fim1 = -yr*(bn1*atomJ.q-bn2*dkr+bn3*qkr)
- bn1*atomJ.labFrameDipole_Y + 2.0f*bn2*qky;
float fim2 = -zr*(bn1*atomJ.q-bn2*dkr+bn3*qkr)
- bn1*atomJ.labFrameDipole_Z + 2.0f*bn2*qkz;
float fim2 = -zr*(bn1*atomJ.q-bn2*dkr+bn3*qkr)
- bn1*atomJ.labFrameDipole_Z + 2.0f*bn2*qkz;
float fkm0 = xr*(bn1*atomI.q+bn2*dir+bn3*qir)
- bn1*atomI.labFrameDipole_X - 2.0f*bn2*qix;
float fkm0 = xr*(bn1*atomI.q+bn2*dir+bn3*qir)
- bn1*atomI.labFrameDipole_X - 2.0f*bn2*qix;
float fkm1 = yr*(bn1*atomI.q+bn2*dir+bn3*qir)
- bn1*atomI.labFrameDipole_Y - 2.0f*bn2*qiy;
float fkm1 = yr*(bn1*atomI.q+bn2*dir+bn3*qir)
- bn1*atomI.labFrameDipole_Y - 2.0f*bn2*qiy;
float fkm2 = zr*(bn1*atomI.q+bn2*dir+bn3*qir)
- bn1*atomI.labFrameDipole_Z - 2.0f*bn2*qiz;
float fkm2 = zr*(bn1*atomI.q+bn2*dir+bn3*qir)
- bn1*atomI.labFrameDipole_Z - 2.0f*bn2*qiz;
float fid0 = -xr*(drr3*atomJ.q-drr5*dkr+drr7*qkr)
- drr3*atomJ.labFrameDipole_X + 2.0f*drr5*qkx;
float fid0 = -xr*(drr3*atomJ.q-drr5*dkr+drr7*qkr)
- drr3*atomJ.labFrameDipole_X + 2.0f*drr5*qkx;
float fid1 = -yr*(drr3*atomJ.q-drr5*dkr+drr7*qkr)
- drr3*atomJ.labFrameDipole_Y + 2.0f*drr5*qky;
float fid1 = -yr*(drr3*atomJ.q-drr5*dkr+drr7*qkr)
- drr3*atomJ.labFrameDipole_Y + 2.0f*drr5*qky;
float fid2 = -zr*(drr3*atomJ.q-drr5*dkr+drr7*qkr)
- drr3*atomJ.labFrameDipole_Z + 2.0f*drr5*qkz;
float fid2 = -zr*(drr3*atomJ.q-drr5*dkr+drr7*qkr)
- drr3*atomJ.labFrameDipole_Z + 2.0f*drr5*qkz;
float fkd0 = xr*(drr3*atomI.q+drr5*dir+drr7*qir)
- drr3*atomI.labFrameDipole_X - 2.0f*drr5*qix;
float fkd0 = xr*(drr3*atomI.q+drr5*dir+drr7*qir)
- drr3*atomI.labFrameDipole_X - 2.0f*drr5*qix;
float fkd1 = yr*(drr3*atomI.q+drr5*dir+drr7*qir)
- drr3*atomI.labFrameDipole_Y - 2.0f*drr5*qiy;
float fkd1 = yr*(drr3*atomI.q+drr5*dir+drr7*qir)
- drr3*atomI.labFrameDipole_Y - 2.0f*drr5*qiy;
float fkd2 = zr*(drr3*atomI.q+drr5*dir+drr7*qir)
- drr3*atomI.labFrameDipole_Z - 2.0f*drr5*qiz;
float fkd2 = zr*(drr3*atomI.q+drr5*dir+drr7*qir)
- drr3*atomI.labFrameDipole_Z - 2.0f*drr5*qiz;
float fip0 = -xr*(prr3*atomJ.q-prr5*dkr+prr7*qkr)
- prr3*atomJ.labFrameDipole_X + 2.0f*prr5*qkx;
float fip0 = -xr*(prr3*atomJ.q-prr5*dkr+prr7*qkr)
- prr3*atomJ.labFrameDipole_X + 2.0f*prr5*qkx;
float fip1 = -yr*(prr3*atomJ.q-prr5*dkr+prr7*qkr)
- prr3*atomJ.labFrameDipole_Y + 2.0f*prr5*qky;
float fip1 = -yr*(prr3*atomJ.q-prr5*dkr+prr7*qkr)
- prr3*atomJ.labFrameDipole_Y + 2.0f*prr5*qky;
float fip2 = -zr*(prr3*atomJ.q-prr5*dkr+prr7*qkr)
- prr3*atomJ.labFrameDipole_Z + 2.0f*prr5*qkz;
float fip2 = -zr*(prr3*atomJ.q-prr5*dkr+prr7*qkr)
- prr3*atomJ.labFrameDipole_Z + 2.0f*prr5*qkz;
float fkp0 = xr*(prr3*atomI.q+prr5*dir+prr7*qir)
- prr3*atomI.labFrameDipole_X - 2.0f*prr5*qix;
float fkp0 = xr*(prr3*atomI.q+prr5*dir+prr7*qir)
- prr3*atomI.labFrameDipole_X - 2.0f*prr5*qix;
float fkp1 = yr*(prr3*atomI.q+prr5*dir+prr7*qir)
- prr3*atomI.labFrameDipole_Y - 2.0f*prr5*qiy;
float fkp1 = yr*(prr3*atomI.q+prr5*dir+prr7*qir)
- prr3*atomI.labFrameDipole_Y - 2.0f*prr5*qiy;
float fkp2 = zr*(prr3*atomI.q+prr5*dir+prr7*qir)
- prr3*atomI.labFrameDipole_Z - 2.0f*prr5*qiz;
// increment the field at each site due to this interaction
float fkp2 = zr*(prr3*atomI.q+prr5*dir+prr7*qir)
- prr3*atomI.labFrameDipole_Z - 2.0f*prr5*qiz;
if( r2 <= cSim.nonbondedCutoffSqr ){
// increment the field at each site due to this interaction
fields[0].x = fim0 - fid0;
fields[1].x = fim1 - fid1;
......
......@@ -75,92 +75,91 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce
zr -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
float r2 = xr*xr + yr* yr + zr*zr;
float r = sqrtf(r2);
if( r2 <= cSim.nonbondedCutoffSqr ){
float r = sqrtf(r2);
// calculate the error function damping terms
// calculate the error function damping terms
float ralpha = cSim.alphaEwald*r;
float ralpha = cSim.alphaEwald*r;
float bn0 = erfc(ralpha)/r;
float alsq2 = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
float alsq2n = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald);
float exp2a = exp(-(ralpha*ralpha));
alsq2n *= alsq2;
float bn1 = (bn0+alsq2n*exp2a)/r2;
float bn0 = erfc(ralpha)/r;
float alsq2 = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
float alsq2n = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald);
float exp2a = exp(-(ralpha*ralpha));
alsq2n *= alsq2;
float bn1 = (bn0+alsq2n*exp2a)/r2;
alsq2n *= alsq2;
float bn2 = (3.0f*bn1+alsq2n*exp2a)/r2;
alsq2n *= alsq2;
float bn2 = (3.0f*bn1+alsq2n*exp2a)/r2;
// compute the error function scaled and unscaled terms
// compute the error function scaled and unscaled terms
float scale3 = 1.0f;
float scale5 = 1.0f;
float damp = atomI.damp*atomJ.damp;
if( damp != 0.0f ){
float scale3 = 1.0f;
float scale5 = 1.0f;
float damp = atomI.damp*atomJ.damp;
if( damp != 0.0f ){
float ratio = (r/damp);
ratio = ratio*ratio*ratio;
float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
damp = -pgamma*ratio;
float ratio = (r/damp);
ratio = ratio*ratio*ratio;
float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
damp = -pgamma*ratio;
if( damp > -50.0f) {
float expdamp = exp(damp);
scale3 = 1.0f - expdamp;
scale5 = 1.0f - expdamp*(1.0f-damp);
if( damp > -50.0f) {
float expdamp = exp(damp);
scale3 = 1.0f - expdamp;
scale5 = 1.0f - expdamp*(1.0f-damp);
}
}
}
float dsc3 = uscale*scale3;
float dsc5 = uscale*scale5;
float dsc3 = uscale*scale3;
float dsc5 = uscale*scale5;
float r3 = (r*r2);
float r5 = (r3*r2);
float rr3 = (1.0f-dsc3)/r3;
float rr5 = 3.0f * (1.0f-dsc5)/r5;
float r3 = (r*r2);
float r5 = (r3*r2);
float rr3 = (1.0f-dsc3)/r3;
float rr5 = 3.0f * (1.0f-dsc5)/r5;
float duir = atomI.inducedDipole[0]*xr + atomI.inducedDipole[1]*yr + atomI.inducedDipole[2]*zr;
float dukr = atomJ.inducedDipole[0]*xr + atomJ.inducedDipole[1]*yr + atomJ.inducedDipole[2]*zr;
float duir = atomI.inducedDipole[0]*xr + atomI.inducedDipole[1]*yr + atomI.inducedDipole[2]*zr;
float dukr = atomJ.inducedDipole[0]*xr + atomJ.inducedDipole[1]*yr + atomJ.inducedDipole[2]*zr;
float puir = atomI.inducedDipolePolar[0]*xr + atomI.inducedDipolePolar[1]*yr + atomI.inducedDipolePolar[2]*zr;
float pukr = atomJ.inducedDipolePolar[0]*xr + atomJ.inducedDipolePolar[1]*yr + atomJ.inducedDipolePolar[2]*zr;
float puir = atomI.inducedDipolePolar[0]*xr + atomI.inducedDipolePolar[1]*yr + atomI.inducedDipolePolar[2]*zr;
float pukr = atomJ.inducedDipolePolar[0]*xr + atomJ.inducedDipolePolar[1]*yr + atomJ.inducedDipolePolar[2]*zr;
bn1 *= -1.0f;
bn1 *= -1.0f;
float fimd0 = bn1*atomJ.inducedDipole[0] + bn2*dukr*xr;
float fimd1 = bn1*atomJ.inducedDipole[1] + bn2*dukr*yr;
float fimd2 = bn1*atomJ.inducedDipole[2] + bn2*dukr*zr;
float fimd0 = bn1*atomJ.inducedDipole[0] + bn2*dukr*xr;
float fimd1 = bn1*atomJ.inducedDipole[1] + bn2*dukr*yr;
float fimd2 = bn1*atomJ.inducedDipole[2] + bn2*dukr*zr;
float fkmd0 = bn1*atomI.inducedDipole[0] + bn2*duir*xr;
float fkmd1 = bn1*atomI.inducedDipole[1] + bn2*duir*yr;
float fkmd2 = bn1*atomI.inducedDipole[2] + bn2*duir*zr;
float fkmd0 = bn1*atomI.inducedDipole[0] + bn2*duir*xr;
float fkmd1 = bn1*atomI.inducedDipole[1] + bn2*duir*yr;
float fkmd2 = bn1*atomI.inducedDipole[2] + bn2*duir*zr;
float fimp0 = bn1*atomJ.inducedDipolePolar[0] + bn2*pukr*xr;
float fimp1 = bn1*atomJ.inducedDipolePolar[1] + bn2*pukr*yr;
float fimp2 = bn1*atomJ.inducedDipolePolar[2] + bn2*pukr*zr;
float fimp0 = bn1*atomJ.inducedDipolePolar[0] + bn2*pukr*xr;
float fimp1 = bn1*atomJ.inducedDipolePolar[1] + bn2*pukr*yr;
float fimp2 = bn1*atomJ.inducedDipolePolar[2] + bn2*pukr*zr;
float fkmp0 = bn1*atomI.inducedDipolePolar[0] + bn2*puir*xr;
float fkmp1 = bn1*atomI.inducedDipolePolar[1] + bn2*puir*yr;
float fkmp2 = bn1*atomI.inducedDipolePolar[2] + bn2*puir*zr;
float fkmp0 = bn1*atomI.inducedDipolePolar[0] + bn2*puir*xr;
float fkmp1 = bn1*atomI.inducedDipolePolar[1] + bn2*puir*yr;
float fkmp2 = bn1*atomI.inducedDipolePolar[2] + bn2*puir*zr;
rr3 *= -1.0f;;
float fid0 = rr3*atomJ.inducedDipole[0] + rr5*dukr*xr;
float fid1 = rr3*atomJ.inducedDipole[1] + rr5*dukr*yr;
float fid2 = rr3*atomJ.inducedDipole[2] + rr5*dukr*zr;
rr3 *= -1.0f;;
float fid0 = rr3*atomJ.inducedDipole[0] + rr5*dukr*xr;
float fid1 = rr3*atomJ.inducedDipole[1] + rr5*dukr*yr;
float fid2 = rr3*atomJ.inducedDipole[2] + rr5*dukr*zr;
float fkd0 = rr3*atomI.inducedDipole[0] + rr5*duir*xr;
float fkd1 = rr3*atomI.inducedDipole[1] + rr5*duir*yr;
float fkd2 = rr3*atomI.inducedDipole[2] + rr5*duir*zr;
float fkd0 = rr3*atomI.inducedDipole[0] + rr5*duir*xr;
float fkd1 = rr3*atomI.inducedDipole[1] + rr5*duir*yr;
float fkd2 = rr3*atomI.inducedDipole[2] + rr5*duir*zr;
float fip0 = rr3*atomJ.inducedDipolePolar[0] + rr5*pukr*xr;
float fip1 = rr3*atomJ.inducedDipolePolar[1] + rr5*pukr*yr;
float fip2 = rr3*atomJ.inducedDipolePolar[2] + rr5*pukr*zr;
float fip0 = rr3*atomJ.inducedDipolePolar[0] + rr5*pukr*xr;
float fip1 = rr3*atomJ.inducedDipolePolar[1] + rr5*pukr*yr;
float fip2 = rr3*atomJ.inducedDipolePolar[2] + rr5*pukr*zr;
float fkp0 = rr3*atomI.inducedDipolePolar[0] + rr5*puir*xr;
float fkp1 = rr3*atomI.inducedDipolePolar[1] + rr5*puir*yr;
float fkp2 = rr3*atomI.inducedDipolePolar[2] + rr5*puir*zr;
float fkp0 = rr3*atomI.inducedDipolePolar[0] + rr5*puir*xr;
float fkp1 = rr3*atomI.inducedDipolePolar[1] + rr5*puir*yr;
float fkp2 = rr3*atomI.inducedDipolePolar[2] + rr5*puir*zr;
// increment the field at each site due to this interaction
if( r2 <= cSim.nonbondedCutoffSqr ){
// increment the field at each site due to this interaction
fields[0].x = fimd0 - fid0;
fields[0].y = fkmd0 - fkd0;
......
......@@ -29,7 +29,7 @@
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment