Commit 408469c3 authored by Peter Eastman's avatar Peter Eastman
Browse files

Optimizations to PME

parent 45b0302d
...@@ -15,9 +15,9 @@ void SetCalculateAmoebaPmeDirectElectrostaticSim(amoebaGpuContext amoebaGpu) ...@@ -15,9 +15,9 @@ void SetCalculateAmoebaPmeDirectElectrostaticSim(amoebaGpuContext amoebaGpu)
{ {
cudaError_t status; cudaError_t status;
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation)); status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "SetCalculateAmoebaPmeDirectElectrostaticSim: cudaMemcpyToSymbol: SetSim copy to cSim failed"); RTERROR(status, "SetCalculateAmoebaPmeDirectElectrostaticSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
status = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation)); status = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));
RTERROR(status, "SetCalculateAmoebaPmeDirectElectrostaticSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed"); RTERROR(status, "SetCalculateAmoebaPmeDirectElectrostaticSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
} }
...@@ -25,15 +25,15 @@ void GetCalculateAmoebaPmeDirectElectrostaticSim(amoebaGpuContext amoebaGpu) ...@@ -25,15 +25,15 @@ void GetCalculateAmoebaPmeDirectElectrostaticSim(amoebaGpuContext amoebaGpu)
{ {
cudaError_t status; cudaError_t status;
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation)); status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "GetCalculateAmoebaPmeDirectElectrostaticSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed"); RTERROR(status, "GetCalculateAmoebaPmeDirectElectrostaticSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation)); status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));
RTERROR(status, "GetCalculateAmoebaPmeDirectElectrostaticSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed"); RTERROR(status, "GetCalculateAmoebaPmeDirectElectrostaticSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
} }
static int const PScaleIndex = 0; static int const PScaleIndex = 0;
static int const DScaleIndex = 1; static int const DScaleIndex = 1;
static int const UScaleIndex = 2; static int const UScaleIndex = 2;
static int const MScaleIndex = 3; static int const MScaleIndex = 3;
static int const LastScalingIndex = 4; static int const LastScalingIndex = 4;
...@@ -105,7 +105,7 @@ __device__ static void debugSetup( unsigned int atomI, unsigned int atomJ, ...@@ -105,7 +105,7 @@ __device__ static void debugSetup( unsigned int atomI, unsigned int atomJ,
debugArray[index].y = pullBack[pullIndex].y; debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z; debugArray[index].z = pullBack[pullIndex].z;
debugArray[index].w = pullBack[pullIndex].w; debugArray[index].w = pullBack[pullIndex].w;
} }
} }
*/ */
...@@ -152,43 +152,16 @@ __device__ static void calculatePmeSelfTorqueElectrostaticPairIxn_kernel( PmeDir ...@@ -152,43 +152,16 @@ __device__ static void calculatePmeSelfTorqueElectrostaticPairIxn_kernel( PmeDir
} }
__device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ, __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ,
float* scalingFactors, float* outputForce, float outputTorque[2][3], float* energy float* scalingFactors, float* outputForce, float3 outputTorque[3], float* energy
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
,float4* debugArray ,float4* debugArray
#endif #endif
){ ){
float e,ei;
float erl,erli;
float conversionFactor = (-cAmoebaSim.electric/cAmoebaSim.dielec);
// set the permanent multipole and induced dipole values;
float pdi = atomI.damp;
float pti = atomI.thole;
float ci = atomI.q;
float di1 = atomI.labFrameDipole[0];
float di2 = atomI.labFrameDipole[1];
float di3 = atomI.labFrameDipole[2];
float qi1 = atomI.labFrameQuadrupole[0];
float qi2 = atomI.labFrameQuadrupole[1];
float qi3 = atomI.labFrameQuadrupole[2];
float qi4 = atomI.labFrameQuadrupole[3];
float qi5 = atomI.labFrameQuadrupole[4];
float qi6 = atomI.labFrameQuadrupole[5];
float qi7 = atomI.labFrameQuadrupole[6];
float qi8 = atomI.labFrameQuadrupole[7];
float qi9 = atomI.labFrameQuadrupole[8];
float xr = atomJ.x - atomI.x; float xr = atomJ.x - atomI.x;
float yr = atomJ.y - atomI.y; float yr = atomJ.y - atomI.y;
float zr = atomJ.z - atomI.z; float zr = atomJ.z - atomI.z;
// periodic box // periodic box
xr -= floor(xr*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX; xr -= floor(xr*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
yr -= floor(yr*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY; yr -= floor(yr*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
...@@ -199,11 +172,33 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros ...@@ -199,11 +172,33 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
float r = sqrt(r2); float r = sqrt(r2);
float ck = atomJ.q; float ck = atomJ.q;
float conversionFactor = (-cAmoebaSim.electric/cAmoebaSim.dielec);
// set the permanent multipole and induced dipole values;
float pdi = atomI.damp;
float pti = atomI.thole;
float ci = atomI.q;
float di1 = atomI.labFrameDipole[0];
float di2 = atomI.labFrameDipole[1];
float di3 = atomI.labFrameDipole[2];
float qi1 = atomI.labFrameQuadrupole[0];
float qi2 = atomI.labFrameQuadrupole[1];
float qi3 = atomI.labFrameQuadrupole[2];
float qi4 = atomI.labFrameQuadrupole[3];
float qi5 = atomI.labFrameQuadrupole[4];
float qi6 = atomI.labFrameQuadrupole[5];
float qi7 = atomI.labFrameQuadrupole[6];
float qi8 = atomI.labFrameQuadrupole[7];
float qi9 = atomI.labFrameQuadrupole[8];
float dk1 = atomJ.labFrameDipole[0]; float dk1 = atomJ.labFrameDipole[0];
float dk2 = atomJ.labFrameDipole[1]; float dk2 = atomJ.labFrameDipole[1];
float dk3 = atomJ.labFrameDipole[2]; float dk3 = atomJ.labFrameDipole[2];
float qk1 = atomJ.labFrameQuadrupole[0]; float qk1 = atomJ.labFrameQuadrupole[0];
float qk2 = atomJ.labFrameQuadrupole[1]; float qk2 = atomJ.labFrameQuadrupole[1];
float qk3 = atomJ.labFrameQuadrupole[2]; float qk3 = atomJ.labFrameQuadrupole[2];
...@@ -213,7 +208,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros ...@@ -213,7 +208,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
float qk7 = atomJ.labFrameQuadrupole[6]; float qk7 = atomJ.labFrameQuadrupole[6];
float qk8 = atomJ.labFrameQuadrupole[7]; float qk8 = atomJ.labFrameQuadrupole[7];
float qk9 = atomJ.labFrameQuadrupole[8]; float qk9 = atomJ.labFrameQuadrupole[8];
// calculate the real space error function terms; // calculate the real space error function terms;
float ralpha = cSim.alphaEwald*r; float ralpha = cSim.alphaEwald*r;
...@@ -497,18 +492,18 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros ...@@ -497,18 +492,18 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
// compute the energy contributions for this interaction // compute the energy contributions for this interaction
e = bn0*gl0 + bn1*(gl1+gl6) float e = bn0*gl0 + bn1*(gl1+gl6)
+ bn2*(gl2+gl7+gl8) + bn2*(gl2+gl7+gl8)
+ bn3*(gl3+gl5) + bn4*gl4; + bn3*(gl3+gl5) + bn4*gl4;
ei = 0.5f * (bn1*(gli1+gli6) float ei = 0.5f * (bn1*(gli1+gli6)
+ bn2*(gli2+gli7) + bn3*gli3); + bn2*(gli2+gli7) + bn3*gli3);
// get the real energy without any screening function // get the real energy without any screening function
erl = rr1*gl0 + rr3*(gl1+gl6) float erl = rr1*gl0 + rr3*(gl1+gl6)
+ rr5*(gl2+gl7+gl8) + rr5*(gl2+gl7+gl8)
+ rr7*(gl3+gl5) + rr9*gl4; + rr7*(gl3+gl5) + rr9*gl4;
erli = 0.5f*(rr3*(gli1+gli6)*psc3 float erli = 0.5f*(rr3*(gli1+gli6)*psc3
+ rr5*(gli2+gli7)*psc5 + rr5*(gli2+gli7)*psc5
+ rr7*gli3*psc7); + rr7*gli3*psc7);
e = e - (1.0f-scalingFactors[MScaleIndex])*erl; e = e - (1.0f-scalingFactors[MScaleIndex])*erl;
...@@ -552,23 +547,23 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros ...@@ -552,23 +547,23 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
// intermediate variables for induced force terms // intermediate variables for induced force terms
float gfi1 = 0.5f*bn2*(gli1+glip1+gli6+glip6) float gfi1 = 0.5f*(bn2*(gli1+glip1+gli6+glip6)
+ 0.5f*bn2*scip2 + bn2*scip2
+ 0.5f*bn3*(gli2+glip2+gli7+glip7) + bn3*(gli2+glip2+gli7+glip7)
- 0.5f*bn3*(sci3*scip4+scip3*sci4) - bn3*(sci3*scip4+scip3*sci4)
+ 0.5f*bn4*(gli3+glip3); + bn4*(gli3+glip3));
float gfi2 = -ck*bn1 + sc4*bn2 - sc6*bn3; float gfi2 = -ck*bn1 + sc4*bn2 - sc6*bn3;
float gfi3 = ci*bn1 + sc3*bn2 + sc5*bn3; float gfi3 = ci*bn1 + sc3*bn2 + sc5*bn3;
float gfi4 = 2.0f * bn2; float gfi4 = 2.0f * bn2;
float gfi5 = bn3 * (sci4+scip4); float gfi5 = bn3 * (sci4+scip4);
float gfi6 = -bn3 * (sci3+scip3); float gfi6 = -bn3 * (sci3+scip3);
float gfri1 = 0.5f*rr5*((gli1+gli6)*psc3 float gfri1 = 0.5f*(rr5*((gli1+gli6)*psc3
+ (glip1+glip6)*dsc3 + (glip1+glip6)*dsc3
+ scip2*usc3) + scip2*usc3)
+ 0.5f*rr7*((gli7+gli2)*psc5 + rr7*((gli7+gli2)*psc5
+ (glip7+glip2)*dsc5 + (glip7+glip2)*dsc5
- (sci3*scip4+scip3*sci4)*usc5) - (sci3*scip4+scip3*sci4)*usc5)
+ 0.5f*rr9*(gli3*psc7+glip3*dsc7); + rr9*(gli3*psc7+glip3*dsc7));
float gfri4 = 2.0f * rr5; float gfri4 = 2.0f * rr5;
float gfri5 = rr7 * (sci4*psc7+scip4*dsc7); float gfri5 = rr7 * (sci4*psc7+scip4*dsc7);
float gfri6 = -rr7 * (sci3*psc7+scip3*dsc7); float gfri6 = -rr7 * (sci3*psc7+scip3*dsc7);
...@@ -856,15 +851,15 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros ...@@ -856,15 +851,15 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
outputForce[0] = conversionFactor*(ftm21 + ftm2i1); outputForce[0] = conversionFactor*(ftm21 + ftm2i1);
outputForce[1] = conversionFactor*(ftm22 + ftm2i2); outputForce[1] = conversionFactor*(ftm22 + ftm2i2);
outputForce[2] = conversionFactor*(ftm23 + ftm2i3); outputForce[2] = conversionFactor*(ftm23 + ftm2i3);
conversionFactor *= -1.0; conversionFactor *= -1.0;
outputTorque[0][0] = conversionFactor*(ttm21 + ttm2i1); outputTorque[0].x = conversionFactor*(ttm21 + ttm2i1);
outputTorque[0][1] = conversionFactor*(ttm22 + ttm2i2); outputTorque[1].x = conversionFactor*(ttm22 + ttm2i2);
outputTorque[0][2] = conversionFactor*(ttm23 + ttm2i3); outputTorque[2].x = conversionFactor*(ttm23 + ttm2i3);
outputTorque[1][0] = conversionFactor*(ttm31 + ttm3i1); outputTorque[1].x = conversionFactor*(ttm31 + ttm3i1);
outputTorque[1][1] = conversionFactor*(ttm32 + ttm3i2); outputTorque[1].y = conversionFactor*(ttm32 + ttm3i2);
outputTorque[1][2] = conversionFactor*(ttm33 + ttm3i3); outputTorque[1].z = conversionFactor*(ttm33 + ttm3i3);
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
int debugIndex = 0; int debugIndex = 0;
...@@ -958,14 +953,14 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros ...@@ -958,14 +953,14 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
outputForce[0] = 0.0f; outputForce[0] = 0.0f;
outputForce[1] = 0.0f; outputForce[1] = 0.0f;
outputForce[2] = 0.0f; outputForce[2] = 0.0f;
outputTorque[0][0] = 0.0f; outputTorque[0].x = 0.0f;
outputTorque[0][1] = 0.0f; outputTorque[0].y = 0.0f;
outputTorque[0][2] = 0.0f; outputTorque[0].z = 0.0f;
outputTorque[1][0] = 0.0f; outputTorque[1].x = 0.0f;
outputTorque[1][1] = 0.0f; outputTorque[1].y = 0.0f;
outputTorque[1][2] = 0.0f; outputTorque[1].z = 0.0f;
*energy = 0.0f; *energy = 0.0f;
...@@ -977,7 +972,7 @@ for( int ii = 0; ii < 5; ii++ ){ ...@@ -977,7 +972,7 @@ for( int ii = 0; ii < 5; ii++ ){
debugArray[ii].w = (float) (11*ii); debugArray[ii].w = (float) (11*ii);
} }
#endif #endif
} }
return; return;
...@@ -1070,7 +1065,7 @@ static void kReduceForceTorque(amoebaGpuContext amoebaGpu ) ...@@ -1070,7 +1065,7 @@ static void kReduceForceTorque(amoebaGpuContext amoebaGpu )
void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu ) void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
{ {
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
static unsigned int threadsPerBlock = 0; static unsigned int threadsPerBlock = 0;
...@@ -1098,7 +1093,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -1098,7 +1093,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
methodName, gpu->natoms, methodName, gpu->natoms,
amoebaGpu->maxCovalentDegreeSz, amoebaGpu->pGamma, amoebaGpu->maxCovalentDegreeSz, amoebaGpu->pGamma,
amoebaGpu->scalingDistanceCutoff ); amoebaGpu->scalingDistanceCutoff );
} }
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
CUDAStream<float4>* debugArray = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray"); CUDAStream<float4>* debugArray = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysStream[0], 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms); memset( debugArray->_pSysStream[0], 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
...@@ -1120,8 +1115,8 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -1120,8 +1115,8 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
// (void) fprintf( amoebaGpu->log, " %u %s %s\n", ii, lineTokens[0].c_str(), lineTokens[lineTokenIndex].c_str() ); fflush( amoebaGpu->log ); // (void) fprintf( amoebaGpu->log, " %u %s %s\n", ii, lineTokens[0].c_str(), lineTokens[lineTokenIndex].c_str() ); fflush( amoebaGpu->log );
amoebaGpu->psInducedDipole->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str())); amoebaGpu->psInducedDipole->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
amoebaGpu->psInducedDipole->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str())); amoebaGpu->psInducedDipole->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
amoebaGpu->psInducedDipole->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str())); amoebaGpu->psInducedDipole->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
offset -= 3; offset -= 3;
amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str())); amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str())); amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str())); amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
...@@ -1153,7 +1148,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -1153,7 +1148,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
kClearFields_3( amoebaGpu, 2 ); kClearFields_3( amoebaGpu, 2 );
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticN2Forces: threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u\n", (void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticN2Forces: threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u\n",
threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(PmeDirectElectrostaticParticle)+sizeof(float3)), threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(PmeDirectElectrostaticParticle)+sizeof(float3)),
(sizeof(PmeDirectElectrostaticParticle)+sizeof(float3)) ); (sizeof(PmeDirectElectrostaticParticle)+sizeof(float3)) );
...@@ -1200,19 +1195,19 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -1200,19 +1195,19 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
amoebaGpu->psForce->Download(); amoebaGpu->psForce->Download();
amoebaGpu->psTorque->Download(); amoebaGpu->psTorque->Download();
debugArray->Download(); debugArray->Download();
(void) fprintf( amoebaGpu->log, "Finished PmeDirectElectrostatic kernel execution\n" ); (void) fflush( amoebaGpu->log ); (void) fprintf( amoebaGpu->log, "Finished PmeDirectElectrostatic kernel execution\n" ); (void) fflush( amoebaGpu->log );
int maxPrint = 5; int maxPrint = 5;
float conversion = 1.0f/41.84f; float conversion = 1.0f/41.84f;
float forceSum[3] = { 0.0f, 0.0f, 0.0f}; float forceSum[3] = { 0.0f, 0.0f, 0.0f};
for( int ii = 0; ii < gpu->natoms; ii++ ){ for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii); (void) fprintf( amoebaGpu->log, "%5d ", ii);
int indexOffset = ii*3; int indexOffset = ii*3;
// force // force
(void) fprintf( amoebaGpu->log,"PmeDirectElectrostaticF [%16.9e %16.9e %16.9e] ", (void) fprintf( amoebaGpu->log,"PmeDirectElectrostaticF [%16.9e %16.9e %16.9e] ",
conversion*amoebaGpu->psForce->_pSysStream[0][indexOffset], conversion*amoebaGpu->psForce->_pSysStream[0][indexOffset],
conversion*amoebaGpu->psForce->_pSysStream[0][indexOffset+1], conversion*amoebaGpu->psForce->_pSysStream[0][indexOffset+1],
...@@ -1223,12 +1218,12 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -1223,12 +1218,12 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
forceSum[2] += amoebaGpu->psForce->_pSysStream[0][indexOffset+2]; forceSum[2] += amoebaGpu->psForce->_pSysStream[0][indexOffset+2];
// torque // torque
(void) fprintf( amoebaGpu->log,"PmeDirectElectrostaticT [%16.9e %16.9e %16.9e] ", (void) fprintf( amoebaGpu->log,"PmeDirectElectrostaticT [%16.9e %16.9e %16.9e] ",
conversion*amoebaGpu->psTorque->_pSysStream[0][indexOffset], conversion*amoebaGpu->psTorque->_pSysStream[0][indexOffset],
conversion*amoebaGpu->psTorque->_pSysStream[0][indexOffset+1], conversion*amoebaGpu->psTorque->_pSysStream[0][indexOffset+1],
conversion*amoebaGpu->psTorque->_pSysStream[0][indexOffset+2] ); conversion*amoebaGpu->psTorque->_pSysStream[0][indexOffset+2] );
(void) fprintf( amoebaGpu->log,"\n" ); (void) fprintf( amoebaGpu->log,"\n" );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){ if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
ii = gpu->natoms - maxPrint; ii = gpu->natoms - maxPrint;
...@@ -1243,7 +1238,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -1243,7 +1238,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
} else { } else {
energy += (*gpu->psEnergy)[ii]; energy += (*gpu->psEnergy)[ii];
} }
} }
(void) fprintf( amoebaGpu->log,"Force sums: [%16.9e %16.9e %16.9e] Energy=%16.9e\n", forceSum[0], forceSum[1], forceSum[2], energy ); (void) fprintf( amoebaGpu->log,"Force sums: [%16.9e %16.9e %16.9e] Energy=%16.9e\n", forceSum[0], forceSum[1], forceSum[2], energy );
if( 0 ){ if( 0 ){
...@@ -1279,7 +1274,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -1279,7 +1274,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
} }
} }
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
if( 1 ){ if( 1 ){
std::vector<int> fileId; std::vector<int> fileId;
//fileId.push_back( 0 ); //fileId.push_back( 0 );
...@@ -1290,7 +1285,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -1290,7 +1285,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeDirectForceTorque", fileId, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeDirectForceTorque", fileId, outputVector );
} }
} }
delete debugArray; delete debugArray;
#endif #endif
...@@ -1310,5 +1305,5 @@ void cudaComputeAmoebaPmeElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -1310,5 +1305,5 @@ void cudaComputeAmoebaPmeElectrostatic( amoebaGpuContext amoebaGpu )
{ {
cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpu ); cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpu );
kCalculateAmoebaPMEInducedDipoleForces( amoebaGpu ); kCalculateAmoebaPMEInducedDipoleForces( amoebaGpu );
} }
...@@ -117,7 +117,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)( ...@@ -117,7 +117,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
{ {
float force[3]; float force[3];
float torque[2][3]; float3 torque[2];
unsigned int atomJ = y + j; unsigned int atomJ = y + j;
...@@ -151,9 +151,9 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)( ...@@ -151,9 +151,9 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
localParticle.force[1] += mask ? force[1] : 0.0f; localParticle.force[1] += mask ? force[1] : 0.0f;
localParticle.force[2] += mask ? force[2] : 0.0f; localParticle.force[2] += mask ? force[2] : 0.0f;
localParticle.torque[0] += mask ? torque[0][0] : 0.0f; localParticle.torque[0] += mask ? torque[0].x : 0.0f;
localParticle.torque[1] += mask ? torque[0][1] : 0.0f; localParticle.torque[1] += mask ? torque[0].y : 0.0f;
localParticle.torque[2] += mask ? torque[0][2] : 0.0f; localParticle.torque[2] += mask ? torque[0].z : 0.0f;
totalEnergy += mask ? 0.5*energy : 0.0f; totalEnergy += mask ? 0.5*energy : 0.0f;
...@@ -181,15 +181,15 @@ if( atomI == targetAtom ){ ...@@ -181,15 +181,15 @@ if( atomI == targetAtom ){
index += cAmoebaSim.paddedNumberOfAtoms; index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? torque[0][0] : 0.0f; debugArray[index].x = mask ? torque[0].x : 0.0f;
debugArray[index].y = mask ? torque[0][1] : 0.0f; debugArray[index].y = mask ? torque[0].y : 0.0f;
debugArray[index].z = mask ? torque[0][2] : 0.0f; debugArray[index].z = mask ? torque[0].z : 0.0f;
debugArray[index].w = mask ? energy : 0.0f; debugArray[index].w = mask ? energy : 0.0f;
index += cAmoebaSim.paddedNumberOfAtoms; index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? torque[0][0] : 0.0f; debugArray[index].x = mask ? torque[0].x : 0.0f;
debugArray[index].y = mask ? torque[0][1] : 0.0f; debugArray[index].y = mask ? torque[0].y : 0.0f;
debugArray[index].z = mask ? torque[0][2] : 0.0f; debugArray[index].z = mask ? torque[0].z : 0.0f;
debugArray[index].w = (float) (blockIdx.x * blockDim.x + threadIdx.x); debugArray[index].w = (float) (blockIdx.x * blockDim.x + threadIdx.x);
for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){ for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
...@@ -304,7 +304,7 @@ if( atomI == targetAtom ){ ...@@ -304,7 +304,7 @@ if( atomI == targetAtom ){
unsigned int atomJ = y + jIdx; unsigned int atomJ = y + jIdx;
float force[3]; float force[3];
float torque[2][3]; float3 torque[2];
// set scale factors // set scale factors
...@@ -335,9 +335,9 @@ if( atomI == targetAtom ){ ...@@ -335,9 +335,9 @@ if( atomI == targetAtom ){
localParticle.force[1] += mask ? force[1] : 0.0f; localParticle.force[1] += mask ? force[1] : 0.0f;
localParticle.force[2] += mask ? force[2] : 0.0f; localParticle.force[2] += mask ? force[2] : 0.0f;
localParticle.torque[0] += mask ? torque[0][0] : 0.0f; localParticle.torque[0] += mask ? torque[0].x : 0.0f;
localParticle.torque[1] += mask ? torque[0][1] : 0.0f; localParticle.torque[1] += mask ? torque[0].y : 0.0f;
localParticle.torque[2] += mask ? torque[0][2] : 0.0f; localParticle.torque[2] += mask ? torque[0].z : 0.0f;
totalEnergy += mask ? energy : 0.0f; totalEnergy += mask ? energy : 0.0f;
...@@ -349,9 +349,9 @@ if( atomI == targetAtom ){ ...@@ -349,9 +349,9 @@ if( atomI == targetAtom ){
psA[jIdx].force[1] -= mask ? force[1] : 0.0f; psA[jIdx].force[1] -= mask ? force[1] : 0.0f;
psA[jIdx].force[2] -= mask ? force[2] : 0.0f; psA[jIdx].force[2] -= mask ? force[2] : 0.0f;
psA[jIdx].torque[0] += mask ? torque[1][0] : 0.0f; psA[jIdx].torque[0] += mask ? torque[1].x : 0.0f;
psA[jIdx].torque[1] += mask ? torque[1][1] : 0.0f; psA[jIdx].torque[1] += mask ? torque[1].y : 0.0f;
psA[jIdx].torque[2] += mask ? torque[1][2] : 0.0f; psA[jIdx].torque[2] += mask ? torque[1].z : 0.0f;
} else { } else {
...@@ -359,9 +359,9 @@ if( atomI == targetAtom ){ ...@@ -359,9 +359,9 @@ if( atomI == targetAtom ){
sA[threadIdx.x].tempForce[1] = mask ? 0.0f : force[1]; sA[threadIdx.x].tempForce[1] = mask ? 0.0f : force[1];
sA[threadIdx.x].tempForce[2] = mask ? 0.0f : force[2]; sA[threadIdx.x].tempForce[2] = mask ? 0.0f : force[2];
sA[threadIdx.x].tempTorque[0] = mask ? 0.0f : torque[1][0]; sA[threadIdx.x].tempTorque[0] = mask ? 0.0f : torque[1].x;
sA[threadIdx.x].tempTorque[1] = mask ? 0.0f : torque[1][1]; sA[threadIdx.x].tempTorque[1] = mask ? 0.0f : torque[1].y;
sA[threadIdx.x].tempTorque[2] = mask ? 0.0f : torque[1][2]; sA[threadIdx.x].tempTorque[2] = mask ? 0.0f : torque[1].z;
if( tgx % 2 == 0 ){ if( tgx % 2 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] ); sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );
......
...@@ -187,137 +187,136 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle& ...@@ -187,137 +187,136 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
zr -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ; zr -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
float r2 = xr*xr + yr*yr + zr*zr; float r2 = xr*xr + yr*yr + zr*zr;
float r = sqrtf(r2); if( r2 <= cSim.nonbondedCutoffSqr ){
float r = sqrtf(r2);
// calculate the error function damping terms // calculate the error function damping terms
float ralpha = cSim.alphaEwald*r; float ralpha = cSim.alphaEwald*r;
float bn0 = erfc(ralpha)/r; float bn0 = erfc(ralpha)/r;
float alsq2 = 2.0f*cSim.alphaEwald*cSim.alphaEwald; float alsq2 = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
float alsq2n = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald); float alsq2n = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald);
float exp2a = exp(-(ralpha*ralpha)); float exp2a = exp(-(ralpha*ralpha));
alsq2n *= alsq2; alsq2n *= alsq2;
float bn1 = (bn0+alsq2n*exp2a)/r2; float bn1 = (bn0+alsq2n*exp2a)/r2;
alsq2n *= alsq2; alsq2n *= alsq2;
float bn2 = (3.0f*bn1+alsq2n*exp2a)/r2; float bn2 = (3.0f*bn1+alsq2n*exp2a)/r2;
alsq2n *= alsq2; alsq2n *= alsq2;
float bn3 = (5.0f*bn2+alsq2n*exp2a)/r2; float bn3 = (5.0f*bn2+alsq2n*exp2a)/r2;
// compute the error function scaled and unscaled terms // compute the error function scaled and unscaled terms
float scale3 = 1.0f; float scale3 = 1.0f;
float scale5 = 1.0f; float scale5 = 1.0f;
float scale7 = 1.0f; float scale7 = 1.0f;
float damp = atomI.damp*atomJ.damp; float damp = atomI.damp*atomJ.damp;
if( damp != 0.0f ){ if( damp != 0.0f ){
float ratio = (r/damp); float ratio = (r/damp);
ratio = ratio*ratio*ratio; ratio = ratio*ratio*ratio;
float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole; float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
damp = -pgamma*ratio; damp = -pgamma*ratio;
if( damp > -50.0f) { if( damp > -50.0f) {
float expdamp = exp(damp); float expdamp = exp(damp);
scale3 = 1.0f - expdamp; scale3 = 1.0f - expdamp;
scale5 = 1.0f - expdamp*(1.0f-damp); scale5 = 1.0f - expdamp*(1.0f-damp);
scale7 = 1.0f - expdamp*(1.0f-damp+(0.6f*damp*damp)); scale7 = 1.0f - expdamp*(1.0f-damp+(0.6f*damp*damp));
}
} }
} float dsc3 = dscale*scale3;
float dsc3 = dscale*scale3; float dsc5 = dscale*scale5;
float dsc5 = dscale*scale5; float dsc7 = dscale*scale7;
float dsc7 = dscale*scale7;
float psc3 = pscale*scale3; float psc3 = pscale*scale3;
float psc5 = pscale*scale5; float psc5 = pscale*scale5;
float psc7 = pscale*scale7; float psc7 = pscale*scale7;
float r3 = (r*r2); float r3 = (r*r2);
float r5 = (r3*r2); float r5 = (r3*r2);
float r7 = (r5*r2); float r7 = (r5*r2);
float drr3 = (1.0f-dsc3)/r3; float drr3 = (1.0f-dsc3)/r3;
float drr5 = 3.0f * (1.0f-dsc5)/r5; float drr5 = 3.0f * (1.0f-dsc5)/r5;
float drr7 = 15.0f * (1.0f-dsc7)/r7; float drr7 = 15.0f * (1.0f-dsc7)/r7;
float prr3 = (1.0f-psc3) / r3; float prr3 = (1.0f-psc3) / r3;
float prr5 = 3.0f *(1.0f-psc5)/r5; float prr5 = 3.0f *(1.0f-psc5)/r5;
float prr7 = 15.0f*(1.0f-psc7)/r7; float prr7 = 15.0f*(1.0f-psc7)/r7;
float dir = atomI.labFrameDipole_X*xr + atomI.labFrameDipole_Y*yr + atomI.labFrameDipole_Z*zr; float dir = atomI.labFrameDipole_X*xr + atomI.labFrameDipole_Y*yr + atomI.labFrameDipole_Z*zr;
float qix = atomI.labFrameQuadrupole_XX*xr + atomI.labFrameQuadrupole_XY*yr + atomI.labFrameQuadrupole_XZ*zr; float qix = atomI.labFrameQuadrupole_XX*xr + atomI.labFrameQuadrupole_XY*yr + atomI.labFrameQuadrupole_XZ*zr;
float qiy = atomI.labFrameQuadrupole_XY*xr + atomI.labFrameQuadrupole_YY*yr + atomI.labFrameQuadrupole_YZ*zr; float qiy = atomI.labFrameQuadrupole_XY*xr + atomI.labFrameQuadrupole_YY*yr + atomI.labFrameQuadrupole_YZ*zr;
float qiz = atomI.labFrameQuadrupole_XZ*xr + atomI.labFrameQuadrupole_YZ*yr + atomI.labFrameQuadrupole_ZZ*zr; float qiz = atomI.labFrameQuadrupole_XZ*xr + atomI.labFrameQuadrupole_YZ*yr + atomI.labFrameQuadrupole_ZZ*zr;
float qir = qix*xr + qiy*yr + qiz*zr; float qir = qix*xr + qiy*yr + qiz*zr;
float dkr = atomJ.labFrameDipole_X*xr + atomJ.labFrameDipole_Y*yr + atomJ.labFrameDipole_Z*zr; float dkr = atomJ.labFrameDipole_X*xr + atomJ.labFrameDipole_Y*yr + atomJ.labFrameDipole_Z*zr;
float qkx = atomJ.labFrameQuadrupole_XX*xr + atomJ.labFrameQuadrupole_XY*yr + atomJ.labFrameQuadrupole_XZ*zr; float qkx = atomJ.labFrameQuadrupole_XX*xr + atomJ.labFrameQuadrupole_XY*yr + atomJ.labFrameQuadrupole_XZ*zr;
float qky = atomJ.labFrameQuadrupole_XY*xr + atomJ.labFrameQuadrupole_YY*yr + atomJ.labFrameQuadrupole_YZ*zr; float qky = atomJ.labFrameQuadrupole_XY*xr + atomJ.labFrameQuadrupole_YY*yr + atomJ.labFrameQuadrupole_YZ*zr;
float qkz = atomJ.labFrameQuadrupole_XZ*xr + atomJ.labFrameQuadrupole_YZ*yr + atomJ.labFrameQuadrupole_ZZ*zr; float qkz = atomJ.labFrameQuadrupole_XZ*xr + atomJ.labFrameQuadrupole_YZ*yr + atomJ.labFrameQuadrupole_ZZ*zr;
float qkr = qkx*xr + qky*yr + qkz*zr; float qkr = qkx*xr + qky*yr + qkz*zr;
float fim0 = -xr*(bn1*atomJ.q-bn2*dkr+bn3*qkr) float fim0 = -xr*(bn1*atomJ.q-bn2*dkr+bn3*qkr)
- bn1*atomJ.labFrameDipole_X + 2.0f*bn2*qkx; - bn1*atomJ.labFrameDipole_X + 2.0f*bn2*qkx;
float fim1 = -yr*(bn1*atomJ.q-bn2*dkr+bn3*qkr) float fim1 = -yr*(bn1*atomJ.q-bn2*dkr+bn3*qkr)
- bn1*atomJ.labFrameDipole_Y + 2.0f*bn2*qky; - bn1*atomJ.labFrameDipole_Y + 2.0f*bn2*qky;
float fim2 = -zr*(bn1*atomJ.q-bn2*dkr+bn3*qkr) float fim2 = -zr*(bn1*atomJ.q-bn2*dkr+bn3*qkr)
- bn1*atomJ.labFrameDipole_Z + 2.0f*bn2*qkz; - bn1*atomJ.labFrameDipole_Z + 2.0f*bn2*qkz;
float fkm0 = xr*(bn1*atomI.q+bn2*dir+bn3*qir) float fkm0 = xr*(bn1*atomI.q+bn2*dir+bn3*qir)
- bn1*atomI.labFrameDipole_X - 2.0f*bn2*qix; - bn1*atomI.labFrameDipole_X - 2.0f*bn2*qix;
float fkm1 = yr*(bn1*atomI.q+bn2*dir+bn3*qir) float fkm1 = yr*(bn1*atomI.q+bn2*dir+bn3*qir)
- bn1*atomI.labFrameDipole_Y - 2.0f*bn2*qiy; - bn1*atomI.labFrameDipole_Y - 2.0f*bn2*qiy;
float fkm2 = zr*(bn1*atomI.q+bn2*dir+bn3*qir) float fkm2 = zr*(bn1*atomI.q+bn2*dir+bn3*qir)
- bn1*atomI.labFrameDipole_Z - 2.0f*bn2*qiz; - bn1*atomI.labFrameDipole_Z - 2.0f*bn2*qiz;
float fid0 = -xr*(drr3*atomJ.q-drr5*dkr+drr7*qkr) float fid0 = -xr*(drr3*atomJ.q-drr5*dkr+drr7*qkr)
- drr3*atomJ.labFrameDipole_X + 2.0f*drr5*qkx; - drr3*atomJ.labFrameDipole_X + 2.0f*drr5*qkx;
float fid1 = -yr*(drr3*atomJ.q-drr5*dkr+drr7*qkr) float fid1 = -yr*(drr3*atomJ.q-drr5*dkr+drr7*qkr)
- drr3*atomJ.labFrameDipole_Y + 2.0f*drr5*qky; - drr3*atomJ.labFrameDipole_Y + 2.0f*drr5*qky;
float fid2 = -zr*(drr3*atomJ.q-drr5*dkr+drr7*qkr) float fid2 = -zr*(drr3*atomJ.q-drr5*dkr+drr7*qkr)
- drr3*atomJ.labFrameDipole_Z + 2.0f*drr5*qkz; - drr3*atomJ.labFrameDipole_Z + 2.0f*drr5*qkz;
float fkd0 = xr*(drr3*atomI.q+drr5*dir+drr7*qir) float fkd0 = xr*(drr3*atomI.q+drr5*dir+drr7*qir)
- drr3*atomI.labFrameDipole_X - 2.0f*drr5*qix; - drr3*atomI.labFrameDipole_X - 2.0f*drr5*qix;
float fkd1 = yr*(drr3*atomI.q+drr5*dir+drr7*qir) float fkd1 = yr*(drr3*atomI.q+drr5*dir+drr7*qir)
- drr3*atomI.labFrameDipole_Y - 2.0f*drr5*qiy; - drr3*atomI.labFrameDipole_Y - 2.0f*drr5*qiy;
float fkd2 = zr*(drr3*atomI.q+drr5*dir+drr7*qir) float fkd2 = zr*(drr3*atomI.q+drr5*dir+drr7*qir)
- drr3*atomI.labFrameDipole_Z - 2.0f*drr5*qiz; - drr3*atomI.labFrameDipole_Z - 2.0f*drr5*qiz;
float fip0 = -xr*(prr3*atomJ.q-prr5*dkr+prr7*qkr) float fip0 = -xr*(prr3*atomJ.q-prr5*dkr+prr7*qkr)
- prr3*atomJ.labFrameDipole_X + 2.0f*prr5*qkx; - prr3*atomJ.labFrameDipole_X + 2.0f*prr5*qkx;
float fip1 = -yr*(prr3*atomJ.q-prr5*dkr+prr7*qkr) float fip1 = -yr*(prr3*atomJ.q-prr5*dkr+prr7*qkr)
- prr3*atomJ.labFrameDipole_Y + 2.0f*prr5*qky; - prr3*atomJ.labFrameDipole_Y + 2.0f*prr5*qky;
float fip2 = -zr*(prr3*atomJ.q-prr5*dkr+prr7*qkr) float fip2 = -zr*(prr3*atomJ.q-prr5*dkr+prr7*qkr)
- prr3*atomJ.labFrameDipole_Z + 2.0f*prr5*qkz; - prr3*atomJ.labFrameDipole_Z + 2.0f*prr5*qkz;
float fkp0 = xr*(prr3*atomI.q+prr5*dir+prr7*qir) float fkp0 = xr*(prr3*atomI.q+prr5*dir+prr7*qir)
- prr3*atomI.labFrameDipole_X - 2.0f*prr5*qix; - prr3*atomI.labFrameDipole_X - 2.0f*prr5*qix;
float fkp1 = yr*(prr3*atomI.q+prr5*dir+prr7*qir) float fkp1 = yr*(prr3*atomI.q+prr5*dir+prr7*qir)
- prr3*atomI.labFrameDipole_Y - 2.0f*prr5*qiy; - prr3*atomI.labFrameDipole_Y - 2.0f*prr5*qiy;
float fkp2 = zr*(prr3*atomI.q+prr5*dir+prr7*qir) float fkp2 = zr*(prr3*atomI.q+prr5*dir+prr7*qir)
- prr3*atomI.labFrameDipole_Z - 2.0f*prr5*qiz; - prr3*atomI.labFrameDipole_Z - 2.0f*prr5*qiz;
// increment the field at each site due to this interaction
if( r2 <= cSim.nonbondedCutoffSqr ){ // increment the field at each site due to this interaction
fields[0].x = fim0 - fid0; fields[0].x = fim0 - fid0;
fields[1].x = fim1 - fid1; fields[1].x = fim1 - fid1;
......
...@@ -75,92 +75,91 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce ...@@ -75,92 +75,91 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce
zr -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ; zr -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
float r2 = xr*xr + yr* yr + zr*zr; float r2 = xr*xr + yr* yr + zr*zr;
float r = sqrtf(r2); if( r2 <= cSim.nonbondedCutoffSqr ){
float r = sqrtf(r2);
// calculate the error function damping terms // calculate the error function damping terms
float ralpha = cSim.alphaEwald*r; float ralpha = cSim.alphaEwald*r;
float bn0 = erfc(ralpha)/r; float bn0 = erfc(ralpha)/r;
float alsq2 = 2.0f*cSim.alphaEwald*cSim.alphaEwald; float alsq2 = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
float alsq2n = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald); float alsq2n = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald);
float exp2a = exp(-(ralpha*ralpha)); float exp2a = exp(-(ralpha*ralpha));
alsq2n *= alsq2; alsq2n *= alsq2;
float bn1 = (bn0+alsq2n*exp2a)/r2; float bn1 = (bn0+alsq2n*exp2a)/r2;
alsq2n *= alsq2; alsq2n *= alsq2;
float bn2 = (3.0f*bn1+alsq2n*exp2a)/r2; float bn2 = (3.0f*bn1+alsq2n*exp2a)/r2;
// compute the error function scaled and unscaled terms // compute the error function scaled and unscaled terms
float scale3 = 1.0f; float scale3 = 1.0f;
float scale5 = 1.0f; float scale5 = 1.0f;
float damp = atomI.damp*atomJ.damp; float damp = atomI.damp*atomJ.damp;
if( damp != 0.0f ){ if( damp != 0.0f ){
float ratio = (r/damp); float ratio = (r/damp);
ratio = ratio*ratio*ratio; ratio = ratio*ratio*ratio;
float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole; float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
damp = -pgamma*ratio; damp = -pgamma*ratio;
if( damp > -50.0f) { if( damp > -50.0f) {
float expdamp = exp(damp); float expdamp = exp(damp);
scale3 = 1.0f - expdamp; scale3 = 1.0f - expdamp;
scale5 = 1.0f - expdamp*(1.0f-damp); scale5 = 1.0f - expdamp*(1.0f-damp);
}
} }
} float dsc3 = uscale*scale3;
float dsc3 = uscale*scale3; float dsc5 = uscale*scale5;
float dsc5 = uscale*scale5;
float r3 = (r*r2); float r3 = (r*r2);
float r5 = (r3*r2); float r5 = (r3*r2);
float rr3 = (1.0f-dsc3)/r3; float rr3 = (1.0f-dsc3)/r3;
float rr5 = 3.0f * (1.0f-dsc5)/r5; float rr5 = 3.0f * (1.0f-dsc5)/r5;
float duir = atomI.inducedDipole[0]*xr + atomI.inducedDipole[1]*yr + atomI.inducedDipole[2]*zr; float duir = atomI.inducedDipole[0]*xr + atomI.inducedDipole[1]*yr + atomI.inducedDipole[2]*zr;
float dukr = atomJ.inducedDipole[0]*xr + atomJ.inducedDipole[1]*yr + atomJ.inducedDipole[2]*zr; float dukr = atomJ.inducedDipole[0]*xr + atomJ.inducedDipole[1]*yr + atomJ.inducedDipole[2]*zr;
float puir = atomI.inducedDipolePolar[0]*xr + atomI.inducedDipolePolar[1]*yr + atomI.inducedDipolePolar[2]*zr; float puir = atomI.inducedDipolePolar[0]*xr + atomI.inducedDipolePolar[1]*yr + atomI.inducedDipolePolar[2]*zr;
float pukr = atomJ.inducedDipolePolar[0]*xr + atomJ.inducedDipolePolar[1]*yr + atomJ.inducedDipolePolar[2]*zr; float pukr = atomJ.inducedDipolePolar[0]*xr + atomJ.inducedDipolePolar[1]*yr + atomJ.inducedDipolePolar[2]*zr;
bn1 *= -1.0f; bn1 *= -1.0f;
float fimd0 = bn1*atomJ.inducedDipole[0] + bn2*dukr*xr; float fimd0 = bn1*atomJ.inducedDipole[0] + bn2*dukr*xr;
float fimd1 = bn1*atomJ.inducedDipole[1] + bn2*dukr*yr; float fimd1 = bn1*atomJ.inducedDipole[1] + bn2*dukr*yr;
float fimd2 = bn1*atomJ.inducedDipole[2] + bn2*dukr*zr; float fimd2 = bn1*atomJ.inducedDipole[2] + bn2*dukr*zr;
float fkmd0 = bn1*atomI.inducedDipole[0] + bn2*duir*xr; float fkmd0 = bn1*atomI.inducedDipole[0] + bn2*duir*xr;
float fkmd1 = bn1*atomI.inducedDipole[1] + bn2*duir*yr; float fkmd1 = bn1*atomI.inducedDipole[1] + bn2*duir*yr;
float fkmd2 = bn1*atomI.inducedDipole[2] + bn2*duir*zr; float fkmd2 = bn1*atomI.inducedDipole[2] + bn2*duir*zr;
float fimp0 = bn1*atomJ.inducedDipolePolar[0] + bn2*pukr*xr; float fimp0 = bn1*atomJ.inducedDipolePolar[0] + bn2*pukr*xr;
float fimp1 = bn1*atomJ.inducedDipolePolar[1] + bn2*pukr*yr; float fimp1 = bn1*atomJ.inducedDipolePolar[1] + bn2*pukr*yr;
float fimp2 = bn1*atomJ.inducedDipolePolar[2] + bn2*pukr*zr; float fimp2 = bn1*atomJ.inducedDipolePolar[2] + bn2*pukr*zr;
float fkmp0 = bn1*atomI.inducedDipolePolar[0] + bn2*puir*xr; float fkmp0 = bn1*atomI.inducedDipolePolar[0] + bn2*puir*xr;
float fkmp1 = bn1*atomI.inducedDipolePolar[1] + bn2*puir*yr; float fkmp1 = bn1*atomI.inducedDipolePolar[1] + bn2*puir*yr;
float fkmp2 = bn1*atomI.inducedDipolePolar[2] + bn2*puir*zr; float fkmp2 = bn1*atomI.inducedDipolePolar[2] + bn2*puir*zr;
rr3 *= -1.0f;; rr3 *= -1.0f;;
float fid0 = rr3*atomJ.inducedDipole[0] + rr5*dukr*xr; float fid0 = rr3*atomJ.inducedDipole[0] + rr5*dukr*xr;
float fid1 = rr3*atomJ.inducedDipole[1] + rr5*dukr*yr; float fid1 = rr3*atomJ.inducedDipole[1] + rr5*dukr*yr;
float fid2 = rr3*atomJ.inducedDipole[2] + rr5*dukr*zr; float fid2 = rr3*atomJ.inducedDipole[2] + rr5*dukr*zr;
float fkd0 = rr3*atomI.inducedDipole[0] + rr5*duir*xr; float fkd0 = rr3*atomI.inducedDipole[0] + rr5*duir*xr;
float fkd1 = rr3*atomI.inducedDipole[1] + rr5*duir*yr; float fkd1 = rr3*atomI.inducedDipole[1] + rr5*duir*yr;
float fkd2 = rr3*atomI.inducedDipole[2] + rr5*duir*zr; float fkd2 = rr3*atomI.inducedDipole[2] + rr5*duir*zr;
float fip0 = rr3*atomJ.inducedDipolePolar[0] + rr5*pukr*xr; float fip0 = rr3*atomJ.inducedDipolePolar[0] + rr5*pukr*xr;
float fip1 = rr3*atomJ.inducedDipolePolar[1] + rr5*pukr*yr; float fip1 = rr3*atomJ.inducedDipolePolar[1] + rr5*pukr*yr;
float fip2 = rr3*atomJ.inducedDipolePolar[2] + rr5*pukr*zr; float fip2 = rr3*atomJ.inducedDipolePolar[2] + rr5*pukr*zr;
float fkp0 = rr3*atomI.inducedDipolePolar[0] + rr5*puir*xr; float fkp0 = rr3*atomI.inducedDipolePolar[0] + rr5*puir*xr;
float fkp1 = rr3*atomI.inducedDipolePolar[1] + rr5*puir*yr; float fkp1 = rr3*atomI.inducedDipolePolar[1] + rr5*puir*yr;
float fkp2 = rr3*atomI.inducedDipolePolar[2] + rr5*puir*zr; float fkp2 = rr3*atomI.inducedDipolePolar[2] + rr5*puir*zr;
// increment the field at each site due to this interaction // increment the field at each site due to this interaction
if( r2 <= cSim.nonbondedCutoffSqr ){
fields[0].x = fimd0 - fid0; fields[0].x = fimd0 - fid0;
fields[0].y = fkmd0 - fkd0; fields[0].y = fkmd0 - fkd0;
......
...@@ -29,7 +29,7 @@ ...@@ -29,7 +29,7 @@
__global__ __global__
#if (__CUDA_ARCH__ >= 200) #if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1) __launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130) #elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1) __launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
#else #else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1) __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment