Commit 4a1e9683 authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Optimzations and bug fix for mapping of torque to force for small molecules

parent 44bfd168
...@@ -688,8 +688,9 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe ...@@ -688,8 +688,9 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){ if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s Initial setup for matrix multiply\n", methodName ); fflush( amoebaGpu->log ); (void) fprintf( amoebaGpu->log, "cudaComputeAmoebaMutualInducedAndGkFieldBySOR Initial setup for matrix multiply\n" ); fflush( amoebaGpu->log );
gpu->psPosq4->Download();
amoebaGpu->psE_Field->Download(); amoebaGpu->psE_Field->Download();
amoebaGpu->psE_FieldPolar->Download(); amoebaGpu->psE_FieldPolar->Download();
amoebaGpu->psInducedDipole->Download(), amoebaGpu->psInducedDipole->Download(),
...@@ -708,9 +709,10 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe ...@@ -708,9 +709,10 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
(void) fprintf( amoebaGpu->log, "PolX!!! %12.4e %12.4e ", amoebaGpu->psPolarizability->_pSysData[offset+1], amoebaGpu->psPolarizability->_pSysData[offset+2] ); (void) fprintf( amoebaGpu->log, "PolX!!! %12.4e %12.4e ", amoebaGpu->psPolarizability->_pSysData[offset+1], amoebaGpu->psPolarizability->_pSysData[offset+2] );
} }
(void) fprintf( amoebaGpu->log," E[%14.6e %14.6e %14.6e] Mi[%14.6e %14.6e %14.6e]\n", (void) fprintf( amoebaGpu->log," E[%14.6e %14.6e %14.6e] Mi[%14.6e %14.6e %14.6e] Pos[%14.6e %14.6e %14.6e]\n",
amoebaGpu->psE_Field->_pSysData[offset], amoebaGpu->psE_Field->_pSysData[offset+1], amoebaGpu->psE_Field->_pSysData[offset+2], amoebaGpu->psE_Field->_pSysData[offset], amoebaGpu->psE_Field->_pSysData[offset+1], amoebaGpu->psE_Field->_pSysData[offset+2],
amoebaGpu->psInducedDipole->_pSysData[offset], amoebaGpu->psInducedDipole->_pSysData[offset+1], amoebaGpu->psInducedDipole->_pSysData[offset+2] ); amoebaGpu->psInducedDipole->_pSysData[offset], amoebaGpu->psInducedDipole->_pSysData[offset+1], amoebaGpu->psInducedDipole->_pSysData[offset+2],
gpu->psPosq4->_pSysData[ii].x, gpu->psPosq4->_pSysData[ii].y, gpu->psPosq4->_pSysData[ii].z );
(void) fprintf( amoebaGpu->log,"Ep[%14.6e %14.6e %14.6e] Mip[%14.6e %14.6e %14.6e]\n", (void) fprintf( amoebaGpu->log,"Ep[%14.6e %14.6e %14.6e] Mip[%14.6e %14.6e %14.6e]\n",
amoebaGpu->psE_FieldPolar->_pSysData[offset], amoebaGpu->psE_FieldPolar->_pSysData[offset+1], amoebaGpu->psE_FieldPolar->_pSysData[offset+2], amoebaGpu->psE_FieldPolar->_pSysData[offset], amoebaGpu->psE_FieldPolar->_pSysData[offset+1], amoebaGpu->psE_FieldPolar->_pSysData[offset+2],
amoebaGpu->psInducedDipolePolar->_pSysData[offset], amoebaGpu->psInducedDipolePolar->_pSysData[offset+1], amoebaGpu->psInducedDipolePolar->_pSysData[offset+2] ); amoebaGpu->psInducedDipolePolar->_pSysData[offset], amoebaGpu->psInducedDipolePolar->_pSysData[offset+1], amoebaGpu->psInducedDipolePolar->_pSysData[offset+2] );
...@@ -777,38 +779,33 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe ...@@ -777,38 +779,33 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
amoebaGpu->psCurrentEpsilon->_pDevData ); amoebaGpu->psCurrentEpsilon->_pDevData );
LAUNCHERROR("kReduceMutualInducedAndGkFieldDelta_kernel"); LAUNCHERROR("kReduceMutualInducedAndGkFieldDelta_kernel");
#if 0
// get total epsilon -- performing sums on cpu
{
float sum[4];
float currentEpsilon = -1.0e30;
for( int ii = 0; ii < 4; ii++ ){
sum[ii] = cudaGetSum( 3*gpu->natoms, amoebaGpu->psWorkVector[ii]);
sum[ii] = 48.033324f*sqrtf( sum[ii]/( (float) gpu->natoms) );
if( sum[ii] > currentEpsilon ){
currentEpsilon = sum[ii];
}
}
amoebaGpu->mutualInducedCurrentEpsilon = currentEpsilon;
(void) fprintf( amoebaGpu->log, "%s iteration=%3d eps %14.6e [%14.6e %14.6e] done=%d sums=%14.6e %14.6e %14.6e %14.6e\n",
methodName, iteration, amoebaGpu->mutualInducedCurrentEpsilon,
amoebaGpu->psCurrentEpsilon->_pSysData[1],
amoebaGpu->psCurrentEpsilon->_pSysData[2], done, sum[0], sum[1], sum[2], sum[3] );
}
#endif
// Debye=48.033324f // Debye=48.033324f
amoebaGpu->psCurrentEpsilon->Download(); amoebaGpu->psCurrentEpsilon->Download();
float currentEpsilon = amoebaGpu->psCurrentEpsilon->_pSysData[0]; float currentEpsilon = amoebaGpu->psCurrentEpsilon->_pSysData[0];
amoebaGpu->mutualInducedCurrentEpsilon = currentEpsilon; amoebaGpu->mutualInducedCurrentEpsilon = currentEpsilon;
// check for nans
if( currentEpsilon != currentEpsilon ){
(void) fprintf( stderr, "cudaComputeAmoebaMutualInducedAndGkFieldBySOR at timestep=%d iteration=%3d eps is nan -- exiting.\n",
timestep, iteration );
(void) fflush( NULL );
exit(-1);
}
// converged?
if( iteration > amoebaGpu->mutualInducedMaxIterations || amoebaGpu->mutualInducedCurrentEpsilon < amoebaGpu->mutualInducedTargetEpsilon ){ if( iteration > amoebaGpu->mutualInducedMaxIterations || amoebaGpu->mutualInducedCurrentEpsilon < amoebaGpu->mutualInducedTargetEpsilon ){
done = 1; done = 1;
} }
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "cudaComputeAmoebaMutualInducedAndGkFieldBySOR step=%d iteration=%3d eps %14.6e done=%d\n",
timestep, iteration, amoebaGpu->mutualInducedCurrentEpsilon, done );
}
if( amoebaGpu->log ){ if( amoebaGpu->log ){
amoebaGpu->psInducedDipole->Download(); amoebaGpu->psInducedDipole->Download();
amoebaGpu->psInducedDipolePolar->Download(); amoebaGpu->psInducedDipolePolar->Download();
...@@ -869,11 +866,11 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe ...@@ -869,11 +866,11 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
amoebaGpu->mutualInducedDone = done; amoebaGpu->mutualInducedDone = done;
amoebaGpu->mutualInducedConverged = ( !done || iteration > amoebaGpu->mutualInducedMaxIterations ) ? 0 : 1; amoebaGpu->mutualInducedConverged = ( !done || iteration > amoebaGpu->mutualInducedMaxIterations ) ? 0 : 1;
#ifdef AMOEBA_DEBUG
if( 0 && amoebaGpu->log ){ if( 0 && amoebaGpu->log ){
trackMutualInducedIterations( amoebaGpu, iteration ); trackMutualInducedIterations( amoebaGpu, iteration );
} }
#ifdef AMOEBA_DEBUG
if( 0 ){ if( 0 ){
std::vector<int> fileId; std::vector<int> fileId;
//fileId.push_back( 0 ); //fileId.push_back( 0 );
......
...@@ -558,9 +558,11 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu ...@@ -558,9 +558,11 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
amoebaGpu->psCurrentEpsilon->_pDevData ); amoebaGpu->psCurrentEpsilon->_pDevData );
LAUNCHERROR("kReduceMutualInducedFieldDelta"); LAUNCHERROR("kReduceMutualInducedFieldDelta");
#ifdef AMOEBA_DEBUG
if( 0 && amoebaGpu->log ){ // trackMutualInducedIterations if( 0 && amoebaGpu->log ){ // trackMutualInducedIterations
trackMutualInducedIterations( amoebaGpu, iteration); trackMutualInducedIterations( amoebaGpu, iteration);
} }
#endif
// Debye=48.033324f // Debye=48.033324f
amoebaGpu->psCurrentEpsilon->Download(); amoebaGpu->psCurrentEpsilon->Download();
......
...@@ -54,15 +54,11 @@ struct PmeDirectElectrostaticParticle { ...@@ -54,15 +54,11 @@ struct PmeDirectElectrostaticParticle {
float labFrameDipole[3]; float labFrameDipole[3];
// lab frame quadrupole // lab frame quadrupole
float labFrameQuadrupole[5];
float labFrameQuadrupole[6];
// induced dipole // induced dipole
float inducedDipole[3]; float inducedDipole[3];
// polar induced dipole
float inducedDipoleP[3]; float inducedDipoleP[3];
// scaling factors // scaling factors
...@@ -72,7 +68,7 @@ struct PmeDirectElectrostaticParticle { ...@@ -72,7 +68,7 @@ struct PmeDirectElectrostaticParticle {
float force[3]; float force[3];
float torque[3]; float torque[3];
//float padding; float padding;
#ifndef CALCULATE_FULL_TILE #ifndef CALCULATE_FULL_TILE
float tempForce[3]; float tempForce[3];
...@@ -117,816 +113,60 @@ __device__ static void debugSetup( unsigned int atomI, unsigned int atomJ, ...@@ -117,816 +113,60 @@ __device__ static void debugSetup( unsigned int atomI, unsigned int atomJ,
// self-energy for PME // self-energy for PME
__device__ static void calculatePmeSelfEnergyElectrostaticPairIxn_kernel( PmeDirectElectrostaticParticle& atomI, float* energy) __device__ static void calculatePmeSelfEnergyElectrostaticPairIxn_kernel( PmeDirectElectrostaticParticle& atomI, float* totalEnergy)
{
float term = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
float fterm = -cSim.alphaEwald/cAmoebaSim.sqrtPi;
float cii = atomI.q*atomI.q;
float dii = atomI.labFrameDipole[0]*atomI.labFrameDipole[0] +
atomI.labFrameDipole[1]*atomI.labFrameDipole[1] +
atomI.labFrameDipole[2]*atomI.labFrameDipole[2];
float qii = atomI.labFrameQuadrupole[0]*atomI.labFrameQuadrupole[0] +
atomI.labFrameQuadrupole[3]*atomI.labFrameQuadrupole[3] +
atomI.labFrameQuadrupole[5]*atomI.labFrameQuadrupole[5] + 2.0f*(
atomI.labFrameQuadrupole[1]*atomI.labFrameQuadrupole[1] +
atomI.labFrameQuadrupole[2]*atomI.labFrameQuadrupole[2] +
atomI.labFrameQuadrupole[4]*atomI.labFrameQuadrupole[4]);
float uii = atomI.labFrameDipole[0]*atomI.inducedDipole[0] + atomI.labFrameDipole[1]*atomI.inducedDipole[1] + atomI.labFrameDipole[2]*atomI.inducedDipole[2];
*energy = (cii + term*(dii/3.0f + 2.0f*term*qii/5.0f));
*energy += term*uii/3.0f;
*energy *= fterm;
}
// self-torque for PME
__device__ static void calculatePmeSelfTorqueElectrostaticPairIxn_kernel( PmeDirectElectrostaticParticle& atomI)
{ {
float term = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi; float term = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
float fterm = -cSim.alphaEwald/cAmoebaSim.sqrtPi;
float uix = 0.5f*(atomI.inducedDipole[0] + atomI.inducedDipoleP[0]);
float uiy = 0.5f*(atomI.inducedDipole[1] + atomI.inducedDipoleP[1]);
float uiz = 0.5f*(atomI.inducedDipole[2] + atomI.inducedDipoleP[2]);
atomI.torque[0] += term*(atomI.labFrameDipole[1]*uiz - atomI.labFrameDipole[2]*uiy); float cii = atomI.q*atomI.q;
atomI.torque[1] += term*(atomI.labFrameDipole[2]*uix - atomI.labFrameDipole[0]*uiz);
atomI.torque[2] += term*(atomI.labFrameDipole[0]*uiy - atomI.labFrameDipole[1]*uix);
}
float dii = atomI.labFrameDipole[0]*atomI.labFrameDipole[0] +
atomI.labFrameDipole[1]*atomI.labFrameDipole[1] +
atomI.labFrameDipole[2]*atomI.labFrameDipole[2];
/* /*
#undef SUB_METHOD_NAME float qii = atomI.labFrameQuadrupole[0]*atomI.labFrameQuadrupole[0] +
#undef F1 atomI.labFrameQuadrupole[3]*atomI.labFrameQuadrupole[3] +
#define SUB_METHOD_NAME(a, b) a##F1##b atomI.labFrameQuadrupole[5]*atomI.labFrameQuadrupole[5] + 2.0f*(
#define F1 atomI.labFrameQuadrupole[1]*atomI.labFrameQuadrupole[1] +
//#define T1 atomI.labFrameQuadrupole[2]*atomI.labFrameQuadrupole[2] +
#include "kCalculateAmoebaCudaPmeDirectElectrostatic_b.h" atomI.labFrameQuadrupole[4]*atomI.labFrameQuadrupole[4]);
#undef F1
//#undef T1
#undef SUB_METHOD_NAME
#define SUB_METHOD_NAME(a, b) a##F2##b
#define F2
#include "kCalculateAmoebaCudaPmeDirectElectrostatic_b.h"
#undef F2
#undef SUB_METHOD_NAME
#undef SUB_METHOD_NAME
#undef T1
#define SUB_METHOD_NAME(a, b) a##T1##b
#define T1
#include "kCalculateAmoebaCudaPmeDirectElectrostatic_b.h"
#undef T1
#undef SUB_METHOD_NAME
#define SUB_METHOD_NAME(a, b) a##T2##b
#define T2
#include "kCalculateAmoebaCudaPmeDirectElectrostatic_b.h"
#undef T2
#undef SUB_METHOD_NAME
#undef SUB_METHOD_NAME
#undef T3
#define SUB_METHOD_NAME(a, b) a##T3##b
#define T3
#include "kCalculateAmoebaCudaPmeDirectElectrostatic_b.h"
#undef T3
#undef SUB_METHOD_NAME
#define SUB_METHOD_NAME(a, b) a##T4##b
#define T4
#include "kCalculateAmoebaCudaPmeDirectElectrostatic_b.h"
#undef T4
#undef SUB_METHOD_NAME
*/ */
static __device__ void calculatePmeDirectElectrostaticPairIxnF1_kernel( const PmeDirectElectrostaticParticle& atomI, const PmeDirectElectrostaticParticle& atomJ, float qii = atomI.labFrameQuadrupole[0]*atomI.labFrameQuadrupole[0] +
const float4 delta, const float4 bn, const float bn5, const float* scalingFactors, float4* forceTorqueEnergy ){ atomI.labFrameQuadrupole[3]*atomI.labFrameQuadrupole[3] +
atomI.labFrameQuadrupole[0]*atomI.labFrameQuadrupole[3] +
float xr = delta.x; atomI.labFrameQuadrupole[1]*atomI.labFrameQuadrupole[1] +
float yr = delta.y; atomI.labFrameQuadrupole[2]*atomI.labFrameQuadrupole[2] +
float zr = delta.z; atomI.labFrameQuadrupole[4]*atomI.labFrameQuadrupole[4];
float r = delta.w;
// set the permanent multipole and induced dipole values;
float ci = atomI.q;
float di1 = atomI.labFrameDipole[0];
float di2 = atomI.labFrameDipole[1];
float di3 = atomI.labFrameDipole[2];
float qi1 = atomI.labFrameQuadrupole[0];
float qi2 = atomI.labFrameQuadrupole[1];
float qi3 = atomI.labFrameQuadrupole[2];
float qi5 = atomI.labFrameQuadrupole[3];
float qi6 = atomI.labFrameQuadrupole[4];
float qi9 = atomI.labFrameQuadrupole[5];
float ck = atomJ.q;
float dk1 = atomJ.labFrameDipole[0];
float dk2 = atomJ.labFrameDipole[1];
float dk3 = atomJ.labFrameDipole[2];
float qk1 = atomJ.labFrameQuadrupole[0];
float qk2 = atomJ.labFrameQuadrupole[1];
float qk3 = atomJ.labFrameQuadrupole[2];
float qk5 = atomJ.labFrameQuadrupole[3];
float qk6 = atomJ.labFrameQuadrupole[4];
float qk9 = atomJ.labFrameQuadrupole[5];
float bn1 = bn.x;
float bn2 = bn.y;
float bn3 = bn.z;
float bn4 = bn.w;
// apply Thole polarization damping to scale factors
float rr1 = 1.0f/r;
float rr2 = rr1*rr1;
float rr3 = rr1*rr2;
float rr5 = 3.0f*rr3*rr2;
float rr7 = 5.0f*rr5*rr2;
float rr9 = 7.0f*rr7*rr2;
float rr11 = 9.0f*rr9*rr2;
float qir1 = qi1*xr + qi2*yr + qi3*zr;
float qir2 = qi2*xr + qi5*yr + qi6*zr;
float qir3 = qi3*xr + qi6*yr + qi9*zr;
float qkr1 = qk1*xr + qk2*yr + qk3*zr;
float qkr2 = qk2*xr + qk5*yr + qk6*zr;
float qkr3 = qk3*xr + qk6*yr + qk9*zr;
float offset = 1.0f-scalingFactors[MScaleIndex];
float gf4 = 2.0f*(bn2 - offset*rr5);
float qidk1 = qi1*dk1 + qi2*dk2 + qi3*dk3;
float qkdi1 = qk1*di1 + qk2*di2 + qk3*di3;
float ftm21 = gf4*(qkdi1-qidk1);
float qidk2 = qi2*dk1 + qi5*dk2 + qi6*dk3;
float qkdi2 = qk2*di1 + qk5*di2 + qk6*di3;
float ftm22 = gf4*(qkdi2-qidk2);
float qidk3 = qi3*dk1 + qi6*dk2 + qi9*dk3;
float qkdi3 = qk3*di1 + qk6*di2 + qk9*di3;
float ftm23 = gf4*(qkdi3-qidk3);
float gf7 = 4.0f*(bn3 - offset*rr7);
float qiqkr1 = qi1*qkr1 + qi2*qkr2 + qi3*qkr3;
float qkqir1 = qk1*qir1 + qk2*qir2 + qk3*qir3;
ftm21 += gf7*(qiqkr1+qkqir1);
float qiqkr2 = qi2*qkr1 + qi5*qkr2 + qi6*qkr3;
float qkqir2 = qk2*qir1 + qk5*qir2 + qk6*qir3;
ftm22 += gf7*(qiqkr2+qkqir2);
float qiqkr3 = qi3*qkr1 + qi6*qkr2 + qi9*qkr3;
float qkqir3 = qk3*qir1 + qk6*qir2 + qk9*qir3;
ftm23 += gf7*(qiqkr3+qkqir3);
// calculate the scalar products for permanent components
float sc2 = di1*dk1 + di2*dk2 + di3*dk3;
float sc7 = qir1*dk1 + qir2*dk2 + qir3*dk3;
float sc8 = qkr1*di1 + qkr2*di2 + qkr3*di3;
float sc9 = qir1*qkr1 + qir2*qkr2 + qir3*qkr3;
float sc10 = qi1*qk1 + qi2*qk2 + qi3*qk3 + qi2*qk2 + qi5*qk5 + qi6*qk6 + qi3*qk3 + qi6*qk6 + qi9*qk9;
float sc3 = di1*xr + di2*yr + di3*zr;
float sc5 = qir1*xr + qir2*yr + qir3*zr;
float sc4 = dk1*xr + dk2*yr + dk3*zr;
float sc6 = qkr1*xr + qkr2*yr + qkr3*zr;
// calculate the scalar products for induced components
// calculate the gl functions for permanent components
float gl0 = ci*ck;
float gl1 = ck*sc3 - ci*sc4;
float gl2 = ci*sc6 + ck*sc5 - sc3*sc4;
float gl3 = sc3*sc6 - sc4*sc5;
float gl4 = sc5*sc6;
float gl5 = -4.0f * sc9;
float gl6 = sc2;
float gl7 = 2.0f * (sc7-sc8);
float gl8 = 2.0f * sc10;
forceTorqueEnergy->w += -offset*rr1*gl0 + (bn1-offset*rr3)*(gl1+gl6) + (bn2-offset*rr5)*(gl2+gl7+gl8) + (bn3-offset*rr7)*(gl3+gl5) + (bn4-offset*rr9)*gl4;
// calculate the gl functions for induced components
// intermediate variables for permanent force terms
float gf1 = bn1*gl0 + bn2*(gl1+gl6) + bn3*(gl2+gl7+gl8) + bn4*(gl3+gl5) + bn5*gl4;
gf1 -= offset*(rr3*gl0 + rr5*(gl1+gl6) + rr7*(gl2+gl7+gl8) + rr9*(gl3+gl5) + rr11*gl4);
ftm21 += gf1*xr;
ftm22 += gf1*yr;
ftm23 += gf1*zr;
float gf2 = -ck*bn1 + sc4*bn2 - sc6*bn3 - offset*(-ck*rr3 + sc4*rr5 - sc6*rr7);
ftm21 += gf2*di1;
ftm22 += gf2*di2;
ftm23 += gf2*di3;
float gf3 = ci*bn1 + sc3*bn2 + sc5*bn3 - offset*(ci*rr3 + sc3*rr5 + sc5*rr7);
ftm21 += gf3*dk1;
ftm22 += gf3*dk2;
ftm23 += gf3*dk3;
float gf5 = 2.0f*(-ck*bn2+sc4*bn3-sc6*bn4 - offset*(-ck*rr5+sc4*rr7-sc6*rr9));
ftm21 += gf5*qir1;
ftm22 += gf5*qir2;
ftm23 += gf5*qir3;
float gf6 = 2.0f*(-ci*bn2-sc3*bn3-sc5*bn4 - offset*(-ci*rr5-sc3*rr7-sc5*rr9));
ftm21 += gf6*qkr1;
ftm22 += gf6*qkr2;
ftm23 += gf6*qkr3;
forceTorqueEnergy->x = ftm21;
forceTorqueEnergy->y = ftm22;
forceTorqueEnergy->z = ftm23;
return;
}
static __device__ void calculatePmeDirectElectrostaticPairIxnF2_kernel(
const PmeDirectElectrostaticParticle& atomI, const PmeDirectElectrostaticParticle& atomJ,
const float4 delta, const float4 bn, const float* scalingFactors, float4* forceTorqueEnergy ){
float xr = delta.x;
float yr = delta.y;
float zr = delta.z;
float r = delta.w;
// set the permanent multipole and induced dipole values;
float ci = atomI.q;
float di1 = atomI.labFrameDipole[0];
float di2 = atomI.labFrameDipole[1];
float di3 = atomI.labFrameDipole[2];
float qi1 = atomI.labFrameQuadrupole[0];
float qi2 = atomI.labFrameQuadrupole[1];
float qi3 = atomI.labFrameQuadrupole[2];
float qi5 = atomI.labFrameQuadrupole[3];
float qi6 = atomI.labFrameQuadrupole[4];
float qi9 = atomI.labFrameQuadrupole[5];
float ck = atomJ.q;
float dk1 = atomJ.labFrameDipole[0];
float dk2 = atomJ.labFrameDipole[1];
float dk3 = atomJ.labFrameDipole[2];
float qk1 = atomJ.labFrameQuadrupole[0];
float qk2 = atomJ.labFrameQuadrupole[1];
float qk3 = atomJ.labFrameQuadrupole[2];
float qk5 = atomJ.labFrameQuadrupole[3];
float qk6 = atomJ.labFrameQuadrupole[4];
float qk9 = atomJ.labFrameQuadrupole[5];
float bn1 = bn.x;
float bn2 = bn.y;
float bn3 = bn.z;
float bn4 = bn.w;
// apply Thole polarization damping to scale factors
float rr1 = 1.0f/r;
float rr2 = rr1*rr1;
float rr3 = rr1*rr2;
float rr5 = 3.0f*rr3*rr2;
float rr7 = 5.0f*rr5*rr2;
float rr9 = 7.0f*rr7*rr2;
float scale3 = 1.0f;
float scale5 = 1.0f;
float scale7 = 1.0f;
float damp = atomI.damp*atomJ.damp;
if( damp != 0.0f ){
float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
float ratio = r/damp;
damp = -pgamma*ratio*ratio*ratio;
if( damp > -50.0f ){
float expdamp = exp(damp);
scale3 = 1.0f - expdamp;
scale5 = 1.0f - (1.0f-damp)*expdamp;
scale7 = 1.0f - (1.0f-damp+0.6f*damp*damp)*expdamp;
} else {
damp = 0.0f;
}
}
float psc5 = 1.0f - scale5*scalingFactors[PScaleIndex];
float dsc5 = 1.0f - scale5*scalingFactors[DScaleIndex];
float qiuk1 = qi1*atomJ.inducedDipole[0] + qi2*atomJ.inducedDipole[1] + qi3*atomJ.inducedDipole[2];
float qkui1 = qk1*atomI.inducedDipole[0] + qk2*atomI.inducedDipole[1] + qk3*atomI.inducedDipole[2];
float qiukp1 = qi1*atomJ.inducedDipoleP[0] + qi2*atomJ.inducedDipoleP[1] + qi3*atomJ.inducedDipoleP[2];
float qkuip1 = qk1*atomI.inducedDipoleP[0] + qk2*atomI.inducedDipoleP[1] + qk3*atomI.inducedDipoleP[2];
forceTorqueEnergy->x += bn2*(qkui1+qkuip1-qiuk1-qiukp1);
forceTorqueEnergy->x -= rr5*((qkui1-qiuk1)*psc5 + (qkuip1-qiukp1)*dsc5);
float qiuk2 = qi2*atomJ.inducedDipole[0] + qi5*atomJ.inducedDipole[1] + qi6*atomJ.inducedDipole[2];
float qkui2 = qk2*atomI.inducedDipole[0] + qk5*atomI.inducedDipole[1] + qk6*atomI.inducedDipole[2];
float qiukp2 = qi2*atomJ.inducedDipoleP[0] + qi5*atomJ.inducedDipoleP[1] + qi6*atomJ.inducedDipoleP[2];
float qkuip2 = qk2*atomI.inducedDipoleP[0] + qk5*atomI.inducedDipoleP[1] + qk6*atomI.inducedDipoleP[2];
forceTorqueEnergy->y += bn2*(qkui2+qkuip2-qiuk2-qiukp2);
forceTorqueEnergy->y -= rr5*((qkui2-qiuk2)*psc5 + (qkuip2-qiukp2)*dsc5);
float qiuk3 = qi3*atomJ.inducedDipole[0] + qi6*atomJ.inducedDipole[1] + qi9*atomJ.inducedDipole[2];
float qkui3 = qk3*atomI.inducedDipole[0] + qk6*atomI.inducedDipole[1] + qk9*atomI.inducedDipole[2];
float qiukp3 = qi3*atomJ.inducedDipoleP[0] + qi6*atomJ.inducedDipoleP[1] + qi9*atomJ.inducedDipoleP[2];
float qkuip3 = qk3*atomI.inducedDipoleP[0] + qk6*atomI.inducedDipoleP[1] + qk9*atomI.inducedDipoleP[2];
forceTorqueEnergy->z += bn2*(qkui3+qkuip3-qiuk3-qiukp3);
forceTorqueEnergy->z -= rr5*((qkui3-qiuk3)*psc5 + (qkuip3-qiukp3)*dsc5);
float sc3 = di1*xr + di2*yr + di3*zr;
float sc4 = dk1*xr + dk2*yr + dk3*zr;
float qir1 = qi1*xr + qi2*yr + qi3*zr;
float qir2 = qi2*xr + qi5*yr + qi6*zr;
float qir3 = qi3*xr + qi6*yr + qi9*zr;
float sc5 = qir1*xr + qir2*yr + qir3*zr;
float qkr1 = qk1*xr + qk2*yr + qk3*zr;
float qkr2 = qk2*xr + qk5*yr + qk6*zr;
float qkr3 = qk3*xr + qk6*yr + qk9*zr;
float sc6 = qkr1*xr + qkr2*yr + qkr3*zr;
// calculate the scalar products for induced components
float sci3 = atomI.inducedDipole[0]*xr + atomI.inducedDipole[1]*yr + atomI.inducedDipole[2]*zr;
float scip3 = atomI.inducedDipoleP[0]*xr + atomI.inducedDipoleP[1]*yr + atomI.inducedDipoleP[2]*zr;
float sci4 = atomJ.inducedDipole[0]*xr + atomJ.inducedDipole[1]*yr + atomJ.inducedDipole[2]*zr;
float scip4 = atomJ.inducedDipoleP[0]*xr + atomJ.inducedDipoleP[1]*yr + atomJ.inducedDipoleP[2]*zr;
float usc5 = 1.0f - scale5*scalingFactors[UScaleIndex];
float prefactor1 = 0.5f*( bn2 - rr5*usc5 );
forceTorqueEnergy->x += prefactor1*( (sci4*atomI.inducedDipoleP[0] + scip4*atomI.inducedDipole[0]) + sci3*atomJ.inducedDipoleP[0] + scip3*atomJ.inducedDipole[0] );
forceTorqueEnergy->y += prefactor1*( (sci4*atomI.inducedDipoleP[1] + scip4*atomI.inducedDipole[1]) + sci3*atomJ.inducedDipoleP[1] + scip3*atomJ.inducedDipole[1] );
forceTorqueEnergy->z += prefactor1*( (sci4*atomI.inducedDipoleP[2] + scip4*atomI.inducedDipole[2]) + sci3*atomJ.inducedDipoleP[2] + scip3*atomJ.inducedDipole[2] );
float dsc7 = 1.0f - scale7*scalingFactors[DScaleIndex];
float psc7 = 1.0f - scale7*scalingFactors[PScaleIndex];
float gfi5 = bn3*(sci4+scip4) - rr7*(sci4*psc7+scip4*dsc7);
forceTorqueEnergy->x += gfi5*qir1;
forceTorqueEnergy->y += gfi5*qir2;
forceTorqueEnergy->z += gfi5*qir3;
prefactor1 = 0.5f*( bn2*(sci4+scip4) - rr5*(sci4*psc5+scip4*dsc5) );
float prefactor2 = 0.5f*( bn2*(sci3+scip3) - rr5*(sci3*psc5+scip3*dsc5) );
forceTorqueEnergy->x += prefactor1*di1 + prefactor2*dk1;
forceTorqueEnergy->y += prefactor1*di2 + prefactor2*dk2;
forceTorqueEnergy->z += prefactor1*di3 + prefactor2*dk3;
float gfi6 = -bn3*(sci3+scip3) + rr7*(sci3*psc7+scip3*dsc7);
forceTorqueEnergy->x += gfi6*qkr1;
forceTorqueEnergy->y += gfi6*qkr2;
forceTorqueEnergy->z += gfi6*qkr3;
float sci1 = atomI.inducedDipole[0]*dk1 + atomI.inducedDipole[1]*dk2 + atomI.inducedDipole[2]*dk3 + di1*atomJ.inducedDipole[0] + di2*atomJ.inducedDipole[1] + di3*atomJ.inducedDipole[2];
float sci7 = qir1*atomJ.inducedDipole[0] + qir2*atomJ.inducedDipole[1] + qir3*atomJ.inducedDipole[2];
float sci8 = qkr1*atomI.inducedDipole[0] + qkr2*atomI.inducedDipole[1] + qkr3*atomI.inducedDipole[2];
float scip1 = atomI.inducedDipoleP[0]*dk1 + atomI.inducedDipoleP[1]*dk2 + atomI.inducedDipoleP[2]*dk3 + di1*atomJ.inducedDipoleP[0] + di2*atomJ.inducedDipoleP[1] + di3*atomJ.inducedDipoleP[2];
float scip2 = atomI.inducedDipole[0]*atomJ.inducedDipoleP[0] +
atomI.inducedDipole[1]*atomJ.inducedDipoleP[1] +
atomI.inducedDipole[2]*atomJ.inducedDipoleP[2] +
atomJ.inducedDipole[0]*atomI.inducedDipoleP[0] +
atomJ.inducedDipole[1]*atomI.inducedDipoleP[1] +
atomJ.inducedDipole[2]*atomI.inducedDipoleP[2];
float scip7 = qir1*atomJ.inducedDipoleP[0] + qir2*atomJ.inducedDipoleP[1] + qir3*atomJ.inducedDipoleP[2];
float scip8 = qkr1*atomI.inducedDipoleP[0] + qkr2*atomI.inducedDipoleP[1] + qkr3*atomI.inducedDipoleP[2];
// calculate the gl functions for permanent components
// calculate the gl functions for induced components
float gli1 = ck*sci3 - ci*sci4;
float gli2 = -sc3*sci4 - sci3*sc4;
float gli3 = sci3*sc6 - sci4*sc5;
float gli6 = sci1;
float gli7 = 2.0f * (sci7-sci8);
float glip1 = ck*scip3 - ci*scip4;
float glip2 = -sc3*scip4 - scip3*sc4;
float glip3 = scip3*sc6 - scip4*sc5;
float glip6 = scip1;
float glip7 = 2.0f * (scip7-scip8);
float ei = (bn1*(gli1+gli6) + bn2*(gli2+gli7) + bn3*gli3);
float psc3 = 1.0f - scale3*scalingFactors[PScaleIndex];
ei -= (rr3*(gli1+gli6)*psc3 + rr5*(gli2+gli7)*psc5 + rr7*gli3*psc7);
forceTorqueEnergy->w += 0.5f*ei;
float dsc3 = 1.0f - scale3*scalingFactors[DScaleIndex];
float usc3 = 1.0f - scale3*scalingFactors[UScaleIndex];
float gfi1 = (bn2*(gli1+glip1+gli6+glip6)
+ bn2*scip2
+ bn3*(gli2+glip2+gli7+glip7)
- bn3*(sci3*scip4+scip3*sci4)
+ bn4*(gli3+glip3));
gfi1 -= rr5*((gli1+gli6)*psc3 + (glip1+glip6)*dsc3 + scip2*usc3) +
rr7*((gli7+gli2)*psc5 + (glip7+glip2)*dsc5 - (sci3*scip4+scip3*sci4)*usc5) + rr9*(gli3*psc7+glip3*dsc7);
gfi1 *= 0.5f;
forceTorqueEnergy->x += gfi1*xr;
forceTorqueEnergy->y += gfi1*yr;
forceTorqueEnergy->z += gfi1*zr;
float gfi2 = -ck*bn1 + sc4*bn2 - sc6*bn3;
float gfi3 = ci*bn1 + sc3*bn2 + sc5*bn3;
float ftm2i1 = gfi2*(atomI.inducedDipole[0]+atomI.inducedDipoleP[0]) + gfi3*(atomJ.inducedDipole[0]+atomJ.inducedDipoleP[0]);
float ftm2i2 = gfi2*(atomI.inducedDipole[1]+atomI.inducedDipoleP[1]) + gfi3*(atomJ.inducedDipole[1]+atomJ.inducedDipoleP[1]);
float ftm2i3 = gfi2*(atomI.inducedDipole[2]+atomI.inducedDipoleP[2]) + gfi3*(atomJ.inducedDipole[2]+atomJ.inducedDipoleP[2]);
forceTorqueEnergy->x -= 0.5f*
( -rr3*ck *(atomI.inducedDipole[0]*psc3 + atomI.inducedDipoleP[0]*dsc3)
+ rr5*sc4*(atomI.inducedDipole[0]*psc5 + atomI.inducedDipoleP[0]*dsc5)
- rr7*sc6*(atomI.inducedDipole[0]*psc7 + atomI.inducedDipoleP[0]*dsc7)
+ rr3*ci* (atomJ.inducedDipole[0]*psc3 + atomJ.inducedDipoleP[0]*dsc3)
+ rr5*sc3*(atomJ.inducedDipole[0]*psc5 + atomJ.inducedDipoleP[0]*dsc5)
+ rr7*sc5*(atomJ.inducedDipole[0]*psc7 + atomJ.inducedDipoleP[0]*dsc7));
forceTorqueEnergy->y -= 0.5f*
(- rr3*ck *(atomI.inducedDipole[1]*psc3 + atomI.inducedDipoleP[1]*dsc3)
+ rr5*sc4*(atomI.inducedDipole[1]*psc5 + atomI.inducedDipoleP[1]*dsc5)
- rr7*sc6*(atomI.inducedDipole[1]*psc7 + atomI.inducedDipoleP[1]*dsc7)
+ rr3*ci *(atomJ.inducedDipole[1]*psc3 + atomJ.inducedDipoleP[1]*dsc3)
+ rr5*sc3*(atomJ.inducedDipole[1]*psc5 + atomJ.inducedDipoleP[1]*dsc5)
+ rr7*sc5*(atomJ.inducedDipole[1]*psc7 + atomJ.inducedDipoleP[1]*dsc7));
forceTorqueEnergy->z -= 0.5f*
(- rr3*ck *(atomI.inducedDipole[2]*psc3 + atomI.inducedDipoleP[2]*dsc3)
+ rr5*sc4*(atomI.inducedDipole[2]*psc5 + atomI.inducedDipoleP[2]*dsc5)
- rr7*sc6*(atomI.inducedDipole[2]*psc7 + atomI.inducedDipoleP[2]*dsc7)
+ rr3*ci *(atomJ.inducedDipole[2]*psc3 + atomJ.inducedDipoleP[2]*dsc3)
+ rr5*sc3*(atomJ.inducedDipole[2]*psc5 + atomJ.inducedDipoleP[2]*dsc5)
+ rr7*sc5*(atomJ.inducedDipole[2]*psc7 + atomJ.inducedDipoleP[2]*dsc7));
if( damp != 0.0f ){
float expdamp = exp(damp);
float temp3 = -3.0f*damp*expdamp*rr2;
float temp5 = -damp;
float temp7 = -0.2f - 0.6f*damp;
float ddsc31 = temp3 * xr;
float ddsc32 = temp3 * yr;
float ddsc33 = temp3 * zr;
float ddsc51 = temp5 * ddsc31;
float ddsc52 = temp5 * ddsc32;
float ddsc53 = temp5 * ddsc33;
float ddsc71 = temp7 * ddsc51;
float ddsc72 = temp7 * ddsc52;
float ddsc73 = temp7 * ddsc53;
temp3 = rr3*((gli1+gli6)*scalingFactors[PScaleIndex] + (glip1+glip6)*scalingFactors[DScaleIndex]);
temp5 = rr5*((gli2+gli7)*scalingFactors[PScaleIndex] + (glip2+glip7)*scalingFactors[DScaleIndex]);
temp7 = rr7*(gli3*scalingFactors[PScaleIndex] + glip3*scalingFactors[DScaleIndex]);
ftm2i1 -= (temp3*ddsc31 + temp5*ddsc51 + temp7*ddsc71);
ftm2i2 -= (temp3*ddsc32 + temp5*ddsc52 + temp7*ddsc72);
ftm2i3 -= (temp3*ddsc33 + temp5*ddsc53 + temp7*ddsc73);
if( cAmoebaSim.polarizationType == 0 ){
temp3 = rr3 * scalingFactors[UScaleIndex] * scip2;
temp5 = -rr5 * scalingFactors[UScaleIndex] * (sci3*scip4+scip3*sci4);
ftm2i1 -= (temp3*ddsc31 + temp5*ddsc51);
ftm2i2 -= (temp3*ddsc32 + temp5*ddsc52);
ftm2i3 -= (temp3*ddsc33 + temp5*ddsc53);
}
}
if( cAmoebaSim.polarizationType ){
float gfd = (bn2*scip2 - bn3*(scip3*sci4+sci3*scip4)) - (rr5*scip2*usc3 - rr7*(scip3*sci4 +sci3*scip4)*usc5);
float p1 = (bn2 - usc5*rr5);
ftm2i1 -= gfd*xr + p1*(sci4*atomI.inducedDipoleP[0]+scip4*atomI.inducedDipole[0]+sci3*atomJ.inducedDipoleP[0]+scip3*atomJ.inducedDipole[0]);
ftm2i2 -= gfd*yr + p1*(sci4*atomI.inducedDipoleP[1]+scip4*atomI.inducedDipole[1]+sci3*atomJ.inducedDipoleP[1]+scip3*atomJ.inducedDipole[1]);
ftm2i3 -= gfd*zr + p1*(sci4*atomI.inducedDipoleP[2]+scip4*atomI.inducedDipole[2]+sci3*atomJ.inducedDipoleP[2]+scip3*atomJ.inducedDipole[2]);
}
forceTorqueEnergy->x += 0.5f*ftm2i1;
forceTorqueEnergy->y += 0.5f*ftm2i2;
forceTorqueEnergy->z += 0.5f*ftm2i3;
return;
}
static __device__ void calculatePmeDirectElectrostaticPairIxnT1_kernel(
const PmeDirectElectrostaticParticle& atomI, const PmeDirectElectrostaticParticle& atomJ,
const float4 delta, const float4 bn, const float* scalingFactors, float4* forceTorqueEnergy ){
float xr = delta.x;
float yr = delta.y;
float zr = delta.z;
float r = delta.w;
// set the permanent multipole and induced dipole values;
float di1 = atomI.labFrameDipole[0];
float di2 = atomI.labFrameDipole[1];
float di3 = atomI.labFrameDipole[2];
float qi1 = atomI.labFrameQuadrupole[0];
float qi2 = atomI.labFrameQuadrupole[1];
float qi3 = atomI.labFrameQuadrupole[2];
float qi5 = atomI.labFrameQuadrupole[3];
float qi6 = atomI.labFrameQuadrupole[4];
float qi9 = atomI.labFrameQuadrupole[5];
float ck = atomJ.q;
float dk1 = atomJ.labFrameDipole[0];
float dk2 = atomJ.labFrameDipole[1];
float dk3 = atomJ.labFrameDipole[2];
float qk1 = atomJ.labFrameQuadrupole[0];
float qk2 = atomJ.labFrameQuadrupole[1];
float qk3 = atomJ.labFrameQuadrupole[2];
float qk5 = atomJ.labFrameQuadrupole[3];
float qk6 = atomJ.labFrameQuadrupole[4];
float qk9 = atomJ.labFrameQuadrupole[5];
float bn1 = bn.x;
float bn2 = bn.y;
float bn3 = bn.z;
float bn4 = bn.w;
// apply Thole polarization damping to scale factors qii *= 2.0f;
float rr1 = 1.0f/r; float uii = atomI.labFrameDipole[0]*atomI.inducedDipole[0] + atomI.labFrameDipole[1]*atomI.inducedDipole[1] + atomI.labFrameDipole[2]*atomI.inducedDipole[2];
float rr2 = rr1*rr1;
float rr3 = rr1*rr2;
float rr5 = 3.0f*rr3*rr2;
float rr7 = 5.0f*rr5*rr2;
float rr9 = 7.0f*rr7*rr2;
float scale = 1.0f-scalingFactors[MScaleIndex];
float prefactor = scale*rr3 - bn1;
float dixdk1 = di2*dk3 - di3*dk2;
float ttm21 = prefactor*dixdk1;
float dixdk2 = di3*dk1 - di1*dk3;
float ttm22 = prefactor*dixdk2;
float dixdk3 = di1*dk2 - di2*dk1;
float ttm23 = prefactor*dixdk3;
float qir1 = qi1*xr + qi2*yr + qi3*zr;
float qir2 = qi2*xr + qi5*yr + qi6*zr;
float qir3 = qi3*xr + qi6*yr + qi9*zr;
float qkr1 = qk1*xr + qk2*yr + qk3*zr;
float qkr2 = qk2*xr + qk5*yr + qk6*zr;
float qkr3 = qk3*xr + qk6*yr + qk9*zr;
float qiqkr1 = qi1*qkr1 + qi2*qkr2 + qi3*qkr3;
float qiqkr2 = qi2*qkr1 + qi5*qkr2 + qi6*qkr3;
float qiqkr3 = qi3*qkr1 + qi6*qkr2 + qi9*qkr3;
float rxqikr1 = yr*qiqkr3 - zr*qiqkr2;
float qkrxqir1 = qkr2*qir3 - qkr3*qir2;
prefactor = 4.0f*(bn3 - scale*rr7);
ttm21 -= prefactor*(rxqikr1+qkrxqir1);
float rxqikr2 = zr*qiqkr1 - xr*qiqkr3;
float qkrxqir2 = qkr3*qir1 - qkr1*qir3;
ttm22 -= prefactor*(rxqikr2+qkrxqir2);
float rxqikr3 = xr*qiqkr2 - yr*qiqkr1;
float qkrxqir3 = qkr1*qir2 - qkr2*qir1;
ttm23 -= prefactor*(rxqikr3+qkrxqir3);
float qidk1 = qi1*dk1 + qi2*dk2 + qi3*dk3;
float qidk2 = qi2*dk1 + qi5*dk2 + qi6*dk3;
float qidk3 = qi3*dk1 + qi6*dk2 + qi9*dk3;
float dixqkr1 = di2*qkr3 - di3*qkr2;
float dkxqir1 = dk2*qir3 - dk3*qir2;
float rxqidk1 = yr*qidk3 - zr*qidk2;
float qixqk1 = qi2*qk3 + qi5*qk6 + qi6*qk9 - qi3*qk2 - qi6*qk5 - qi9*qk6;
prefactor = 2.0f*(bn2 - scale*rr5);
ttm21 += prefactor*(dixqkr1+dkxqir1+rxqidk1-2.0f*qixqk1);
float dixqkr2 = di3*qkr1 - di1*qkr3;
float dkxqir2 = dk3*qir1 - dk1*qir3;
float rxqidk2 = zr*qidk1 - xr*qidk3;
float qixqk2 = qi3*qk1 + qi6*qk2 + qi9*qk3 - qi1*qk3 - qi2*qk6 - qi3*qk9;
ttm22 += prefactor*(dixqkr2+dkxqir2+rxqidk2-2.0f*qixqk2);
float dixqkr3 = di1*qkr2 - di2*qkr1;
float dkxqir3 = dk1*qir2 - dk2*qir1;
float rxqidk3 = xr*qidk2 - yr*qidk1;
float qixqk3 = qi1*qk2 + qi2*qk5 + qi3*qk6 - qi2*qk1 - qi5*qk2 - qi6*qk3;
ttm23 += prefactor*(dixqkr3+dkxqir3+rxqidk3-2.0f*qixqk3);
float sc4 = dk1*xr + dk2*yr + dk3*zr;
float sc6 = qkr1*xr + qkr2*yr + qkr3*zr;
float gf2 = -ck*bn1 + sc4*bn2 - sc6*bn3;
float gfr2 = -ck*rr3 + sc4*rr5 - sc6*rr7;
prefactor = (gf2 - scale*gfr2);
ttm21 += prefactor*(di2*zr - di3*yr);
ttm22 += prefactor*(di3*xr - di1*zr);
ttm23 += prefactor*(di1*yr - di2*xr);
float gf5 = (-ck*bn2+sc4*bn3-sc6*bn4);
float gfr5 = (-ck*rr5+sc4*rr7-sc6*rr9);
prefactor = 2.0f*(gf5 - scale*gfr5);
float rxqir1 = yr*qir3 - zr*qir2;
float rxqir2 = zr*qir1 - xr*qir3;
float rxqir3 = xr*qir2 - yr*qir1;
ttm21 -= prefactor*rxqir1;
ttm22 -= prefactor*rxqir2;
ttm23 -= prefactor*rxqir3;
forceTorqueEnergy->x = ttm21;
forceTorqueEnergy->y = ttm22;
forceTorqueEnergy->z = ttm23;
return;
float energy = (cii + term*(dii/3.0f + 2.0f*term*qii/5.0f));
energy += term*uii/3.0f;
energy *= fterm;
*totalEnergy += energy;
} }
static __device__ void calculatePmeDirectElectrostaticPairIxnT2_kernel( // self-torque for PME
const PmeDirectElectrostaticParticle& atomI, const PmeDirectElectrostaticParticle& atomJ,
const float4 delta, const float4 bn, const float* scalingFactors, float4* forceTorqueEnergy ){
float xr = delta.x;
float yr = delta.y;
float zr = delta.z;
float r = delta.w;
// set the permanent multipole and induced dipole values;
float di1 = atomI.labFrameDipole[0];
float di2 = atomI.labFrameDipole[1];
float di3 = atomI.labFrameDipole[2];
float qi1 = atomI.labFrameQuadrupole[0];
float qi2 = atomI.labFrameQuadrupole[1];
float qi3 = atomI.labFrameQuadrupole[2];
float qi5 = atomI.labFrameQuadrupole[3];
float qi6 = atomI.labFrameQuadrupole[4];
float qi9 = atomI.labFrameQuadrupole[5];
float bn1 = bn.x;
float bn2 = bn.y;
float bn3 = bn.z;
// apply Thole polarization damping to scale factors
float rr1 = 1.0f/r;
float rr2 = rr1*rr1;
float rr3 = rr1*rr2;
float rr5 = 3.0f*rr3*rr2;
float rr7 = 5.0f*rr5*rr2;
float scale3 = 1.0f;
float scale5 = 1.0f;
float scale7 = 1.0f;
float damp = atomI.damp*atomJ.damp;
if( damp != 0.0f ){
float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
float ratio = r/damp;
damp = -pgamma*ratio*ratio*ratio;
if( damp > -50.0f ){
float expdamp = exp(damp);
scale3 = 1.0f - expdamp;
scale5 = 1.0f - (1.0f-damp)*expdamp;
scale7 = 1.0f - (1.0f-damp+0.6f*damp*damp)*expdamp;
}
}
float dsc3 = 1.0f - scale3*scalingFactors[DScaleIndex]; __device__ static void calculatePmeSelfTorqueElectrostaticPairIxn_kernel( PmeDirectElectrostaticParticle& atomI )
float dsc5 = 1.0f - scale5*scalingFactors[DScaleIndex]; {
float dsc7 = 1.0f - scale7*scalingFactors[DScaleIndex]; float term = (2.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
float psc3 = 1.0f - scale3*scalingFactors[PScaleIndex];
float psc5 = 1.0f - scale5*scalingFactors[PScaleIndex];
float psc7 = 1.0f - scale7*scalingFactors[PScaleIndex];
float prefactor1 = 0.5f*(psc3*rr3 - bn1);
float prefactor2 = 0.5f*(dsc3*rr3 - bn1);
float dixuk1 = di2*atomJ.inducedDipole[2] - di3*atomJ.inducedDipole[1];
float dixukp1 = di2*atomJ.inducedDipoleP[2] - di3*atomJ.inducedDipoleP[1];
float ttm2i1 = prefactor1*dixuk1 + prefactor2*dixukp1;
float dixuk2 = di3*atomJ.inducedDipole[0] - di1*atomJ.inducedDipole[2];
float dixukp2 = di3*atomJ.inducedDipoleP[0] - di1*atomJ.inducedDipoleP[2];
float ttm2i2 = prefactor1*dixuk2 + prefactor2*dixukp2;
float dixuk3 = di1*atomJ.inducedDipole[1] - di2*atomJ.inducedDipole[0];
float dixukp3 = di1*atomJ.inducedDipoleP[1] - di2*atomJ.inducedDipoleP[0];
float ttm2i3 = prefactor1*dixuk3 + prefactor2*dixukp3;
float sci4 = atomJ.inducedDipole[0]*xr + atomJ.inducedDipole[1]*yr + atomJ.inducedDipole[2]*zr;
float scip4 = atomJ.inducedDipoleP[0]*xr + atomJ.inducedDipoleP[1]*yr + atomJ.inducedDipoleP[2]*zr;
float gti2 = bn2*(sci4+scip4);
float gtri2 = rr5*(sci4*psc5+scip4*dsc5);
prefactor1 = 0.5f*(gti2 - gtri2);
ttm2i1 += prefactor1*( di2*zr - di3*yr );
ttm2i2 += prefactor1*( di3*xr - di1*zr );
ttm2i3 += prefactor1*( di1*yr - di2*xr );
float qir1 = qi1*xr + qi2*yr + qi3*zr;
float qir2 = qi2*xr + qi5*yr + qi6*zr;
float qir3 = qi3*xr + qi6*yr + qi9*zr;
prefactor1 = rr7*(sci4*psc7+scip4*dsc7) - bn3*(sci4+scip4);
ttm2i1 += prefactor1*( yr*qir3 - zr*qir2 );
ttm2i2 += prefactor1*( zr*qir1 - xr*qir3 );
ttm2i3 += prefactor1*( xr*qir2 - yr*qir1 );
float qiuk1 = qi1*atomJ.inducedDipole[0] + qi2*atomJ.inducedDipole[1] + qi3*atomJ.inducedDipole[2];
float qiuk2 = qi2*atomJ.inducedDipole[0] + qi5*atomJ.inducedDipole[1] + qi6*atomJ.inducedDipole[2];
float qiuk3 = qi3*atomJ.inducedDipole[0] + qi6*atomJ.inducedDipole[1] + qi9*atomJ.inducedDipole[2];
float qiukp1 = qi1*atomJ.inducedDipoleP[0] + qi2*atomJ.inducedDipoleP[1] + qi3*atomJ.inducedDipoleP[2];
float qiukp2 = qi2*atomJ.inducedDipoleP[0] + qi5*atomJ.inducedDipoleP[1] + qi6*atomJ.inducedDipoleP[2];
float qiukp3 = qi3*atomJ.inducedDipoleP[0] + qi6*atomJ.inducedDipoleP[1] + qi9*atomJ.inducedDipoleP[2];
prefactor1 = (bn2 - rr5*psc5);
prefactor2 = (bn2 - rr5*dsc5);
float ukxqir1 = atomJ.inducedDipole[1]*qir3 - atomJ.inducedDipole[2]*qir2;
float ukxqirp1 = atomJ.inducedDipoleP[1]*qir3 - atomJ.inducedDipoleP[2]*qir2;
float rxqiuk1 = yr*qiuk3 - zr*qiuk2;
float rxqiukp1 = yr*qiukp3 - zr*qiukp2;
ttm2i1 += prefactor1*(ukxqir1 + rxqiuk1) + prefactor2*(ukxqirp1 + rxqiukp1);
float ukxqir2 = atomJ.inducedDipole[2]*qir1 - atomJ.inducedDipole[0]*qir3;
float ukxqirp2 = atomJ.inducedDipoleP[2]*qir1 - atomJ.inducedDipoleP[0]*qir3;
float rxqiuk2 = zr*qiuk1 - xr*qiuk3;
float rxqiukp2 = zr*qiukp1 - xr*qiukp3;
ttm2i2 += prefactor1*(ukxqir2 + rxqiuk2) + prefactor2*(ukxqirp2 + rxqiukp2);
float ukxqir3 = atomJ.inducedDipole[0]*qir2 - atomJ.inducedDipole[1]*qir1;
float ukxqirp3 = atomJ.inducedDipoleP[0]*qir2 - atomJ.inducedDipoleP[1]*qir1;
float rxqiuk3 = xr*qiuk2 - yr*qiuk1;
float rxqiukp3 = xr*qiukp2 - yr*qiukp1;
ttm2i3 += prefactor1*(ukxqir3 + rxqiuk3) + prefactor2*(ukxqirp3 + rxqiukp3);
forceTorqueEnergy->x += ttm2i1;
forceTorqueEnergy->y += ttm2i2;
forceTorqueEnergy->z += ttm2i3;
return; float uix = (atomI.inducedDipole[0] + atomI.inducedDipoleP[0]);
float uiy = (atomI.inducedDipole[1] + atomI.inducedDipoleP[1]);
float uiz = (atomI.inducedDipole[2] + atomI.inducedDipoleP[2]);
atomI.torque[0] += term*(atomI.labFrameDipole[1]*uiz - atomI.labFrameDipole[2]*uiy);
atomI.torque[1] += term*(atomI.labFrameDipole[2]*uix - atomI.labFrameDipole[0]*uiz);
atomI.torque[2] += term*(atomI.labFrameDipole[0]*uiy - atomI.labFrameDipole[1]*uix);
} }
__device__ void calculatePmeDirectElectrostaticPairIxn_kernel( const PmeDirectElectrostaticParticle& atomI, const PmeDirectElectrostaticParticle& atomJ, __device__ void calculateBn_kernel( float r, float4* bn, float* bn0, float *bn5 ){
const float* scalingFactors, float4 forceTorqueEnergy[3] ){
float4 delta;
delta.x = atomJ.x - atomI.x;
delta.y = atomJ.y - atomI.y;
delta.z = atomJ.z - atomI.z;
// periodic box
delta.x -= floor(delta.x*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
delta.y -= floor(delta.y*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
delta.z -= floor(delta.z*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
delta.w = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if( delta.w > cSim.nonbondedCutoffSqr || delta.w == 0.0f ){
forceTorqueEnergy[0].x = forceTorqueEnergy[0].y = forceTorqueEnergy[0].z = forceTorqueEnergy[0].w = 0.0f;
forceTorqueEnergy[1].x = forceTorqueEnergy[1].y = forceTorqueEnergy[1].z = 0.0f;
forceTorqueEnergy[2].x = forceTorqueEnergy[2].y = forceTorqueEnergy[2].z = 0.0f;
return;
}
delta.w = sqrt(delta.w);
float r = delta.w;
float ralpha = cSim.alphaEwald*r; float ralpha = cSim.alphaEwald*r;
float alsq2 = 2.0f*cSim.alphaEwald*cSim.alphaEwald; float alsq2 = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
float alsq2n = 0.0f; float alsq2n = 0.0f;
if( cSim.alphaEwald > 0.0f ){ if( cSim.alphaEwald > 0.0f ){
...@@ -935,59 +175,45 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( const PmeDirectEl ...@@ -935,59 +175,45 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( const PmeDirectEl
float exp2a = exp(-(ralpha*ralpha)); float exp2a = exp(-(ralpha*ralpha));
float rr1 = 1.0f/r; float rr1 = 1.0f/r;
float bn0 = erfc(ralpha)*rr1; *bn0 = erfc(ralpha)*rr1;
float rr2 = rr1*rr1; float rr2 = rr1*rr1;
forceTorqueEnergy[0].w = atomI.q*atomJ.q*bn0;
alsq2n *= alsq2; alsq2n *= alsq2;
float4 bn; bn->x = (*bn0+alsq2n*exp2a)*rr2;
bn.x = (bn0+alsq2n*exp2a)*rr2;
alsq2n *= alsq2; alsq2n *= alsq2;
bn.y = (3.0f*bn.x+alsq2n*exp2a)*rr2; bn->y = (3.0f*bn->x+alsq2n*exp2a)*rr2;
alsq2n *= alsq2; alsq2n *= alsq2;
bn.z = (5.0f*bn.y+alsq2n*exp2a)*rr2; bn->z = (5.0f*bn->y+alsq2n*exp2a)*rr2;
alsq2n *= alsq2; alsq2n *= alsq2;
bn.w = (7.0f*bn.z+alsq2n*exp2a)*rr2; bn->w = (7.0f*bn->z+alsq2n*exp2a)*rr2;
alsq2n *= alsq2; alsq2n *= alsq2;
float bn5 = (9.0f*bn.w+alsq2n*exp2a)*rr2; *bn5 = (9.0f*bn->w+alsq2n*exp2a)*rr2;
//calculatePmeDirectElectrostaticPairIxn_F1_kernel( atomI, atomJ, delta, scalingFactors, forceTorqueEnergy );
calculatePmeDirectElectrostaticPairIxnF1_kernel( atomI, atomJ, delta, bn, bn5, scalingFactors, forceTorqueEnergy );
//calculatePmeDirectElectrostaticPairIxn_F2_kernel( atomI, atomJ, delta, scalingFactors, forceTorqueEnergy );
calculatePmeDirectElectrostaticPairIxnF2_kernel( atomI, atomJ, delta, bn, scalingFactors, forceTorqueEnergy );
//calculatePmeDirectElectrostaticPairIxn_T1_kernel( atomI, atomJ, delta, scalingFactors, forceTorqueEnergy );
calculatePmeDirectElectrostaticPairIxnT1_kernel( atomI, atomJ, delta, bn, scalingFactors, (forceTorqueEnergy+1) );
//calculatePmeDirectElectrostaticPairIxn_T2_kernel( atomI, atomJ, delta, scalingFactors, forceTorqueEnergy );
calculatePmeDirectElectrostaticPairIxnT2_kernel( atomI, atomJ, delta, bn, scalingFactors, (forceTorqueEnergy+1) );
//calculatePmeDirectElectrostaticPairIxn_T3_kernel( atomI, atomJ, delta, scalingFactors, forceTorqueEnergy );
//calculatePmeDirectElectrostaticPairIxnT3_kernel( atomI, atomJ, delta, bn, scalingFactors, (forceTorqueEnergy+2) );
// T3 == T1 w/ particles I and J reversed
// T4 == T2 w/ particles I and J reversed
delta.x *= -1.0f;
delta.y *= -1.0f;
delta.z *= -1.0f;
calculatePmeDirectElectrostaticPairIxnT1_kernel( atomJ, atomI, delta, bn, scalingFactors, (forceTorqueEnergy+2) );
calculatePmeDirectElectrostaticPairIxnT2_kernel( atomJ, atomI, delta, bn, scalingFactors, (forceTorqueEnergy+2) );
//calculatePmeDirectElectrostaticPairIxn_T4_kernel( atomI, atomJ, delta, scalingFactors, forceTorqueEnergy );
//calculatePmeDirectElectrostaticPairIxnT4_kernel( atomI, atomJ, delta, scalingFactors, (forceTorqueEnergy+2) );
return;
} }
__device__ void calculatePmeDirectElectrostaticPairIxnOrig_kernel( PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ, #define SUB_METHOD_NAME(a, b) a##Scale##b
float* scalingFactors, float4 forceTorqueEnergy[3] #define APPLY_SCALE
#include "kCalculateAmoebaCudaPmeDirectElectrostaticF1.h"
#include "kCalculateAmoebaCudaPmeDirectElectrostaticF2P.h"
#include "kCalculateAmoebaCudaPmeDirectElectrostaticT1.h"
#include "kCalculateAmoebaCudaPmeDirectElectrostaticT2.h"
//#include "kCalculateAmoebaCudaPmeDirectElectrostaticDriver.h"
#undef APPLY_SCALE
#undef SUB_METHOD_NAME
#define SUB_METHOD_NAME(a, b) a##NoScale##b
#include "kCalculateAmoebaCudaPmeDirectElectrostaticF1.h"
#include "kCalculateAmoebaCudaPmeDirectElectrostaticF2P.h"
//#include "kCalculateAmoebaCudaPmeDirectElectrostaticT1.h"
//#include "kCalculateAmoebaCudaPmeDirectElectrostaticT2.h"
//#include "kCalculateAmoebaCudaPmeDirectElectrostaticDriver.h"
#undef SUB_METHOD_NAME
__device__ void calculatePmeDirectElectrostaticPairIxnOrig_kernel( const PmeDirectElectrostaticParticle& atomI, const PmeDirectElectrostaticParticle& atomJ,
const float* scalingFactors, float4 forceTorqueEnergy[3]
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
,float4* debugArray ,float4* debugArray
#endif #endif
...@@ -1022,7 +248,8 @@ __device__ void calculatePmeDirectElectrostaticPairIxnOrig_kernel( PmeDirectElec ...@@ -1022,7 +248,8 @@ __device__ void calculatePmeDirectElectrostaticPairIxnOrig_kernel( PmeDirectElec
float qi3 = atomI.labFrameQuadrupole[2]; float qi3 = atomI.labFrameQuadrupole[2];
float qi5 = atomI.labFrameQuadrupole[3]; float qi5 = atomI.labFrameQuadrupole[3];
float qi6 = atomI.labFrameQuadrupole[4]; float qi6 = atomI.labFrameQuadrupole[4];
float qi9 = atomI.labFrameQuadrupole[5]; //float qi9 = atomI.labFrameQuadrupole[5];
float qi9 = -(atomI.labFrameQuadrupole[0] + atomI.labFrameQuadrupole[3]);
float dk1 = atomJ.labFrameDipole[0]; float dk1 = atomJ.labFrameDipole[0];
float dk2 = atomJ.labFrameDipole[1]; float dk2 = atomJ.labFrameDipole[1];
...@@ -1033,7 +260,8 @@ __device__ void calculatePmeDirectElectrostaticPairIxnOrig_kernel( PmeDirectElec ...@@ -1033,7 +260,8 @@ __device__ void calculatePmeDirectElectrostaticPairIxnOrig_kernel( PmeDirectElec
float qk3 = atomJ.labFrameQuadrupole[2]; float qk3 = atomJ.labFrameQuadrupole[2];
float qk5 = atomJ.labFrameQuadrupole[3]; float qk5 = atomJ.labFrameQuadrupole[3];
float qk6 = atomJ.labFrameQuadrupole[4]; float qk6 = atomJ.labFrameQuadrupole[4];
float qk9 = atomJ.labFrameQuadrupole[5]; //float qk9 = atomJ.labFrameQuadrupole[5];
float qk9 = -(atomI.labFrameQuadrupole[0] + atomJ.labFrameQuadrupole[3]);
// calculate the real space error function terms // calculate the real space error function terms
...@@ -1838,7 +1066,201 @@ for( int ii = 0; ii < 12; ii++ ){ ...@@ -1838,7 +1066,201 @@ for( int ii = 0; ii < 12; ii++ ){
} }
__device__ void loadPmeDirectElectrostaticShared( struct PmeDirectElectrostaticParticle* sA, unsigned int atomI ) __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ,
bool bExclusionFlag, const float* scalingFactors, float forceFactor, float* energy ){
float4 delta;
delta.x = atomJ.x - atomI.x;
delta.y = atomJ.y - atomI.y;
delta.z = atomJ.z - atomI.z;
// periodic box
delta.x -= floor(delta.x*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
delta.y -= floor(delta.y*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
delta.z -= floor(delta.z*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
delta.w = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if( delta.w > cSim.nonbondedCutoffSqr ){
return;
}
float r = sqrt(delta.w);
float ralpha = cSim.alphaEwald*r;
float alsq2 = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
float alsq2n = 0.0f;
if( cSim.alphaEwald > 0.0f ){
alsq2n = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald);
}
float exp2a = exp(-(ralpha*ralpha));
float rr1 = 1.0f/r;
delta.w = rr1;
float bn0 = erfc(ralpha)*rr1;
*energy += forceFactor*atomI.q*atomJ.q*bn0;
float rr2 = rr1*rr1;
alsq2n *= alsq2;
float4 bn;
bn.x = (bn0+alsq2n*exp2a)*rr2;
alsq2n *= alsq2;
bn.y = (3.0f*bn.x+alsq2n*exp2a)*rr2;
alsq2n *= alsq2;
bn.z = (5.0f*bn.y+alsq2n*exp2a)*rr2;
alsq2n *= alsq2;
bn.w = (7.0f*bn.z+alsq2n*exp2a)*rr2;
alsq2n *= alsq2;
float bn5 = (9.0f*bn.w+alsq2n*exp2a)*rr2;
/*
float bn0, bn5;
float4 bn;
calculateBn_kernel( r, &bn, &bn0, &bn5 );
*energy += forceFactor*atomI.q*atomJ.q*bn0;
delta.w = 1.0f/r;
*/
float force[3];
//calculatePmeDirectElectrostaticPairIxnF1Scale_kernel( atomI, atomJ, delta, bn, bn5, forceFactor, scalingFactors, force, energy );
if( bExclusionFlag ){
calculatePmeDirectElectrostaticPairIxnF1Scale_kernel( atomI, atomJ, delta, bn, bn5, forceFactor, scalingFactors, force, energy );
calculatePmeDirectElectrostaticPairIxnF2Scale_kernel( atomI, atomJ, delta, bn, forceFactor, scalingFactors, force, energy );
} else {
calculatePmeDirectElectrostaticPairIxnF1NoScale_kernel( atomI, atomJ, delta, bn, bn5, forceFactor, force, energy );
calculatePmeDirectElectrostaticPairIxnF2NoScale_kernel( atomI, atomJ, delta, bn, forceFactor, force, energy );
}
atomI.force[0] += force[0];
atomI.force[1] += force[1];
atomI.force[2] += force[2];
if( forceFactor == 1.0f ){
atomJ.force[0] -= force[0];
atomJ.force[1] -= force[1];
atomJ.force[2] -= force[2];
}
calculatePmeDirectElectrostaticPairIxnT1Scale_kernel( atomI, atomJ, delta, bn, scalingFactors );
calculatePmeDirectElectrostaticPairIxnT2Scale_kernel( atomI, atomJ, delta, bn, scalingFactors );
if( forceFactor == 1.0f ){
// T3 == T1 w/ particles I and J reversed
// T4 == T2 w/ particles I and J reversed
delta.x *= -1.0f;
delta.y *= -1.0f;
delta.z *= -1.0f;
calculatePmeDirectElectrostaticPairIxnT1Scale_kernel( atomJ, atomI, delta, bn, scalingFactors );
calculatePmeDirectElectrostaticPairIxnT2Scale_kernel( atomJ, atomI, delta, bn, scalingFactors );
}
return;
}
#ifdef OLD
__device__ void calculatePmeDirectElectrostaticPairIxnOrigg_kernel( PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ,
bool bExclusionFlag, const float* scalingFactors, float forceFactor, float* energy ){
float4 delta;
delta.x = atomJ.x - atomI.x;
delta.y = atomJ.y - atomI.y;
delta.z = atomJ.z - atomI.z;
// periodic box
delta.x -= floor(delta.x*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
delta.y -= floor(delta.y*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
delta.z -= floor(delta.z*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
delta.w = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if( delta.w > cSim.nonbondedCutoffSqr ){
return;
}
float r = sqrt(delta.w);
float ralpha = cSim.alphaEwald*r;
float alsq2 = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
float alsq2n = 0.0f;
if( cSim.alphaEwald > 0.0f ){
alsq2n = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald);
}
float exp2a = exp(-(ralpha*ralpha));
float rr1 = 1.0f/r;
delta.w = rr1;
float bn0 = erfc(ralpha)*rr1;
*energy += forceFactor*atomI.q*atomJ.q*bn0;
float rr2 = rr1*rr1;
alsq2n *= alsq2;
float4 bn;
bn.x = (bn0+alsq2n*exp2a)*rr2;
alsq2n *= alsq2;
bn.y = (3.0f*bn.x+alsq2n*exp2a)*rr2;
alsq2n *= alsq2;
bn.z = (5.0f*bn.y+alsq2n*exp2a)*rr2;
alsq2n *= alsq2;
bn.w = (7.0f*bn.z+alsq2n*exp2a)*rr2;
alsq2n *= alsq2;
float bn5 = (9.0f*bn.w+alsq2n*exp2a)*rr2;
float force[3];
//calculatePmeDirectElectrostaticPairIxnF1Scale_kernel( atomI, atomJ, delta, bn, bn5, forceFactor, scalingFactors, force, energy );
if( bExclusionFlag ){
calculatePmeDirectElectrostaticPairIxnF1Scale_kernel( atomI, atomJ, delta, bn, bn5, forceFactor, scalingFactors, force, energy );
calculatePmeDirectElectrostaticPairIxnF2Scale_kernel( atomI, atomJ, delta, bn, forceFactor, scalingFactors, force, energy );
} else {
calculatePmeDirectElectrostaticPairIxnF1NoScale_kernel( atomI, atomJ, delta, bn, bn5, forceFactor, force, energy );
calculatePmeDirectElectrostaticPairIxnF2NoScale_kernel( atomI, atomJ, delta, bn, forceFactor, force, energy );
}
atomI.force[0] += force[0];
atomI.force[1] += force[1];
atomI.force[2] += force[2];
if( forceFactor == 1.0f ){
atomJ.force[0] -= force[0];
atomJ.force[1] -= force[1];
atomJ.force[2] -= force[2];
}
calculatePmeDirectElectrostaticPairIxnT1Scale_kernel( atomI, atomJ, delta, bn, scalingFactors );
calculatePmeDirectElectrostaticPairIxnT2Scale_kernel( atomI, atomJ, delta, bn, scalingFactors );
if( forceFactor == 1.0f ){
// T3 == T1 w/ particles I and J reversed
// T4 == T2 w/ particles I and J reversed
delta.x *= -1.0f;
delta.y *= -1.0f;
delta.z *= -1.0f;
calculatePmeDirectElectrostaticPairIxnT1Scale_kernel( atomJ, atomI, delta, bn, scalingFactors );
calculatePmeDirectElectrostaticPairIxnT2Scale_kernel( atomJ, atomI, delta, bn, scalingFactors );
}
return;
}
#endif
__device__ void loadPmeDirectElectrostaticParticle( unsigned int atomI, struct PmeDirectElectrostaticParticle* sA )
{ {
// coordinates & charge // coordinates & charge
float4 posq = cSim.pPosq[atomI]; float4 posq = cSim.pPosq[atomI];
...@@ -1853,7 +1275,6 @@ __device__ void loadPmeDirectElectrostaticShared( struct PmeDirectElectrostaticP ...@@ -1853,7 +1275,6 @@ __device__ void loadPmeDirectElectrostaticShared( struct PmeDirectElectrostaticP
sA->labFrameDipole[1] = cAmoebaSim.pLabFrameDipole[atomI*3+1]; sA->labFrameDipole[1] = cAmoebaSim.pLabFrameDipole[atomI*3+1];
sA->labFrameDipole[2] = cAmoebaSim.pLabFrameDipole[atomI*3+2]; sA->labFrameDipole[2] = cAmoebaSim.pLabFrameDipole[atomI*3+2];
// lab quadrupole // lab quadrupole
sA->labFrameQuadrupole[0] = cAmoebaSim.pLabFrameQuadrupole[atomI*9]; sA->labFrameQuadrupole[0] = cAmoebaSim.pLabFrameQuadrupole[atomI*9];
...@@ -1861,7 +1282,11 @@ __device__ void loadPmeDirectElectrostaticShared( struct PmeDirectElectrostaticP ...@@ -1861,7 +1282,11 @@ __device__ void loadPmeDirectElectrostaticShared( struct PmeDirectElectrostaticP
sA->labFrameQuadrupole[2] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+2]; sA->labFrameQuadrupole[2] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+2];
sA->labFrameQuadrupole[3] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+4]; sA->labFrameQuadrupole[3] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+4];
sA->labFrameQuadrupole[4] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+5]; sA->labFrameQuadrupole[4] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+5];
sA->labFrameQuadrupole[5] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+8];
// traceless tensor
//sA->labFrameQuadrupole[5] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+8];
//sA->labFrameQuadrupole[5] = -(sA->labFrameQuadrupole[0] + sA->labFrameQuadrupole[3]);
// induced dipole // induced dipole
...@@ -1881,6 +1306,19 @@ __device__ void loadPmeDirectElectrostaticShared( struct PmeDirectElectrostaticP ...@@ -1881,6 +1306,19 @@ __device__ void loadPmeDirectElectrostaticShared( struct PmeDirectElectrostaticP
} }
__device__ void zeroPmeDirectElectrostaticParticle( struct PmeDirectElectrostaticParticle* sA )
{
sA->force[0] = 0.0f;
sA->force[1] = 0.0f;
sA->force[2] = 0.0f;
sA->torque[0] = 0.0f;
sA->torque[1] = 0.0f;
sA->torque[2] = 0.0f;
}
// Include versions of the kernels for N^2 calculations. // Include versions of the kernels for N^2 calculations.
#undef USE_OUTPUT_BUFFER_PER_WARP #undef USE_OUTPUT_BUFFER_PER_WARP
......
...@@ -34,17 +34,7 @@ __launch_bounds__(128, 1) ...@@ -34,17 +34,7 @@ __launch_bounds__(128, 1)
#else #else
__launch_bounds__(64, 1) __launch_bounds__(64, 1)
#endif #endif
void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)( void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)( unsigned int* workUnit, float* outputTorque ){
unsigned int* workUnit, float* outputTorque
#ifdef AMOEBA_DEBUG
, float4* debugArray, unsigned int targetAtom
#endif
){
#ifdef AMOEBA_DEBUG
int maxPullIndex = 7;
float4 pullBack[12];
#endif
extern __shared__ PmeDirectElectrostaticParticle sA[]; extern __shared__ PmeDirectElectrostaticParticle sA[];
...@@ -55,7 +45,6 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)( ...@@ -55,7 +45,6 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
unsigned int end = (warp+1)*numWorkUnits/totalWarps; unsigned int end = (warp+1)*numWorkUnits/totalWarps;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
float totalEnergy = 0.0f; float totalEnergy = 0.0f;
float4 forceTorqueEnergy[3];
float scalingFactors[LastScalingIndex]; float scalingFactors[LastScalingIndex];
float conversionFactor = (-cAmoebaSim.electric/cAmoebaSim.dielec); float conversionFactor = (-cAmoebaSim.electric/cAmoebaSim.dielec);
...@@ -74,174 +63,89 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)( ...@@ -74,174 +63,89 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
decodeCell( workUnit[pos], &x, &y, &bExclusionFlag ); decodeCell( workUnit[pos], &x, &y, &bExclusionFlag );
unsigned int tgx = threadIdx.x & (GRID - 1); unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx; unsigned int tbx = threadIdx.x - tgx;
unsigned int tj = tgx; unsigned int tj = tgx;
PmeDirectElectrostaticParticle* psA = &sA[tbx]; PmeDirectElectrostaticParticle* psA = &sA[tbx];
unsigned int atomI = x + tgx; unsigned int atomI = x + tgx;
PmeDirectElectrostaticParticle localParticle; PmeDirectElectrostaticParticle localParticleI;
loadPmeDirectElectrostaticShared(&localParticle, atomI ); loadPmeDirectElectrostaticParticle( atomI, &localParticleI );
localParticle.force[0] = 0.0f; zeroPmeDirectElectrostaticParticle( &localParticleI );
localParticle.force[1] = 0.0f; scalingFactors[UScaleIndex] = 1.0f;
localParticle.force[2] = 0.0f;
localParticle.torque[0] = 0.0f;
localParticle.torque[1] = 0.0f;
localParticle.torque[2] = 0.0f;
scalingFactors[UScaleIndex] = 1.0f;
if (x == y) // Handle diagonals uniquely at 50% efficiency if (x == y) // Handle diagonals uniquely at 50% efficiency
{ {
// load shared data // load shared data
loadPmeDirectElectrostaticShared( &(sA[threadIdx.x]), atomI ); loadPmeDirectElectrostaticParticle( atomI, &(sA[threadIdx.x]) );
if (bExclusionFlag)
{
unsigned int xi = x >> GRIDBITS;
unsigned int cell = xi + xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
mScaleMask = cAmoebaSim.pM_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
} else {
scalingFactors[DScaleIndex] = scalingFactors[PScaleIndex] = scalingFactors[MScaleIndex] = 1.0f;
}
for (unsigned int j = 0; j < GRID; j++)
{
unsigned int atomJ = y + j;
// set scale factors
if( atomI < cSim.atoms ){
if (bExclusionFlag) if (bExclusionFlag)
{ {
getMaskedDScaleFactor( j, dScaleMask, scalingFactors + DScaleIndex ); unsigned int xi = x >> GRIDBITS;
getMaskedPScaleFactor( j, pScaleMask, scalingFactors + PScaleIndex ); unsigned int cell = xi + xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
getMaskedMScaleFactor( j, mScaleMask, scalingFactors + MScaleIndex ); dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
mScaleMask = cAmoebaSim.pM_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
} else {
scalingFactors[DScaleIndex] = scalingFactors[PScaleIndex] = scalingFactors[MScaleIndex] = 1.0f;
} }
// force for (unsigned int j = 0; j < GRID && (y+j) < cSim.atoms; j++)
calculatePmeDirectElectrostaticPairIxn_kernel( localParticle, psA[j], scalingFactors, forceTorqueEnergy
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
// nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution
// by setting match flag
if( (atomI != atomJ) && (atomI < cSim.atoms) && (atomJ < cSim.atoms) )
{ {
localParticle.force[0] += forceTorqueEnergy[0].x;
localParticle.force[1] += forceTorqueEnergy[0].y;
localParticle.force[2] += forceTorqueEnergy[0].z;
localParticle.torque[0] += forceTorqueEnergy[1].x; if( atomI != (y+j) )
localParticle.torque[1] += forceTorqueEnergy[1].y; {
localParticle.torque[2] += forceTorqueEnergy[1].z; if (bExclusionFlag)
{
getMaskedDScaleFactor( j, dScaleMask, scalingFactors + DScaleIndex );
getMaskedPScaleFactor( j, pScaleMask, scalingFactors + PScaleIndex );
getMaskedMScaleFactor( j, mScaleMask, scalingFactors + MScaleIndex );
}
calculatePmeDirectElectrostaticPairIxn_kernel( localParticleI, psA[j], bExclusionFlag, scalingFactors, 0.5f, &totalEnergy);
}
} // end of j-loop
// include self energy and self torque
calculatePmeSelfTorqueElectrostaticPairIxn_kernel( localParticleI );
calculatePmeSelfEnergyElectrostaticPairIxn_kernel( localParticleI, &totalEnergy );
localParticleI.force[0] *= conversionFactor;
localParticleI.force[1] *= conversionFactor;
localParticleI.force[2] *= conversionFactor;
localParticleI.torque[0] *= -conversionFactor;
localParticleI.torque[1] *= -conversionFactor;
localParticleI.torque[2] *= -conversionFactor;
// Write results
// energy for each diagonal-block ixn included twice, hence factor of 0.5
totalEnergy += 0.5*forceTorqueEnergy[0].w;
}
#ifdef AMOEBA_DEBUG
if( atomI == targetAtom || atomJ == targetAtom ){
unsigned int mask = ( (atomI == atomJ) || (atomI >= cSim.atoms) || (atomJ >= cSim.atoms) ) ? 0 : 1;
unsigned int index = (atomI == targetAtom) ? atomJ : atomI;
float blockId = 1.0f;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) atomJ;
debugArray[index].z = (float) y;
debugArray[index].w = blockId;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? forceTorqueEnergy[0].x : 0.0f;
debugArray[index].y = mask ? forceTorqueEnergy[0].y : 0.0f;
debugArray[index].z = mask ? forceTorqueEnergy[0].z : 0.0f;
debugArray[index].w = mask ? forceTorqueEnergy[0].w : 0.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? forceTorqueEnergy[1].x : 0.0f;
debugArray[index].y = mask ? forceTorqueEnergy[1].y : 0.0f;
debugArray[index].z = mask ? forceTorqueEnergy[1].z : 0.0f;
float offsetF = (float)(3*(x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms));
debugArray[index].w = offsetF;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? forceTorqueEnergy[2].x : 0.0f;
debugArray[index].y = mask ? forceTorqueEnergy[2].y : 0.0f;
debugArray[index].z = mask ? forceTorqueEnergy[2].z : 0.0f;
debugArray[index].w = offsetF;
for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullIndex].x;
debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z;
debugArray[index].w = pullBack[pullIndex].w;
}
}
#endif
} // end of j-loop
// include self energy and self torque
if( atomI < cSim.atoms ){
calculatePmeSelfTorqueElectrostaticPairIxn_kernel( localParticle );
float energy;
calculatePmeSelfEnergyElectrostaticPairIxn_kernel( localParticle, &energy );
totalEnergy += energy;
}
localParticle.force[0] *= conversionFactor;
localParticle.force[1] *= conversionFactor;
localParticle.force[2] *= conversionFactor;
localParticle.torque[0] *= -conversionFactor;
localParticle.torque[1] *= -conversionFactor;
localParticle.torque[2] *= -conversionFactor;
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP #ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = (x + tgx + warp*cSim.paddedNumberOfAtoms); unsigned int offset = (x + tgx + warp*cSim.paddedNumberOfAtoms);
add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 ); add3dArrayToFloat4( offset, localParticleI.force, cSim.pForce4 );
add3dArray( 3*offset, localParticle.torque, outputTorque ); add3dArray( 3*offset, localParticleI.torque, outputTorque );
#else #else
unsigned int offset = (x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms); unsigned int offset = (x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 ); add3dArrayToFloat4( offset, localParticleI.force, cSim.pForce4 );
load3dArray( 3*offset, localParticle.torque, outputTorque ); load3dArray( 3*offset, localParticleI.torque, outputTorque );
#endif #endif
}
} else { } else {
if (lasty != y) { if (lasty != y) {
loadPmeDirectElectrostaticParticle( (y+tgx), &(sA[threadIdx.x]) );
// load shared data
loadPmeDirectElectrostaticShared( &(sA[threadIdx.x]), (y+tgx) );
} }
unsigned int flags = cSim.pInteractionFlag[pos]; if (cSim.pInteractionFlag[pos] != 0 ) {
if (flags == 0) {
// No interactions in this block.
} else {
#ifdef CALCULATE_FULL_TILE zeroPmeDirectElectrostaticParticle( &(sA[threadIdx.x]) );
flags = 0xFFFFFFFF; /*
#endif
sA[threadIdx.x].force[0] = 0.0f; sA[threadIdx.x].force[0] = 0.0f;
sA[threadIdx.x].force[1] = 0.0f; sA[threadIdx.x].force[1] = 0.0f;
sA[threadIdx.x].force[2] = 0.0f; sA[threadIdx.x].force[2] = 0.0f;
...@@ -249,7 +153,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){ ...@@ -249,7 +153,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
sA[threadIdx.x].torque[0] = 0.0f; sA[threadIdx.x].torque[0] = 0.0f;
sA[threadIdx.x].torque[1] = 0.0f; sA[threadIdx.x].torque[1] = 0.0f;
sA[threadIdx.x].torque[2] = 0.0f; sA[threadIdx.x].torque[2] = 0.0f;
*/
if( bExclusionFlag ) if( bExclusionFlag )
{ {
unsigned int xi = x >> GRIDBITS; unsigned int xi = x >> GRIDBITS;
...@@ -264,165 +168,49 @@ if( atomI == targetAtom || atomJ == targetAtom ){ ...@@ -264,165 +168,49 @@ if( atomI == targetAtom || atomJ == targetAtom ){
for (unsigned int j = 0; j < GRID; j++) for (unsigned int j = 0; j < GRID; j++)
{ {
if( (flags & (1<<j) ) != 0)
{
unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j;
unsigned int atomJ = y + jIdx;
// set scale factors
if( bExclusionFlag ) // set scale factors and calculate force
{
getMaskedDScaleFactor( jIdx, dScaleMask, scalingFactors + DScaleIndex );
getMaskedPScaleFactor( jIdx, pScaleMask, scalingFactors + PScaleIndex );
getMaskedMScaleFactor( jIdx, mScaleMask, scalingFactors + MScaleIndex );
}
// force if( bExclusionFlag ){
getMaskedDScaleFactor( tj, dScaleMask, scalingFactors + DScaleIndex );
getMaskedPScaleFactor( tj, pScaleMask, scalingFactors + PScaleIndex );
getMaskedMScaleFactor( tj, mScaleMask, scalingFactors + MScaleIndex );
}
calculatePmeDirectElectrostaticPairIxn_kernel( localParticle, psA[jIdx], // check if atoms out-of-bounds
scalingFactors, forceTorqueEnergy
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
// check if atoms out-of-bounds if( (atomI < cSim.atoms) && ((y+tj) < cSim.atoms) )
{
if( (atomI < cSim.atoms) && (atomJ < cSim.atoms) ) calculatePmeDirectElectrostaticPairIxn_kernel( localParticleI, psA[tj], bExclusionFlag, scalingFactors, 1.0f, &totalEnergy);
{ }
// add force and torque to atom I due atom J
localParticle.force[0] += forceTorqueEnergy[0].x;
localParticle.force[1] += forceTorqueEnergy[0].y;
localParticle.force[2] += forceTorqueEnergy[0].z;
totalEnergy += forceTorqueEnergy[0].w;
localParticle.torque[0] += forceTorqueEnergy[1].x;
localParticle.torque[1] += forceTorqueEnergy[1].y;
localParticle.torque[2] += forceTorqueEnergy[1].z;
// add force and torque to atom J due atom I
if( flags == 0xFFFFFFFF ){
psA[jIdx].force[0] -= forceTorqueEnergy[0].x;
psA[jIdx].force[1] -= forceTorqueEnergy[0].y;
psA[jIdx].force[2] -= forceTorqueEnergy[0].z;
psA[jIdx].torque[0] += forceTorqueEnergy[2].x;
psA[jIdx].torque[1] += forceTorqueEnergy[2].y;
psA[jIdx].torque[2] += forceTorqueEnergy[2].z;
#ifndef CALCULATE_FULL_TILE
} else {
sA[threadIdx.x].tempForce[0] = forceTorqueEnergy[0].x;
sA[threadIdx.x].tempForce[1] = forceTorqueEnergy[0].y;
sA[threadIdx.x].tempForce[2] = forceTorqueEnergy[0].z;
sA[threadIdx.x].tempTorque[0] = forceTorqueEnergy[2].x;
sA[threadIdx.x].tempTorque[1] = forceTorqueEnergy[2].y;
sA[threadIdx.x].tempTorque[2] = forceTorqueEnergy[2].z;
if( tgx % 2 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );
}
if( tgx % 4 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+2] );
}
if( tgx % 8 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+4] );
}
if( tgx % 16 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+8] );
}
if (tgx == 0)
{
psA[jIdx].force[0] -= sA[threadIdx.x].tempForce[0] + sA[threadIdx.x+16].tempForce[0];
psA[jIdx].force[1] -= sA[threadIdx.x].tempForce[1] + sA[threadIdx.x+16].tempForce[1];
psA[jIdx].force[2] -= sA[threadIdx.x].tempForce[2] + sA[threadIdx.x+16].tempForce[2];
psA[jIdx].torque[0] += sA[threadIdx.x].tempTorque[0] + sA[threadIdx.x+16].tempTorque[0];
psA[jIdx].torque[1] += sA[threadIdx.x].tempTorque[1] + sA[threadIdx.x+16].tempTorque[1];
psA[jIdx].torque[2] += sA[threadIdx.x].tempTorque[2] + sA[threadIdx.x+16].tempTorque[2];
}
#endif
}
} // end of atoms out-of-bounds
} // end of flags&(1<<j block
#ifdef AMOEBA_DEBUG
unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j;
unsigned int atomJ = y + jIdx;
unsigned int mask = ( (atomI >= cSim.atoms) || (atomJ >= cSim.atoms) ) ? 0 : 1;
if( atomI == targetAtom || atomJ == targetAtom ){
unsigned int index = (atomI == targetAtom) ? atomJ : atomI;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) atomJ;
debugArray[index].z = (float) y;
debugArray[index].w = (flags == 0xFFFFFFFF) ? (float) -141.0f : -151.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? forceTorqueEnergy[0].x : 0.0f;
debugArray[index].y = mask ? forceTorqueEnergy[0].y : 0.0f;
debugArray[index].z = mask ? forceTorqueEnergy[0].z : 0.0f;
debugArray[index].w = mask ? forceTorqueEnergy[0].w : 0.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? forceTorqueEnergy[1].x : 0.0f;
debugArray[index].y = mask ? forceTorqueEnergy[1].y : 0.0f;
debugArray[index].z = mask ? forceTorqueEnergy[1].z : 0.0f;
float offsetF = (float)(3*(y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms));
debugArray[index].w = offsetF;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? forceTorqueEnergy[2].x : 0.0f;
debugArray[index].y = mask ? forceTorqueEnergy[2].y : 0.0f;
debugArray[index].z = mask ? forceTorqueEnergy[2].z : 0.0f;
offsetF = (float) (3*(x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms));
debugArray[index].w = offsetF;
for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullIndex].x;
debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z;
debugArray[index].w = pullBack[pullIndex].w;
}
}
#endif
tj = (tj + 1) & (GRID - 1); tj = (tj + 1) & (GRID - 1);
} // end of j-loop } // end of j-loop
localParticle.force[0] *= conversionFactor; localParticleI.force[0] *= conversionFactor;
localParticle.force[1] *= conversionFactor; localParticleI.force[1] *= conversionFactor;
localParticle.force[2] *= conversionFactor; localParticleI.force[2] *= conversionFactor;
localParticle.torque[0] *= -conversionFactor; localParticleI.torque[0] *= -conversionFactor;
localParticle.torque[1] *= -conversionFactor; localParticleI.torque[1] *= -conversionFactor;
localParticle.torque[2] *= -conversionFactor; localParticleI.torque[2] *= -conversionFactor;
sA[threadIdx.x].force[0] *= conversionFactor; sA[threadIdx.x].force[0] *= conversionFactor;
sA[threadIdx.x].force[1] *= conversionFactor; sA[threadIdx.x].force[1] *= conversionFactor;
sA[threadIdx.x].force[2] *= conversionFactor; sA[threadIdx.x].force[2] *= conversionFactor;
sA[threadIdx.x].torque[0] *= -conversionFactor; sA[threadIdx.x].torque[0] *= -conversionFactor;
sA[threadIdx.x].torque[1] *= -conversionFactor; sA[threadIdx.x].torque[1] *= -conversionFactor;
sA[threadIdx.x].torque[2] *= -conversionFactor; sA[threadIdx.x].torque[2] *= -conversionFactor;
// Write results // Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP #ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = (x + tgx + warp*cSim.paddedNumberOfAtoms); unsigned int offset = (x + tgx + warp*cSim.paddedNumberOfAtoms);
add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 ); add3dArrayToFloat4( offset, localParticleI.force, cSim.pForce4 );
add3dArray( 3*offset, localParticle.torque, outputTorque ); add3dArray( 3*offset, localParticleI.torque, outputTorque );
offset = (y + tgx + warp*cSim.paddedNumberOfAtoms); offset = (y + tgx + warp*cSim.paddedNumberOfAtoms);
add3dArrayToFloat4( offset, sA[threadIdx.x].force, cSim.pForce4 ); add3dArrayToFloat4( offset, sA[threadIdx.x].force, cSim.pForce4 );
...@@ -430,18 +218,18 @@ if( atomI == targetAtom || atomJ == targetAtom ){ ...@@ -430,18 +218,18 @@ if( atomI == targetAtom || atomJ == targetAtom ){
#else #else
unsigned int offset = (x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms); unsigned int offset = (x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms);
add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 ); add3dArrayToFloat4( offset, localParticleI.force, cSim.pForce4 );
load3dArray( 3*offset, localParticle.torque, outputTorque ); load3dArray( 3*offset, localParticleI.torque, outputTorque );
offset = (y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms); offset = (y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
add3dArrayToFloat4( offset, sA[threadIdx.x].force, cSim.pForce4 ); add3dArrayToFloat4( offset, sA[threadIdx.x].force, cSim.pForce4 );
load3dArray( 3*offset, sA[threadIdx.x].torque, outputTorque ); load3dArray( 3*offset, sA[threadIdx.x].torque, outputTorque );
#endif #endif
lasty = y;
} // end of pInteractionFlag block } // end of pInteractionFlag block
lasty = y;
} }
pos++; pos++;
} }
......
static __device__ void SUB_METHOD_NAME( calculatePmeDirectElectrostaticPairIxnF1, _kernel)( PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ,
float4 delta, float4 bn, float bn5, float forceFactor,
#ifdef APPLY_SCALE
const float* scalingFactors,
#endif
float force[3], float* energy ){
float xr = delta.x;
float yr = delta.y;
float zr = delta.z;
#ifdef APPLY_SCALE
float rr1 = delta.w;
#endif
// set the permanent multipole and induced dipole values;
float ci = atomI.q;
float di1 = atomI.labFrameDipole[0];
float di2 = atomI.labFrameDipole[1];
float di3 = atomI.labFrameDipole[2];
float qi1 = atomI.labFrameQuadrupole[0];
float qi2 = atomI.labFrameQuadrupole[1];
float qi3 = atomI.labFrameQuadrupole[2];
float qi5 = atomI.labFrameQuadrupole[3];
float qi6 = atomI.labFrameQuadrupole[4];
//float qi9 = atomI.labFrameQuadrupole[5];
float qi9 = -(atomI.labFrameQuadrupole[0] + atomI.labFrameQuadrupole[3]);
float ck = atomJ.q;
float dk1 = atomJ.labFrameDipole[0];
float dk2 = atomJ.labFrameDipole[1];
float dk3 = atomJ.labFrameDipole[2];
float qk1 = atomJ.labFrameQuadrupole[0];
float qk2 = atomJ.labFrameQuadrupole[1];
float qk3 = atomJ.labFrameQuadrupole[2];
float qk5 = atomJ.labFrameQuadrupole[3];
float qk6 = atomJ.labFrameQuadrupole[4];
// float qk9 = atomJ.labFrameQuadrupole[5];
float qk9 = -(atomJ.labFrameQuadrupole[0] + atomJ.labFrameQuadrupole[3]);
float bn1 = bn.x;
float bn2 = bn.y;
float bn3 = bn.z;
float bn4 = bn.w;
#ifdef APPLY_SCALE
float offset = 1.0f-scalingFactors[MScaleIndex];
float rr3 = rr1*rr1*rr1;
float gf4 = 2.0f*(bn2 - 3.0f*offset*rr3*rr1*rr1);
#else
float gf4 = 2.0f*bn2;
#endif
float qidk1 = qi1*dk1 + qi2*dk2 + qi3*dk3;
float qkdi1 = qk1*di1 + qk2*di2 + qk3*di3;
float ftm21 = gf4*(qkdi1-qidk1);
float qidk2 = qi2*dk1 + qi5*dk2 + qi6*dk3;
float qkdi2 = qk2*di1 + qk5*di2 + qk6*di3;
float ftm22 = gf4*(qkdi2-qidk2);
float qidk3 = qi3*dk1 + qi6*dk2 + qi9*dk3;
float qkdi3 = qk3*di1 + qk6*di2 + qk9*di3;
float ftm23 = gf4*(qkdi3-qidk3);
float qir1 = qi1*xr + qi2*yr + qi3*zr;
float qir2 = qi2*xr + qi5*yr + qi6*zr;
float qir3 = qi3*xr + qi6*yr + qi9*zr;
float qkr1 = qk1*xr + qk2*yr + qk3*zr;
float qkr2 = qk2*xr + qk5*yr + qk6*zr;
float qkr3 = qk3*xr + qk6*yr + qk9*zr;
#ifdef APPLY_SCALE
float gf7 = 4.0f*(bn3 - 15.0f*offset*rr3*rr3*rr1);
#else
float gf7 = 4.0f*bn3;
#endif
float qiqkr1 = qi1*qkr1 + qi2*qkr2 + qi3*qkr3;
float qkqir1 = qk1*qir1 + qk2*qir2 + qk3*qir3;
ftm21 += gf7*(qiqkr1+qkqir1);
float qiqkr2 = qi2*qkr1 + qi5*qkr2 + qi6*qkr3;
float qkqir2 = qk2*qir1 + qk5*qir2 + qk6*qir3;
ftm22 += gf7*(qiqkr2+qkqir2);
float qiqkr3 = qi3*qkr1 + qi6*qkr2 + qi9*qkr3;
float qkqir3 = qk3*qir1 + qk6*qir2 + qk9*qir3;
ftm23 += gf7*(qiqkr3+qkqir3);
// calculate the scalar products for permanent components
float gl6 = di1*dk1 + di2*dk2 + di3*dk3;
float gl7 = 2.0f*( qir1*dk1 + qir2*dk2 + qir3*dk3 - ( qkr1*di1 + qkr2*di2 + qkr3*di3 ) );
float gl5 = -4.0f*(qir1*qkr1 + qir2*qkr2 + qir3*qkr3);
float gl8 = 2.0f*(qi1*qk1 + qi2*qk2 + qi3*qk3 + qi2*qk2 + qi5*qk5 + qi6*qk6 + qi3*qk3 + qi6*qk6 + qi9*qk9 );
float sc3 = di1*xr + di2*yr + di3*zr;
float sc5 = qir1*xr + qir2*yr + qir3*zr;
float sc4 = dk1*xr + dk2*yr + dk3*zr;
float sc6 = qkr1*xr + qkr2*yr + qkr3*zr;
float gl0 = ci*ck;
float gl1 = ck*sc3 - ci*sc4;
float gl2 = ci*sc6 + ck*sc5 - sc3*sc4;
float gl3 = sc3*sc6 - sc4*sc5;
float gl4 = sc5*sc6;
#ifdef APPLY_SCALE
//forceTorqueEnergy->w += forceFactor*(-offset*rr1*gl0 + (bn1-offset*rr3)*(gl1+gl6) + (bn2-offset*(3.0f*rr3*rr1*rr1))*(gl2+gl7+gl8) + (bn3-offset*(15.0f*rr3*rr3*rr1))*(gl3+gl5) + (bn4-offset*(105.0f*rr3*rr3*rr3))*gl4);
*energy += forceFactor*(-offset*rr1*gl0 + (bn1-offset*rr3)*(gl1+gl6) + (bn2-offset*(3.0f*rr3*rr1*rr1))*(gl2+gl7+gl8) + (bn3-offset*(15.0f*rr3*rr3*rr1))*(gl3+gl5) + (bn4-offset*(105.0f*rr3*rr3*rr3))*gl4);
#else
//forceTorqueEnergy->w += bn1*(gl1+gl6) + bn2*(gl2+gl7+gl8) + bn3*(gl3+gl5) + bn4*gl4;
*energy += forceFactor*(bn1*(gl1+gl6) + bn2*(gl2+gl7+gl8) + bn3*(gl3+gl5) + bn4*gl4);
#endif
float gf1 = bn1*gl0 + bn2*(gl1+gl6) + bn3*(gl2+gl7+gl8) + bn4*(gl3+gl5) + bn5*gl4;
#ifdef APPLY_SCALE
gf1 -= offset*(rr3*gl0 + (3.0f*rr3*rr1*rr1)*(gl1+gl6) + (15.0f*rr3*rr3*rr1)*(gl2+gl7+gl8) + (105.0f*rr3*rr3*rr3)*(gl3+gl5) + (945.0f*rr3*rr3*rr3*rr1*rr1)*gl4);
#endif
ftm21 += gf1*xr;
ftm22 += gf1*yr;
ftm23 += gf1*zr;
#ifdef APPLY_SCALE
float gf2 = -ck*bn1 + sc4*bn2 - sc6*bn3 - offset*(-ck*rr3 + sc4*(3.0f*rr3*rr1*rr1) - sc6*(15.0f*rr3*rr3*rr1));
#else
float gf2 = -ck*bn1 + sc4*bn2 - sc6*bn3;
#endif
ftm21 += gf2*di1;
ftm22 += gf2*di2;
ftm23 += gf2*di3;
#ifdef APPLY_SCALE
float gf5 = 2.0f*(-ck*bn2+sc4*bn3-sc6*bn4 - offset*(-ck*(3.0f*rr3*rr1*rr1)+sc4*(15.0f*rr3*rr3*rr1)-sc6*(105.0f*rr3*rr3*rr3)));
#else
float gf5 = 2.0f*(-ck*bn2+sc4*bn3-sc6*bn4);
#endif
ftm21 += gf5*qir1;
ftm22 += gf5*qir2;
ftm23 += gf5*qir3;
#ifdef APPLY_SCALE
float gf3 = ci*bn1 + sc3*bn2 + sc5*bn3 - offset*(ci*rr3 + sc3*(3.0f*rr3*rr1*rr1) + sc5*(15.0f*rr3*rr3*rr1));
#else
float gf3 = ci*bn1 + sc3*bn2 + sc5*bn3;
#endif
ftm21 += gf3*dk1;
ftm22 += gf3*dk2;
ftm23 += gf3*dk3;
#ifdef APPLY_SCALE
float gf6 = 2.0f*(-ci*bn2-sc3*bn3-sc5*bn4 - offset*(-ci*(3.0f*rr3*rr1*rr1)-sc3*(15.0f*rr3*rr3*rr1)-sc5*(105.0f*rr3*rr3*rr3)));
#else
float gf6 = 2.0f*(-ci*bn2-sc3*bn3-sc5*bn4);
#endif
ftm21 += gf6*qkr1;
ftm22 += gf6*qkr2;
ftm23 += gf6*qkr3;
force[0] = ftm21;
force[1] = ftm22;
force[2] = ftm23;
/*
if( forceFactor == 1.0f ){
atomJ.force[0] -= ftm21;
atomJ.force[1] -= ftm22;
atomJ.force[2] -= ftm23;
}
atomI.force[0] += ftm21;
atomI.force[1] += ftm22;
atomI.force[2] += ftm23;
*/
return;
}
static __device__ void SUB_METHOD_NAME( calculatePmeDirectElectrostaticPairIxnF2, _kernel )(
PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ,
float4 delta, float4 bn, float forceFactor,
#ifdef APPLY_SCALE
const float* scalingFactors,
#endif
float force[3], float* energy ){
float xr = delta.x;
float yr = delta.y;
float zr = delta.z;
float rr1 = delta.w;
// set the permanent multipole and induced dipole values;
float ci = atomI.q;
float di1 = atomI.labFrameDipole[0];
float di2 = atomI.labFrameDipole[1];
float di3 = atomI.labFrameDipole[2];
float qi1 = atomI.labFrameQuadrupole[0];
float qi2 = atomI.labFrameQuadrupole[1];
float qi3 = atomI.labFrameQuadrupole[2];
float qi5 = atomI.labFrameQuadrupole[3];
float qi6 = atomI.labFrameQuadrupole[4];
// float qi9 = atomI.labFrameQuadrupole[5];
float qi9 = -(atomI.labFrameQuadrupole[0] + atomI.labFrameQuadrupole[3]);
float bn1 = bn.x;
float bn2 = bn.y;
float bn3 = bn.z;
float bn4 = bn.w;
float damp = atomI.damp*atomJ.damp;
if( damp != 0.0f ){
float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
float ratio = 1.0f/(rr1*damp);
damp = -pgamma*ratio*ratio*ratio;
damp = damp < -50.0f ? 0.0f : damp;
}
float scale5 = (damp == 0.0f) ? 1.0f : (1.0f - (1.0f-damp)*exp(damp));
float rr5 = rr1*rr1;
rr5 = 3.0f*rr1*rr5*rr5;
#ifdef APPLY_SCALE
float psc5 = rr5*(1.0f - scale5*scalingFactors[PScaleIndex]);
float dsc5 = rr5*(1.0f - scale5*scalingFactors[DScaleIndex]);
float usc5 = rr5*(1.0f - scale5*scalingFactors[UScaleIndex]);
#else
float psc5 = rr5*(1.0f - scale5);
#endif
float qiuk1 = qi1*atomJ.inducedDipole[0] + qi2*atomJ.inducedDipole[1] + qi3*atomJ.inducedDipole[2];
float qiukp1 = qi1*atomJ.inducedDipoleP[0] + qi2*atomJ.inducedDipoleP[1] + qi3*atomJ.inducedDipoleP[2];
float ftm21 = -bn2*(qiuk1+qiukp1);
#ifdef APPLY_SCALE
ftm21 += qiuk1*psc5 + qiukp1*dsc5;
#else
ftm21 += (qiuk1 + qiukp1)*psc5;
#endif
float qiuk2 = qi2*atomJ.inducedDipole[0] + qi5*atomJ.inducedDipole[1] + qi6*atomJ.inducedDipole[2];
float qiukp2 = qi2*atomJ.inducedDipoleP[0] + qi5*atomJ.inducedDipoleP[1] + qi6*atomJ.inducedDipoleP[2];
float ftm22 = -bn2*(qiuk2+qiukp2);
#ifdef APPLY_SCALE
ftm22 += ((qiuk2)*psc5 + (qiukp2)*dsc5);
#else
ftm22 += (qiuk2 + qiukp2)*psc5;
#endif
float qiuk3 = qi3*atomJ.inducedDipole[0] + qi6*atomJ.inducedDipole[1] + qi9*atomJ.inducedDipole[2];
float qiukp3 = qi3*atomJ.inducedDipoleP[0] + qi6*atomJ.inducedDipoleP[1] + qi9*atomJ.inducedDipoleP[2];
float ftm23 = -bn2*(qiuk3+qiukp3);
#ifdef APPLY_SCALE
ftm23 += ((qiuk3)*psc5 + (qiukp3)*dsc5);
#else
ftm23 += (qiuk3 + qiukp3)*psc5;
#endif
float expdamp = exp(damp);
float scale3 = (damp == 0.0f) ? 1.0f : (1.0f - expdamp);
float rr3 = rr1*rr1*rr1;
#ifdef APPLY_SCALE
float psc3 = rr3*(1.0f - scale3*scalingFactors[PScaleIndex]);
float dsc3 = rr3*(1.0f - scale3*scalingFactors[DScaleIndex]);
float usc3 = rr3*(1.0f - scale3*scalingFactors[UScaleIndex]);
#else
float psc3 = rr3*(1.0f - scale3);
#endif
float scale7 = (damp == 0.0f) ? 1.0f : (1.0f - (1.0f-damp+0.6f*damp*damp)*expdamp);
#ifdef APPLY_SCALE
float psc7 = (15.0f*rr3*rr3*rr1)*(1.0f - scale7*scalingFactors[PScaleIndex]);
float dsc7 = (15.0f*rr3*rr3*rr1)*(1.0f - scale7*scalingFactors[DScaleIndex]);
#else
float psc7 = (15.0f*rr3*rr3*rr1)*(1.0f - scale7);
#endif
float qir1 = qi1*xr + qi2*yr + qi3*zr;
float qir2 = qi2*xr + qi5*yr + qi6*zr;
float qir3 = qi3*xr + qi6*yr + qi9*zr;
float sc3 = di1*xr + di2*yr + di3*zr;
float sc5 = qir1*xr + qir2*yr + qir3*zr;
float gfi3 = ci*bn1 + sc3*bn2 + sc5*bn3;
float prefactor1;
prefactor1 = 0.5f*(ci*psc3 + sc3*psc5 + sc5*psc7 - gfi3);
ftm21 -= prefactor1*atomJ.inducedDipole[0];
ftm22 -= prefactor1*atomJ.inducedDipole[1];
ftm23 -= prefactor1*atomJ.inducedDipole[2];
#ifdef APPLY_SCALE
prefactor1 = 0.5f*(ci*dsc3 + sc3*dsc5 + sc5*dsc7 - gfi3);
#endif
ftm21 -= prefactor1*atomJ.inducedDipoleP[0];
ftm22 -= prefactor1*atomJ.inducedDipoleP[1];
ftm23 -= prefactor1*atomJ.inducedDipoleP[2];
float sci4 = atomJ.inducedDipole[0]*xr + atomJ.inducedDipole[1]*yr + atomJ.inducedDipole[2]*zr;
//forceTorqueEnergy->w += 0.5f*((psc3-bn1)*(ci*sci4) + (psc5-bn2)*(sc3*sci4) + (psc7-bn3)*(sci4*sc5));
*energy += forceFactor*0.5f*((psc3-bn1)*(ci*sci4) + (psc5-bn2)*(sc3*sci4) + (psc7-bn3)*(sci4*sc5));
float scip4 = atomJ.inducedDipoleP[0]*xr + atomJ.inducedDipoleP[1]*yr + atomJ.inducedDipoleP[2]*zr;
if( cAmoebaSim.polarizationType == 0 ){
#ifdef APPLY_SCALE
prefactor1 = 0.5f*( bn2 - usc5 );
#else
prefactor1 = 0.5f*( bn2 - psc5 );
#endif
ftm21 += prefactor1*( (sci4*atomI.inducedDipoleP[0] + scip4*atomI.inducedDipole[0]) );
ftm22 += prefactor1*( (sci4*atomI.inducedDipoleP[1] + scip4*atomI.inducedDipole[1]) );
ftm23 += prefactor1*( (sci4*atomI.inducedDipoleP[2] + scip4*atomI.inducedDipole[2]) );
}
#ifdef APPLY_SCALE
prefactor1 = 0.5f*( bn2*(sci4+scip4) - (sci4*psc5+scip4*dsc5) );
#else
sci4 += scip4;
prefactor1 = 0.5f*sci4*( bn2 - psc5 );
#endif
ftm21 += prefactor1*di1;
ftm22 += prefactor1*di2;
ftm23 += prefactor1*di3;
#ifdef APPLY_SCALE
float gfi5 = bn3*(sci4+scip4) - (sci4*psc7+scip4*dsc7);
#else
float gfi5 = sci4*(bn3 - psc7);
#endif
ftm21 += gfi5*qir1;
ftm22 += gfi5*qir2;
ftm23 += gfi5*qir3;
float sci7 = qir1*atomJ.inducedDipole[0] + qir2*atomJ.inducedDipole[1] + qir3*atomJ.inducedDipole[2];
//forceTorqueEnergy->w += (bn2-psc5)*sci7;
*energy += forceFactor*(bn2-psc5)*sci7;
float scip7 = qir1*atomJ.inducedDipoleP[0] + qir2*atomJ.inducedDipoleP[1] + qir3*atomJ.inducedDipoleP[2];
#ifdef APPLY_SCALE
float gli1 = -ci*sci4;
float gli2 = -sc3*sci4 + 2.0f*sci7;
float gli3 = -sci4*sc5;
float glip1 = -ci*scip4;
float glip2 = -sc3*scip4 + 2.0f*scip7;
float glip3 = -scip4*sc5;
#else
float gli1 = -ci*sci4;
float gli2 = -sc3*sci4 + 2.0f*(sci7 + scip7);
float gli3 = -sci4*sc5;
#endif
#ifdef APPLY_SCALE
float gfi1 = (bn2*(gli1+glip1) + bn3*(gli2+glip2) + bn4*(gli3+glip3));
gfi1 -= (rr1*rr1)*( 3.0f*(gli1*psc3 + glip1*dsc3) + 5.0f*(gli2*psc5 + glip2*dsc5 ) + 7.0f*(gli3*psc7+glip3*dsc7) );
#else
float gfi1 = bn2*gli1 + bn3*gli2 + bn4*gli3;
gfi1 -= (rr1*rr1)*( 3.0f*gli1*psc3 + 5.0f*gli2*psc5 + 7.0f*gli3*psc7);
#endif
gfi1 *= 0.5f;
ftm21 += gfi1*xr;
ftm22 += gfi1*yr;
ftm23 += gfi1*zr;
if( damp != 0.0f ){
float expdamp = exp(damp);
float temp3 = -1.5f*damp*expdamp*rr1*rr1;
float temp5 = -damp;
float temp7 = -0.2f - 0.6f*damp;
float ddsc31 = temp3*xr;
float ddsc32 = temp3*yr;
float ddsc33 = temp3*zr;
float ddsc51 = temp5*ddsc31;
float ddsc52 = temp5*ddsc32;
float ddsc53 = temp5*ddsc33;
float ddsc71 = temp7*ddsc51;
float ddsc72 = temp7*ddsc52;
float ddsc73 = temp7*ddsc53;
float rr3 = rr1*rr1*rr1;
#ifdef APPLY_SCALE
temp3 = (gli1*scalingFactors[PScaleIndex] + glip1*scalingFactors[DScaleIndex]);
temp5 = (3.0f*rr1*rr1)*(gli2*scalingFactors[PScaleIndex] + glip2*scalingFactors[DScaleIndex]);
temp7 = (15.0f*rr3*rr1)*(gli3*scalingFactors[PScaleIndex] + glip3*scalingFactors[DScaleIndex]);
#else
temp3 = gli1;
temp5 = (3.0f*rr1*rr1)*gli2;
temp7 = (15.0f*rr3*rr1)*gli3;
#endif
ftm21 -= rr3*(temp3*ddsc31 + temp5*ddsc51 + temp7*ddsc71);
ftm22 -= rr3*(temp3*ddsc32 + temp5*ddsc52 + temp7*ddsc72);
ftm23 -= rr3*(temp3*ddsc33 + temp5*ddsc53 + temp7*ddsc73);
}
//K
float qk1 = atomJ.labFrameQuadrupole[0];
float qk2 = atomJ.labFrameQuadrupole[1];
float qk3 = atomJ.labFrameQuadrupole[2];
float qk5 = atomJ.labFrameQuadrupole[3];
float qk6 = atomJ.labFrameQuadrupole[4];
//float qk9 = atomJ.labFrameQuadrupole[5];
float qk9 = -(qk1 + qk5);
float qkui1 = qk1*atomI.inducedDipole[0] + qk2*atomI.inducedDipole[1] + qk3*atomI.inducedDipole[2];
float qkuip1 = qk1*atomI.inducedDipoleP[0] + qk2*atomI.inducedDipoleP[1] + qk3*atomI.inducedDipoleP[2];
ftm21 += bn2*(qkui1+qkuip1);
#ifdef APPLY_SCALE
ftm21 -= (qkui1*psc5 + qkuip1*dsc5);
#else
ftm21 -= (qkui1 + qkuip1)*psc5;
#endif
float qkui2 = qk2*atomI.inducedDipole[0] + qk5*atomI.inducedDipole[1] + qk6*atomI.inducedDipole[2];
float qkuip2 = qk2*atomI.inducedDipoleP[0] + qk5*atomI.inducedDipoleP[1] + qk6*atomI.inducedDipoleP[2];
ftm22 += bn2*(qkui2+qkuip2);
#ifdef APPLY_SCALE
ftm22 -= ((qkui2)*psc5 + (qkuip2)*dsc5);
#else
ftm22 -= (qkui2 + qkuip2)*psc5;
#endif
float qkui3 = qk3*atomI.inducedDipole[0] + qk6*atomI.inducedDipole[1] + qk9*atomI.inducedDipole[2];
float qkuip3 = qk3*atomI.inducedDipoleP[0] + qk6*atomI.inducedDipoleP[1] + qk9*atomI.inducedDipoleP[2];
ftm23 += bn2*(qkui3+qkuip3);
#ifdef APPLY_SCALE
ftm23 -= ((qkui3)*psc5 + (qkuip3)*dsc5);
#else
ftm23 -= (qkui3 + qkuip3)*psc5;
#endif
float qkr1 = qk1*xr + qk2*yr + qk3*zr;
float qkr2 = qk2*xr + qk5*yr + qk6*zr;
float qkr3 = qk3*xr + qk6*yr + qk9*zr;
float dk1 = atomJ.labFrameDipole[0];
float dk2 = atomJ.labFrameDipole[1];
float dk3 = atomJ.labFrameDipole[2];
float sc4 = dk1*xr + dk2*yr + dk3*zr;
float sc6 = qkr1*xr + qkr2*yr + qkr3*zr;
float ck = atomJ.q;
float gfi2 = (-ck*bn1 + sc4*bn2 - sc6*bn3);
prefactor1 = 0.5f*(ck*psc3 - sc4*psc5 + sc6*psc7 + gfi2);
ftm21 += prefactor1*atomI.inducedDipole[0];
ftm22 += prefactor1*atomI.inducedDipole[1];
ftm23 += prefactor1*atomI.inducedDipole[2];
#ifdef APPLY_SCALE
prefactor1 = 0.5f*(ck*dsc3 - sc4*dsc5 + sc6*dsc7 + gfi2);
#endif
ftm21 += prefactor1*atomI.inducedDipoleP[0];
ftm22 += prefactor1*atomI.inducedDipoleP[1];
ftm23 += prefactor1*atomI.inducedDipoleP[2];
float sci3 = atomI.inducedDipole[0]*xr + atomI.inducedDipole[1]*yr + atomI.inducedDipole[2]*zr;
//forceTorqueEnergy->w += 0.5f*( (ck*sci3)*(bn1-psc3) -(sci3*sc4)*(bn2-psc5) + sci3*sc6*(bn3-psc7) );
*energy += forceFactor*0.5f*( (ck*sci3)*(bn1-psc3) -(sci3*sc4)*(bn2-psc5) + sci3*sc6*(bn3-psc7) );
float scip3 = atomI.inducedDipoleP[0]*xr + atomI.inducedDipoleP[1]*yr + atomI.inducedDipoleP[2]*zr;
if( cAmoebaSim.polarizationType == 0 ){
#ifdef APPLY_SCALE
prefactor1 = 0.5f*( bn2 - usc5 );
#else
prefactor1 = 0.5f*( bn2 - psc5 );
#endif
ftm21 += prefactor1*( sci3*atomJ.inducedDipoleP[0] + scip3*atomJ.inducedDipole[0] );
ftm22 += prefactor1*( sci3*atomJ.inducedDipoleP[1] + scip3*atomJ.inducedDipole[1] );
ftm23 += prefactor1*( sci3*atomJ.inducedDipoleP[2] + scip3*atomJ.inducedDipole[2] );
}
float sci34;
if( cAmoebaSim.polarizationType == 0 ){
float sci4 = atomJ.inducedDipole[0]*xr + atomJ.inducedDipole[1]*yr + atomJ.inducedDipole[2]*zr;
float scip4 = atomJ.inducedDipoleP[0]*xr + atomJ.inducedDipoleP[1]*yr + atomJ.inducedDipoleP[2]*zr;
sci34 = (sci3*scip4+scip3*sci4);
#ifdef APPLY_SCALE
gfi1 = sci34*(usc5*(5.0f*rr1*rr1) -bn3 );
#else
gfi1 = sci34*(psc5*(5.0f*rr1*rr1) -bn3 );
#endif
} else {
gfi1 = 0.0f;
}
#ifdef APPLY_SCALE
prefactor1 = 0.5f*( bn2*(sci3+scip3) - (sci3*psc5+scip3*dsc5) );
#else
sci3 += scip3;
prefactor1 = 0.5f*sci3*( bn2 - psc5 );
#endif
ftm21 += prefactor1*dk1;
ftm22 += prefactor1*dk2;
ftm23 += prefactor1*dk3;
#ifdef APPLY_SCALE
float gfi6 = -bn3*(sci3+scip3) + (sci3*psc7+scip3*dsc7);
#else
float gfi6 = sci3*( psc7 - bn3);
#endif
ftm21 += gfi6*qkr1;
ftm22 += gfi6*qkr2;
ftm23 += gfi6*qkr3;
float sci1 = atomI.inducedDipole[0]*dk1 + atomI.inducedDipole[1]*dk2 + atomI.inducedDipole[2]*dk3 + di1*atomJ.inducedDipole[0] + di2*atomJ.inducedDipole[1] + di3*atomJ.inducedDipole[2];
//forceTorqueEnergy->w += 0.5f*( sci1*(bn1-psc3) );
*energy += forceFactor*0.5f*( sci1*(bn1-psc3) );
float sci8 = qkr1*atomI.inducedDipole[0] + qkr2*atomI.inducedDipole[1] + qkr3*atomI.inducedDipole[2];
//forceTorqueEnergy->w += sci8*(bn2-psc5);
*energy += forceFactor*sci8*(bn2-psc5);
float scip1 = atomI.inducedDipoleP[0]*dk1 + atomI.inducedDipoleP[1]*dk2 + atomI.inducedDipoleP[2]*dk3 + di1*atomJ.inducedDipoleP[0] + di2*atomJ.inducedDipoleP[1] + di3*atomJ.inducedDipoleP[2];
#ifndef APPLY_SCALE
sci1 += scip1;
#endif
float scip2 = atomI.inducedDipole[0]*atomJ.inducedDipoleP[0] +
atomI.inducedDipole[1]*atomJ.inducedDipoleP[1] +
atomI.inducedDipole[2]*atomJ.inducedDipoleP[2] +
atomJ.inducedDipole[0]*atomI.inducedDipoleP[0] +
atomJ.inducedDipole[1]*atomI.inducedDipoleP[1] +
atomJ.inducedDipole[2]*atomI.inducedDipoleP[2];
float scip8 = qkr1*atomI.inducedDipoleP[0] + qkr2*atomI.inducedDipoleP[1] + qkr3*atomI.inducedDipoleP[2];
#ifndef APPLY_SCALE
sci8 += scip8;
#endif
gli1 = ck*sci3 + sci1;
gli2 = -(sci3*sc4 + 2.0f*sci8);
gli3 = sci3*sc6;
#ifdef APPLY_SCALE
glip1 = ck*scip3 + scip1;
glip2 = -(scip3*sc4 + 2.0f*scip8);
glip3 = scip3*sc6;
#endif
#ifdef APPLY_SCALE
gfi1 += (bn2*(gli1+glip1) + bn3*(gli2+glip2) + bn4*(gli3+glip3));
gfi1 -= (rr1*rr1)*( 3.0f*(gli1*psc3 + glip1*dsc3) + 5.0f*(gli2*psc5 + glip2*dsc5 ) + 7.0f*(gli3*psc7+glip3*dsc7) );
#else
gfi1 += (bn2*gli1 + bn3*gli2 + bn4*gli3);
gfi1 -= (rr1*rr1)*( 3.0f*gli1*psc3 + 5.0f*gli2*psc5 + 7.0f*gli3*psc7 );
#endif
if( cAmoebaSim.polarizationType == 0 ){
#ifdef APPLY_SCALE
gfi1 += scip2*(bn2 - (3.0f*rr1*rr1)*usc3);
#else
gfi1 += scip2*(bn2 - (3.0f*rr1*rr1)*psc3);
#endif
}
gfi1 *= 0.5f;
ftm21 += gfi1*xr;
ftm22 += gfi1*yr;
ftm23 += gfi1*zr;
if( damp != 0.0f ){
float expdamp = exp(damp);
float temp3 = -1.5f*damp*expdamp*rr1*rr1;
float temp5 = -damp;
float temp7 = -0.2f - 0.6f*damp;
float ddsc31 = temp3*xr;
float ddsc32 = temp3*yr;
float ddsc33 = temp3*zr;
float ddsc51 = temp5*ddsc31;
float ddsc52 = temp5*ddsc32;
float ddsc53 = temp5*ddsc33;
float ddsc71 = temp7*ddsc51;
float ddsc72 = temp7*ddsc52;
float ddsc73 = temp7*ddsc53;
float rr3 = rr1*rr1*rr1;
#ifdef APPLY_SCALE
temp3 = gli1*scalingFactors[PScaleIndex] + glip1*scalingFactors[DScaleIndex];
temp5 = (3.0f*rr1*rr1)*( gli2*scalingFactors[PScaleIndex] + glip2*scalingFactors[DScaleIndex]);
temp7 = (15.0f*rr3*rr1)*(gli3*scalingFactors[PScaleIndex] + glip3*scalingFactors[DScaleIndex]);
#else
temp3 = gli1;
temp5 = (3.0f*rr1*rr1)*gli2;
temp7 = (15.0f*rr3*rr1)*(gli3);
#endif
ftm21 -= rr3*(temp3*ddsc31 + temp5*ddsc51 + temp7*ddsc71);
ftm22 -= rr3*(temp3*ddsc32 + temp5*ddsc52 + temp7*ddsc72);
ftm23 -= rr3*(temp3*ddsc33 + temp5*ddsc53 + temp7*ddsc73);
if( cAmoebaSim.polarizationType == 0 ){
#ifdef APPLY_SCALE
temp3 = scalingFactors[UScaleIndex]*scip2;
temp5 = -(3.0f*rr1*rr1)*scalingFactors[UScaleIndex]*sci34;
#else
temp3 = scip2;
temp5 = -(3.0f*rr1*rr1)*sci34;
#endif
ftm21 -= rr3*(temp3*ddsc31 + temp5*ddsc51);
ftm22 -= rr3*(temp3*ddsc32 + temp5*ddsc52);
ftm23 -= rr3*(temp3*ddsc33 + temp5*ddsc53);
}
}
force[0] += ftm21;
force[1] += ftm22;
force[2] += ftm23;
/*
if( forceFactor == 1.0f ){
atomJ.force[0] -= ftm21;
atomJ.force[1] -= ftm22;
atomJ.force[2] -= ftm23;
}
atomI.force[0] += ftm21;
atomI.force[1] += ftm22;
atomI.force[2] += ftm23;
*/
/*
forceTorqueEnergy->x += ftm21;
forceTorqueEnergy->y += ftm22;
forceTorqueEnergy->z += ftm23;
*/
return;
}
static __device__ void SUB_METHOD_NAME( calculatePmeDirectElectrostaticPairIxnT1, _kernel )(
PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ,
const float4 delta, const float4 bn
#ifdef APPLY_SCALE
, const float* scalingFactors
#endif
){
float xr = delta.x;
float yr = delta.y;
float zr = delta.z;
#ifdef APPLY_SCALE
float rr1 = delta.w;
#endif
// set the permanent multipole and induced dipole values;
float di1 = atomI.labFrameDipole[0];
float di2 = atomI.labFrameDipole[1];
float di3 = atomI.labFrameDipole[2];
float qi1 = atomI.labFrameQuadrupole[0];
float qi2 = atomI.labFrameQuadrupole[1];
float qi3 = atomI.labFrameQuadrupole[2];
float qi5 = atomI.labFrameQuadrupole[3];
float qi6 = atomI.labFrameQuadrupole[4];
//float qi9 = atomI.labFrameQuadrupole[5];
float qi9 = -(atomI.labFrameQuadrupole[0] + atomI.labFrameQuadrupole[3]);
float ck = atomJ.q;
float dk1 = atomJ.labFrameDipole[0];
float dk2 = atomJ.labFrameDipole[1];
float dk3 = atomJ.labFrameDipole[2];
float qk1 = atomJ.labFrameQuadrupole[0];
float qk2 = atomJ.labFrameQuadrupole[1];
float qk3 = atomJ.labFrameQuadrupole[2];
float qk5 = atomJ.labFrameQuadrupole[3];
float qk6 = atomJ.labFrameQuadrupole[4];
//float qk9 = atomJ.labFrameQuadrupole[5];
float qk9 = -(atomJ.labFrameQuadrupole[0] + atomJ.labFrameQuadrupole[3]);
float bn1 = bn.x;
float bn2 = bn.y;
float bn3 = bn.z;
float bn4 = bn.w;
// apply Thole polarization damping to scale factors
#ifdef APPLY_SCALE
float rr2 = rr1*rr1;
float rr3 = rr1*rr2;
float rr5 = 3.0f*rr3*rr2;
float rr7 = 5.0f*rr5*rr2;
float rr9 = 7.0f*rr7*rr2;
float scale = 1.0f-scalingFactors[MScaleIndex];
float prefactor = scale*rr3 - bn1;
#else
float prefactor = -bn1;
#endif
float dixdk1 = di2*dk3 - di3*dk2;
float ttm21 = prefactor*dixdk1;
float dixdk2 = di3*dk1 - di1*dk3;
float ttm22 = prefactor*dixdk2;
float dixdk3 = di1*dk2 - di2*dk1;
float ttm23 = prefactor*dixdk3;
float qir1 = qi1*xr + qi2*yr + qi3*zr;
float qir2 = qi2*xr + qi5*yr + qi6*zr;
float qir3 = qi3*xr + qi6*yr + qi9*zr;
float qkr1 = qk1*xr + qk2*yr + qk3*zr;
float qkr2 = qk2*xr + qk5*yr + qk6*zr;
float qkr3 = qk3*xr + qk6*yr + qk9*zr;
float qiqkr1 = qi1*qkr1 + qi2*qkr2 + qi3*qkr3;
float qiqkr2 = qi2*qkr1 + qi5*qkr2 + qi6*qkr3;
float qiqkr3 = qi3*qkr1 + qi6*qkr2 + qi9*qkr3;
float rxqikr1 = yr*qiqkr3 - zr*qiqkr2;
float qkrxqir1 = qkr2*qir3 - qkr3*qir2;
#ifdef APPLY_SCALE
prefactor = 4.0f*(bn3 - scale*rr7);
#else
prefactor = 4.0f*bn3;
#endif
ttm21 -= prefactor*(rxqikr1+qkrxqir1);
float rxqikr2 = zr*qiqkr1 - xr*qiqkr3;
float qkrxqir2 = qkr3*qir1 - qkr1*qir3;
ttm22 -= prefactor*(rxqikr2+qkrxqir2);
float rxqikr3 = xr*qiqkr2 - yr*qiqkr1;
float qkrxqir3 = qkr1*qir2 - qkr2*qir1;
ttm23 -= prefactor*(rxqikr3+qkrxqir3);
float qidk1 = qi1*dk1 + qi2*dk2 + qi3*dk3;
float qidk2 = qi2*dk1 + qi5*dk2 + qi6*dk3;
float qidk3 = qi3*dk1 + qi6*dk2 + qi9*dk3;
float dixqkr1 = di2*qkr3 - di3*qkr2;
float dkxqir1 = dk2*qir3 - dk3*qir2;
float rxqidk1 = yr*qidk3 - zr*qidk2;
float qixqk1 = qi2*qk3 + qi5*qk6 + qi6*qk9 - qi3*qk2 - qi6*qk5 - qi9*qk6;
#ifdef APPLY_SCALE
prefactor = 2.0f*(bn2 - scale*rr5);
#else
prefactor = 2.0f*bn2;
#endif
ttm21 += prefactor*(dixqkr1+dkxqir1+rxqidk1-2.0f*qixqk1);
float dixqkr2 = di3*qkr1 - di1*qkr3;
float dkxqir2 = dk3*qir1 - dk1*qir3;
float rxqidk2 = zr*qidk1 - xr*qidk3;
float qixqk2 = qi3*qk1 + qi6*qk2 + qi9*qk3 - qi1*qk3 - qi2*qk6 - qi3*qk9;
ttm22 += prefactor*(dixqkr2+dkxqir2+rxqidk2-2.0f*qixqk2);
float dixqkr3 = di1*qkr2 - di2*qkr1;
float dkxqir3 = dk1*qir2 - dk2*qir1;
float rxqidk3 = xr*qidk2 - yr*qidk1;
float qixqk3 = qi1*qk2 + qi2*qk5 + qi3*qk6 - qi2*qk1 - qi5*qk2 - qi6*qk3;
ttm23 += prefactor*(dixqkr3+dkxqir3+rxqidk3-2.0f*qixqk3);
float sc4 = dk1*xr + dk2*yr + dk3*zr;
float sc6 = qkr1*xr + qkr2*yr + qkr3*zr;
float gf2 = -ck*bn1 + sc4*bn2 - sc6*bn3;
#ifdef APPLY_SCALE
float gfr2 = -ck*rr3 + sc4*rr5 - sc6*rr7;
prefactor = (gf2 - scale*gfr2);
#else
prefactor = gf2;
#endif
ttm21 += prefactor*(di2*zr - di3*yr);
ttm22 += prefactor*(di3*xr - di1*zr);
ttm23 += prefactor*(di1*yr - di2*xr);
float gf5 = (-ck*bn2+sc4*bn3-sc6*bn4);
#ifdef APPLY_SCALE
float gfr5 = (-ck*rr5+sc4*rr7-sc6*rr9);
prefactor = 2.0f*(gf5 - scale*gfr5);
#else
prefactor = 2.0f*gf5;
#endif
float rxqir1 = yr*qir3 - zr*qir2;
float rxqir2 = zr*qir1 - xr*qir3;
float rxqir3 = xr*qir2 - yr*qir1;
ttm21 -= prefactor*rxqir1;
ttm22 -= prefactor*rxqir2;
ttm23 -= prefactor*rxqir3;
atomI.torque[0] += ttm21;
atomI.torque[1] += ttm22;
atomI.torque[2] += ttm23;
/*
torque[0] = ttm21;
torque[1] = ttm22;
torque[2] = ttm23;
*/
return;
}
static __device__ void SUB_METHOD_NAME( calculatePmeDirectElectrostaticPairIxnT2, _kernel)(
PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ,
const float4 delta, const float4 bn
#ifdef APPLY_SCALE
, const float* scalingFactors
#endif
){
float xr = delta.x;
float yr = delta.y;
float zr = delta.z;
float rr1 = delta.w;
// set the permanent multipole and induced dipole values;
float di1 = atomI.labFrameDipole[0];
float di2 = atomI.labFrameDipole[1];
float di3 = atomI.labFrameDipole[2];
float qi1 = atomI.labFrameQuadrupole[0];
float qi2 = atomI.labFrameQuadrupole[1];
float qi3 = atomI.labFrameQuadrupole[2];
float qi5 = atomI.labFrameQuadrupole[3];
float qi6 = atomI.labFrameQuadrupole[4];
//float qi9 = atomI.labFrameQuadrupole[5];
float qi9 = -(atomI.labFrameQuadrupole[0] + atomI.labFrameQuadrupole[3]);
float bn1 = bn.x;
float bn2 = bn.y;
float bn3 = bn.z;
// apply Thole polarization damping to scale factors
float scale3 = 1.0f;
float scale5 = 1.0f;
float scale7 = 1.0f;
float damp = atomI.damp*atomJ.damp;
if( damp != 0.0f ){
float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
float ratio = 1.0f/(rr1*damp);
damp = -pgamma*ratio*ratio*ratio;
if( damp > -50.0f ){
float expdamp = exp(damp);
scale3 = 1.0f - expdamp;
scale5 = 1.0f - (1.0f-damp)*expdamp;
scale7 = 1.0f - (1.0f-damp+0.6f*damp*damp)*expdamp;
}
}
float rr3 = rr1*rr1*rr1;
#ifdef APPLY_SCALE
float dsc3 = rr3*(1.0f - scale3*scalingFactors[DScaleIndex]);
float dsc5 = (3.0f*rr3*rr1*rr1)* (1.0f - scale5*scalingFactors[DScaleIndex]);
float dsc7 = (15.0f*rr3*rr3*rr1)*(1.0f - scale7*scalingFactors[DScaleIndex]);
float psc3 = rr3*(1.0f - scale3*scalingFactors[PScaleIndex]);
float psc5 = (3.0f*rr3*rr1*rr1)*(1.0f - scale5*scalingFactors[PScaleIndex]);
float psc7 = (15.0f*rr3*rr3*rr1)*(1.0f - scale7*scalingFactors[PScaleIndex]);
#else
float psc3 = rr3*(1.0f - scale3);
float psc5 = (3.0f*rr3*rr1*rr1)*(1.0f - scale5);
float psc7 = (15.0f*rr3*rr3*rr1)*(1.0f - scale7);
#endif
float prefactor1 = 0.5f*(psc3 - bn1);
#ifdef APPLY_SCALE
float prefactor2 = 0.5f*(dsc3 - bn1);
#endif
float dixuk1 = di2*atomJ.inducedDipole[2] - di3*atomJ.inducedDipole[1];
float dixukp1 = di2*atomJ.inducedDipoleP[2] - di3*atomJ.inducedDipoleP[1];
#ifdef APPLY_SCALE
float ttm2i1 = prefactor1*dixuk1 + prefactor2*dixukp1;
#else
float ttm2i1 = prefactor1*(dixuk1 + dixukp1);
#endif
float dixuk2 = di3*atomJ.inducedDipole[0] - di1*atomJ.inducedDipole[2];
float dixukp2 = di3*atomJ.inducedDipoleP[0] - di1*atomJ.inducedDipoleP[2];
#ifdef APPLY_SCALE
float ttm2i2 = prefactor1*dixuk2 + prefactor2*dixukp2;
#else
float ttm2i2 = prefactor1*(dixuk2 + dixukp2);
#endif
float dixuk3 = di1*atomJ.inducedDipole[1] - di2*atomJ.inducedDipole[0];
float dixukp3 = di1*atomJ.inducedDipoleP[1] - di2*atomJ.inducedDipoleP[0];
#ifdef APPLY_SCALE
float ttm2i3 = prefactor1*dixuk3 + prefactor2*dixukp3;
#else
float ttm2i3 = prefactor1*(dixuk3 + dixukp3);
#endif
float sci4 = atomJ.inducedDipole[0]*xr + atomJ.inducedDipole[1]*yr + atomJ.inducedDipole[2]*zr;
float scip4 = atomJ.inducedDipoleP[0]*xr + atomJ.inducedDipoleP[1]*yr + atomJ.inducedDipoleP[2]*zr;
float gti2 = bn2*(sci4+scip4);
#ifdef APPLY_SCALE
float gtri2 = (sci4*psc5+scip4*dsc5);
#else
float gtri2 = psc5*(sci4+scip4);
#endif
prefactor1 = 0.5f*(gti2 - gtri2);
ttm2i1 += prefactor1*( di2*zr - di3*yr );
ttm2i2 += prefactor1*( di3*xr - di1*zr );
ttm2i3 += prefactor1*( di1*yr - di2*xr );
float qir1 = qi1*xr + qi2*yr + qi3*zr;
float qir2 = qi2*xr + qi5*yr + qi6*zr;
float qir3 = qi3*xr + qi6*yr + qi9*zr;
#ifdef APPLY_SCALE
prefactor1 = sci4*psc7 + scip4*dsc7 - bn3*(sci4+scip4);
#else
prefactor1 = psc7*(sci4+scip4) - bn3*(sci4+scip4);
#endif
ttm2i1 += prefactor1*( yr*qir3 - zr*qir2 );
ttm2i2 += prefactor1*( zr*qir1 - xr*qir3 );
ttm2i3 += prefactor1*( xr*qir2 - yr*qir1 );
float qiuk1 = qi1*atomJ.inducedDipole[0] + qi2*atomJ.inducedDipole[1] + qi3*atomJ.inducedDipole[2];
float qiuk2 = qi2*atomJ.inducedDipole[0] + qi5*atomJ.inducedDipole[1] + qi6*atomJ.inducedDipole[2];
float qiuk3 = qi3*atomJ.inducedDipole[0] + qi6*atomJ.inducedDipole[1] + qi9*atomJ.inducedDipole[2];
float qiukp1 = qi1*atomJ.inducedDipoleP[0] + qi2*atomJ.inducedDipoleP[1] + qi3*atomJ.inducedDipoleP[2];
float qiukp2 = qi2*atomJ.inducedDipoleP[0] + qi5*atomJ.inducedDipoleP[1] + qi6*atomJ.inducedDipoleP[2];
float qiukp3 = qi3*atomJ.inducedDipoleP[0] + qi6*atomJ.inducedDipoleP[1] + qi9*atomJ.inducedDipoleP[2];
prefactor1 = (bn2 - psc5);
#ifdef APPLY_SCALE
prefactor2 = (bn2 - dsc5);
#endif
float ukxqir1 = atomJ.inducedDipole[1]*qir3 - atomJ.inducedDipole[2]*qir2;
float ukxqirp1 = atomJ.inducedDipoleP[1]*qir3 - atomJ.inducedDipoleP[2]*qir2;
float rxqiuk1 = yr*qiuk3 - zr*qiuk2;
float rxqiukp1 = yr*qiukp3 - zr*qiukp2;
#ifdef APPLY_SCALE
ttm2i1 += prefactor1*(ukxqir1 + rxqiuk1) + prefactor2*(ukxqirp1 + rxqiukp1);
#else
ttm2i1 += prefactor1*( ukxqir1 + rxqiuk1 + ukxqirp1 + rxqiukp1 );
#endif
float ukxqir2 = atomJ.inducedDipole[2]*qir1 - atomJ.inducedDipole[0]*qir3;
float ukxqirp2 = atomJ.inducedDipoleP[2]*qir1 - atomJ.inducedDipoleP[0]*qir3;
float rxqiuk2 = zr*qiuk1 - xr*qiuk3;
float rxqiukp2 = zr*qiukp1 - xr*qiukp3;
#ifdef APPLY_SCALE
ttm2i2 += prefactor1*(ukxqir2 + rxqiuk2) + prefactor2*(ukxqirp2 + rxqiukp2);
#else
ttm2i2 += prefactor1*( ukxqir2 + rxqiuk2 + ukxqirp2 + rxqiukp2 );
#endif
float ukxqir3 = atomJ.inducedDipole[0]*qir2 - atomJ.inducedDipole[1]*qir1;
float ukxqirp3 = atomJ.inducedDipoleP[0]*qir2 - atomJ.inducedDipoleP[1]*qir1;
float rxqiuk3 = xr*qiuk2 - yr*qiuk1;
float rxqiukp3 = xr*qiukp2 - yr*qiukp1;
#ifdef APPLY_SCALE
ttm2i3 += prefactor1*(ukxqir3 + rxqiuk3) + prefactor2*(ukxqirp3 + rxqiukp3);
#else
ttm2i3 += prefactor1*(ukxqir3 + rxqiuk3 + ukxqirp3 + rxqiukp3 );
#endif
atomI.torque[0] += ttm2i1;
atomI.torque[1] += ttm2i2;
atomI.torque[2] += ttm2i3;
/*
torque[0] += ttm2i1;
torque[1] += ttm2i2;
torque[2] += ttm2i3;
*/
return;
}
...@@ -378,13 +378,16 @@ __launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1) ...@@ -378,13 +378,16 @@ __launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1) __launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif #endif
static void kSorUpdateMutualInducedField_kernel( static void kSorUpdateMutualInducedField_kernel(
int numberOfEntries, float* polarizability, float* polarizability,
float* inducedDipole, float* inducedDipoleP, float* inducedDipole, float* inducedDipoleP,
float* fixedEField, float* fixedEFieldP, float* fixedEField, float* fixedEFieldP,
float* matrixProduct, float* matrixProductP ) float* matrixProduct, float* matrixProductP )
{ {
int pos = blockIdx.x*blockDim.x + threadIdx.x; int pos = blockIdx.x*blockDim.x + threadIdx.x;
const float term = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
const float polarSOR = 0.70f;
while( pos < 3*cSim.atoms ) while( pos < 3*cSim.atoms )
{ {
...@@ -393,14 +396,12 @@ static void kSorUpdateMutualInducedField_kernel( ...@@ -393,14 +396,12 @@ static void kSorUpdateMutualInducedField_kernel(
// add self terms to fields // add self terms to fields
const float term = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
matrixProduct[pos] += term*previousDipole; matrixProduct[pos] += term*previousDipole;
matrixProductP[pos] += term*previousDipoleP; matrixProductP[pos] += term*previousDipoleP;
inducedDipole[pos] = fixedEField[pos] + polarizability[pos]*matrixProduct[pos]; inducedDipole[pos] = fixedEField[pos] + polarizability[pos]*matrixProduct[pos];
inducedDipoleP[pos] = fixedEFieldP[pos] + polarizability[pos]*matrixProductP[pos]; inducedDipoleP[pos] = fixedEFieldP[pos] + polarizability[pos]*matrixProductP[pos];
const float polarSOR = 0.70f;
inducedDipole[pos] = previousDipole + polarSOR*( inducedDipole[pos] - previousDipole ); inducedDipole[pos] = previousDipole + polarSOR*( inducedDipole[pos] - previousDipole );
inducedDipoleP[pos] = previousDipoleP + polarSOR*( inducedDipoleP[pos] - previousDipoleP ); inducedDipoleP[pos] = previousDipoleP + polarSOR*( inducedDipoleP[pos] - previousDipoleP );
...@@ -445,9 +446,9 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte ...@@ -445,9 +446,9 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
static int iteration = 1;
int targetAtom = 546; int targetAtom = 546;
static const char* methodName = "cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply"; static const char* methodName = "cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply";
static int iteration = 1;
if( 1 && amoebaGpu->log ){ if( 1 && amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s\n", methodName ); (void) fprintf( amoebaGpu->log, "%s\n", methodName );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
...@@ -485,17 +486,30 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte ...@@ -485,17 +486,30 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
#endif #endif
if (gpu->bOutputBufferPerWarp){ if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaPmeMutualInducedFieldCutoffByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>( kCalculateAmoebaPmeMutualInducedFieldCutoffByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit, gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_2->_pDevData ); amoebaGpu->psWorkArray_3_2->_pDevData );
#endif
} else { } else {
kCalculateAmoebaPmeMutualInducedFieldCutoff_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>( kCalculateAmoebaPmeMutualInducedFieldCutoff_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit, gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_2->_pDevData ); amoebaGpu->psWorkArray_3_2->_pDevData );
#endif
} }
LAUNCHERROR("kCalculateAmoebaPmeMutualInducedField"); LAUNCHERROR("kCalculateAmoebaPmeMutualInducedField");
...@@ -507,8 +521,8 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte ...@@ -507,8 +521,8 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
iteration ); (void) fflush( amoebaGpu->log ); iteration ); (void) fflush( amoebaGpu->log );
outputArray->Download(); outputArray->Download();
outputPolarArray->Download(); outputPolarArray->Download();
// debugArray->Download(); //debugArray->Download();
int maxPrint = 5000000; int maxPrint = 5;
for( int ii = 0; ii < gpu->natoms; ii++ ){ for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii); (void) fprintf( amoebaGpu->log, "%5d ", ii);
...@@ -551,7 +565,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte ...@@ -551,7 +565,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
iteration++; iteration++;
} }
// delete debugArray; delete debugArray;
#endif #endif
} }
...@@ -592,7 +606,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba ...@@ -592,7 +606,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
// set E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability // set E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability
// initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability // initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability
kInitializeMutualInducedField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>( kInitializeMutualInducedField_kernel<<< gpu->sim.blocks, gpu->sim.threads_per_block >>>(
gpu->natoms, gpu->natoms,
amoebaGpu->psE_Field->_pDevData, amoebaGpu->psE_Field->_pDevData,
amoebaGpu->psE_FieldPolar->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData,
...@@ -637,41 +651,47 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba ...@@ -637,41 +651,47 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpu, amoebaGpu->psWorkVector[0], amoebaGpu->psWorkVector[1] ); cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpu, amoebaGpu->psWorkVector[0], amoebaGpu->psWorkVector[1] );
kCalculateAmoebaPMEInducedDipoleField( amoebaGpu ); kCalculateAmoebaPMEInducedDipoleField( amoebaGpu );
// post matrix multiply #ifdef AMOEBA_DEBUG
if( 0 ){ if( 0 ){
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
std::vector<int> fileId; std::vector<int> fileId;
fileId.push_back( iteration ); fileId.push_back( iteration );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
/*
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psPolarizability, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
*/
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psWorkVector[0], outputVector, gpu->psAtomIndex->_pSysData, 1.0f ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psWorkVector[0], outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psWorkVector[1], outputVector, gpu->psAtomIndex->_pSysData, 1.0f ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psWorkVector[1], outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeDirectMIPre", fileId, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaPrePostPmeDirectMI", fileId, outputVector );
} }
#endif
// post matrix multiply
kSorUpdateMutualInducedField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>( kSorUpdateMutualInducedField_kernel<<< gpu->sim.blocks, gpu->sim.threads_per_block >>>(
gpu->natoms, amoebaGpu->psPolarizability->_pDevData, amoebaGpu->psPolarizability->_pDevData,
amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psInducedDipolePolar->_pDevData, amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psInducedDipolePolar->_pDevData,
amoebaGpu->psE_Field->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData, amoebaGpu->psE_Field->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData,
amoebaGpu->psWorkVector[0]->_pDevData, amoebaGpu->psWorkVector[1]->_pDevData ); amoebaGpu->psWorkVector[0]->_pDevData, amoebaGpu->psWorkVector[1]->_pDevData );
LAUNCHERROR("kSorUpdatePmeMutualInducedField"); LAUNCHERROR("kSorUpdatePmeMutualInducedField");
#ifdef AMOEBA_DEBUG
if( 0 ){ if( 0 ){
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
std::vector<int> fileId; std::vector<int> fileId;
fileId.push_back( iteration ); fileId.push_back( iteration );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData, 1.0f ); //cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f ); //cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psPolarizability, outputVector, gpu->psAtomIndex->_pSysData, 1.0f ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psPolarizability, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeDirectMI", fileId, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeDirectMI", fileId, outputVector );
//exit(0);
} }
#endif
// get total epsilon -- performing sums on gpu // get total epsilon -- performing sums on gpu
kReduceMutualInducedFieldDelta_kernel<<<1, amoebaGpu->epsilonThreadsPerBlock, 2*sizeof(float)*amoebaGpu->epsilonThreadsPerBlock>>>( kReduceMutualInducedFieldDelta_kernel<<<1, amoebaGpu->epsilonThreadsPerBlock, 2*sizeof(float)*amoebaGpu->epsilonThreadsPerBlock>>>(
...@@ -679,9 +699,11 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba ...@@ -679,9 +699,11 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
amoebaGpu->psCurrentEpsilon->_pDevData ); amoebaGpu->psCurrentEpsilon->_pDevData );
LAUNCHERROR("kReducePmeMutualInducedFieldDelta"); LAUNCHERROR("kReducePmeMutualInducedFieldDelta");
#ifdef AMOEBA_DEBUG
if( 0 && amoebaGpu->log ){ // trackMutualInducedIterations if( 0 && amoebaGpu->log ){ // trackMutualInducedIterations
trackMutualInducedIterations( amoebaGpu, iteration); trackMutualInducedIterations( amoebaGpu, iteration);
} }
#endif
// Debye=48.033324f // Debye=48.033324f
amoebaGpu->psCurrentEpsilon->Download(); amoebaGpu->psCurrentEpsilon->Download();
...@@ -697,8 +719,8 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba ...@@ -697,8 +719,8 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
amoebaGpu->psInducedDipole->Download(); amoebaGpu->psInducedDipole->Download();
amoebaGpu->psInducedDipolePolar->Download(); amoebaGpu->psInducedDipolePolar->Download();
#if 1 #if 1
(void) fprintf( amoebaGpu->log, "%s iteration=%3d eps %14.6e [%14.6e %14.6e] done=%d\n", (void) fprintf( amoebaGpu->log, "cudaComputeAmoebaPmeMutualInducedFieldBySOR iteration=%3d eps %14.6e [%14.6e %14.6e] done=%d\n",
methodName, iteration, amoebaGpu->mutualInducedCurrentEpsilon, iteration, amoebaGpu->mutualInducedCurrentEpsilon,
amoebaGpu->psCurrentEpsilon->_pSysData[1], amoebaGpu->psCurrentEpsilon->_pSysData[1],
amoebaGpu->psCurrentEpsilon->_pSysData[2], done ); amoebaGpu->psCurrentEpsilon->_pSysData[2], done );
#else #else
...@@ -765,17 +787,15 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba ...@@ -765,17 +787,15 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
if( amoebaGpu->log ){ if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "MI iteration=%3d eps %14.6e [%14.6e %14.6e] done=%d\n", (void) fprintf( amoebaGpu->log, "MI iteration=%3d eps %14.6e done=%d\n",
iteration, amoebaGpu->mutualInducedCurrentEpsilon, iteration, amoebaGpu->mutualInducedCurrentEpsilon, done );
amoebaGpu->psCurrentEpsilon->_pSysData[1], (void) fflush( amoebaGpu->log );
amoebaGpu->psCurrentEpsilon->_pSysData[2], done );
(void) fflush( amoebaGpu->log );
} }
#endif #endif
// exit if nan // exit if nan
if( 0 && amoebaGpu->mutualInducedCurrentEpsilon != amoebaGpu->mutualInducedCurrentEpsilon ){ if( amoebaGpu->mutualInducedCurrentEpsilon != amoebaGpu->mutualInducedCurrentEpsilon ){
(void) fprintf( stderr, "PME MI iteration=%3d eps is nan -- exiting.\n", iteration ); (void) fprintf( stderr, "PME MI iteration=%3d eps is nan -- exiting.\n", iteration );
exit(0); exit(0);
} }
...@@ -786,6 +806,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba ...@@ -786,6 +806,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
amoebaGpu->mutualInducedDone = done; amoebaGpu->mutualInducedDone = done;
amoebaGpu->mutualInducedConverged = ( !done || iteration > amoebaGpu->mutualInducedMaxIterations ) ? 0 : 1; amoebaGpu->mutualInducedConverged = ( !done || iteration > amoebaGpu->mutualInducedMaxIterations ) ? 0 : 1;
#ifdef AMOEBA_DEBUG
if( 0 ){ if( 0 ){
std::vector<int> fileId; std::vector<int> fileId;
//fileId.push_back( 0 ); //fileId.push_back( 0 );
...@@ -801,6 +822,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba ...@@ -801,6 +822,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
checkForNans( gpu->natoms, 3, amoebaGpu->psInducedDipole, gpu->psAtomIndex->_pSysData, ++iteration, "CudaPmeMI", stderr ); checkForNans( gpu->natoms, 3, amoebaGpu->psInducedDipole, gpu->psAtomIndex->_pSysData, ++iteration, "CudaPmeMI", stderr );
checkForNans( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, gpu->psAtomIndex->_pSysData, iteration, "CudaPmeMIPolar", stderr ); checkForNans( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, gpu->psAtomIndex->_pSysData, iteration, "CudaPmeMIPolar", stderr );
} }
#endif
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
} }
......
...@@ -396,17 +396,22 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu ) ...@@ -396,17 +396,22 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
(void) fprintf( amoebaGpu->log, "%s: numBlocks/atoms=%d\n", methodName, numBlocks ); (void) fflush( amoebaGpu->log ); (void) fprintf( amoebaGpu->log, "%s: numBlocks/atoms=%d\n", methodName, numBlocks ); (void) fflush( amoebaGpu->log );
amoebaGpu->psMultipoleParticlesIdsAndAxisType->Download(); amoebaGpu->psMultipoleParticlesIdsAndAxisType->Download();
amoebaGpu->psMolecularDipole->Download(); amoebaGpu->psMolecularDipole->Download();
amoebaGpu->psMultipoleParticlesTorqueBufferIndices->Download();
gpu->psPosq4->Download(); gpu->psPosq4->Download();
for( int ii = 0; ii < gpu->natoms; ii++ ){ for( int ii = 0; ii < gpu->natoms; ii++ ){
int mIndex = 3*ii; int mIndex = 3*ii;
(void) fprintf( amoebaGpu->log,"%6d [%6d %6d %6d %6d] x[%16.9e %16.9e %16.9e] %s\n", ii, (void) fprintf( amoebaGpu->log,"%6d [%6d %6d %6d %6d] x[%16.9e %16.9e %16.9e] %s [%6d %6d %6d %6d]\n", ii,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].x, amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].x,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].y, amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].y,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].z, amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].z,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].w, amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].w,
gpu->psPosq4->_pSysData[ii].x, gpu->psPosq4->_pSysData[ii].x,
gpu->psPosq4->_pSysData[ii].y, gpu->psPosq4->_pSysData[ii].y,
gpu->psPosq4->_pSysData[ii].z, (amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].w > 1 ? " XXX" : "") ); gpu->psPosq4->_pSysData[ii].z, (amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].w > 1 ? " XXX" : ""),
amoebaGpu->psMultipoleParticlesTorqueBufferIndices->_pSysData[ii].x,
amoebaGpu->psMultipoleParticlesTorqueBufferIndices->_pSysData[ii].y,
amoebaGpu->psMultipoleParticlesTorqueBufferIndices->_pSysData[ii].z,
amoebaGpu->psMultipoleParticlesTorqueBufferIndices->_pSysData[ii].w );
//if( ii == 30 )ii = gpu->natoms - 30; //if( ii == 30 )ii = gpu->natoms - 30;
} }
} }
...@@ -491,6 +496,7 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG ...@@ -491,6 +496,7 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
} else { } else {
cudaComputeAmoebaPmeElectrostatic( amoebaGpu ); cudaComputeAmoebaPmeElectrostatic( amoebaGpu );
} }
} }
#undef AMOEBA_DEBUG #undef AMOEBA_DEBUG
...@@ -118,7 +118,7 @@ void kClearFields_3( amoebaGpuContext amoebaGpu, unsigned int numberToClear ) ...@@ -118,7 +118,7 @@ void kClearFields_3( amoebaGpuContext amoebaGpu, unsigned int numberToClear )
{ {
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
kClearFields_kernel<<<gpu->sim.nonbond_blocks, 384>>>( gpu->sim.paddedNumberOfAtoms*3*gpu->sim.outputBuffers, amoebaGpu->psWorkArray_3_1->_pDevData ); kClearFields_kernel<<<gpu->sim.blocks, gpu->sim.threads_per_block>>>( gpu->sim.paddedNumberOfAtoms*3*gpu->sim.outputBuffers, amoebaGpu->psWorkArray_3_1->_pDevData );
LAUNCHERROR("kClearFields_3_1"); LAUNCHERROR("kClearFields_3_1");
if( numberToClear > 1 ){ if( numberToClear > 1 ){
......
...@@ -534,10 +534,12 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff ...@@ -534,10 +534,12 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(Vdw14_7Particle), gpu->sharedMemoryPerBlock ), maxThreads); threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(Vdw14_7Particle), gpu->sharedMemoryPerBlock ), maxThreads);
} }
#ifdef AMOEBA_DEBUG_PRINT
if( 0 ){ if( 0 ){
static int iteration = 0; static int iteration = 0;
checkForNansFloat4( gpu->natoms, gpu->psPosq4, gpu->psAtomIndex->_pSysData, ++iteration, "\n\nzCoordPreCopyVdw", stderr ); checkForNansFloat4( gpu->natoms, gpu->psPosq4, gpu->psAtomIndex->_pSysData, ++iteration, "\n\nzCoordPreCopyVdw", stderr );
} }
#endif
kCalculateAmoebaVdw14_7CopyCoordinates( amoebaGpu, gpu->psPosq4, amoebaGpu->psAmoebaVdwCoordinates ); kCalculateAmoebaVdw14_7CopyCoordinates( amoebaGpu, gpu->psPosq4, amoebaGpu->psAmoebaVdwCoordinates );
kCalculateAmoebaVdw14_7CoordinateReduction( amoebaGpu, amoebaGpu->psAmoebaVdwCoordinates, amoebaGpu->psAmoebaVdwCoordinates ); kCalculateAmoebaVdw14_7CoordinateReduction( amoebaGpu, amoebaGpu->psAmoebaVdwCoordinates, amoebaGpu->psAmoebaVdwCoordinates );
...@@ -725,22 +727,27 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff ...@@ -725,22 +727,27 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
} }
#endif #endif
#ifdef AMOEBA_DEBUG
if( 0 ){ if( 0 ){
static int iteration = 0; static int iteration = 0;
checkForNansFloat4( gpu->natoms, amoebaGpu->gpuContext->psForce4, gpu->psAtomIndex->_pSysData, ++iteration, "PreVdw", stderr ); checkForNansFloat4( gpu->natoms, amoebaGpu->gpuContext->psForce4, gpu->psAtomIndex->_pSysData, ++iteration, "PreVdw", stderr );
checkForNansFloat4( gpu->natoms, gpu->psPosq4, gpu->psAtomIndex->_pSysData, iteration, "zCoordPreVdw", stderr ); checkForNansFloat4( gpu->natoms, gpu->psPosq4, gpu->psAtomIndex->_pSysData, iteration, "zCoordPreVdw", stderr );
} }
#endif
kReduceVdw14_7( amoebaGpu, amoebaGpu->psWorkArray_3_2 ); kReduceVdw14_7( amoebaGpu, amoebaGpu->psWorkArray_3_2 );
#ifdef AMOEBA_DEBUG
if( 0 ){ if( 0 ){
static int iteration = 0; static int iteration = 0;
checkForNans( gpu->natoms, 3, amoebaGpu->psWorkArray_3_2, gpu->psAtomIndex->_pSysData, ++iteration, "Vdw32", stderr ); checkForNans( gpu->natoms, 3, amoebaGpu->psWorkArray_3_2, gpu->psAtomIndex->_pSysData, ++iteration, "Vdw32", stderr );
} }
#endif
kCalculateAmoebaVdw14_7Reduction( amoebaGpu, amoebaGpu->psWorkArray_3_2, amoebaGpu->gpuContext->psForce4 ); kCalculateAmoebaVdw14_7Reduction( amoebaGpu, amoebaGpu->psWorkArray_3_2, amoebaGpu->gpuContext->psForce4 );
kCalculateAmoebaVdw14_7NonReduction( amoebaGpu, amoebaGpu->psWorkArray_3_2, amoebaGpu->gpuContext->psForce4 ); kCalculateAmoebaVdw14_7NonReduction( amoebaGpu, amoebaGpu->psWorkArray_3_2, amoebaGpu->gpuContext->psForce4 );
#ifdef AMOEBA_DEBUG
if( 0 ){ if( 0 ){
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
CUDAStream<float4>* psTempForce = new CUDAStream<float4>(paddedNumberOfAtoms, 1, "psTempForce"); CUDAStream<float4>* psTempForce = new CUDAStream<float4>(paddedNumberOfAtoms, 1, "psTempForce");
...@@ -761,6 +768,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff ...@@ -761,6 +768,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
static int iteration = 0; static int iteration = 0;
checkForNansFloat4( gpu->natoms, amoebaGpu->gpuContext->psForce4, gpu->psAtomIndex->_pSysData, ++iteration, "VdwForce", stderr ); checkForNansFloat4( gpu->natoms, amoebaGpu->gpuContext->psForce4, gpu->psAtomIndex->_pSysData, ++iteration, "VdwForce", stderr );
} }
#endif
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
delete debugArray; delete debugArray;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment