Commit 8a331fb9 authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Direct space optimizations

parent af4d503a
......@@ -3449,19 +3449,20 @@ tgx = 0;
Get threads/block
@param amoebaGpu amoebaGpuContext
@param sharedMemoryPerThread shared memory/thread
@param amoebaGpu amoebaGpuContext
@param sharedMemoryPerThread shared memory/thread
@param sharedMemoryPerBlock shared memory/block
@return threadsPerBlock
--------------------------------------------------------------------------------------- */
unsigned int getThreadsPerBlock( amoebaGpuContext amoebaGpu, unsigned int sharedMemoryPerThread )
unsigned int getThreadsPerBlock( amoebaGpuContext amoebaGpu, unsigned int sharedMemoryPerThread, unsigned int sharedMemoryPerBlock )
{
unsigned int grid = amoebaGpu->gpuContext->grid;
unsigned int threadsPerBlock = (amoebaGpu->gpuContext->sharedMemoryPerBlock + grid -1)/(grid*sharedMemoryPerThread);
unsigned int threadsPerBlock = (sharedMemoryPerBlock + grid -1)/(grid*sharedMemoryPerThread);
threadsPerBlock = threadsPerBlock < 1 ? 1 : threadsPerBlock;
threadsPerBlock *= grid;
threadsPerBlock *= grid;
return threadsPerBlock;
}
......
......@@ -160,7 +160,7 @@ extern void kClearFloat( amoebaGpuContext amoebaGpu, unsigned int entries, CUDAS
extern void kClearFloat4( amoebaGpuContext amoebaGpu, unsigned int entries, CUDAStream<float4>* fieldToClear );
extern void kClearFields_1( amoebaGpuContext amoebaGpu );
extern void kClearFields_3( amoebaGpuContext amoebaGpu, unsigned int numberToClear );
extern unsigned int getThreadsPerBlock( amoebaGpuContext amoebaGpu, unsigned int sharedMemoryPerThread );
extern unsigned int getThreadsPerBlock( amoebaGpuContext amoebaGpu, unsigned int sharedMemoryPerThread, unsigned int sharedMemoryPerBlock );
//extern int isNanOrInfinity( double number );
extern void trackMutualInducedIterations( amoebaGpuContext amoebaGpu, int iteration);
......
......@@ -759,7 +759,7 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo
maxThreads = 128;
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(ElectrostaticParticle)), maxThreads);
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(ElectrostaticParticle), gpu->sharedMemoryPerBlock), maxThreads);
}
kClearFields_3( amoebaGpu, 1 );
......
......@@ -362,7 +362,7 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
maxThreads = 128;
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)), maxThreads);
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle), gpu->sharedMemoryPerBlock ), maxThreads);
}
kClearFields_3( amoebaGpu, 3 );
......
......@@ -108,7 +108,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
maxThreads = 128;
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)), maxThreads);
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle), gpu->sharedMemoryPerBlock ), maxThreads);
}
#ifdef AMOEBA_DEBUG
......
......@@ -1813,7 +1813,7 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
maxThreads = 128;
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(KirkwoodParticle)), maxThreads);
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(KirkwoodParticle), gpu->sharedMemoryPerBlock ), maxThreads);
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
......
......@@ -978,7 +978,7 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
maxThreads = 96;
else
maxThreads = 32;
threadsPerBlock = std::min(getThreadsPerBlock( amoebaGpu, sizeof(KirkwoodEDiffParticle)), maxThreads);
threadsPerBlock = std::min(getThreadsPerBlock( amoebaGpu, sizeof(KirkwoodEDiffParticle), gpu->sharedMemoryPerBlock ), maxThreads);
}
#ifdef AMOEBA_DEBUG
......
......@@ -490,7 +490,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
maxThreads = 128;
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock( amoebaGpu, sizeof(MutualInducedParticle)), maxThreads);
threadsPerBlock = std::min(getThreadsPerBlock( amoebaGpu, sizeof(MutualInducedParticle), gpu->sharedMemoryPerBlock ), maxThreads);
}
#ifdef AMOEBA_DEBUG
......
......@@ -276,7 +276,7 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
maxThreads = 128;
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(MutualInducedParticle)), maxThreads);
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(MutualInducedParticle), gpu->sharedMemoryPerBlock ), maxThreads);
}
#ifdef AMOEBA_DEBUG
......
......@@ -4,7 +4,6 @@ struct MutualInducedParticle {
float x;
float y;
float z;
float q;
float inducedDipole[3];
float inducedDipolePolar[3];
......@@ -41,7 +40,6 @@ __device__ static void loadMutualInducedShared( MutualInducedParticle* sA, unsig
sA->x = posq.x;
sA->y = posq.y;
sA->z = posq.z;
sA->q = posq.w;
// dipole
......
......@@ -38,6 +38,8 @@ static int const UScaleIndex = 2;
static int const MScaleIndex = 3;
static int const LastScalingIndex = 4;
#define CALCULATE_FULL_TILE
struct PmeDirectElectrostaticParticle {
// coordinates charge
......@@ -53,7 +55,7 @@ struct PmeDirectElectrostaticParticle {
// lab frame quadrupole
float labFrameQuadrupole[9];
float labFrameQuadrupole[6];
// induced dipole
......@@ -69,14 +71,16 @@ struct PmeDirectElectrostaticParticle {
float damp;
float force[3];
float torque[3];
float padding;
//float padding;
#ifndef CALCULATE_FULL_TILE
float tempForce[3];
float tempTorque[3];
#endif
};
#ifndef CALCULATE_FULL_TILE
__device__ void sumTempBuffer( PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ ){
atomI.tempForce[0] += atomJ.tempForce[0];
......@@ -87,6 +91,7 @@ __device__ void sumTempBuffer( PmeDirectElectrostaticParticle& atomI, PmeDirectE
atomI.tempTorque[1] += atomJ.tempTorque[1];
atomI.tempTorque[2] += atomJ.tempTorque[2];
}
#endif
/*
__device__ static void debugSetup( unsigned int atomI, unsigned int atomJ,
......@@ -124,11 +129,11 @@ __device__ static void calculatePmeSelfEnergyElectrostaticPairIxn_kernel( PmeDir
atomI.labFrameDipole[2]*atomI.labFrameDipole[2];
float qii = atomI.labFrameQuadrupole[0]*atomI.labFrameQuadrupole[0] +
atomI.labFrameQuadrupole[4]*atomI.labFrameQuadrupole[4] +
atomI.labFrameQuadrupole[8]*atomI.labFrameQuadrupole[8] + 2.0f*(
atomI.labFrameQuadrupole[3]*atomI.labFrameQuadrupole[3] +
atomI.labFrameQuadrupole[5]*atomI.labFrameQuadrupole[5] + 2.0f*(
atomI.labFrameQuadrupole[1]*atomI.labFrameQuadrupole[1] +
atomI.labFrameQuadrupole[2]*atomI.labFrameQuadrupole[2] +
atomI.labFrameQuadrupole[5]*atomI.labFrameQuadrupole[5]);
atomI.labFrameQuadrupole[4]*atomI.labFrameQuadrupole[4]);
float uii = atomI.labFrameDipole[0]*atomI.inducedDipole[0] + atomI.labFrameDipole[1]*atomI.inducedDipole[1] + atomI.labFrameDipole[2]*atomI.inducedDipole[2];
......@@ -152,7 +157,836 @@ __device__ static void calculatePmeSelfTorqueElectrostaticPairIxn_kernel( PmeDir
atomI.torque[2] += term*(atomI.labFrameDipole[0]*uiy - atomI.labFrameDipole[1]*uix);
}
__device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ,
/*
#undef SUB_METHOD_NAME
#undef F1
#define SUB_METHOD_NAME(a, b) a##F1##b
#define F1
//#define T1
#include "kCalculateAmoebaCudaPmeDirectElectrostatic_b.h"
#undef F1
//#undef T1
#undef SUB_METHOD_NAME
#define SUB_METHOD_NAME(a, b) a##F2##b
#define F2
#include "kCalculateAmoebaCudaPmeDirectElectrostatic_b.h"
#undef F2
#undef SUB_METHOD_NAME
#undef SUB_METHOD_NAME
#undef T1
#define SUB_METHOD_NAME(a, b) a##T1##b
#define T1
#include "kCalculateAmoebaCudaPmeDirectElectrostatic_b.h"
#undef T1
#undef SUB_METHOD_NAME
#define SUB_METHOD_NAME(a, b) a##T2##b
#define T2
#include "kCalculateAmoebaCudaPmeDirectElectrostatic_b.h"
#undef T2
#undef SUB_METHOD_NAME
#undef SUB_METHOD_NAME
#undef T3
#define SUB_METHOD_NAME(a, b) a##T3##b
#define T3
#include "kCalculateAmoebaCudaPmeDirectElectrostatic_b.h"
#undef T3
#undef SUB_METHOD_NAME
#define SUB_METHOD_NAME(a, b) a##T4##b
#define T4
#include "kCalculateAmoebaCudaPmeDirectElectrostatic_b.h"
#undef T4
#undef SUB_METHOD_NAME
*/
static __device__ void calculatePmeDirectElectrostaticPairIxnF1_kernel( const PmeDirectElectrostaticParticle& atomI, const PmeDirectElectrostaticParticle& atomJ,
const float4 delta, const float4 bn, const float bn5, const float* scalingFactors, float4* forceTorqueEnergy ){
float xr = delta.x;
float yr = delta.y;
float zr = delta.z;
float r = delta.w;
// set the permanent multipole and induced dipole values;
float ci = atomI.q;
float di1 = atomI.labFrameDipole[0];
float di2 = atomI.labFrameDipole[1];
float di3 = atomI.labFrameDipole[2];
float qi1 = atomI.labFrameQuadrupole[0];
float qi2 = atomI.labFrameQuadrupole[1];
float qi3 = atomI.labFrameQuadrupole[2];
float qi5 = atomI.labFrameQuadrupole[3];
float qi6 = atomI.labFrameQuadrupole[4];
float qi9 = atomI.labFrameQuadrupole[5];
float ck = atomJ.q;
float dk1 = atomJ.labFrameDipole[0];
float dk2 = atomJ.labFrameDipole[1];
float dk3 = atomJ.labFrameDipole[2];
float qk1 = atomJ.labFrameQuadrupole[0];
float qk2 = atomJ.labFrameQuadrupole[1];
float qk3 = atomJ.labFrameQuadrupole[2];
float qk5 = atomJ.labFrameQuadrupole[3];
float qk6 = atomJ.labFrameQuadrupole[4];
float qk9 = atomJ.labFrameQuadrupole[5];
float bn1 = bn.x;
float bn2 = bn.y;
float bn3 = bn.z;
float bn4 = bn.w;
// apply Thole polarization damping to scale factors
float rr1 = 1.0f/r;
float rr2 = rr1*rr1;
float rr3 = rr1*rr2;
float rr5 = 3.0f*rr3*rr2;
float rr7 = 5.0f*rr5*rr2;
float rr9 = 7.0f*rr7*rr2;
float rr11 = 9.0f*rr9*rr2;
float qir1 = qi1*xr + qi2*yr + qi3*zr;
float qir2 = qi2*xr + qi5*yr + qi6*zr;
float qir3 = qi3*xr + qi6*yr + qi9*zr;
float qkr1 = qk1*xr + qk2*yr + qk3*zr;
float qkr2 = qk2*xr + qk5*yr + qk6*zr;
float qkr3 = qk3*xr + qk6*yr + qk9*zr;
float offset = 1.0f-scalingFactors[MScaleIndex];
float gf4 = 2.0f*(bn2 - offset*rr5);
float qidk1 = qi1*dk1 + qi2*dk2 + qi3*dk3;
float qkdi1 = qk1*di1 + qk2*di2 + qk3*di3;
float ftm21 = gf4*(qkdi1-qidk1);
float qidk2 = qi2*dk1 + qi5*dk2 + qi6*dk3;
float qkdi2 = qk2*di1 + qk5*di2 + qk6*di3;
float ftm22 = gf4*(qkdi2-qidk2);
float qidk3 = qi3*dk1 + qi6*dk2 + qi9*dk3;
float qkdi3 = qk3*di1 + qk6*di2 + qk9*di3;
float ftm23 = gf4*(qkdi3-qidk3);
float gf7 = 4.0f*(bn3 - offset*rr7);
float qiqkr1 = qi1*qkr1 + qi2*qkr2 + qi3*qkr3;
float qkqir1 = qk1*qir1 + qk2*qir2 + qk3*qir3;
ftm21 += gf7*(qiqkr1+qkqir1);
float qiqkr2 = qi2*qkr1 + qi5*qkr2 + qi6*qkr3;
float qkqir2 = qk2*qir1 + qk5*qir2 + qk6*qir3;
ftm22 += gf7*(qiqkr2+qkqir2);
float qiqkr3 = qi3*qkr1 + qi6*qkr2 + qi9*qkr3;
float qkqir3 = qk3*qir1 + qk6*qir2 + qk9*qir3;
ftm23 += gf7*(qiqkr3+qkqir3);
// calculate the scalar products for permanent components
float sc2 = di1*dk1 + di2*dk2 + di3*dk3;
float sc7 = qir1*dk1 + qir2*dk2 + qir3*dk3;
float sc8 = qkr1*di1 + qkr2*di2 + qkr3*di3;
float sc9 = qir1*qkr1 + qir2*qkr2 + qir3*qkr3;
float sc10 = qi1*qk1 + qi2*qk2 + qi3*qk3 + qi2*qk2 + qi5*qk5 + qi6*qk6 + qi3*qk3 + qi6*qk6 + qi9*qk9;
float sc3 = di1*xr + di2*yr + di3*zr;
float sc5 = qir1*xr + qir2*yr + qir3*zr;
float sc4 = dk1*xr + dk2*yr + dk3*zr;
float sc6 = qkr1*xr + qkr2*yr + qkr3*zr;
// calculate the scalar products for induced components
// calculate the gl functions for permanent components
float gl0 = ci*ck;
float gl1 = ck*sc3 - ci*sc4;
float gl2 = ci*sc6 + ck*sc5 - sc3*sc4;
float gl3 = sc3*sc6 - sc4*sc5;
float gl4 = sc5*sc6;
float gl5 = -4.0f * sc9;
float gl6 = sc2;
float gl7 = 2.0f * (sc7-sc8);
float gl8 = 2.0f * sc10;
forceTorqueEnergy->w += -offset*rr1*gl0 + (bn1-offset*rr3)*(gl1+gl6) + (bn2-offset*rr5)*(gl2+gl7+gl8) + (bn3-offset*rr7)*(gl3+gl5) + (bn4-offset*rr9)*gl4;
// calculate the gl functions for induced components
// intermediate variables for permanent force terms
float gf1 = bn1*gl0 + bn2*(gl1+gl6) + bn3*(gl2+gl7+gl8) + bn4*(gl3+gl5) + bn5*gl4;
gf1 -= offset*(rr3*gl0 + rr5*(gl1+gl6) + rr7*(gl2+gl7+gl8) + rr9*(gl3+gl5) + rr11*gl4);
ftm21 += gf1*xr;
ftm22 += gf1*yr;
ftm23 += gf1*zr;
float gf2 = -ck*bn1 + sc4*bn2 - sc6*bn3 - offset*(-ck*rr3 + sc4*rr5 - sc6*rr7);
ftm21 += gf2*di1;
ftm22 += gf2*di2;
ftm23 += gf2*di3;
float gf3 = ci*bn1 + sc3*bn2 + sc5*bn3 - offset*(ci*rr3 + sc3*rr5 + sc5*rr7);
ftm21 += gf3*dk1;
ftm22 += gf3*dk2;
ftm23 += gf3*dk3;
float gf5 = 2.0f*(-ck*bn2+sc4*bn3-sc6*bn4 - offset*(-ck*rr5+sc4*rr7-sc6*rr9));
ftm21 += gf5*qir1;
ftm22 += gf5*qir2;
ftm23 += gf5*qir3;
float gf6 = 2.0f*(-ci*bn2-sc3*bn3-sc5*bn4 - offset*(-ci*rr5-sc3*rr7-sc5*rr9));
ftm21 += gf6*qkr1;
ftm22 += gf6*qkr2;
ftm23 += gf6*qkr3;
forceTorqueEnergy->x = ftm21;
forceTorqueEnergy->y = ftm22;
forceTorqueEnergy->z = ftm23;
return;
}
static __device__ void calculatePmeDirectElectrostaticPairIxnF2_kernel(
const PmeDirectElectrostaticParticle& atomI, const PmeDirectElectrostaticParticle& atomJ,
const float4 delta, const float4 bn, const float* scalingFactors, float4* forceTorqueEnergy ){
float xr = delta.x;
float yr = delta.y;
float zr = delta.z;
float r = delta.w;
// set the permanent multipole and induced dipole values;
float ci = atomI.q;
float di1 = atomI.labFrameDipole[0];
float di2 = atomI.labFrameDipole[1];
float di3 = atomI.labFrameDipole[2];
float qi1 = atomI.labFrameQuadrupole[0];
float qi2 = atomI.labFrameQuadrupole[1];
float qi3 = atomI.labFrameQuadrupole[2];
float qi5 = atomI.labFrameQuadrupole[3];
float qi6 = atomI.labFrameQuadrupole[4];
float qi9 = atomI.labFrameQuadrupole[5];
float ck = atomJ.q;
float dk1 = atomJ.labFrameDipole[0];
float dk2 = atomJ.labFrameDipole[1];
float dk3 = atomJ.labFrameDipole[2];
float qk1 = atomJ.labFrameQuadrupole[0];
float qk2 = atomJ.labFrameQuadrupole[1];
float qk3 = atomJ.labFrameQuadrupole[2];
float qk5 = atomJ.labFrameQuadrupole[3];
float qk6 = atomJ.labFrameQuadrupole[4];
float qk9 = atomJ.labFrameQuadrupole[5];
float bn1 = bn.x;
float bn2 = bn.y;
float bn3 = bn.z;
float bn4 = bn.w;
// apply Thole polarization damping to scale factors
float rr1 = 1.0f/r;
float rr2 = rr1*rr1;
float rr3 = rr1*rr2;
float rr5 = 3.0f*rr3*rr2;
float rr7 = 5.0f*rr5*rr2;
float rr9 = 7.0f*rr7*rr2;
float scale3 = 1.0f;
float scale5 = 1.0f;
float scale7 = 1.0f;
float damp = atomI.damp*atomJ.damp;
if( damp != 0.0f ){
float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
float ratio = r/damp;
damp = -pgamma*ratio*ratio*ratio;
if( damp > -50.0f ){
float expdamp = exp(damp);
scale3 = 1.0f - expdamp;
scale5 = 1.0f - (1.0f-damp)*expdamp;
scale7 = 1.0f - (1.0f-damp+0.6f*damp*damp)*expdamp;
} else {
damp = 0.0f;
}
}
float psc5 = 1.0f - scale5*scalingFactors[PScaleIndex];
float dsc5 = 1.0f - scale5*scalingFactors[DScaleIndex];
float qiuk1 = qi1*atomJ.inducedDipole[0] + qi2*atomJ.inducedDipole[1] + qi3*atomJ.inducedDipole[2];
float qkui1 = qk1*atomI.inducedDipole[0] + qk2*atomI.inducedDipole[1] + qk3*atomI.inducedDipole[2];
float qiukp1 = qi1*atomJ.inducedDipoleP[0] + qi2*atomJ.inducedDipoleP[1] + qi3*atomJ.inducedDipoleP[2];
float qkuip1 = qk1*atomI.inducedDipoleP[0] + qk2*atomI.inducedDipoleP[1] + qk3*atomI.inducedDipoleP[2];
forceTorqueEnergy->x += bn2*(qkui1+qkuip1-qiuk1-qiukp1);
forceTorqueEnergy->x -= rr5*((qkui1-qiuk1)*psc5 + (qkuip1-qiukp1)*dsc5);
float qiuk2 = qi2*atomJ.inducedDipole[0] + qi5*atomJ.inducedDipole[1] + qi6*atomJ.inducedDipole[2];
float qkui2 = qk2*atomI.inducedDipole[0] + qk5*atomI.inducedDipole[1] + qk6*atomI.inducedDipole[2];
float qiukp2 = qi2*atomJ.inducedDipoleP[0] + qi5*atomJ.inducedDipoleP[1] + qi6*atomJ.inducedDipoleP[2];
float qkuip2 = qk2*atomI.inducedDipoleP[0] + qk5*atomI.inducedDipoleP[1] + qk6*atomI.inducedDipoleP[2];
forceTorqueEnergy->y += bn2*(qkui2+qkuip2-qiuk2-qiukp2);
forceTorqueEnergy->y -= rr5*((qkui2-qiuk2)*psc5 + (qkuip2-qiukp2)*dsc5);
float qiuk3 = qi3*atomJ.inducedDipole[0] + qi6*atomJ.inducedDipole[1] + qi9*atomJ.inducedDipole[2];
float qkui3 = qk3*atomI.inducedDipole[0] + qk6*atomI.inducedDipole[1] + qk9*atomI.inducedDipole[2];
float qiukp3 = qi3*atomJ.inducedDipoleP[0] + qi6*atomJ.inducedDipoleP[1] + qi9*atomJ.inducedDipoleP[2];
float qkuip3 = qk3*atomI.inducedDipoleP[0] + qk6*atomI.inducedDipoleP[1] + qk9*atomI.inducedDipoleP[2];
forceTorqueEnergy->z += bn2*(qkui3+qkuip3-qiuk3-qiukp3);
forceTorqueEnergy->z -= rr5*((qkui3-qiuk3)*psc5 + (qkuip3-qiukp3)*dsc5);
float sc3 = di1*xr + di2*yr + di3*zr;
float sc4 = dk1*xr + dk2*yr + dk3*zr;
float qir1 = qi1*xr + qi2*yr + qi3*zr;
float qir2 = qi2*xr + qi5*yr + qi6*zr;
float qir3 = qi3*xr + qi6*yr + qi9*zr;
float sc5 = qir1*xr + qir2*yr + qir3*zr;
float qkr1 = qk1*xr + qk2*yr + qk3*zr;
float qkr2 = qk2*xr + qk5*yr + qk6*zr;
float qkr3 = qk3*xr + qk6*yr + qk9*zr;
float sc6 = qkr1*xr + qkr2*yr + qkr3*zr;
// calculate the scalar products for induced components
float sci3 = atomI.inducedDipole[0]*xr + atomI.inducedDipole[1]*yr + atomI.inducedDipole[2]*zr;
float scip3 = atomI.inducedDipoleP[0]*xr + atomI.inducedDipoleP[1]*yr + atomI.inducedDipoleP[2]*zr;
float sci4 = atomJ.inducedDipole[0]*xr + atomJ.inducedDipole[1]*yr + atomJ.inducedDipole[2]*zr;
float scip4 = atomJ.inducedDipoleP[0]*xr + atomJ.inducedDipoleP[1]*yr + atomJ.inducedDipoleP[2]*zr;
float usc5 = 1.0f - scale5*scalingFactors[UScaleIndex];
float prefactor1 = 0.5f*( bn2 - rr5*usc5 );
forceTorqueEnergy->x += prefactor1*( (sci4*atomI.inducedDipoleP[0] + scip4*atomI.inducedDipole[0]) + sci3*atomJ.inducedDipoleP[0] + scip3*atomJ.inducedDipole[0] );
forceTorqueEnergy->y += prefactor1*( (sci4*atomI.inducedDipoleP[1] + scip4*atomI.inducedDipole[1]) + sci3*atomJ.inducedDipoleP[1] + scip3*atomJ.inducedDipole[1] );
forceTorqueEnergy->z += prefactor1*( (sci4*atomI.inducedDipoleP[2] + scip4*atomI.inducedDipole[2]) + sci3*atomJ.inducedDipoleP[2] + scip3*atomJ.inducedDipole[2] );
float dsc7 = 1.0f - scale7*scalingFactors[DScaleIndex];
float psc7 = 1.0f - scale7*scalingFactors[PScaleIndex];
float gfi5 = bn3*(sci4+scip4) - rr7*(sci4*psc7+scip4*dsc7);
forceTorqueEnergy->x += gfi5*qir1;
forceTorqueEnergy->y += gfi5*qir2;
forceTorqueEnergy->z += gfi5*qir3;
prefactor1 = 0.5f*( bn2*(sci4+scip4) - rr5*(sci4*psc5+scip4*dsc5) );
float prefactor2 = 0.5f*( bn2*(sci3+scip3) - rr5*(sci3*psc5+scip3*dsc5) );
forceTorqueEnergy->x += prefactor1*di1 + prefactor2*dk1;
forceTorqueEnergy->y += prefactor1*di2 + prefactor2*dk2;
forceTorqueEnergy->z += prefactor1*di3 + prefactor2*dk3;
float gfi6 = -bn3*(sci3+scip3) + rr7*(sci3*psc7+scip3*dsc7);
forceTorqueEnergy->x += gfi6*qkr1;
forceTorqueEnergy->y += gfi6*qkr2;
forceTorqueEnergy->z += gfi6*qkr3;
float sci1 = atomI.inducedDipole[0]*dk1 + atomI.inducedDipole[1]*dk2 + atomI.inducedDipole[2]*dk3 + di1*atomJ.inducedDipole[0] + di2*atomJ.inducedDipole[1] + di3*atomJ.inducedDipole[2];
float sci7 = qir1*atomJ.inducedDipole[0] + qir2*atomJ.inducedDipole[1] + qir3*atomJ.inducedDipole[2];
float sci8 = qkr1*atomI.inducedDipole[0] + qkr2*atomI.inducedDipole[1] + qkr3*atomI.inducedDipole[2];
float scip1 = atomI.inducedDipoleP[0]*dk1 + atomI.inducedDipoleP[1]*dk2 + atomI.inducedDipoleP[2]*dk3 + di1*atomJ.inducedDipoleP[0] + di2*atomJ.inducedDipoleP[1] + di3*atomJ.inducedDipoleP[2];
float scip2 = atomI.inducedDipole[0]*atomJ.inducedDipoleP[0] +
atomI.inducedDipole[1]*atomJ.inducedDipoleP[1] +
atomI.inducedDipole[2]*atomJ.inducedDipoleP[2] +
atomJ.inducedDipole[0]*atomI.inducedDipoleP[0] +
atomJ.inducedDipole[1]*atomI.inducedDipoleP[1] +
atomJ.inducedDipole[2]*atomI.inducedDipoleP[2];
float scip7 = qir1*atomJ.inducedDipoleP[0] + qir2*atomJ.inducedDipoleP[1] + qir3*atomJ.inducedDipoleP[2];
float scip8 = qkr1*atomI.inducedDipoleP[0] + qkr2*atomI.inducedDipoleP[1] + qkr3*atomI.inducedDipoleP[2];
// calculate the gl functions for permanent components
// calculate the gl functions for induced components
float gli1 = ck*sci3 - ci*sci4;
float gli2 = -sc3*sci4 - sci3*sc4;
float gli3 = sci3*sc6 - sci4*sc5;
float gli6 = sci1;
float gli7 = 2.0f * (sci7-sci8);
float glip1 = ck*scip3 - ci*scip4;
float glip2 = -sc3*scip4 - scip3*sc4;
float glip3 = scip3*sc6 - scip4*sc5;
float glip6 = scip1;
float glip7 = 2.0f * (scip7-scip8);
float ei = (bn1*(gli1+gli6) + bn2*(gli2+gli7) + bn3*gli3);
float psc3 = 1.0f - scale3*scalingFactors[PScaleIndex];
ei -= (rr3*(gli1+gli6)*psc3 + rr5*(gli2+gli7)*psc5 + rr7*gli3*psc7);
forceTorqueEnergy->w += 0.5f*ei;
float dsc3 = 1.0f - scale3*scalingFactors[DScaleIndex];
float usc3 = 1.0f - scale3*scalingFactors[UScaleIndex];
float gfi1 = (bn2*(gli1+glip1+gli6+glip6)
+ bn2*scip2
+ bn3*(gli2+glip2+gli7+glip7)
- bn3*(sci3*scip4+scip3*sci4)
+ bn4*(gli3+glip3));
gfi1 -= rr5*((gli1+gli6)*psc3 + (glip1+glip6)*dsc3 + scip2*usc3) +
rr7*((gli7+gli2)*psc5 + (glip7+glip2)*dsc5 - (sci3*scip4+scip3*sci4)*usc5) + rr9*(gli3*psc7+glip3*dsc7);
gfi1 *= 0.5f;
forceTorqueEnergy->x += gfi1*xr;
forceTorqueEnergy->y += gfi1*yr;
forceTorqueEnergy->z += gfi1*zr;
float gfi2 = -ck*bn1 + sc4*bn2 - sc6*bn3;
float gfi3 = ci*bn1 + sc3*bn2 + sc5*bn3;
float ftm2i1 = gfi2*(atomI.inducedDipole[0]+atomI.inducedDipoleP[0]) + gfi3*(atomJ.inducedDipole[0]+atomJ.inducedDipoleP[0]);
float ftm2i2 = gfi2*(atomI.inducedDipole[1]+atomI.inducedDipoleP[1]) + gfi3*(atomJ.inducedDipole[1]+atomJ.inducedDipoleP[1]);
float ftm2i3 = gfi2*(atomI.inducedDipole[2]+atomI.inducedDipoleP[2]) + gfi3*(atomJ.inducedDipole[2]+atomJ.inducedDipoleP[2]);
forceTorqueEnergy->x -= 0.5f*
( -rr3*ck *(atomI.inducedDipole[0]*psc3 + atomI.inducedDipoleP[0]*dsc3)
+ rr5*sc4*(atomI.inducedDipole[0]*psc5 + atomI.inducedDipoleP[0]*dsc5)
- rr7*sc6*(atomI.inducedDipole[0]*psc7 + atomI.inducedDipoleP[0]*dsc7)
+ rr3*ci* (atomJ.inducedDipole[0]*psc3 + atomJ.inducedDipoleP[0]*dsc3)
+ rr5*sc3*(atomJ.inducedDipole[0]*psc5 + atomJ.inducedDipoleP[0]*dsc5)
+ rr7*sc5*(atomJ.inducedDipole[0]*psc7 + atomJ.inducedDipoleP[0]*dsc7));
forceTorqueEnergy->y -= 0.5f*
(- rr3*ck *(atomI.inducedDipole[1]*psc3 + atomI.inducedDipoleP[1]*dsc3)
+ rr5*sc4*(atomI.inducedDipole[1]*psc5 + atomI.inducedDipoleP[1]*dsc5)
- rr7*sc6*(atomI.inducedDipole[1]*psc7 + atomI.inducedDipoleP[1]*dsc7)
+ rr3*ci *(atomJ.inducedDipole[1]*psc3 + atomJ.inducedDipoleP[1]*dsc3)
+ rr5*sc3*(atomJ.inducedDipole[1]*psc5 + atomJ.inducedDipoleP[1]*dsc5)
+ rr7*sc5*(atomJ.inducedDipole[1]*psc7 + atomJ.inducedDipoleP[1]*dsc7));
forceTorqueEnergy->z -= 0.5f*
(- rr3*ck *(atomI.inducedDipole[2]*psc3 + atomI.inducedDipoleP[2]*dsc3)
+ rr5*sc4*(atomI.inducedDipole[2]*psc5 + atomI.inducedDipoleP[2]*dsc5)
- rr7*sc6*(atomI.inducedDipole[2]*psc7 + atomI.inducedDipoleP[2]*dsc7)
+ rr3*ci *(atomJ.inducedDipole[2]*psc3 + atomJ.inducedDipoleP[2]*dsc3)
+ rr5*sc3*(atomJ.inducedDipole[2]*psc5 + atomJ.inducedDipoleP[2]*dsc5)
+ rr7*sc5*(atomJ.inducedDipole[2]*psc7 + atomJ.inducedDipoleP[2]*dsc7));
if( damp != 0.0f ){
float expdamp = exp(damp);
float temp3 = -3.0f*damp*expdamp*rr2;
float temp5 = -damp;
float temp7 = -0.2f - 0.6f*damp;
float ddsc31 = temp3 * xr;
float ddsc32 = temp3 * yr;
float ddsc33 = temp3 * zr;
float ddsc51 = temp5 * ddsc31;
float ddsc52 = temp5 * ddsc32;
float ddsc53 = temp5 * ddsc33;
float ddsc71 = temp7 * ddsc51;
float ddsc72 = temp7 * ddsc52;
float ddsc73 = temp7 * ddsc53;
temp3 = rr3*((gli1+gli6)*scalingFactors[PScaleIndex] + (glip1+glip6)*scalingFactors[DScaleIndex]);
temp5 = rr5*((gli2+gli7)*scalingFactors[PScaleIndex] + (glip2+glip7)*scalingFactors[DScaleIndex]);
temp7 = rr7*(gli3*scalingFactors[PScaleIndex] + glip3*scalingFactors[DScaleIndex]);
ftm2i1 -= (temp3*ddsc31 + temp5*ddsc51 + temp7*ddsc71);
ftm2i2 -= (temp3*ddsc32 + temp5*ddsc52 + temp7*ddsc72);
ftm2i3 -= (temp3*ddsc33 + temp5*ddsc53 + temp7*ddsc73);
if( cAmoebaSim.polarizationType == 0 ){
temp3 = rr3 * scalingFactors[UScaleIndex] * scip2;
temp5 = -rr5 * scalingFactors[UScaleIndex] * (sci3*scip4+scip3*sci4);
ftm2i1 -= (temp3*ddsc31 + temp5*ddsc51);
ftm2i2 -= (temp3*ddsc32 + temp5*ddsc52);
ftm2i3 -= (temp3*ddsc33 + temp5*ddsc53);
}
}
if( cAmoebaSim.polarizationType ){
float gfd = (bn2*scip2 - bn3*(scip3*sci4+sci3*scip4)) - (rr5*scip2*usc3 - rr7*(scip3*sci4 +sci3*scip4)*usc5);
float p1 = (bn2 - usc5*rr5);
ftm2i1 -= gfd*xr + p1*(sci4*atomI.inducedDipoleP[0]+scip4*atomI.inducedDipole[0]+sci3*atomJ.inducedDipoleP[0]+scip3*atomJ.inducedDipole[0]);
ftm2i2 -= gfd*yr + p1*(sci4*atomI.inducedDipoleP[1]+scip4*atomI.inducedDipole[1]+sci3*atomJ.inducedDipoleP[1]+scip3*atomJ.inducedDipole[1]);
ftm2i3 -= gfd*zr + p1*(sci4*atomI.inducedDipoleP[2]+scip4*atomI.inducedDipole[2]+sci3*atomJ.inducedDipoleP[2]+scip3*atomJ.inducedDipole[2]);
}
forceTorqueEnergy->x += 0.5f*ftm2i1;
forceTorqueEnergy->y += 0.5f*ftm2i2;
forceTorqueEnergy->z += 0.5f*ftm2i3;
return;
}
static __device__ void calculatePmeDirectElectrostaticPairIxnT1_kernel(
const PmeDirectElectrostaticParticle& atomI, const PmeDirectElectrostaticParticle& atomJ,
const float4 delta, const float4 bn, const float* scalingFactors, float4* forceTorqueEnergy ){
float xr = delta.x;
float yr = delta.y;
float zr = delta.z;
float r = delta.w;
// set the permanent multipole and induced dipole values;
float di1 = atomI.labFrameDipole[0];
float di2 = atomI.labFrameDipole[1];
float di3 = atomI.labFrameDipole[2];
float qi1 = atomI.labFrameQuadrupole[0];
float qi2 = atomI.labFrameQuadrupole[1];
float qi3 = atomI.labFrameQuadrupole[2];
float qi5 = atomI.labFrameQuadrupole[3];
float qi6 = atomI.labFrameQuadrupole[4];
float qi9 = atomI.labFrameQuadrupole[5];
float ck = atomJ.q;
float dk1 = atomJ.labFrameDipole[0];
float dk2 = atomJ.labFrameDipole[1];
float dk3 = atomJ.labFrameDipole[2];
float qk1 = atomJ.labFrameQuadrupole[0];
float qk2 = atomJ.labFrameQuadrupole[1];
float qk3 = atomJ.labFrameQuadrupole[2];
float qk5 = atomJ.labFrameQuadrupole[3];
float qk6 = atomJ.labFrameQuadrupole[4];
float qk9 = atomJ.labFrameQuadrupole[5];
float bn1 = bn.x;
float bn2 = bn.y;
float bn3 = bn.z;
float bn4 = bn.w;
// apply Thole polarization damping to scale factors
float rr1 = 1.0f/r;
float rr2 = rr1*rr1;
float rr3 = rr1*rr2;
float rr5 = 3.0f*rr3*rr2;
float rr7 = 5.0f*rr5*rr2;
float rr9 = 7.0f*rr7*rr2;
float scale = 1.0f-scalingFactors[MScaleIndex];
float prefactor = scale*rr3 - bn1;
float dixdk1 = di2*dk3 - di3*dk2;
float ttm21 = prefactor*dixdk1;
float dixdk2 = di3*dk1 - di1*dk3;
float ttm22 = prefactor*dixdk2;
float dixdk3 = di1*dk2 - di2*dk1;
float ttm23 = prefactor*dixdk3;
float qir1 = qi1*xr + qi2*yr + qi3*zr;
float qir2 = qi2*xr + qi5*yr + qi6*zr;
float qir3 = qi3*xr + qi6*yr + qi9*zr;
float qkr1 = qk1*xr + qk2*yr + qk3*zr;
float qkr2 = qk2*xr + qk5*yr + qk6*zr;
float qkr3 = qk3*xr + qk6*yr + qk9*zr;
float qiqkr1 = qi1*qkr1 + qi2*qkr2 + qi3*qkr3;
float qiqkr2 = qi2*qkr1 + qi5*qkr2 + qi6*qkr3;
float qiqkr3 = qi3*qkr1 + qi6*qkr2 + qi9*qkr3;
float rxqikr1 = yr*qiqkr3 - zr*qiqkr2;
float qkrxqir1 = qkr2*qir3 - qkr3*qir2;
prefactor = 4.0f*(bn3 - scale*rr7);
ttm21 -= prefactor*(rxqikr1+qkrxqir1);
float rxqikr2 = zr*qiqkr1 - xr*qiqkr3;
float qkrxqir2 = qkr3*qir1 - qkr1*qir3;
ttm22 -= prefactor*(rxqikr2+qkrxqir2);
float rxqikr3 = xr*qiqkr2 - yr*qiqkr1;
float qkrxqir3 = qkr1*qir2 - qkr2*qir1;
ttm23 -= prefactor*(rxqikr3+qkrxqir3);
float qidk1 = qi1*dk1 + qi2*dk2 + qi3*dk3;
float qidk2 = qi2*dk1 + qi5*dk2 + qi6*dk3;
float qidk3 = qi3*dk1 + qi6*dk2 + qi9*dk3;
float dixqkr1 = di2*qkr3 - di3*qkr2;
float dkxqir1 = dk2*qir3 - dk3*qir2;
float rxqidk1 = yr*qidk3 - zr*qidk2;
float qixqk1 = qi2*qk3 + qi5*qk6 + qi6*qk9 - qi3*qk2 - qi6*qk5 - qi9*qk6;
prefactor = 2.0f*(bn2 - scale*rr5);
ttm21 += prefactor*(dixqkr1+dkxqir1+rxqidk1-2.0f*qixqk1);
float dixqkr2 = di3*qkr1 - di1*qkr3;
float dkxqir2 = dk3*qir1 - dk1*qir3;
float rxqidk2 = zr*qidk1 - xr*qidk3;
float qixqk2 = qi3*qk1 + qi6*qk2 + qi9*qk3 - qi1*qk3 - qi2*qk6 - qi3*qk9;
ttm22 += prefactor*(dixqkr2+dkxqir2+rxqidk2-2.0f*qixqk2);
float dixqkr3 = di1*qkr2 - di2*qkr1;
float dkxqir3 = dk1*qir2 - dk2*qir1;
float rxqidk3 = xr*qidk2 - yr*qidk1;
float qixqk3 = qi1*qk2 + qi2*qk5 + qi3*qk6 - qi2*qk1 - qi5*qk2 - qi6*qk3;
ttm23 += prefactor*(dixqkr3+dkxqir3+rxqidk3-2.0f*qixqk3);
float sc4 = dk1*xr + dk2*yr + dk3*zr;
float sc6 = qkr1*xr + qkr2*yr + qkr3*zr;
float gf2 = -ck*bn1 + sc4*bn2 - sc6*bn3;
float gfr2 = -ck*rr3 + sc4*rr5 - sc6*rr7;
prefactor = (gf2 - scale*gfr2);
ttm21 += prefactor*(di2*zr - di3*yr);
ttm22 += prefactor*(di3*xr - di1*zr);
ttm23 += prefactor*(di1*yr - di2*xr);
float gf5 = (-ck*bn2+sc4*bn3-sc6*bn4);
float gfr5 = (-ck*rr5+sc4*rr7-sc6*rr9);
prefactor = 2.0f*(gf5 - scale*gfr5);
float rxqir1 = yr*qir3 - zr*qir2;
float rxqir2 = zr*qir1 - xr*qir3;
float rxqir3 = xr*qir2 - yr*qir1;
ttm21 -= prefactor*rxqir1;
ttm22 -= prefactor*rxqir2;
ttm23 -= prefactor*rxqir3;
forceTorqueEnergy->x = ttm21;
forceTorqueEnergy->y = ttm22;
forceTorqueEnergy->z = ttm23;
return;
}
static __device__ void calculatePmeDirectElectrostaticPairIxnT2_kernel(
const PmeDirectElectrostaticParticle& atomI, const PmeDirectElectrostaticParticle& atomJ,
const float4 delta, const float4 bn, const float* scalingFactors, float4* forceTorqueEnergy ){
float xr = delta.x;
float yr = delta.y;
float zr = delta.z;
float r = delta.w;
// set the permanent multipole and induced dipole values;
float di1 = atomI.labFrameDipole[0];
float di2 = atomI.labFrameDipole[1];
float di3 = atomI.labFrameDipole[2];
float qi1 = atomI.labFrameQuadrupole[0];
float qi2 = atomI.labFrameQuadrupole[1];
float qi3 = atomI.labFrameQuadrupole[2];
float qi5 = atomI.labFrameQuadrupole[3];
float qi6 = atomI.labFrameQuadrupole[4];
float qi9 = atomI.labFrameQuadrupole[5];
float bn1 = bn.x;
float bn2 = bn.y;
float bn3 = bn.z;
// apply Thole polarization damping to scale factors
float rr1 = 1.0f/r;
float rr2 = rr1*rr1;
float rr3 = rr1*rr2;
float rr5 = 3.0f*rr3*rr2;
float rr7 = 5.0f*rr5*rr2;
float scale3 = 1.0f;
float scale5 = 1.0f;
float scale7 = 1.0f;
float damp = atomI.damp*atomJ.damp;
if( damp != 0.0f ){
float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
float ratio = r/damp;
damp = -pgamma*ratio*ratio*ratio;
if( damp > -50.0f ){
float expdamp = exp(damp);
scale3 = 1.0f - expdamp;
scale5 = 1.0f - (1.0f-damp)*expdamp;
scale7 = 1.0f - (1.0f-damp+0.6f*damp*damp)*expdamp;
}
}
float dsc3 = 1.0f - scale3*scalingFactors[DScaleIndex];
float dsc5 = 1.0f - scale5*scalingFactors[DScaleIndex];
float dsc7 = 1.0f - scale7*scalingFactors[DScaleIndex];
float psc3 = 1.0f - scale3*scalingFactors[PScaleIndex];
float psc5 = 1.0f - scale5*scalingFactors[PScaleIndex];
float psc7 = 1.0f - scale7*scalingFactors[PScaleIndex];
float prefactor1 = 0.5f*(psc3*rr3 - bn1);
float prefactor2 = 0.5f*(dsc3*rr3 - bn1);
float dixuk1 = di2*atomJ.inducedDipole[2] - di3*atomJ.inducedDipole[1];
float dixukp1 = di2*atomJ.inducedDipoleP[2] - di3*atomJ.inducedDipoleP[1];
float ttm2i1 = prefactor1*dixuk1 + prefactor2*dixukp1;
float dixuk2 = di3*atomJ.inducedDipole[0] - di1*atomJ.inducedDipole[2];
float dixukp2 = di3*atomJ.inducedDipoleP[0] - di1*atomJ.inducedDipoleP[2];
float ttm2i2 = prefactor1*dixuk2 + prefactor2*dixukp2;
float dixuk3 = di1*atomJ.inducedDipole[1] - di2*atomJ.inducedDipole[0];
float dixukp3 = di1*atomJ.inducedDipoleP[1] - di2*atomJ.inducedDipoleP[0];
float ttm2i3 = prefactor1*dixuk3 + prefactor2*dixukp3;
float sci4 = atomJ.inducedDipole[0]*xr + atomJ.inducedDipole[1]*yr + atomJ.inducedDipole[2]*zr;
float scip4 = atomJ.inducedDipoleP[0]*xr + atomJ.inducedDipoleP[1]*yr + atomJ.inducedDipoleP[2]*zr;
float gti2 = bn2*(sci4+scip4);
float gtri2 = rr5*(sci4*psc5+scip4*dsc5);
prefactor1 = 0.5f*(gti2 - gtri2);
ttm2i1 += prefactor1*( di2*zr - di3*yr );
ttm2i2 += prefactor1*( di3*xr - di1*zr );
ttm2i3 += prefactor1*( di1*yr - di2*xr );
float qir1 = qi1*xr + qi2*yr + qi3*zr;
float qir2 = qi2*xr + qi5*yr + qi6*zr;
float qir3 = qi3*xr + qi6*yr + qi9*zr;
prefactor1 = rr7*(sci4*psc7+scip4*dsc7) - bn3*(sci4+scip4);
ttm2i1 += prefactor1*( yr*qir3 - zr*qir2 );
ttm2i2 += prefactor1*( zr*qir1 - xr*qir3 );
ttm2i3 += prefactor1*( xr*qir2 - yr*qir1 );
float qiuk1 = qi1*atomJ.inducedDipole[0] + qi2*atomJ.inducedDipole[1] + qi3*atomJ.inducedDipole[2];
float qiuk2 = qi2*atomJ.inducedDipole[0] + qi5*atomJ.inducedDipole[1] + qi6*atomJ.inducedDipole[2];
float qiuk3 = qi3*atomJ.inducedDipole[0] + qi6*atomJ.inducedDipole[1] + qi9*atomJ.inducedDipole[2];
float qiukp1 = qi1*atomJ.inducedDipoleP[0] + qi2*atomJ.inducedDipoleP[1] + qi3*atomJ.inducedDipoleP[2];
float qiukp2 = qi2*atomJ.inducedDipoleP[0] + qi5*atomJ.inducedDipoleP[1] + qi6*atomJ.inducedDipoleP[2];
float qiukp3 = qi3*atomJ.inducedDipoleP[0] + qi6*atomJ.inducedDipoleP[1] + qi9*atomJ.inducedDipoleP[2];
prefactor1 = (bn2 - rr5*psc5);
prefactor2 = (bn2 - rr5*dsc5);
float ukxqir1 = atomJ.inducedDipole[1]*qir3 - atomJ.inducedDipole[2]*qir2;
float ukxqirp1 = atomJ.inducedDipoleP[1]*qir3 - atomJ.inducedDipoleP[2]*qir2;
float rxqiuk1 = yr*qiuk3 - zr*qiuk2;
float rxqiukp1 = yr*qiukp3 - zr*qiukp2;
ttm2i1 += prefactor1*(ukxqir1 + rxqiuk1) + prefactor2*(ukxqirp1 + rxqiukp1);
float ukxqir2 = atomJ.inducedDipole[2]*qir1 - atomJ.inducedDipole[0]*qir3;
float ukxqirp2 = atomJ.inducedDipoleP[2]*qir1 - atomJ.inducedDipoleP[0]*qir3;
float rxqiuk2 = zr*qiuk1 - xr*qiuk3;
float rxqiukp2 = zr*qiukp1 - xr*qiukp3;
ttm2i2 += prefactor1*(ukxqir2 + rxqiuk2) + prefactor2*(ukxqirp2 + rxqiukp2);
float ukxqir3 = atomJ.inducedDipole[0]*qir2 - atomJ.inducedDipole[1]*qir1;
float ukxqirp3 = atomJ.inducedDipoleP[0]*qir2 - atomJ.inducedDipoleP[1]*qir1;
float rxqiuk3 = xr*qiuk2 - yr*qiuk1;
float rxqiukp3 = xr*qiukp2 - yr*qiukp1;
ttm2i3 += prefactor1*(ukxqir3 + rxqiuk3) + prefactor2*(ukxqirp3 + rxqiukp3);
forceTorqueEnergy->x += ttm2i1;
forceTorqueEnergy->y += ttm2i2;
forceTorqueEnergy->z += ttm2i3;
return;
}
__device__ void calculatePmeDirectElectrostaticPairIxn_kernel( const PmeDirectElectrostaticParticle& atomI, const PmeDirectElectrostaticParticle& atomJ,
const float* scalingFactors, float4 forceTorqueEnergy[3] ){
float4 delta;
delta.x = atomJ.x - atomI.x;
delta.y = atomJ.y - atomI.y;
delta.z = atomJ.z - atomI.z;
// periodic box
delta.x -= floor(delta.x*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
delta.y -= floor(delta.y*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
delta.z -= floor(delta.z*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
delta.w = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if( delta.w > cSim.nonbondedCutoffSqr || delta.w == 0.0f ){
forceTorqueEnergy[0].x = forceTorqueEnergy[0].y = forceTorqueEnergy[0].z = forceTorqueEnergy[0].w = 0.0f;
forceTorqueEnergy[1].x = forceTorqueEnergy[1].y = forceTorqueEnergy[1].z = 0.0f;
forceTorqueEnergy[2].x = forceTorqueEnergy[2].y = forceTorqueEnergy[2].z = 0.0f;
return;
}
delta.w = sqrt(delta.w);
float r = delta.w;
float ralpha = cSim.alphaEwald*r;
float alsq2 = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
float alsq2n = 0.0f;
if( cSim.alphaEwald > 0.0f ){
alsq2n = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald);
}
float exp2a = exp(-(ralpha*ralpha));
float rr1 = 1.0f/r;
float bn0 = erfc(ralpha)*rr1;
float rr2 = rr1*rr1;
forceTorqueEnergy[0].w = atomI.q*atomJ.q*bn0;
alsq2n *= alsq2;
float4 bn;
bn.x = (bn0+alsq2n*exp2a)*rr2;
alsq2n *= alsq2;
bn.y = (3.0f*bn.x+alsq2n*exp2a)*rr2;
alsq2n *= alsq2;
bn.z = (5.0f*bn.y+alsq2n*exp2a)*rr2;
alsq2n *= alsq2;
bn.w = (7.0f*bn.z+alsq2n*exp2a)*rr2;
alsq2n *= alsq2;
float bn5 = (9.0f*bn.w+alsq2n*exp2a)*rr2;
//calculatePmeDirectElectrostaticPairIxn_F1_kernel( atomI, atomJ, delta, scalingFactors, forceTorqueEnergy );
calculatePmeDirectElectrostaticPairIxnF1_kernel( atomI, atomJ, delta, bn, bn5, scalingFactors, forceTorqueEnergy );
//calculatePmeDirectElectrostaticPairIxn_F2_kernel( atomI, atomJ, delta, scalingFactors, forceTorqueEnergy );
calculatePmeDirectElectrostaticPairIxnF2_kernel( atomI, atomJ, delta, bn, scalingFactors, forceTorqueEnergy );
//calculatePmeDirectElectrostaticPairIxn_T1_kernel( atomI, atomJ, delta, scalingFactors, forceTorqueEnergy );
calculatePmeDirectElectrostaticPairIxnT1_kernel( atomI, atomJ, delta, bn, scalingFactors, (forceTorqueEnergy+1) );
//calculatePmeDirectElectrostaticPairIxn_T2_kernel( atomI, atomJ, delta, scalingFactors, forceTorqueEnergy );
calculatePmeDirectElectrostaticPairIxnT2_kernel( atomI, atomJ, delta, bn, scalingFactors, (forceTorqueEnergy+1) );
//calculatePmeDirectElectrostaticPairIxn_T3_kernel( atomI, atomJ, delta, scalingFactors, forceTorqueEnergy );
//calculatePmeDirectElectrostaticPairIxnT3_kernel( atomI, atomJ, delta, bn, scalingFactors, (forceTorqueEnergy+2) );
// T3 == T1 w/ particles I and J reversed
// T4 == T2 w/ particles I and J reversed
delta.x *= -1.0f;
delta.y *= -1.0f;
delta.z *= -1.0f;
calculatePmeDirectElectrostaticPairIxnT1_kernel( atomJ, atomI, delta, bn, scalingFactors, (forceTorqueEnergy+2) );
calculatePmeDirectElectrostaticPairIxnT2_kernel( atomJ, atomI, delta, bn, scalingFactors, (forceTorqueEnergy+2) );
//calculatePmeDirectElectrostaticPairIxn_T4_kernel( atomI, atomJ, delta, scalingFactors, forceTorqueEnergy );
//calculatePmeDirectElectrostaticPairIxnT4_kernel( atomI, atomJ, delta, scalingFactors, (forceTorqueEnergy+2) );
return;
}
__device__ void calculatePmeDirectElectrostaticPairIxnOrig_kernel( PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ,
float* scalingFactors, float4 forceTorqueEnergy[3]
#ifdef AMOEBA_DEBUG
,float4* debugArray
......@@ -186,12 +1020,9 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
float qi1 = atomI.labFrameQuadrupole[0];
float qi2 = atomI.labFrameQuadrupole[1];
float qi3 = atomI.labFrameQuadrupole[2];
float qi4 = atomI.labFrameQuadrupole[3];
float qi5 = atomI.labFrameQuadrupole[4];
float qi6 = atomI.labFrameQuadrupole[5];
float qi7 = atomI.labFrameQuadrupole[6];
float qi8 = atomI.labFrameQuadrupole[7];
float qi9 = atomI.labFrameQuadrupole[8];
float qi5 = atomI.labFrameQuadrupole[3];
float qi6 = atomI.labFrameQuadrupole[4];
float qi9 = atomI.labFrameQuadrupole[5];
float dk1 = atomJ.labFrameDipole[0];
float dk2 = atomJ.labFrameDipole[1];
......@@ -200,12 +1031,9 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
float qk1 = atomJ.labFrameQuadrupole[0];
float qk2 = atomJ.labFrameQuadrupole[1];
float qk3 = atomJ.labFrameQuadrupole[2];
float qk4 = atomJ.labFrameQuadrupole[3];
float qk5 = atomJ.labFrameQuadrupole[4];
float qk6 = atomJ.labFrameQuadrupole[5];
float qk7 = atomJ.labFrameQuadrupole[6];
float qk8 = atomJ.labFrameQuadrupole[7];
float qk9 = atomJ.labFrameQuadrupole[8];
float qk5 = atomJ.labFrameQuadrupole[3];
float qk6 = atomJ.labFrameQuadrupole[4];
float qk9 = atomJ.labFrameQuadrupole[5];
// calculate the real space error function terms
......@@ -325,25 +1153,25 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
float dkxr2 = dk3*xr - dk1*zr;
float dkxr3 = dk1*yr - dk2*xr;
float qir1 = qi1*xr + qi4*yr + qi7*zr;
float qir2 = qi2*xr + qi5*yr + qi8*zr;
float qir1 = qi1*xr + qi2*yr + qi3*zr;
float qir2 = qi2*xr + qi5*yr + qi6*zr;
float qir3 = qi3*xr + qi6*yr + qi9*zr;
float qkr1 = qk1*xr + qk4*yr + qk7*zr;
float qkr2 = qk2*xr + qk5*yr + qk8*zr;
float qkr1 = qk1*xr + qk2*yr + qk3*zr;
float qkr2 = qk2*xr + qk5*yr + qk6*zr;
float qkr3 = qk3*xr + qk6*yr + qk9*zr;
float qiqkr1 = qi1*qkr1 + qi4*qkr2 + qi7*qkr3;
float qiqkr2 = qi2*qkr1 + qi5*qkr2 + qi8*qkr3;
float qiqkr1 = qi1*qkr1 + qi2*qkr2 + qi3*qkr3;
float qiqkr2 = qi2*qkr1 + qi5*qkr2 + qi6*qkr3;
float qiqkr3 = qi3*qkr1 + qi6*qkr2 + qi9*qkr3;
float qkqir1 = qk1*qir1 + qk4*qir2 + qk7*qir3;
float qkqir2 = qk2*qir1 + qk5*qir2 + qk8*qir3;
float qkqir1 = qk1*qir1 + qk2*qir2 + qk3*qir3;
float qkqir2 = qk2*qir1 + qk5*qir2 + qk6*qir3;
float qkqir3 = qk3*qir1 + qk6*qir2 + qk9*qir3;
float qixqk1 = qi2*qk3 + qi5*qk6 + qi8*qk9 - qi3*qk2 - qi6*qk5 - qi9*qk8;
float qixqk2 = qi3*qk1 + qi6*qk4 + qi9*qk7 - qi1*qk3 - qi4*qk6 - qi7*qk9;
float qixqk3 = qi1*qk2 + qi4*qk5 + qi7*qk8 - qi2*qk1 - qi5*qk4 - qi8*qk7;
float qixqk1 = qi2*qk3 + qi5*qk6 + qi6*qk9 - qi3*qk2 - qi6*qk5 - qi9*qk6;
float qixqk2 = qi3*qk1 + qi6*qk2 + qi9*qk3 - qi1*qk3 - qi2*qk6 - qi3*qk9;
float qixqk3 = qi1*qk2 + qi2*qk5 + qi3*qk6 - qi2*qk1 - qi5*qk2 - qi6*qk3;
float rxqir1 = yr*qir3 - zr*qir2;
float rxqir2 = zr*qir1 - xr*qir3;
......@@ -365,28 +1193,28 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
float qkrxqir2 = qkr3*qir1 - qkr1*qir3;
float qkrxqir3 = qkr1*qir2 - qkr2*qir1;
float qidk1 = qi1*dk1 + qi4*dk2 + qi7*dk3;
float qidk2 = qi2*dk1 + qi5*dk2 + qi8*dk3;
float qidk1 = qi1*dk1 + qi2*dk2 + qi3*dk3;
float qidk2 = qi2*dk1 + qi5*dk2 + qi6*dk3;
float qidk3 = qi3*dk1 + qi6*dk2 + qi9*dk3;
float qkdi1 = qk1*di1 + qk4*di2 + qk7*di3;
float qkdi2 = qk2*di1 + qk5*di2 + qk8*di3;
float qkdi1 = qk1*di1 + qk2*di2 + qk3*di3;
float qkdi2 = qk2*di1 + qk5*di2 + qk6*di3;
float qkdi3 = qk3*di1 + qk6*di2 + qk9*di3;
float qiuk1 = qi1*atomJ.inducedDipole[0] + qi4*atomJ.inducedDipole[1] + qi7*atomJ.inducedDipole[2];
float qiuk2 = qi2*atomJ.inducedDipole[0] + qi5*atomJ.inducedDipole[1] + qi8*atomJ.inducedDipole[2];
float qiuk1 = qi1*atomJ.inducedDipole[0] + qi2*atomJ.inducedDipole[1] + qi3*atomJ.inducedDipole[2];
float qiuk2 = qi2*atomJ.inducedDipole[0] + qi5*atomJ.inducedDipole[1] + qi6*atomJ.inducedDipole[2];
float qiuk3 = qi3*atomJ.inducedDipole[0] + qi6*atomJ.inducedDipole[1] + qi9*atomJ.inducedDipole[2];
float qkui1 = qk1*atomI.inducedDipole[0] + qk4*atomI.inducedDipole[1] + qk7*atomI.inducedDipole[2];
float qkui2 = qk2*atomI.inducedDipole[0] + qk5*atomI.inducedDipole[1] + qk8*atomI.inducedDipole[2];
float qkui1 = qk1*atomI.inducedDipole[0] + qk2*atomI.inducedDipole[1] + qk3*atomI.inducedDipole[2];
float qkui2 = qk2*atomI.inducedDipole[0] + qk5*atomI.inducedDipole[1] + qk6*atomI.inducedDipole[2];
float qkui3 = qk3*atomI.inducedDipole[0] + qk6*atomI.inducedDipole[1] + qk9*atomI.inducedDipole[2];
float qiukp1 = qi1*atomJ.inducedDipoleP[0] + qi4*atomJ.inducedDipoleP[1] + qi7*atomJ.inducedDipoleP[2];
float qiukp2 = qi2*atomJ.inducedDipoleP[0] + qi5*atomJ.inducedDipoleP[1] + qi8*atomJ.inducedDipoleP[2];
float qiukp1 = qi1*atomJ.inducedDipoleP[0] + qi2*atomJ.inducedDipoleP[1] + qi3*atomJ.inducedDipoleP[2];
float qiukp2 = qi2*atomJ.inducedDipoleP[0] + qi5*atomJ.inducedDipoleP[1] + qi6*atomJ.inducedDipoleP[2];
float qiukp3 = qi3*atomJ.inducedDipoleP[0] + qi6*atomJ.inducedDipoleP[1] + qi9*atomJ.inducedDipoleP[2];
float qkuip1 = qk1*atomI.inducedDipoleP[0] + qk4*atomI.inducedDipoleP[1] + qk7*atomI.inducedDipoleP[2];
float qkuip2 = qk2*atomI.inducedDipoleP[0] + qk5*atomI.inducedDipoleP[1] + qk8*atomI.inducedDipoleP[2];
float qkuip1 = qk1*atomI.inducedDipoleP[0] + qk2*atomI.inducedDipoleP[1] + qk3*atomI.inducedDipoleP[2];
float qkuip2 = qk2*atomI.inducedDipoleP[0] + qk5*atomI.inducedDipoleP[1] + qk6*atomI.inducedDipoleP[2];
float qkuip3 = qk3*atomI.inducedDipoleP[0] + qk6*atomI.inducedDipoleP[1] + qk9*atomI.inducedDipoleP[2];
float dixqkr1 = di2*qkr3 - di3*qkr2;
......@@ -448,8 +1276,8 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
float sc8 = qkr1*di1 + qkr2*di2 + qkr3*di3;
float sc9 = qir1*qkr1 + qir2*qkr2 + qir3*qkr3;
float sc10 = qi1*qk1 + qi2*qk2 + qi3*qk3
+ qi4*qk4 + qi5*qk5 + qi6*qk6
+ qi7*qk7 + qi8*qk8 + qi9*qk9;
+ qi2*qk2 + qi5*qk5 + qi6*qk6
+ qi3*qk3 + qi6*qk6 + qi9*qk9;
// calculate the scalar products for induced components
......@@ -1031,12 +1859,9 @@ __device__ void loadPmeDirectElectrostaticShared( struct PmeDirectElectrostaticP
sA->labFrameQuadrupole[0] = cAmoebaSim.pLabFrameQuadrupole[atomI*9];
sA->labFrameQuadrupole[1] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+1];
sA->labFrameQuadrupole[2] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+2];
sA->labFrameQuadrupole[3] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+3];
sA->labFrameQuadrupole[4] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+4];
sA->labFrameQuadrupole[5] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+5];
sA->labFrameQuadrupole[6] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+6];
sA->labFrameQuadrupole[7] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+7];
sA->labFrameQuadrupole[8] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+8];
sA->labFrameQuadrupole[3] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+4];
sA->labFrameQuadrupole[4] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+5];
sA->labFrameQuadrupole[5] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+8];
// induced dipole
......@@ -1120,28 +1945,35 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
// on first pass, set threads/block
static unsigned int threadsPerBlock = 0;
static const int maxL1 = 0;
if( threadsPerBlock == 0 ){
unsigned int sharedMemoryPerBlock = gpu->sharedMemoryPerBlock;
unsigned int maxThreads;
if (gpu->sm_version >= SM_20)
if (gpu->sm_version >= SM_20){
maxThreads = 384;
else if (gpu->sm_version >= SM_12)
if( maxL1 ){
sharedMemoryPerBlock = 16384;
cudaFuncSetCacheConfig(kCalculateAmoebaPmeDirectElectrostaticCutoffForces_kernel, cudaFuncCachePreferL1 );
}
} else if (gpu->sm_version >= SM_12){
maxThreads = 128;
else
} else {
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(PmeDirectElectrostaticParticle)), maxThreads);
}
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(PmeDirectElectrostaticParticle), sharedMemoryPerBlock), maxThreads);
}
kClearFields_3( amoebaGpu, 1 );
#ifdef AMOEBA_DEBUG
//#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticCutoffForces: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u gpu->nonbond_threads_per_block=%u\n",
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticCutoffForces: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u maxL1=%d\n",
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(PmeDirectElectrostaticParticle), (sizeof(PmeDirectElectrostaticParticle))*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->sim.nonbond_threads_per_block );
(*gpu->psInteractionCount)[0], gpu->sim.workUnits, maxL1 );
(void) fflush( amoebaGpu->log );
}
#endif
//#endif
if (gpu->bOutputBufferPerWarp){
......@@ -1156,12 +1988,6 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
} else {
/*
if (gpu->sm_version >= SM_20)
cudaFuncSetCacheConfig(kCalculateAmoebaPmeDirectElectrostaticCutoffForces_kernel, cudaFuncCachePreferL1 );
//cudaFuncSetCacheConfig(kCalculateAmoebaPmeDirectElectrostaticCutoffForces_kernel, cudaFuncCachePreferShared );
*/
kCalculateAmoebaPmeDirectElectrostaticCutoffForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
#ifdef AMOEBA_DEBUG
......@@ -1187,7 +2013,6 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
void cudaComputeAmoebaPmeElectrostatic( amoebaGpuContext amoebaGpu )
{
cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpu );
kCalculateAmoebaPMEInducedDipoleForces( amoebaGpu );
cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );
......
......@@ -239,6 +239,9 @@ if( atomI == targetAtom || atomJ == targetAtom ){
// No interactions in this block.
} else {
#ifdef CALCULATE_FULL_TILE
flags = 0xFFFFFFFF;
#endif
sA[threadIdx.x].force[0] = 0.0f;
sA[threadIdx.x].force[1] = 0.0f;
sA[threadIdx.x].force[2] = 0.0f;
......@@ -311,7 +314,8 @@ if( atomI == targetAtom || atomJ == targetAtom ){
psA[jIdx].torque[0] += forceTorqueEnergy[2].x;
psA[jIdx].torque[1] += forceTorqueEnergy[2].y;
psA[jIdx].torque[2] += forceTorqueEnergy[2].z;
#ifndef CALCULATE_FULL_TILE
} else {
sA[threadIdx.x].tempForce[0] = forceTorqueEnergy[0].x;
......@@ -345,6 +349,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
psA[jIdx].torque[1] += sA[threadIdx.x].tempTorque[1] + sA[threadIdx.x+16].tempTorque[1];
psA[jIdx].torque[2] += sA[threadIdx.x].tempTorque[2] + sA[threadIdx.x+16].tempTorque[2];
}
#endif
}
} // end of atoms out-of-bounds
} // end of flags&(1<<j block
......
......@@ -437,7 +437,7 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
maxThreads = 192;
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)), maxThreads);
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle), gpu->sharedMemoryPerBlock ), maxThreads);
}
if (gpu->bOutputBufferPerWarp){
......@@ -469,7 +469,7 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
if( amoebaGpu->log ){
gpu->psInteractionCount->Download();
(void) fprintf( amoebaGpu->log, "cudaComputeAmoebaPmeDirectFixedEField: threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u shrd=%u\n",
threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)+sizeof(float3)),
threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)+sizeof(float3), gpu->sharedMemoryPerBlock),
(sizeof(FixedFieldParticle)+sizeof(float3)), (sizeof(FixedFieldParticle)+sizeof(float3))*threadsPerBlock );
(void) fprintf( amoebaGpu->log, "AmoebaCutoffForces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u warp=%d\n",
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
......
......@@ -37,10 +37,9 @@ void GetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
#undef AMOEBA_DEBUG
#undef INCLUDE_MI_FIELD_BUFFERS
#define INCLUDE_MI_FIELD_BUFFERS
//#define INCLUDE_MI_FIELD_BUFFERS
#include "kCalculateAmoebaCudaMutualInducedParticle.h"
#undef INCLUDE_MI_FIELD_BUFFERS
#ifdef INCLUDE_MI_FIELD_BUFFERS
__device__ void sumTempBuffer( MutualInducedParticle& atomI, MutualInducedParticle& atomJ ){
atomI.tempBuffer[0] += atomJ.tempBuffer[0];
......@@ -51,6 +50,93 @@ __device__ void sumTempBuffer( MutualInducedParticle& atomI, MutualInducedPartic
atomI.tempBufferP[1] += atomJ.tempBufferP[1];
atomI.tempBufferP[2] += atomJ.tempBufferP[2];
}
#endif
// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
__device__ void setupMutualInducedFieldPairIxn_kernel( const MutualInducedParticle& atomI, const MutualInducedParticle& atomJ,
const float uscale, float4* delta, float* preFactor2 ) {
// compute thedelta->xeal space portion of the Ewald summation
delta->x = atomJ.x - atomI.x;
delta->y = atomJ.y - atomI.y;
delta->z = atomJ.z - atomI.z;
// pdelta->xiodic boundary conditions
delta->x -= floor(delta->x*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
delta->y -= floor(delta->y*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
delta->z -= floor(delta->z*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
float r2 = (delta->x*delta->x) + (delta->y*delta->y) + (delta->z*delta->z);
if( r2 <= cSim.nonbondedCutoffSqr ){
float r = sqrtf(r2);
// calculate the error function damping terms
float ralpha = cSim.alphaEwald*r;
float bn0 = erfc(ralpha)/r;
float alsq2 = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
float alsq2n = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald);
float exp2a = exp(-(ralpha*ralpha));
alsq2n *= alsq2;
float bn1 = (bn0+alsq2n*exp2a)/r2;
alsq2n *= alsq2;
float bn2 = (3.0f*bn1+alsq2n*exp2a)/r2;
// compute the error function scaled and unscaled terms
float scale3 = 1.0f;
float scale5 = 1.0f;
float damp = atomI.damp*atomJ.damp;
if( damp != 0.0f ){
float ratio = (r/damp);
ratio = ratio*ratio*ratio;
float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
damp = -pgamma*ratio;
if( damp > -50.0f) {
float expdamp = exp(damp);
scale3 = 1.0f - expdamp;
scale5 = 1.0f - expdamp*(1.0f-damp);
}
}
float dsc3 = uscale*scale3;
float dsc5 = uscale*scale5;
float r3 = (r*r2);
float r5 = (r3*r2);
float rr3 = (1.0f-dsc3)/r3;
float rr5 = 3.0f*(1.0f-dsc5)/r5;
delta->w = rr3 - bn1;
*preFactor2 = bn2 - rr5;
} else {
delta->w = *preFactor2 = 0.0f;
}
}
__device__ void calculateMutualInducedFieldPairIxn_kernel( const float inducedDipole[3], const float4 delta, const float preFactor2, float fieldSum[3] ) {
float preFactor3 = preFactor2*(inducedDipole[0]*delta.x + inducedDipole[1]*delta.y + inducedDipole[2]*delta.z);
fieldSum[0] += preFactor3*delta.x + delta.w*inducedDipole[0];
fieldSum[1] += preFactor3*delta.y + delta.w*inducedDipole[1];
fieldSum[2] += preFactor3*delta.z + delta.w*inducedDipole[2];
}
__device__ void calculateMutualInducedFieldPairIxnNoAdd_kernel( const float inducedDipole[3], const float4 delta, const float preFactor2, float fieldSum[3] ) {
float preFactor3 = preFactor2*(inducedDipole[0]*delta.x + inducedDipole[1]*delta.y + inducedDipole[2]*delta.z);
fieldSum[0] = preFactor3*delta.x + delta.w*inducedDipole[0];
fieldSum[1] = preFactor3*delta.y + delta.w*inducedDipole[1];
fieldSum[2] = preFactor3*delta.z + delta.w*inducedDipole[2];
}
// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
......@@ -385,7 +471,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
maxThreads = 128;
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(MutualInducedParticle)), maxThreads);
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(MutualInducedParticle), gpu->sharedMemoryPerBlock ), maxThreads);
}
#ifdef AMOEBA_DEBUG
......@@ -573,17 +659,17 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
amoebaGpu->psWorkVector[0]->_pDevData, amoebaGpu->psWorkVector[1]->_pDevData );
LAUNCHERROR("kSorUpdatePmeMutualInducedField");
if( 0 ){
gpuContext gpu = amoebaGpu->gpuContext;
std::vector<int> fileId;
fileId.push_back( iteration );
VectorOfDoubleVectors outputVector;
// cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData );
// cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeDirectMI", fileId, outputVector );
}
if( 0 ){
gpuContext gpu = amoebaGpu->gpuContext;
std::vector<int> fileId;
fileId.push_back( iteration );
VectorOfDoubleVectors outputVector;
// cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData );
// cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeDirectMI", fileId, outputVector );
}
// get total epsilon -- performing sums on gpu
......
......@@ -100,99 +100,17 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)(
for (unsigned int j = 0; j < GRID; j++)
{
float4 ijField[3];
// load coords, charge, ...
calculatePmeDirectMutualInducedFieldPairIxn_kernel( localParticle, psA[j], uscale, ijField
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
unsigned int mask = ( (atomI == (y + j)) || (atomI >= cSim.atoms) || ((y+j) >= cSim.atoms) ) ? 0 : 1;
// add to field at atomI the field due atomJ's dipole
fieldSum[0] += mask ? ijField[0].x : 0.0f;
fieldSum[1] += mask ? ijField[1].x : 0.0f;
fieldSum[2] += mask ? ijField[2].x : 0.0f;
fieldPolarSum[0] += mask ? ijField[0].z : 0.0f;
fieldPolarSum[1] += mask ? ijField[1].z : 0.0f;
fieldPolarSum[2] += mask ? ijField[2].z : 0.0f;
#ifdef AMOEBA_DEBUG
/*
if( atomI == targetAtom || (y+j) == targetAtom ){
unsigned int index = atomI == targetAtom ? (y+j) : atomI;
unsigned int pullBackIndex = 0;
unsigned int indexI = 0;
unsigned int indexJ = indexI ? 0 : 2;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + j);
debugArray[index].z = cSim.nonbondedCutoffSqr;
debugArray[index].w = 6.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;
pullBackIndex++;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;
index += cSim.paddedNumberOfAtoms;
float flag = 6.0f;
debugArray[index].x = ijField[0].x;
debugArray[index].y = ijField[1].x;
debugArray[index].z = ijField[2].x;
debugArray[index].w = flag;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[0].x;
debugArray[index].y = ijField[1].x;
debugArray[index].z = ijField[2].x;
debugArray[index].w = flag;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[0].z;
debugArray[index].y = ijField[1].z;
debugArray[index].z = ijField[2].z;
debugArray[index].w = flag;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[0].z;
debugArray[index].y = ijField[1].z;
debugArray[index].z = ijField[2].z;
debugArray[index].w = flag;
index += cSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[0].x;
debugArray[index].y = match ? 0.0f : ijField[1].x;
debugArray[index].z = match ? 0.0f : ijField[2].x;
index += cSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = + 10.0f;
}
*/
#endif
float4 delta;
float prefactor2;
if( ( (atomI == (y + j)) || (atomI >= cSim.atoms) || ((y+j) >= cSim.atoms) ) ){
delta.w = prefactor2 = 0.0f;
} else {
setupMutualInducedFieldPairIxn_kernel( localParticle, psA[j], uscale, &delta, &prefactor2 );
}
calculateMutualInducedFieldPairIxn_kernel( psA[j].inducedDipole, delta, prefactor2, fieldSum );
calculateMutualInducedFieldPairIxn_kernel( psA[j].inducedDipolePolar, delta, prefactor2, fieldPolarSum );
}
......@@ -226,6 +144,10 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
// No interactions in this block.
} else {
#ifndef INCLUDE_MI_FIELD_BUFFERS
flags = 0xFFFFFFFF;
#endif
// zero shared fields
zeroMutualInducedParticleSharedField( &(sA[threadIdx.x]) );
......@@ -235,53 +157,25 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
if ((flags&(1<<j)) != 0)
{
unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j;
float4 ijField[3];
// load coords, charge, ...
calculatePmeDirectMutualInducedFieldPairIxn_kernel( localParticle, psA[jIdx], uscale, ijField
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
unsigned int mask = ( (atomI >= cSim.atoms) || ((y+jIdx) >= cSim.atoms) ) ? 0 : 1;
// add to field at atomI the field due atomJ's dipole
fieldSum[0] += mask ? ijField[0].x : 0.0f;
fieldSum[1] += mask ? ijField[1].x : 0.0f;
fieldSum[2] += mask ? ijField[2].x : 0.0f;
// add to polar field at atomI the field due atomJ's dipole
fieldPolarSum[0] += mask ? ijField[0].z : 0.0f;
fieldPolarSum[1] += mask ? ijField[1].z : 0.0f;
fieldPolarSum[2] += mask ? ijField[2].z : 0.0f;
// add to field at atomJ the field due atomI's dipole
float4 delta;
float prefactor2;
if( (atomI >= cSim.atoms) || ((y+jIdx) >= cSim.atoms) ){
delta.w = prefactor2 = 0.0f;
} else {
setupMutualInducedFieldPairIxn_kernel( localParticle, psA[jIdx], uscale, &delta, &prefactor2 );
}
calculateMutualInducedFieldPairIxn_kernel( psA[jIdx].inducedDipole, delta, prefactor2, fieldSum );
calculateMutualInducedFieldPairIxn_kernel( psA[jIdx].inducedDipolePolar, delta, prefactor2, fieldPolarSum );
#ifndef INCLUDE_MI_FIELD_BUFFERS
calculateMutualInducedFieldPairIxn_kernel( localParticle.inducedDipole, delta, prefactor2, psA[jIdx].field );
calculateMutualInducedFieldPairIxn_kernel( localParticle.inducedDipolePolar, delta, prefactor2, psA[jIdx].fieldPolar );
#else
if( flags == 0xFFFFFFFF ){
psA[jIdx].field[0] += mask ? ijField[0].y : 0.0f;
psA[jIdx].field[1] += mask ? ijField[1].y : 0.0f;
psA[jIdx].field[2] += mask ? ijField[2].y : 0.0f;
// add to polar field at atomJ the field due atomI's dipole
psA[jIdx].fieldPolar[0] += mask ? ijField[0].w : 0.0f;
psA[jIdx].fieldPolar[1] += mask ? ijField[1].w : 0.0f;
psA[jIdx].fieldPolar[2] += mask ? ijField[2].w : 0.0f;
calculateMutualInducedFieldPairIxn_kernel( localParticle.inducedDipole, delta, prefactor2, psA[jIdx].field );
calculateMutualInducedFieldPairIxn_kernel( localParticle.inducedDipolePolar, delta, prefactor2, psA[jIdx].fieldPolar );
} else {
sA[threadIdx.x].tempBuffer[0] = mask ? ijField[0].y : 0.0;
sA[threadIdx.x].tempBuffer[1] = mask ? ijField[1].y : 0.0;
sA[threadIdx.x].tempBuffer[2] = mask ? ijField[2].y : 0.0;
sA[threadIdx.x].tempBufferP[0] = mask ? ijField[0].w : 0.0;
sA[threadIdx.x].tempBufferP[1] = mask ? ijField[1].w : 0.0;
sA[threadIdx.x].tempBufferP[2] = mask ? ijField[2].w : 0.0;
calculateMutualInducedFieldPairIxnNoAdd_kernel( localParticle.inducedDipole, delta, prefactor2, sA[threadIdx.x].tempBuffer );
calculateMutualInducedFieldPairIxnNoAdd_kernel( localParticle.inducedDipolePolar, delta, prefactor2, sA[threadIdx.x].tempBufferP );
if( tgx % 2 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );
......@@ -308,61 +202,8 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
}
}
/*
#ifdef AMOEBA_DEBUG
if( atomI == targetAtom || (y+jIdx) == targetAtom ){
unsigned int index = atomI == targetAtom ? (y+jIdx) : atomI;
unsigned int pullBackIndex = 0;
unsigned int indexI = 0;
unsigned int indexJ = indexI ? 0 : 2;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + jIdx);
debugArray[index].z = cSim.nonbondedCutoffSqr;
debugArray[index].w = 7.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;
pullBackIndex++;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;
index += cSim.paddedNumberOfAtoms;
float flag = 7.0f;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
debugArray[index].w = flag;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ][0];
debugArray[index].y = ijField[indexJ][1];
debugArray[index].z = ijField[indexJ][2];
debugArray[index].w = flag;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI+1][0];
debugArray[index].y = ijField[indexI+1][1];
debugArray[index].z = ijField[indexI+1][2];
debugArray[index].w = flag;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ+1][0];
debugArray[index].y = ijField[indexJ+1][1];
debugArray[index].z = ijField[indexJ+1][2];
debugArray[index].w = flag;
}
#endif
*/
}
tj = (tj + 1) & (GRID - 1);
......
......@@ -531,7 +531,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
maxThreads = 192;
else
maxThreads = 128;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(Vdw14_7Particle)), maxThreads);
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(Vdw14_7Particle), gpu->sharedMemoryPerBlock ), maxThreads);
}
if( 0 ){
......
......@@ -382,7 +382,7 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )
maxThreads = 192;
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock( amoebaGpu, sizeof(WcaDispersionParticle)), maxThreads);
threadsPerBlock = std::min(getThreadsPerBlock( amoebaGpu, sizeof(WcaDispersionParticle), gpu->sharedMemoryPerBlock ), maxThreads);
}
#ifdef AMOEBA_DEBUG
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment