Commit 92a338cf authored by Peter Eastman's avatar Peter Eastman
Browse files

Optimizations to PME direct space computation

parent b20978e1
......@@ -59,7 +59,8 @@ __device__ static void loadFixedFieldShared( struct FixedFieldParticle* sA, unsi
{
// coordinates & charge
sA->x = cSim.pPosq[atomI].x;
float4 posq = cSim.pPosq[atomI];
sA->x = posq.x;
sA->y = cSim.pPosq[atomI].y;
sA->z = cSim.pPosq[atomI].z;
sA->q = cSim.pPosq[atomI].w;
......@@ -79,8 +80,9 @@ __device__ static void loadFixedFieldShared( struct FixedFieldParticle* sA, unsi
sA->labFrameQuadrupole_YZ = cAmoebaSim.pLabFrameQuadrupole[atomI*9+5];
sA->labFrameQuadrupole_ZZ = cAmoebaSim.pLabFrameQuadrupole[atomI*9+8];
sA->damp = cAmoebaSim.pDampingFactorAndThole[atomI].x;
sA->thole = cAmoebaSim.pDampingFactorAndThole[atomI].y;
float2 dampingFactorAndThole = cAmoebaSim.pDampingFactorAndThole[atomI];
sA->damp = dampingFactorAndThole.x;
sA->thole = dampingFactorAndThole.y;
#ifdef GK
sA->bornR = bornR[atomI];
#endif
......
......@@ -23,6 +23,8 @@ struct MutualInducedParticle {
float fieldS[3];
float fieldPolarS[3];
#else
float padding;
#endif
#ifdef INCLUDE_MI_FIELD_BUFFERS
......@@ -35,10 +37,11 @@ __device__ static void loadMutualInducedShared( MutualInducedParticle* sA, unsig
{
// coordinates & charge
sA->x = cSim.pPosq[atomI].x;
sA->y = cSim.pPosq[atomI].y;
sA->z = cSim.pPosq[atomI].z;
sA->q = cSim.pPosq[atomI].w;
float4 posq = cSim.pPosq[atomI];
sA->x = posq.x;
sA->y = posq.y;
sA->z = posq.z;
sA->q = posq.w;
// dipole
......@@ -52,8 +55,9 @@ __device__ static void loadMutualInducedShared( MutualInducedParticle* sA, unsig
sA->inducedDipolePolar[1] = cAmoebaSim.pInducedDipolePolar[atomI*3+1];
sA->inducedDipolePolar[2] = cAmoebaSim.pInducedDipolePolar[atomI*3+2];
sA->damp = cAmoebaSim.pDampingFactorAndThole[atomI].x;
sA->thole = cAmoebaSim.pDampingFactorAndThole[atomI].y;
float2 dampingFactorAndThole = cAmoebaSim.pDampingFactorAndThole[atomI];
sA->damp = dampingFactorAndThole.x;
sA->thole = dampingFactorAndThole.y;
#ifdef GK
......
......@@ -29,7 +29,7 @@
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(384, 1)
#elif (__CUDA_ARCH__ >= 130)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(128, 1)
#else
__launch_bounds__(64, 1)
......
......@@ -167,7 +167,7 @@ __device__ void sumTempBuffer( FixedFieldParticle& atomI, FixedFieldParticle& at
}
__device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle& atomI, FixedFieldParticle& atomJ,
float dscale, float pscale, float fields[4][3]
float dscale, float pscale, float4 fields[3]
#ifdef AMOEBA_DEBUG
, float4* pullBack
#endif
......@@ -192,20 +192,19 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
// calculate the error function damping terms
float ralpha = cSim.alphaEwald*r;
float bn[4];
bn[0] = erfc(ralpha)/r;
float bn0 = erfc(ralpha)/r;
float alsq2 = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
float alsq2n = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald);
float exp2a = exp(-(ralpha*ralpha));
alsq2n *= alsq2;
bn[1] = (bn[0]+alsq2n*exp2a)/r2;
float bn1 = (bn0+alsq2n*exp2a)/r2;
alsq2n *= alsq2;
bn[2] = (3.0f*bn[1]+alsq2n*exp2a)/r2;
float bn2 = (3.0f*bn1+alsq2n*exp2a)/r2;
alsq2n *= alsq2;
bn[3] = (5.0f*bn[2]+alsq2n*exp2a)/r2;
float bn3 = (5.0f*bn2+alsq2n*exp2a)/r2;
// compute the error function scaled and unscaled terms
......@@ -262,99 +261,96 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
float qkz = atomJ.labFrameQuadrupole_XZ*xr + atomJ.labFrameQuadrupole_YZ*yr + atomJ.labFrameQuadrupole_ZZ*zr;
float qkr = qkx*xr + qky*yr + qkz*zr;
float fim[3],fkm[3];
float fid[3],fkd[3];
float fip[3],fkp[3];
fim[0] = -xr*(bn[1]*atomJ.q-bn[2]*dkr+bn[3]*qkr)
- bn[1]*atomJ.labFrameDipole_X + 2.0f*bn[2]*qkx;
float fim0 = -xr*(bn1*atomJ.q-bn2*dkr+bn3*qkr)
- bn1*atomJ.labFrameDipole_X + 2.0f*bn2*qkx;
fim[1] = -yr*(bn[1]*atomJ.q-bn[2]*dkr+bn[3]*qkr)
- bn[1]*atomJ.labFrameDipole_Y + 2.0f*bn[2]*qky;
float fim1 = -yr*(bn1*atomJ.q-bn2*dkr+bn3*qkr)
- bn1*atomJ.labFrameDipole_Y + 2.0f*bn2*qky;
fim[2] = -zr*(bn[1]*atomJ.q-bn[2]*dkr+bn[3]*qkr)
- bn[1]*atomJ.labFrameDipole_Z + 2.0f*bn[2]*qkz;
float fim2 = -zr*(bn1*atomJ.q-bn2*dkr+bn3*qkr)
- bn1*atomJ.labFrameDipole_Z + 2.0f*bn2*qkz;
fkm[0] = xr*(bn[1]*atomI.q+bn[2]*dir+bn[3]*qir)
- bn[1]*atomI.labFrameDipole_X - 2.0f*bn[2]*qix;
float fkm0 = xr*(bn1*atomI.q+bn2*dir+bn3*qir)
- bn1*atomI.labFrameDipole_X - 2.0f*bn2*qix;
fkm[1] = yr*(bn[1]*atomI.q+bn[2]*dir+bn[3]*qir)
- bn[1]*atomI.labFrameDipole_Y - 2.0f*bn[2]*qiy;
float fkm1 = yr*(bn1*atomI.q+bn2*dir+bn3*qir)
- bn1*atomI.labFrameDipole_Y - 2.0f*bn2*qiy;
fkm[2] = zr*(bn[1]*atomI.q+bn[2]*dir+bn[3]*qir)
- bn[1]*atomI.labFrameDipole_Z - 2.0f*bn[2]*qiz;
float fkm2 = zr*(bn1*atomI.q+bn2*dir+bn3*qir)
- bn1*atomI.labFrameDipole_Z - 2.0f*bn2*qiz;
fid[0] = -xr*(drr3*atomJ.q-drr5*dkr+drr7*qkr)
float fid0 = -xr*(drr3*atomJ.q-drr5*dkr+drr7*qkr)
- drr3*atomJ.labFrameDipole_X + 2.0f*drr5*qkx;
fid[1] = -yr*(drr3*atomJ.q-drr5*dkr+drr7*qkr)
float fid1 = -yr*(drr3*atomJ.q-drr5*dkr+drr7*qkr)
- drr3*atomJ.labFrameDipole_Y + 2.0f*drr5*qky;
fid[2] = -zr*(drr3*atomJ.q-drr5*dkr+drr7*qkr)
float fid2 = -zr*(drr3*atomJ.q-drr5*dkr+drr7*qkr)
- drr3*atomJ.labFrameDipole_Z + 2.0f*drr5*qkz;
fkd[0] = xr*(drr3*atomI.q+drr5*dir+drr7*qir)
float fkd0 = xr*(drr3*atomI.q+drr5*dir+drr7*qir)
- drr3*atomI.labFrameDipole_X - 2.0f*drr5*qix;
fkd[1] = yr*(drr3*atomI.q+drr5*dir+drr7*qir)
float fkd1 = yr*(drr3*atomI.q+drr5*dir+drr7*qir)
- drr3*atomI.labFrameDipole_Y - 2.0f*drr5*qiy;
fkd[2] = zr*(drr3*atomI.q+drr5*dir+drr7*qir)
float fkd2 = zr*(drr3*atomI.q+drr5*dir+drr7*qir)
- drr3*atomI.labFrameDipole_Z - 2.0f*drr5*qiz;
fip[0] = -xr*(prr3*atomJ.q-prr5*dkr+prr7*qkr)
float fip0 = -xr*(prr3*atomJ.q-prr5*dkr+prr7*qkr)
- prr3*atomJ.labFrameDipole_X + 2.0f*prr5*qkx;
fip[1] = -yr*(prr3*atomJ.q-prr5*dkr+prr7*qkr)
float fip1 = -yr*(prr3*atomJ.q-prr5*dkr+prr7*qkr)
- prr3*atomJ.labFrameDipole_Y + 2.0f*prr5*qky;
fip[2] = -zr*(prr3*atomJ.q-prr5*dkr+prr7*qkr)
float fip2 = -zr*(prr3*atomJ.q-prr5*dkr+prr7*qkr)
- prr3*atomJ.labFrameDipole_Z + 2.0f*prr5*qkz;
fkp[0] = xr*(prr3*atomI.q+prr5*dir+prr7*qir)
float fkp0 = xr*(prr3*atomI.q+prr5*dir+prr7*qir)
- prr3*atomI.labFrameDipole_X - 2.0f*prr5*qix;
fkp[1] = yr*(prr3*atomI.q+prr5*dir+prr7*qir)
float fkp1 = yr*(prr3*atomI.q+prr5*dir+prr7*qir)
- prr3*atomI.labFrameDipole_Y - 2.0f*prr5*qiy;
fkp[2] = zr*(prr3*atomI.q+prr5*dir+prr7*qir)
float fkp2 = zr*(prr3*atomI.q+prr5*dir+prr7*qir)
- prr3*atomI.labFrameDipole_Z - 2.0f*prr5*qiz;
// increment the field at each site due to this interaction
if( r2 <= cSim.nonbondedCutoffSqr ){
fields[0][0] = fim[0] - fid[0];
fields[0][1] = fim[1] - fid[1];
fields[0][2] = fim[2] - fid[2];
fields[0].x = fim0 - fid0;
fields[1].x = fim1 - fid1;
fields[2].x = fim2 - fid2;
fields[1][0] = fkm[0] - fkd[0];
fields[1][1] = fkm[1] - fkd[1];
fields[1][2] = fkm[2] - fkd[2];
fields[0].y = fkm0 - fkd0;
fields[1].y = fkm1 - fkd1;
fields[2].y = fkm2 - fkd2;
fields[2][0] = fim[0] - fip[0];
fields[2][1] = fim[1] - fip[1];
fields[2][2] = fim[2] - fip[2];
fields[0].z = fim0 - fip0;
fields[1].z = fim1 - fip1;
fields[2].z = fim2 - fip2;
fields[3][0] = fkm[0] - fkp[0];
fields[3][1] = fkm[1] - fkp[1];
fields[3][2] = fkm[2] - fkp[2];
fields[0].w = fkm0 - fkp0;
fields[1].w = fkm1 - fkp1;
fields[2].w = fkm2 - fkp2;
} else {
fields[0][0] = 0.0f;
fields[1][0] = 0.0f;
fields[2][0] = 0.0f;
fields[3][0] = 0.0f;
fields[0].x = 0.0f;
fields[0].y = 0.0f;
fields[0].z = 0.0f;
fields[0].w = 0.0f;
fields[0][1] = 0.0f;
fields[1][1] = 0.0f;
fields[2][1] = 0.0f;
fields[3][1] = 0.0f;
fields[1].x = 0.0f;
fields[1].y = 0.0f;
fields[1].z = 0.0f;
fields[1].w = 0.0f;
fields[0][2] = 0.0f;
fields[1][2] = 0.0f;
fields[2][2] = 0.0f;
fields[3][2] = 0.0f;
fields[2].x = 0.0f;
fields[2].y = 0.0f;
fields[2].z = 0.0f;
fields[2].w = 0.0f;
}
#ifdef AMOEBA_DEBUG
......@@ -441,7 +437,7 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
if (gpu->sm_version >= SM_20)
maxThreads = 384;
else if (gpu->sm_version >= SM_12)
maxThreads = 128;
maxThreads = 192;
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)), maxThreads);
......
......@@ -28,11 +28,11 @@
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
__launch_bounds__(384, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(192, 1)
#else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
__launch_bounds__(64, 1)
#endif
void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
unsigned int* workUnit,
......@@ -117,7 +117,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
getMaskedPScaleFactor( j, pScaleMask, &pScaleValue );
}
float ijField[4][3];
float4 ijField[3];
calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[j], dScaleValue, pScaleValue, ijField
#ifdef AMOEBA_DEBUG
, pullBack
......@@ -131,13 +131,13 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
fieldSum[0] += match ? 0.0f : ijField[0][0];
fieldSum[1] += match ? 0.0f : ijField[0][1];
fieldSum[2] += match ? 0.0f : ijField[0][2];
fieldSum[0] += match ? 0.0f : ijField[0].x;
fieldSum[1] += match ? 0.0f : ijField[1].x;
fieldSum[2] += match ? 0.0f : ijField[2].x;
fieldPolarSum[0] += match ? 0.0f : ijField[2][0];
fieldPolarSum[1] += match ? 0.0f : ijField[2][1];
fieldPolarSum[2] += match ? 0.0f : ijField[2][2];
fieldPolarSum[0] += match ? 0.0f : ijField[0].z;
fieldPolarSum[1] += match ? 0.0f : ijField[1].z;
fieldPolarSum[2] += match ? 0.0f : ijField[2].z;
#ifdef AMOEBA_DEBUG
if( atomI == targetAtom || targetAtom == (y+j) ){
......@@ -234,7 +234,7 @@ if( atomI == targetAtom || targetAtom == (y+j) ){
getMaskedPScaleFactor( jIdx, pScaleMask, &pScaleValue );
}
float ijField[4][3];
float4 ijField[3];
calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[jIdx], dScaleValue, pScaleValue, ijField
#ifdef AMOEBA_DEBUG
, pullBack
......@@ -245,35 +245,35 @@ if( atomI == targetAtom || targetAtom == (y+j) ){
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
fieldSum[0] += outOfBounds ? 0.0f : ijField[0][0];
fieldSum[1] += outOfBounds ? 0.0f : ijField[0][1];
fieldSum[2] += outOfBounds ? 0.0f : ijField[0][2];
fieldSum[0] += outOfBounds ? 0.0f : ijField[0].x;
fieldSum[1] += outOfBounds ? 0.0f : ijField[1].x;
fieldSum[2] += outOfBounds ? 0.0f : ijField[2].x;
fieldPolarSum[0] += outOfBounds ? 0.0f : ijField[2][0];
fieldPolarSum[1] += outOfBounds ? 0.0f : ijField[2][1];
fieldPolarSum[2] += outOfBounds ? 0.0f : ijField[2][2];
fieldPolarSum[0] += outOfBounds ? 0.0f : ijField[0].z;
fieldPolarSum[1] += outOfBounds ? 0.0f : ijField[1].z;
fieldPolarSum[2] += outOfBounds ? 0.0f : ijField[2].z;
if( flags == 0xFFFFFFFF ){
// add to field at atomJ the field due atomI's charge/dipole/quadrupole
psA[jIdx].eField[0] += outOfBounds ? 0.0f : ijField[1][0];
psA[jIdx].eField[1] += outOfBounds ? 0.0f : ijField[1][1];
psA[jIdx].eField[2] += outOfBounds ? 0.0f : ijField[1][2];
psA[jIdx].eField[0] += outOfBounds ? 0.0f : ijField[0].y;
psA[jIdx].eField[1] += outOfBounds ? 0.0f : ijField[1].y;
psA[jIdx].eField[2] += outOfBounds ? 0.0f : ijField[2].y;
psA[jIdx].eFieldP[0] += outOfBounds ? 0.0f : ijField[3][0];
psA[jIdx].eFieldP[1] += outOfBounds ? 0.0f : ijField[3][1];
psA[jIdx].eFieldP[2] += outOfBounds ? 0.0f : ijField[3][2];
psA[jIdx].eFieldP[0] += outOfBounds ? 0.0f : ijField[0].w;
psA[jIdx].eFieldP[1] += outOfBounds ? 0.0f : ijField[1].w;
psA[jIdx].eFieldP[2] += outOfBounds ? 0.0f : ijField[2].w;
} else {
sA[threadIdx.x].tempBuffer[0] = outOfBounds ? 0.0f : ijField[1][0];
sA[threadIdx.x].tempBuffer[1] = outOfBounds ? 0.0f : ijField[1][1];
sA[threadIdx.x].tempBuffer[2] = outOfBounds ? 0.0f : ijField[1][2];
sA[threadIdx.x].tempBuffer[0] = outOfBounds ? 0.0f : ijField[0].y;
sA[threadIdx.x].tempBuffer[1] = outOfBounds ? 0.0f : ijField[1].y;
sA[threadIdx.x].tempBuffer[2] = outOfBounds ? 0.0f : ijField[2].y;
sA[threadIdx.x].tempBufferP[0] = outOfBounds ? 0.0f : ijField[3][0];
sA[threadIdx.x].tempBufferP[1] = outOfBounds ? 0.0f : ijField[3][1];
sA[threadIdx.x].tempBufferP[2] = outOfBounds ? 0.0f : ijField[3][2];
sA[threadIdx.x].tempBufferP[0] = outOfBounds ? 0.0f : ijField[0].w;
sA[threadIdx.x].tempBufferP[1] = outOfBounds ? 0.0f : ijField[1].w;
sA[threadIdx.x].tempBufferP[2] = outOfBounds ? 0.0f : ijField[2].w;
if( tgx % 2 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );
......
......@@ -55,7 +55,7 @@ __device__ void sumTempBuffer( MutualInducedParticle& atomI, MutualInducedPartic
// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
__device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInducedParticle& atomI, MutualInducedParticle& atomJ,
float uscale, float fields[4][3]
float uscale, float4 fields[3]
#ifdef AMOEBA_DEBUG
, float4* pullBack
#endif
......@@ -80,17 +80,16 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce
// calculate the error function damping terms
float ralpha = cSim.alphaEwald*r;
float bn[3];
bn[0] = erfc(ralpha)/r;
float bn0 = erfc(ralpha)/r;
float alsq2 = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
float alsq2n = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald);
float exp2a = exp(-(ralpha*ralpha));
alsq2n *= alsq2;
bn[1] = (bn[0]+alsq2n*exp2a)/r2;
float bn1 = (bn0+alsq2n*exp2a)/r2;
alsq2n *= alsq2;
bn[2] = (3.0f*bn[1]+alsq2n*exp2a)/r2;
float bn2 = (3.0f*bn1+alsq2n*exp2a)/r2;
// compute the error function scaled and unscaled terms
......@@ -124,81 +123,76 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce
float puir = atomI.inducedDipolePolar[0]*xr + atomI.inducedDipolePolar[1]*yr + atomI.inducedDipolePolar[2]*zr;
float pukr = atomJ.inducedDipolePolar[0]*xr + atomJ.inducedDipolePolar[1]*yr + atomJ.inducedDipolePolar[2]*zr;
float fimd[3],fkmd[3];
float fimp[3],fkmp[3];
float fid[3],fkd[3];
float fip[3],fkp[3];
bn1 *= -1.0f;
bn[1] *= -1.0f;
float fimd0 = bn1*atomJ.inducedDipole[0] + bn2*dukr*xr;
float fimd1 = bn1*atomJ.inducedDipole[1] + bn2*dukr*yr;
float fimd2 = bn1*atomJ.inducedDipole[2] + bn2*dukr*zr;
fimd[0] = bn[1]*atomJ.inducedDipole[0] + bn[2]*dukr*xr;
fimd[1] = bn[1]*atomJ.inducedDipole[1] + bn[2]*dukr*yr;
fimd[2] = bn[1]*atomJ.inducedDipole[2] + bn[2]*dukr*zr;
float fkmd0 = bn1*atomI.inducedDipole[0] + bn2*duir*xr;
float fkmd1 = bn1*atomI.inducedDipole[1] + bn2*duir*yr;
float fkmd2 = bn1*atomI.inducedDipole[2] + bn2*duir*zr;
fkmd[0] = bn[1]*atomI.inducedDipole[0] + bn[2]*duir*xr;
fkmd[1] = bn[1]*atomI.inducedDipole[1] + bn[2]*duir*yr;
fkmd[2] = bn[1]*atomI.inducedDipole[2] + bn[2]*duir*zr;
float fimp0 = bn1*atomJ.inducedDipolePolar[0] + bn2*pukr*xr;
float fimp1 = bn1*atomJ.inducedDipolePolar[1] + bn2*pukr*yr;
float fimp2 = bn1*atomJ.inducedDipolePolar[2] + bn2*pukr*zr;
fimp[0] = bn[1]*atomJ.inducedDipolePolar[0] + bn[2]*pukr*xr;
fimp[1] = bn[1]*atomJ.inducedDipolePolar[1] + bn[2]*pukr*yr;
fimp[2] = bn[1]*atomJ.inducedDipolePolar[2] + bn[2]*pukr*zr;
fkmp[0] = bn[1]*atomI.inducedDipolePolar[0] + bn[2]*puir*xr;
fkmp[1] = bn[1]*atomI.inducedDipolePolar[1] + bn[2]*puir*yr;
fkmp[2] = bn[1]*atomI.inducedDipolePolar[2] + bn[2]*puir*zr;
float fkmp0 = bn1*atomI.inducedDipolePolar[0] + bn2*puir*xr;
float fkmp1 = bn1*atomI.inducedDipolePolar[1] + bn2*puir*yr;
float fkmp2 = bn1*atomI.inducedDipolePolar[2] + bn2*puir*zr;
rr3 *= -1.0f;;
fid[0] = rr3*atomJ.inducedDipole[0] + rr5*dukr*xr;
fid[1] = rr3*atomJ.inducedDipole[1] + rr5*dukr*yr;
fid[2] = rr3*atomJ.inducedDipole[2] + rr5*dukr*zr;
float fid0 = rr3*atomJ.inducedDipole[0] + rr5*dukr*xr;
float fid1 = rr3*atomJ.inducedDipole[1] + rr5*dukr*yr;
float fid2 = rr3*atomJ.inducedDipole[2] + rr5*dukr*zr;
fkd[0] = rr3*atomI.inducedDipole[0] + rr5*duir*xr;
fkd[1] = rr3*atomI.inducedDipole[1] + rr5*duir*yr;
fkd[2] = rr3*atomI.inducedDipole[2] + rr5*duir*zr;
float fkd0 = rr3*atomI.inducedDipole[0] + rr5*duir*xr;
float fkd1 = rr3*atomI.inducedDipole[1] + rr5*duir*yr;
float fkd2 = rr3*atomI.inducedDipole[2] + rr5*duir*zr;
fip[0] = rr3*atomJ.inducedDipolePolar[0] + rr5*pukr*xr;
fip[1] = rr3*atomJ.inducedDipolePolar[1] + rr5*pukr*yr;
fip[2] = rr3*atomJ.inducedDipolePolar[2] + rr5*pukr*zr;
float fip0 = rr3*atomJ.inducedDipolePolar[0] + rr5*pukr*xr;
float fip1 = rr3*atomJ.inducedDipolePolar[1] + rr5*pukr*yr;
float fip2 = rr3*atomJ.inducedDipolePolar[2] + rr5*pukr*zr;
fkp[0] = rr3*atomI.inducedDipolePolar[0] + rr5*puir*xr;
fkp[1] = rr3*atomI.inducedDipolePolar[1] + rr5*puir*yr;
fkp[2] = rr3*atomI.inducedDipolePolar[2] + rr5*puir*zr;
float fkp0 = rr3*atomI.inducedDipolePolar[0] + rr5*puir*xr;
float fkp1 = rr3*atomI.inducedDipolePolar[1] + rr5*puir*yr;
float fkp2 = rr3*atomI.inducedDipolePolar[2] + rr5*puir*zr;
// increment the field at each site due to this interaction
if( r2 <= cSim.nonbondedCutoffSqr ){
fields[0][0] = fimd[0] - fid[0];
fields[1][0] = fkmd[0] - fkd[0];
fields[2][0] = fimp[0] - fip[0];
fields[3][0] = fkmp[0] - fkp[0];
fields[0].x = fimd0 - fid0;
fields[0].y = fkmd0 - fkd0;
fields[0].z = fimp0 - fip0;
fields[0].w = fkmp0 - fkp0;
fields[0][1] = fimd[1] - fid[1];
fields[1][1] = fkmd[1] - fkd[1];
fields[2][1] = fimp[1] - fip[1];
fields[3][1] = fkmp[1] - fkp[1];
fields[1].x = fimd1 - fid1;
fields[1].y = fkmd1 - fkd1;
fields[1].z = fimp1 - fip1;
fields[1].w = fkmp1 - fkp1;
fields[0][2] = fimd[2] - fid[2];
fields[1][2] = fkmd[2] - fkd[2];
fields[2][2] = fimp[2] - fip[2];
fields[3][2] = fkmp[2] - fkp[2];
fields[2].x = fimd2 - fid2;
fields[2].y = fkmd2 - fkd2;
fields[2].z = fimp2 - fip2;
fields[2].w = fkmp2 - fkp2;
} else {
fields[0][0] = 0.0f;
fields[1][0] = 0.0f;
fields[2][0] = 0.0f;
fields[3][0] = 0.0f;
fields[0].x = 0.0f;
fields[0].y = 0.0f;
fields[0].z = 0.0f;
fields[0].w = 0.0f;
fields[0][1] = 0.0f;
fields[1][1] = 0.0f;
fields[2][1] = 0.0f;
fields[3][1] = 0.0f;
fields[1].x = 0.0f;
fields[1].y = 0.0f;
fields[1].z = 0.0f;
fields[1].w = 0.0f;
fields[0][2] = 0.0f;
fields[1][2] = 0.0f;
fields[2][2] = 0.0f;
fields[3][2] = 0.0f;
fields[2].x = 0.0f;
fields[2].y = 0.0f;
fields[2].z = 0.0f;
fields[2].w = 0.0f;
}
#ifdef AMOEBA_DEBUG
pullBack[0].x = xr;
......@@ -207,8 +201,8 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce
pullBack[0].w = r2;
pullBack[1].x = alsq2;
pullBack[1].y = bn[0];
pullBack[1].z = bn[2];
pullBack[1].y = bn0;
pullBack[1].z = bn2;
pullBack[1].w = exp2a;
/*
......
......@@ -100,7 +100,7 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)(
for (unsigned int j = 0; j < GRID; j++)
{
float ijField[4][3];
float4 ijField[3];
// load coords, charge, ...
......@@ -114,13 +114,13 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)(
// add to field at atomI the field due atomJ's dipole
fieldSum[0] += mask ? ijField[0][0] : 0.0f;
fieldSum[1] += mask ? ijField[0][1] : 0.0f;
fieldSum[2] += mask ? ijField[0][2] : 0.0f;
fieldSum[0] += mask ? ijField[0].x : 0.0f;
fieldSum[1] += mask ? ijField[1].x : 0.0f;
fieldSum[2] += mask ? ijField[2].x : 0.0f;
fieldPolarSum[0] += mask ? ijField[2][0] : 0.0f;
fieldPolarSum[1] += mask ? ijField[2][1] : 0.0f;
fieldPolarSum[2] += mask ? ijField[2][2] : 0.0f;
fieldPolarSum[0] += mask ? ijField[0].z : 0.0f;
fieldPolarSum[1] += mask ? ijField[1].z : 0.0f;
fieldPolarSum[2] += mask ? ijField[2].z : 0.0f;
#ifdef AMOEBA_DEBUG
if( atomI == targetAtom || (y+j) == targetAtom ){
......@@ -233,7 +233,7 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
{
unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j;
float ijField[4][3];
float4 ijField[3];
// load coords, charge, ...
......@@ -247,39 +247,39 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
// add to field at atomI the field due atomJ's dipole
fieldSum[0] += mask ? ijField[0][0] : 0.0f;
fieldSum[1] += mask ? ijField[0][1] : 0.0f;
fieldSum[2] += mask ? ijField[0][2] : 0.0f;
fieldSum[0] += mask ? ijField[0].x : 0.0f;
fieldSum[1] += mask ? ijField[1].x : 0.0f;
fieldSum[2] += mask ? ijField[2].x : 0.0f;
// add to polar field at atomI the field due atomJ's dipole
fieldPolarSum[0] += mask ? ijField[2][0] : 0.0f;
fieldPolarSum[1] += mask ? ijField[2][1] : 0.0f;
fieldPolarSum[2] += mask ? ijField[2][2] : 0.0f;
fieldPolarSum[0] += mask ? ijField[0].z : 0.0f;
fieldPolarSum[1] += mask ? ijField[1].z : 0.0f;
fieldPolarSum[2] += mask ? ijField[2].z : 0.0f;
// add to field at atomJ the field due atomI's dipole
if( flags == 0xFFFFFFFF ){
psA[jIdx].field[0] += mask ? ijField[1][0] : 0.0f;
psA[jIdx].field[1] += mask ? ijField[1][1] : 0.0f;
psA[jIdx].field[2] += mask ? ijField[1][2] : 0.0f;
psA[jIdx].field[0] += mask ? ijField[0].y : 0.0f;
psA[jIdx].field[1] += mask ? ijField[1].y : 0.0f;
psA[jIdx].field[2] += mask ? ijField[2].y : 0.0f;
// add to polar field at atomJ the field due atomI's dipole
psA[jIdx].fieldPolar[0] += mask ? ijField[3][0] : 0.0f;
psA[jIdx].fieldPolar[1] += mask ? ijField[3][1] : 0.0f;
psA[jIdx].fieldPolar[2] += mask ? ijField[3][2] : 0.0f;
psA[jIdx].fieldPolar[0] += mask ? ijField[0].w : 0.0f;
psA[jIdx].fieldPolar[1] += mask ? ijField[1].w : 0.0f;
psA[jIdx].fieldPolar[2] += mask ? ijField[2].w : 0.0f;
} else {
sA[threadIdx.x].tempBuffer[0] = mask ? 0.0f : ijField[1][0];
sA[threadIdx.x].tempBuffer[1] = mask ? 0.0f : ijField[1][1];
sA[threadIdx.x].tempBuffer[2] = mask ? 0.0f : ijField[1][2];
sA[threadIdx.x].tempBuffer[0] = mask ? 0.0f : ijField[0].y;
sA[threadIdx.x].tempBuffer[1] = mask ? 0.0f : ijField[1].y;
sA[threadIdx.x].tempBuffer[2] = mask ? 0.0f : ijField[2].y;
sA[threadIdx.x].tempBufferP[0] = mask ? 0.0f : ijField[3][0];
sA[threadIdx.x].tempBufferP[1] = mask ? 0.0f : ijField[3][1];
sA[threadIdx.x].tempBufferP[2] = mask ? 0.0f : ijField[3][2];
sA[threadIdx.x].tempBufferP[0] = mask ? 0.0f : ijField[0].w;
sA[threadIdx.x].tempBufferP[1] = mask ? 0.0f : ijField[1].w;
sA[threadIdx.x].tempBufferP[2] = mask ? 0.0f : ijField[2].w;
if( tgx % 2 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment