Commit 35c974f6 authored by Peter Eastman's avatar Peter Eastman
Browse files

Optimizations to PME

parent 92a338cf
...@@ -1018,18 +1018,7 @@ void kCalculateAmoebaPMEInducedDipoleField(amoebaGpuContext amoebaGpu) ...@@ -1018,18 +1018,7 @@ void kCalculateAmoebaPMEInducedDipoleField(amoebaGpuContext amoebaGpu)
*/ */
void kCalculateAmoebaPMEInducedDipoleForces(amoebaGpuContext amoebaGpu) void kCalculateAmoebaPMEInducedDipoleForces(amoebaGpuContext amoebaGpu)
{ {
// Perform PME for the induced dipoles.
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
kGridSpreadInducedDipoles_kernel<<<10*gpu->sim.blocks, 64>>>();
LAUNCHERROR("kGridSpreadInducedDipoles");
cufftExecC2C(gpu->fftplan, gpu->psPmeGrid->_pDevData, gpu->psPmeGrid->_pDevData, CUFFT_FORWARD);
kAmoebaReciprocalConvolution_kernel<<<gpu->sim.blocks, gpu->sim.nonbond_threads_per_block>>>();
LAUNCHERROR("kAmoebaReciprocalConvolution");
cufftExecC2C(gpu->fftplan, gpu->psPmeGrid->_pDevData, gpu->psPmeGrid->_pDevData, CUFFT_INVERSE);
int potentialThreads = (gpu->sm_version >= SM_20 ? 256 : (gpu->sm_version >= SM_12 ? 128 : 64));
kComputeInducedPotentialFromGrid_kernel<<<gpu->sim.blocks, potentialThreads>>>();
LAUNCHERROR("kComputeInducedPotentialFromGrid");
kComputeInducedDipoleForceAndEnergy_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>(); kComputeInducedDipoleForceAndEnergy_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kComputeInducedDipoleForceAndEnergy"); LAUNCHERROR("kComputeInducedDipoleForceAndEnergy");
cudaComputeAmoebaMapTorquesAndAddTotalForce2(amoebaGpu, amoebaGpu->psTorque, gpu->psForce4); cudaComputeAmoebaMapTorquesAndAddTotalForce2(amoebaGpu, amoebaGpu->psTorque, gpu->psForce4);
......
...@@ -298,92 +298,94 @@ if( atomI == targetAtom ){ ...@@ -298,92 +298,94 @@ if( atomI == targetAtom ){
for (unsigned int j = 0; j < GRID; j++) for (unsigned int j = 0; j < GRID; j++)
{ {
if ((flags&(1<<j)) != 0)
unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j;
unsigned int atomJ = y + jIdx;
float force[3];
float torque[2][3];
// set scale factors
if( bExclusionFlag )
{ {
getMaskedDScaleFactor( jIdx, dScaleMask, scalingFactors + DScaleIndex ); unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j;
getMaskedPScaleFactor( jIdx, pScaleMask, scalingFactors + PScaleIndex ); unsigned int atomJ = y + jIdx;
getMaskedMScaleFactor( jIdx, mScaleMask, scalingFactors + MScaleIndex );
} float force[3];
float torque[2][3];
// force
// set scale factors
float energy;
calculatePmeDirectElectrostaticPairIxn_kernel( localParticle, psA[jIdx], if( bExclusionFlag )
scalingFactors, force, torque, &energy {
getMaskedDScaleFactor( jIdx, dScaleMask, scalingFactors + DScaleIndex );
getMaskedPScaleFactor( jIdx, pScaleMask, scalingFactors + PScaleIndex );
getMaskedMScaleFactor( jIdx, mScaleMask, scalingFactors + MScaleIndex );
}
// force
float energy;
calculatePmeDirectElectrostaticPairIxn_kernel( localParticle, psA[jIdx],
scalingFactors, force, torque, &energy
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
, pullBack , pullBack
#endif #endif
); );
// check if atoms out-of-bounds
unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
// add force and torque to atom I due atom J
localParticle.force[0] += mask ? force[0] : 0.0f;
localParticle.force[1] += mask ? force[1] : 0.0f;
localParticle.force[2] += mask ? force[2] : 0.0f;
localParticle.torque[0] += mask ? torque[0][0] : 0.0f;
localParticle.torque[1] += mask ? torque[0][1] : 0.0f;
localParticle.torque[2] += mask ? torque[0][2] : 0.0f;
totalEnergy += mask ? energy : 0.0f;
// add force and torque to atom J due atom I // check if atoms out-of-bounds
if( flags == 0xFFFFFFFF ){ unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
psA[jIdx].force[0] -= mask ? force[0] : 0.0f; // add force and torque to atom I due atom J
psA[jIdx].force[1] -= mask ? force[1] : 0.0f;
psA[jIdx].force[2] -= mask ? force[2] : 0.0f;
psA[jIdx].torque[0] += mask ? torque[1][0] : 0.0f;
psA[jIdx].torque[1] += mask ? torque[1][1] : 0.0f;
psA[jIdx].torque[2] += mask ? torque[1][2] : 0.0f;
} else {
sA[threadIdx.x].tempForce[0] = mask ? 0.0f : force[0];
sA[threadIdx.x].tempForce[1] = mask ? 0.0f : force[1];
sA[threadIdx.x].tempForce[2] = mask ? 0.0f : force[2];
sA[threadIdx.x].tempTorque[0] = mask ? 0.0f : torque[1][0];
sA[threadIdx.x].tempTorque[1] = mask ? 0.0f : torque[1][1];
sA[threadIdx.x].tempTorque[2] = mask ? 0.0f : torque[1][2];
if( tgx % 2 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );
}
if( tgx % 4 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+2] );
}
if( tgx % 8 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+4] );
}
if( tgx % 16 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+8] );
}
if (tgx == 0) localParticle.force[0] += mask ? force[0] : 0.0f;
{ localParticle.force[1] += mask ? force[1] : 0.0f;
psA[jIdx].force[0] -= sA[threadIdx.x].tempForce[0] + sA[threadIdx.x+16].tempForce[0]; localParticle.force[2] += mask ? force[2] : 0.0f;
psA[jIdx].force[1] -= sA[threadIdx.x].tempForce[1] + sA[threadIdx.x+16].tempForce[1];
psA[jIdx].force[2] -= sA[threadIdx.x].tempForce[2] + sA[threadIdx.x+16].tempForce[2]; localParticle.torque[0] += mask ? torque[0][0] : 0.0f;
localParticle.torque[1] += mask ? torque[0][1] : 0.0f;
localParticle.torque[2] += mask ? torque[0][2] : 0.0f;
totalEnergy += mask ? energy : 0.0f;
// add force and torque to atom J due atom I
if( flags == 0xFFFFFFFF ){
psA[jIdx].force[0] -= mask ? force[0] : 0.0f;
psA[jIdx].force[1] -= mask ? force[1] : 0.0f;
psA[jIdx].force[2] -= mask ? force[2] : 0.0f;
psA[jIdx].torque[0] += mask ? torque[1][0] : 0.0f;
psA[jIdx].torque[1] += mask ? torque[1][1] : 0.0f;
psA[jIdx].torque[2] += mask ? torque[1][2] : 0.0f;
} else {
sA[threadIdx.x].tempForce[0] = mask ? 0.0f : force[0];
sA[threadIdx.x].tempForce[1] = mask ? 0.0f : force[1];
sA[threadIdx.x].tempForce[2] = mask ? 0.0f : force[2];
sA[threadIdx.x].tempTorque[0] = mask ? 0.0f : torque[1][0];
sA[threadIdx.x].tempTorque[1] = mask ? 0.0f : torque[1][1];
sA[threadIdx.x].tempTorque[2] = mask ? 0.0f : torque[1][2];
if( tgx % 2 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );
}
if( tgx % 4 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+2] );
}
if( tgx % 8 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+4] );
}
if( tgx % 16 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+8] );
}
if (tgx == 0)
{
psA[jIdx].force[0] -= sA[threadIdx.x].tempForce[0] + sA[threadIdx.x+16].tempForce[0];
psA[jIdx].force[1] -= sA[threadIdx.x].tempForce[1] + sA[threadIdx.x+16].tempForce[1];
psA[jIdx].force[2] -= sA[threadIdx.x].tempForce[2] + sA[threadIdx.x+16].tempForce[2];
psA[jIdx].torque[0] += sA[threadIdx.x].tempTorque[0] + sA[threadIdx.x+16].tempTorque[0]; psA[jIdx].torque[0] += sA[threadIdx.x].tempTorque[0] + sA[threadIdx.x+16].tempTorque[0];
psA[jIdx].torque[1] += sA[threadIdx.x].tempTorque[1] + sA[threadIdx.x+16].tempTorque[1]; psA[jIdx].torque[1] += sA[threadIdx.x].tempTorque[1] + sA[threadIdx.x+16].tempTorque[1];
psA[jIdx].torque[2] += sA[threadIdx.x].tempTorque[2] + sA[threadIdx.x+16].tempTorque[2]; psA[jIdx].torque[2] += sA[threadIdx.x].tempTorque[2] + sA[threadIdx.x+16].tempTorque[2];
}
} }
} }
......
...@@ -228,77 +228,78 @@ if( atomI == targetAtom || targetAtom == (y+j) ){ ...@@ -228,77 +228,78 @@ if( atomI == targetAtom || targetAtom == (y+j) ){
for (unsigned int j = 0; j < GRID; j++){ for (unsigned int j = 0; j < GRID; j++){
unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j; if ((flags&(1<<j)) != 0) {
if( bExclusionFlag ){ unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j;
getMaskedDScaleFactor( jIdx, dScaleMask, &dScaleValue ); if( bExclusionFlag ){
getMaskedPScaleFactor( jIdx, pScaleMask, &pScaleValue ); getMaskedDScaleFactor( jIdx, dScaleMask, &dScaleValue );
} getMaskedPScaleFactor( jIdx, pScaleMask, &pScaleValue );
}
float4 ijField[3];
calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[jIdx], dScaleValue, pScaleValue, ijField
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
unsigned int outOfBounds = ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+jIdx) >= cAmoebaSim.numberOfAtoms) ) ? 1 : 0;
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
fieldSum[0] += outOfBounds ? 0.0f : ijField[0].x;
fieldSum[1] += outOfBounds ? 0.0f : ijField[1].x;
fieldSum[2] += outOfBounds ? 0.0f : ijField[2].x;
fieldPolarSum[0] += outOfBounds ? 0.0f : ijField[0].z;
fieldPolarSum[1] += outOfBounds ? 0.0f : ijField[1].z;
fieldPolarSum[2] += outOfBounds ? 0.0f : ijField[2].z;
if( flags == 0xFFFFFFFF ){
// add to field at atomJ the field due atomI's charge/dipole/quadrupole float4 ijField[3];
calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[jIdx], dScaleValue, pScaleValue, ijField
psA[jIdx].eField[0] += outOfBounds ? 0.0f : ijField[0].y; #ifdef AMOEBA_DEBUG
psA[jIdx].eField[1] += outOfBounds ? 0.0f : ijField[1].y; , pullBack
psA[jIdx].eField[2] += outOfBounds ? 0.0f : ijField[2].y; #endif
);
psA[jIdx].eFieldP[0] += outOfBounds ? 0.0f : ijField[0].w;
psA[jIdx].eFieldP[1] += outOfBounds ? 0.0f : ijField[1].w; unsigned int outOfBounds = ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+jIdx) >= cAmoebaSim.numberOfAtoms) ) ? 1 : 0;
psA[jIdx].eFieldP[2] += outOfBounds ? 0.0f : ijField[2].w;
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
} else {
fieldSum[0] += outOfBounds ? 0.0f : ijField[0].x;
sA[threadIdx.x].tempBuffer[0] = outOfBounds ? 0.0f : ijField[0].y; fieldSum[1] += outOfBounds ? 0.0f : ijField[1].x;
sA[threadIdx.x].tempBuffer[1] = outOfBounds ? 0.0f : ijField[1].y; fieldSum[2] += outOfBounds ? 0.0f : ijField[2].x;
sA[threadIdx.x].tempBuffer[2] = outOfBounds ? 0.0f : ijField[2].y;
fieldPolarSum[0] += outOfBounds ? 0.0f : ijField[0].z;
sA[threadIdx.x].tempBufferP[0] = outOfBounds ? 0.0f : ijField[0].w; fieldPolarSum[1] += outOfBounds ? 0.0f : ijField[1].z;
sA[threadIdx.x].tempBufferP[1] = outOfBounds ? 0.0f : ijField[1].w; fieldPolarSum[2] += outOfBounds ? 0.0f : ijField[2].z;
sA[threadIdx.x].tempBufferP[2] = outOfBounds ? 0.0f : ijField[2].w;
if( flags == 0xFFFFFFFF ){
if( tgx % 2 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] ); // add to field at atomJ the field due atomI's charge/dipole/quadrupole
}
if( tgx % 4 == 0 ){ psA[jIdx].eField[0] += outOfBounds ? 0.0f : ijField[0].y;
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+2] ); psA[jIdx].eField[1] += outOfBounds ? 0.0f : ijField[1].y;
} psA[jIdx].eField[2] += outOfBounds ? 0.0f : ijField[2].y;
if( tgx % 8 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+4] ); psA[jIdx].eFieldP[0] += outOfBounds ? 0.0f : ijField[0].w;
} psA[jIdx].eFieldP[1] += outOfBounds ? 0.0f : ijField[1].w;
if( tgx % 16 == 0 ){ psA[jIdx].eFieldP[2] += outOfBounds ? 0.0f : ijField[2].w;
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+8] );
} } else {
if (tgx == 0) sA[threadIdx.x].tempBuffer[0] = outOfBounds ? 0.0f : ijField[0].y;
{ sA[threadIdx.x].tempBuffer[1] = outOfBounds ? 0.0f : ijField[1].y;
psA[jIdx].eField[0] += sA[threadIdx.x].tempBuffer[0] + sA[threadIdx.x+16].tempBuffer[0]; sA[threadIdx.x].tempBuffer[2] = outOfBounds ? 0.0f : ijField[2].y;
psA[jIdx].eField[1] += sA[threadIdx.x].tempBuffer[1] + sA[threadIdx.x+16].tempBuffer[1];
psA[jIdx].eField[2] += sA[threadIdx.x].tempBuffer[2] + sA[threadIdx.x+16].tempBuffer[2]; sA[threadIdx.x].tempBufferP[0] = outOfBounds ? 0.0f : ijField[0].w;
sA[threadIdx.x].tempBufferP[1] = outOfBounds ? 0.0f : ijField[1].w;
psA[jIdx].eFieldP[0] += sA[threadIdx.x].tempBufferP[0] + sA[threadIdx.x+16].tempBufferP[0]; sA[threadIdx.x].tempBufferP[2] = outOfBounds ? 0.0f : ijField[2].w;
psA[jIdx].eFieldP[1] += sA[threadIdx.x].tempBufferP[1] + sA[threadIdx.x+16].tempBufferP[1];
psA[jIdx].eFieldP[2] += sA[threadIdx.x].tempBufferP[2] + sA[threadIdx.x+16].tempBufferP[2]; if( tgx % 2 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );
}
if( tgx % 4 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+2] );
}
if( tgx % 8 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+4] );
}
if( tgx % 16 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+8] );
}
if (tgx == 0)
{
psA[jIdx].eField[0] += sA[threadIdx.x].tempBuffer[0] + sA[threadIdx.x+16].tempBuffer[0];
psA[jIdx].eField[1] += sA[threadIdx.x].tempBuffer[1] + sA[threadIdx.x+16].tempBuffer[1];
psA[jIdx].eField[2] += sA[threadIdx.x].tempBuffer[2] + sA[threadIdx.x+16].tempBuffer[2];
psA[jIdx].eFieldP[0] += sA[threadIdx.x].tempBufferP[0] + sA[threadIdx.x+16].tempBufferP[0];
psA[jIdx].eFieldP[1] += sA[threadIdx.x].tempBufferP[1] + sA[threadIdx.x+16].tempBufferP[1];
psA[jIdx].eFieldP[2] += sA[threadIdx.x].tempBufferP[2] + sA[threadIdx.x+16].tempBufferP[2];
}
} }
}
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
if( (atomI == targetAtom || (y + jIdx) == targetAtom) ){ if( (atomI == targetAtom || (y + jIdx) == targetAtom) ){
...@@ -358,6 +359,7 @@ if( (atomI == targetAtom || (y + jIdx) == targetAtom) ){ ...@@ -358,6 +359,7 @@ if( (atomI == targetAtom || (y + jIdx) == targetAtom) ){
} }
} }
#endif #endif
}
tj = (tj + 1) & (GRID - 1); tj = (tj + 1) & (GRID - 1);
} // j-loop block } // j-loop block
......
...@@ -231,81 +231,82 @@ if( atomI == targetAtom || (y+j) == targetAtom ){ ...@@ -231,81 +231,82 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
for (unsigned int j = 0; j < GRID; j++) for (unsigned int j = 0; j < GRID; j++)
{ {
if ((flags&(1<<j)) != 0)
unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j; {
float4 ijField[3]; unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j;
float4 ijField[3];
// load coords, charge, ...
calculatePmeDirectMutualInducedFieldPairIxn_kernel( localParticle, psA[jIdx], uscale, ijField
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+jIdx) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
// add to field at atomI the field due atomJ's dipole
fieldSum[0] += mask ? ijField[0].x : 0.0f;
fieldSum[1] += mask ? ijField[1].x : 0.0f;
fieldSum[2] += mask ? ijField[2].x : 0.0f;
// add to polar field at atomI the field due atomJ's dipole
fieldPolarSum[0] += mask ? ijField[0].z : 0.0f;
fieldPolarSum[1] += mask ? ijField[1].z : 0.0f;
fieldPolarSum[2] += mask ? ijField[2].z : 0.0f;
// add to field at atomJ the field due atomI's dipole
if( flags == 0xFFFFFFFF ){
psA[jIdx].field[0] += mask ? ijField[0].y : 0.0f;
psA[jIdx].field[1] += mask ? ijField[1].y : 0.0f;
psA[jIdx].field[2] += mask ? ijField[2].y : 0.0f;
// add to polar field at atomJ the field due atomI's dipole
psA[jIdx].fieldPolar[0] += mask ? ijField[0].w : 0.0f;
psA[jIdx].fieldPolar[1] += mask ? ijField[1].w : 0.0f;
psA[jIdx].fieldPolar[2] += mask ? ijField[2].w : 0.0f;
} else {
sA[threadIdx.x].tempBuffer[0] = mask ? 0.0f : ijField[0].y;
sA[threadIdx.x].tempBuffer[1] = mask ? 0.0f : ijField[1].y;
sA[threadIdx.x].tempBuffer[2] = mask ? 0.0f : ijField[2].y;
sA[threadIdx.x].tempBufferP[0] = mask ? 0.0f : ijField[0].w;
sA[threadIdx.x].tempBufferP[1] = mask ? 0.0f : ijField[1].w;
sA[threadIdx.x].tempBufferP[2] = mask ? 0.0f : ijField[2].w;
if( tgx % 2 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );
}
if( tgx % 4 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+2] );
}
if( tgx % 8 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+4] );
}
if( tgx % 16 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+8] );
}
if (tgx == 0) // load coords, charge, ...
{
psA[jIdx].field[0] += sA[threadIdx.x].tempBuffer[0] + sA[threadIdx.x+16].tempBuffer[0];
psA[jIdx].field[1] += sA[threadIdx.x].tempBuffer[1] + sA[threadIdx.x+16].tempBuffer[1];
psA[jIdx].field[2] += sA[threadIdx.x].tempBuffer[2] + sA[threadIdx.x+16].tempBuffer[2];
psA[jIdx].fieldPolar[0] += sA[threadIdx.x].tempBufferP[0] + sA[threadIdx.x+16].tempBufferP[0]; calculatePmeDirectMutualInducedFieldPairIxn_kernel( localParticle, psA[jIdx], uscale, ijField
psA[jIdx].fieldPolar[1] += sA[threadIdx.x].tempBufferP[1] + sA[threadIdx.x+16].tempBufferP[1]; #ifdef AMOEBA_DEBUG
psA[jIdx].fieldPolar[2] += sA[threadIdx.x].tempBufferP[2] + sA[threadIdx.x+16].tempBufferP[2]; , pullBack
} #endif
);
} unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+jIdx) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
// add to field at atomI the field due atomJ's dipole
fieldSum[0] += mask ? ijField[0].x : 0.0f;
fieldSum[1] += mask ? ijField[1].x : 0.0f;
fieldSum[2] += mask ? ijField[2].x : 0.0f;
// add to polar field at atomI the field due atomJ's dipole
fieldPolarSum[0] += mask ? ijField[0].z : 0.0f;
fieldPolarSum[1] += mask ? ijField[1].z : 0.0f;
fieldPolarSum[2] += mask ? ijField[2].z : 0.0f;
// add to field at atomJ the field due atomI's dipole
if( flags == 0xFFFFFFFF ){
psA[jIdx].field[0] += mask ? ijField[0].y : 0.0f;
psA[jIdx].field[1] += mask ? ijField[1].y : 0.0f;
psA[jIdx].field[2] += mask ? ijField[2].y : 0.0f;
// add to polar field at atomJ the field due atomI's dipole
psA[jIdx].fieldPolar[0] += mask ? ijField[0].w : 0.0f;
psA[jIdx].fieldPolar[1] += mask ? ijField[1].w : 0.0f;
psA[jIdx].fieldPolar[2] += mask ? ijField[2].w : 0.0f;
} else {
sA[threadIdx.x].tempBuffer[0] = mask ? 0.0f : ijField[0].y;
sA[threadIdx.x].tempBuffer[1] = mask ? 0.0f : ijField[1].y;
sA[threadIdx.x].tempBuffer[2] = mask ? 0.0f : ijField[2].y;
sA[threadIdx.x].tempBufferP[0] = mask ? 0.0f : ijField[0].w;
sA[threadIdx.x].tempBufferP[1] = mask ? 0.0f : ijField[1].w;
sA[threadIdx.x].tempBufferP[2] = mask ? 0.0f : ijField[2].w;
if( tgx % 2 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );
}
if( tgx % 4 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+2] );
}
if( tgx % 8 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+4] );
}
if( tgx % 16 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+8] );
}
if (tgx == 0)
{
psA[jIdx].field[0] += sA[threadIdx.x].tempBuffer[0] + sA[threadIdx.x+16].tempBuffer[0];
psA[jIdx].field[1] += sA[threadIdx.x].tempBuffer[1] + sA[threadIdx.x+16].tempBuffer[1];
psA[jIdx].field[2] += sA[threadIdx.x].tempBuffer[2] + sA[threadIdx.x+16].tempBuffer[2];
psA[jIdx].fieldPolar[0] += sA[threadIdx.x].tempBufferP[0] + sA[threadIdx.x+16].tempBufferP[0];
psA[jIdx].fieldPolar[1] += sA[threadIdx.x].tempBufferP[1] + sA[threadIdx.x+16].tempBufferP[1];
psA[jIdx].fieldPolar[2] += sA[threadIdx.x].tempBufferP[2] + sA[threadIdx.x+16].tempBufferP[2];
}
}
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
if( atomI == targetAtom || (y+jIdx) == targetAtom ){ if( atomI == targetAtom || (y+jIdx) == targetAtom ){
...@@ -359,6 +360,7 @@ if( atomI == targetAtom || (y+jIdx) == targetAtom ){ ...@@ -359,6 +360,7 @@ if( atomI == targetAtom || (y+jIdx) == targetAtom ){
debugArray[index].w = flag; debugArray[index].w = flag;
} }
#endif #endif
}
tj = (tj + 1) & (GRID - 1); tj = (tj + 1) & (GRID - 1);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment