Commit 35c974f6 authored by Peter Eastman's avatar Peter Eastman
Browse files

Optimizations to PME

parent 92a338cf
......@@ -1018,18 +1018,7 @@ void kCalculateAmoebaPMEInducedDipoleField(amoebaGpuContext amoebaGpu)
*/
void kCalculateAmoebaPMEInducedDipoleForces(amoebaGpuContext amoebaGpu)
{
// Perform PME for the induced dipoles.
gpuContext gpu = amoebaGpu->gpuContext;
kGridSpreadInducedDipoles_kernel<<<10*gpu->sim.blocks, 64>>>();
LAUNCHERROR("kGridSpreadInducedDipoles");
cufftExecC2C(gpu->fftplan, gpu->psPmeGrid->_pDevData, gpu->psPmeGrid->_pDevData, CUFFT_FORWARD);
kAmoebaReciprocalConvolution_kernel<<<gpu->sim.blocks, gpu->sim.nonbond_threads_per_block>>>();
LAUNCHERROR("kAmoebaReciprocalConvolution");
cufftExecC2C(gpu->fftplan, gpu->psPmeGrid->_pDevData, gpu->psPmeGrid->_pDevData, CUFFT_INVERSE);
int potentialThreads = (gpu->sm_version >= SM_20 ? 256 : (gpu->sm_version >= SM_12 ? 128 : 64));
kComputeInducedPotentialFromGrid_kernel<<<gpu->sim.blocks, potentialThreads>>>();
LAUNCHERROR("kComputeInducedPotentialFromGrid");
kComputeInducedDipoleForceAndEnergy_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kComputeInducedDipoleForceAndEnergy");
cudaComputeAmoebaMapTorquesAndAddTotalForce2(amoebaGpu, amoebaGpu->psTorque, gpu->psForce4);
......
......@@ -298,92 +298,94 @@ if( atomI == targetAtom ){
for (unsigned int j = 0; j < GRID; j++)
{
unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j;
unsigned int atomJ = y + jIdx;
float force[3];
float torque[2][3];
// set scale factors
if( bExclusionFlag )
if ((flags&(1<<j)) != 0)
{
getMaskedDScaleFactor( jIdx, dScaleMask, scalingFactors + DScaleIndex );
getMaskedPScaleFactor( jIdx, pScaleMask, scalingFactors + PScaleIndex );
getMaskedMScaleFactor( jIdx, mScaleMask, scalingFactors + MScaleIndex );
}
// force
float energy;
calculatePmeDirectElectrostaticPairIxn_kernel( localParticle, psA[jIdx],
scalingFactors, force, torque, &energy
unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j;
unsigned int atomJ = y + jIdx;
float force[3];
float torque[2][3];
// set scale factors
if( bExclusionFlag )
{
getMaskedDScaleFactor( jIdx, dScaleMask, scalingFactors + DScaleIndex );
getMaskedPScaleFactor( jIdx, pScaleMask, scalingFactors + PScaleIndex );
getMaskedMScaleFactor( jIdx, mScaleMask, scalingFactors + MScaleIndex );
}
// force
float energy;
calculatePmeDirectElectrostaticPairIxn_kernel( localParticle, psA[jIdx],
scalingFactors, force, torque, &energy
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
// check if atoms out-of-bounds
unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
// add force and torque to atom I due atom J
localParticle.force[0] += mask ? force[0] : 0.0f;
localParticle.force[1] += mask ? force[1] : 0.0f;
localParticle.force[2] += mask ? force[2] : 0.0f;
localParticle.torque[0] += mask ? torque[0][0] : 0.0f;
localParticle.torque[1] += mask ? torque[0][1] : 0.0f;
localParticle.torque[2] += mask ? torque[0][2] : 0.0f;
totalEnergy += mask ? energy : 0.0f;
);
// add force and torque to atom J due atom I
// check if atoms out-of-bounds
if( flags == 0xFFFFFFFF ){
unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
psA[jIdx].force[0] -= mask ? force[0] : 0.0f;
psA[jIdx].force[1] -= mask ? force[1] : 0.0f;
psA[jIdx].force[2] -= mask ? force[2] : 0.0f;
psA[jIdx].torque[0] += mask ? torque[1][0] : 0.0f;
psA[jIdx].torque[1] += mask ? torque[1][1] : 0.0f;
psA[jIdx].torque[2] += mask ? torque[1][2] : 0.0f;
} else {
sA[threadIdx.x].tempForce[0] = mask ? 0.0f : force[0];
sA[threadIdx.x].tempForce[1] = mask ? 0.0f : force[1];
sA[threadIdx.x].tempForce[2] = mask ? 0.0f : force[2];
sA[threadIdx.x].tempTorque[0] = mask ? 0.0f : torque[1][0];
sA[threadIdx.x].tempTorque[1] = mask ? 0.0f : torque[1][1];
sA[threadIdx.x].tempTorque[2] = mask ? 0.0f : torque[1][2];
if( tgx % 2 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );
}
if( tgx % 4 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+2] );
}
if( tgx % 8 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+4] );
}
if( tgx % 16 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+8] );
}
// add force and torque to atom I due atom J
if (tgx == 0)
{
psA[jIdx].force[0] -= sA[threadIdx.x].tempForce[0] + sA[threadIdx.x+16].tempForce[0];
psA[jIdx].force[1] -= sA[threadIdx.x].tempForce[1] + sA[threadIdx.x+16].tempForce[1];
psA[jIdx].force[2] -= sA[threadIdx.x].tempForce[2] + sA[threadIdx.x+16].tempForce[2];
localParticle.force[0] += mask ? force[0] : 0.0f;
localParticle.force[1] += mask ? force[1] : 0.0f;
localParticle.force[2] += mask ? force[2] : 0.0f;
localParticle.torque[0] += mask ? torque[0][0] : 0.0f;
localParticle.torque[1] += mask ? torque[0][1] : 0.0f;
localParticle.torque[2] += mask ? torque[0][2] : 0.0f;
totalEnergy += mask ? energy : 0.0f;
// add force and torque to atom J due atom I
if( flags == 0xFFFFFFFF ){
psA[jIdx].force[0] -= mask ? force[0] : 0.0f;
psA[jIdx].force[1] -= mask ? force[1] : 0.0f;
psA[jIdx].force[2] -= mask ? force[2] : 0.0f;
psA[jIdx].torque[0] += mask ? torque[1][0] : 0.0f;
psA[jIdx].torque[1] += mask ? torque[1][1] : 0.0f;
psA[jIdx].torque[2] += mask ? torque[1][2] : 0.0f;
} else {
sA[threadIdx.x].tempForce[0] = mask ? 0.0f : force[0];
sA[threadIdx.x].tempForce[1] = mask ? 0.0f : force[1];
sA[threadIdx.x].tempForce[2] = mask ? 0.0f : force[2];
sA[threadIdx.x].tempTorque[0] = mask ? 0.0f : torque[1][0];
sA[threadIdx.x].tempTorque[1] = mask ? 0.0f : torque[1][1];
sA[threadIdx.x].tempTorque[2] = mask ? 0.0f : torque[1][2];
if( tgx % 2 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );
}
if( tgx % 4 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+2] );
}
if( tgx % 8 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+4] );
}
if( tgx % 16 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+8] );
}
if (tgx == 0)
{
psA[jIdx].force[0] -= sA[threadIdx.x].tempForce[0] + sA[threadIdx.x+16].tempForce[0];
psA[jIdx].force[1] -= sA[threadIdx.x].tempForce[1] + sA[threadIdx.x+16].tempForce[1];
psA[jIdx].force[2] -= sA[threadIdx.x].tempForce[2] + sA[threadIdx.x+16].tempForce[2];
psA[jIdx].torque[0] += sA[threadIdx.x].tempTorque[0] + sA[threadIdx.x+16].tempTorque[0];
psA[jIdx].torque[1] += sA[threadIdx.x].tempTorque[1] + sA[threadIdx.x+16].tempTorque[1];
psA[jIdx].torque[2] += sA[threadIdx.x].tempTorque[2] + sA[threadIdx.x+16].tempTorque[2];
psA[jIdx].torque[0] += sA[threadIdx.x].tempTorque[0] + sA[threadIdx.x+16].tempTorque[0];
psA[jIdx].torque[1] += sA[threadIdx.x].tempTorque[1] + sA[threadIdx.x+16].tempTorque[1];
psA[jIdx].torque[2] += sA[threadIdx.x].tempTorque[2] + sA[threadIdx.x+16].tempTorque[2];
}
}
}
......
......@@ -228,77 +228,78 @@ if( atomI == targetAtom || targetAtom == (y+j) ){
for (unsigned int j = 0; j < GRID; j++){
unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j;
if( bExclusionFlag ){
getMaskedDScaleFactor( jIdx, dScaleMask, &dScaleValue );
getMaskedPScaleFactor( jIdx, pScaleMask, &pScaleValue );
}
float4 ijField[3];
calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[jIdx], dScaleValue, pScaleValue, ijField
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
unsigned int outOfBounds = ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+jIdx) >= cAmoebaSim.numberOfAtoms) ) ? 1 : 0;
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
fieldSum[0] += outOfBounds ? 0.0f : ijField[0].x;
fieldSum[1] += outOfBounds ? 0.0f : ijField[1].x;
fieldSum[2] += outOfBounds ? 0.0f : ijField[2].x;
fieldPolarSum[0] += outOfBounds ? 0.0f : ijField[0].z;
fieldPolarSum[1] += outOfBounds ? 0.0f : ijField[1].z;
fieldPolarSum[2] += outOfBounds ? 0.0f : ijField[2].z;
if( flags == 0xFFFFFFFF ){
if ((flags&(1<<j)) != 0) {
unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j;
if( bExclusionFlag ){
getMaskedDScaleFactor( jIdx, dScaleMask, &dScaleValue );
getMaskedPScaleFactor( jIdx, pScaleMask, &pScaleValue );
}
// add to field at atomJ the field due atomI's charge/dipole/quadrupole
psA[jIdx].eField[0] += outOfBounds ? 0.0f : ijField[0].y;
psA[jIdx].eField[1] += outOfBounds ? 0.0f : ijField[1].y;
psA[jIdx].eField[2] += outOfBounds ? 0.0f : ijField[2].y;
psA[jIdx].eFieldP[0] += outOfBounds ? 0.0f : ijField[0].w;
psA[jIdx].eFieldP[1] += outOfBounds ? 0.0f : ijField[1].w;
psA[jIdx].eFieldP[2] += outOfBounds ? 0.0f : ijField[2].w;
} else {
sA[threadIdx.x].tempBuffer[0] = outOfBounds ? 0.0f : ijField[0].y;
sA[threadIdx.x].tempBuffer[1] = outOfBounds ? 0.0f : ijField[1].y;
sA[threadIdx.x].tempBuffer[2] = outOfBounds ? 0.0f : ijField[2].y;
sA[threadIdx.x].tempBufferP[0] = outOfBounds ? 0.0f : ijField[0].w;
sA[threadIdx.x].tempBufferP[1] = outOfBounds ? 0.0f : ijField[1].w;
sA[threadIdx.x].tempBufferP[2] = outOfBounds ? 0.0f : ijField[2].w;
if( tgx % 2 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );
}
if( tgx % 4 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+2] );
}
if( tgx % 8 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+4] );
}
if( tgx % 16 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+8] );
}
if (tgx == 0)
{
psA[jIdx].eField[0] += sA[threadIdx.x].tempBuffer[0] + sA[threadIdx.x+16].tempBuffer[0];
psA[jIdx].eField[1] += sA[threadIdx.x].tempBuffer[1] + sA[threadIdx.x+16].tempBuffer[1];
psA[jIdx].eField[2] += sA[threadIdx.x].tempBuffer[2] + sA[threadIdx.x+16].tempBuffer[2];
psA[jIdx].eFieldP[0] += sA[threadIdx.x].tempBufferP[0] + sA[threadIdx.x+16].tempBufferP[0];
psA[jIdx].eFieldP[1] += sA[threadIdx.x].tempBufferP[1] + sA[threadIdx.x+16].tempBufferP[1];
psA[jIdx].eFieldP[2] += sA[threadIdx.x].tempBufferP[2] + sA[threadIdx.x+16].tempBufferP[2];
float4 ijField[3];
calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[jIdx], dScaleValue, pScaleValue, ijField
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
unsigned int outOfBounds = ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+jIdx) >= cAmoebaSim.numberOfAtoms) ) ? 1 : 0;
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
fieldSum[0] += outOfBounds ? 0.0f : ijField[0].x;
fieldSum[1] += outOfBounds ? 0.0f : ijField[1].x;
fieldSum[2] += outOfBounds ? 0.0f : ijField[2].x;
fieldPolarSum[0] += outOfBounds ? 0.0f : ijField[0].z;
fieldPolarSum[1] += outOfBounds ? 0.0f : ijField[1].z;
fieldPolarSum[2] += outOfBounds ? 0.0f : ijField[2].z;
if( flags == 0xFFFFFFFF ){
// add to field at atomJ the field due atomI's charge/dipole/quadrupole
psA[jIdx].eField[0] += outOfBounds ? 0.0f : ijField[0].y;
psA[jIdx].eField[1] += outOfBounds ? 0.0f : ijField[1].y;
psA[jIdx].eField[2] += outOfBounds ? 0.0f : ijField[2].y;
psA[jIdx].eFieldP[0] += outOfBounds ? 0.0f : ijField[0].w;
psA[jIdx].eFieldP[1] += outOfBounds ? 0.0f : ijField[1].w;
psA[jIdx].eFieldP[2] += outOfBounds ? 0.0f : ijField[2].w;
} else {
sA[threadIdx.x].tempBuffer[0] = outOfBounds ? 0.0f : ijField[0].y;
sA[threadIdx.x].tempBuffer[1] = outOfBounds ? 0.0f : ijField[1].y;
sA[threadIdx.x].tempBuffer[2] = outOfBounds ? 0.0f : ijField[2].y;
sA[threadIdx.x].tempBufferP[0] = outOfBounds ? 0.0f : ijField[0].w;
sA[threadIdx.x].tempBufferP[1] = outOfBounds ? 0.0f : ijField[1].w;
sA[threadIdx.x].tempBufferP[2] = outOfBounds ? 0.0f : ijField[2].w;
if( tgx % 2 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );
}
if( tgx % 4 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+2] );
}
if( tgx % 8 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+4] );
}
if( tgx % 16 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+8] );
}
if (tgx == 0)
{
psA[jIdx].eField[0] += sA[threadIdx.x].tempBuffer[0] + sA[threadIdx.x+16].tempBuffer[0];
psA[jIdx].eField[1] += sA[threadIdx.x].tempBuffer[1] + sA[threadIdx.x+16].tempBuffer[1];
psA[jIdx].eField[2] += sA[threadIdx.x].tempBuffer[2] + sA[threadIdx.x+16].tempBuffer[2];
psA[jIdx].eFieldP[0] += sA[threadIdx.x].tempBufferP[0] + sA[threadIdx.x+16].tempBufferP[0];
psA[jIdx].eFieldP[1] += sA[threadIdx.x].tempBufferP[1] + sA[threadIdx.x+16].tempBufferP[1];
psA[jIdx].eFieldP[2] += sA[threadIdx.x].tempBufferP[2] + sA[threadIdx.x+16].tempBufferP[2];
}
}
}
#ifdef AMOEBA_DEBUG
if( (atomI == targetAtom || (y + jIdx) == targetAtom) ){
......@@ -358,6 +359,7 @@ if( (atomI == targetAtom || (y + jIdx) == targetAtom) ){
}
}
#endif
}
tj = (tj + 1) & (GRID - 1);
} // j-loop block
......
......@@ -231,81 +231,82 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
for (unsigned int j = 0; j < GRID; j++)
{
unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j;
float4 ijField[3];
// load coords, charge, ...
calculatePmeDirectMutualInducedFieldPairIxn_kernel( localParticle, psA[jIdx], uscale, ijField
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+jIdx) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
// add to field at atomI the field due atomJ's dipole
fieldSum[0] += mask ? ijField[0].x : 0.0f;
fieldSum[1] += mask ? ijField[1].x : 0.0f;
fieldSum[2] += mask ? ijField[2].x : 0.0f;
// add to polar field at atomI the field due atomJ's dipole
fieldPolarSum[0] += mask ? ijField[0].z : 0.0f;
fieldPolarSum[1] += mask ? ijField[1].z : 0.0f;
fieldPolarSum[2] += mask ? ijField[2].z : 0.0f;
// add to field at atomJ the field due atomI's dipole
if( flags == 0xFFFFFFFF ){
psA[jIdx].field[0] += mask ? ijField[0].y : 0.0f;
psA[jIdx].field[1] += mask ? ijField[1].y : 0.0f;
psA[jIdx].field[2] += mask ? ijField[2].y : 0.0f;
// add to polar field at atomJ the field due atomI's dipole
psA[jIdx].fieldPolar[0] += mask ? ijField[0].w : 0.0f;
psA[jIdx].fieldPolar[1] += mask ? ijField[1].w : 0.0f;
psA[jIdx].fieldPolar[2] += mask ? ijField[2].w : 0.0f;
} else {
sA[threadIdx.x].tempBuffer[0] = mask ? 0.0f : ijField[0].y;
sA[threadIdx.x].tempBuffer[1] = mask ? 0.0f : ijField[1].y;
sA[threadIdx.x].tempBuffer[2] = mask ? 0.0f : ijField[2].y;
sA[threadIdx.x].tempBufferP[0] = mask ? 0.0f : ijField[0].w;
sA[threadIdx.x].tempBufferP[1] = mask ? 0.0f : ijField[1].w;
sA[threadIdx.x].tempBufferP[2] = mask ? 0.0f : ijField[2].w;
if( tgx % 2 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );
}
if( tgx % 4 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+2] );
}
if( tgx % 8 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+4] );
}
if( tgx % 16 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+8] );
}
if ((flags&(1<<j)) != 0)
{
unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j;
float4 ijField[3];
if (tgx == 0)
{
psA[jIdx].field[0] += sA[threadIdx.x].tempBuffer[0] + sA[threadIdx.x+16].tempBuffer[0];
psA[jIdx].field[1] += sA[threadIdx.x].tempBuffer[1] + sA[threadIdx.x+16].tempBuffer[1];
psA[jIdx].field[2] += sA[threadIdx.x].tempBuffer[2] + sA[threadIdx.x+16].tempBuffer[2];
// load coords, charge, ...
psA[jIdx].fieldPolar[0] += sA[threadIdx.x].tempBufferP[0] + sA[threadIdx.x+16].tempBufferP[0];
psA[jIdx].fieldPolar[1] += sA[threadIdx.x].tempBufferP[1] + sA[threadIdx.x+16].tempBufferP[1];
psA[jIdx].fieldPolar[2] += sA[threadIdx.x].tempBufferP[2] + sA[threadIdx.x+16].tempBufferP[2];
}
calculatePmeDirectMutualInducedFieldPairIxn_kernel( localParticle, psA[jIdx], uscale, ijField
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
}
unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+jIdx) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
// add to field at atomI the field due atomJ's dipole
fieldSum[0] += mask ? ijField[0].x : 0.0f;
fieldSum[1] += mask ? ijField[1].x : 0.0f;
fieldSum[2] += mask ? ijField[2].x : 0.0f;
// add to polar field at atomI the field due atomJ's dipole
fieldPolarSum[0] += mask ? ijField[0].z : 0.0f;
fieldPolarSum[1] += mask ? ijField[1].z : 0.0f;
fieldPolarSum[2] += mask ? ijField[2].z : 0.0f;
// add to field at atomJ the field due atomI's dipole
if( flags == 0xFFFFFFFF ){
psA[jIdx].field[0] += mask ? ijField[0].y : 0.0f;
psA[jIdx].field[1] += mask ? ijField[1].y : 0.0f;
psA[jIdx].field[2] += mask ? ijField[2].y : 0.0f;
// add to polar field at atomJ the field due atomI's dipole
psA[jIdx].fieldPolar[0] += mask ? ijField[0].w : 0.0f;
psA[jIdx].fieldPolar[1] += mask ? ijField[1].w : 0.0f;
psA[jIdx].fieldPolar[2] += mask ? ijField[2].w : 0.0f;
} else {
sA[threadIdx.x].tempBuffer[0] = mask ? 0.0f : ijField[0].y;
sA[threadIdx.x].tempBuffer[1] = mask ? 0.0f : ijField[1].y;
sA[threadIdx.x].tempBuffer[2] = mask ? 0.0f : ijField[2].y;
sA[threadIdx.x].tempBufferP[0] = mask ? 0.0f : ijField[0].w;
sA[threadIdx.x].tempBufferP[1] = mask ? 0.0f : ijField[1].w;
sA[threadIdx.x].tempBufferP[2] = mask ? 0.0f : ijField[2].w;
if( tgx % 2 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );
}
if( tgx % 4 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+2] );
}
if( tgx % 8 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+4] );
}
if( tgx % 16 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+8] );
}
if (tgx == 0)
{
psA[jIdx].field[0] += sA[threadIdx.x].tempBuffer[0] + sA[threadIdx.x+16].tempBuffer[0];
psA[jIdx].field[1] += sA[threadIdx.x].tempBuffer[1] + sA[threadIdx.x+16].tempBuffer[1];
psA[jIdx].field[2] += sA[threadIdx.x].tempBuffer[2] + sA[threadIdx.x+16].tempBuffer[2];
psA[jIdx].fieldPolar[0] += sA[threadIdx.x].tempBufferP[0] + sA[threadIdx.x+16].tempBufferP[0];
psA[jIdx].fieldPolar[1] += sA[threadIdx.x].tempBufferP[1] + sA[threadIdx.x+16].tempBufferP[1];
psA[jIdx].fieldPolar[2] += sA[threadIdx.x].tempBufferP[2] + sA[threadIdx.x+16].tempBufferP[2];
}
}
#ifdef AMOEBA_DEBUG
if( atomI == targetAtom || (y+jIdx) == targetAtom ){
......@@ -359,6 +360,7 @@ if( atomI == targetAtom || (y+jIdx) == targetAtom ){
debugArray[index].w = flag;
}
#endif
}
tj = (tj + 1) & (GRID - 1);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment