Commit b20978e1 authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Fix for direct PME

parent a9054686
......@@ -47,9 +47,7 @@ AmoebaCudaData::AmoebaCudaData( CudaPlatform::PlatformData& data ) : cudaPlatfor
}
AmoebaCudaData::~AmoebaCudaData() {
(void) fprintf( stderr, "In AmoebaCudaData destructor\n" ); fflush( stderr );
amoebaGpuShutDown( amoebaGpu );
(void) fprintf( stderr, "Done AmoebaCudaData destructor\n" ); fflush( stderr );
}
void AmoebaCudaData::decrementKernelCount( void ) {
......
......@@ -167,9 +167,10 @@ public:
*/
void setApplyCutoff( int applyCutoff );
CudaPlatform::PlatformData& cudaPlatformData;
private:
CudaPlatform::PlatformData& cudaPlatformData;
amoebaGpuContext amoebaGpu;
bool hasAmoebaBonds, hasAmoebaGeneralizedKirkwood, hasAmoebaMultipole;
int multipoleForceCount;
......
......@@ -670,11 +670,11 @@ static void computeAmoebaMultipoleForce( AmoebaCudaData& data ) {
amoebaGpuContext gpu = data.getAmoebaGpu();
if( data.getMultipoleForceCount() == 0 ){
gpuCopyInteractingWorkUnit( gpu );
}
if( data.getApplyCutoff() && (data.getMultipoleForceCount() % 100) == 0 ){
gpuReorderAtoms(gpu->gpuContext);
gpuCopyWorkUnit( gpu );
}
//if( data.getApplyCutoff() && (data.getMultipoleForceCount() % 100) == 0 ){
//gpuReorderAtoms(gpu->gpuContext);
//}
data.incrementMultipoleForceCount();
data.initializeGpu();
......@@ -875,10 +875,11 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
}
gpuSetAmoebaPMEParameters(data.getAmoebaGpu(), (float) alpha, xsize, ysize, zsize);
data.setApplyCutoff( 1 );
amoebaGpuContext amoebaGpu = data.getAmoebaGpu();
gpuContext gpu = amoebaGpu->gpuContext;
gpu->sim.nonbondedCutoffSqr = force.getCutoffDistance()*force.getCutoffDistance();
gpu->sim.nonbondedMethod = PARTICLE_MESH_EWALD;
data.cudaPlatformData.nonbondedMethod = PARTICLE_MESH_EWALD;
amoebaGpuContext amoebaGpu = data.getAmoebaGpu();
gpuContext gpu = amoebaGpu->gpuContext;
gpu->sim.nonbondedCutoffSqr = force.getCutoffDistance()*force.getCutoffDistance();
gpu->sim.nonbondedMethod = PARTICLE_MESH_EWALD;
}
data.getAmoebaGpu()->gpuContext->forces.push_back(new ForceInfo(force));
}
......
......@@ -4305,7 +4305,7 @@ void trackMutualInducedIterations( amoebaGpuContext amoebaGpu, int iteration){
--------------------------------------------------------------------------------------- */
void gpuCopyInteractingWorkUnit( amoebaGpuContext amoebaGpu ){
void gpuCopyWorkUnit( amoebaGpuContext amoebaGpu ){
// ---------------------------------------------------------------------------------------
......@@ -4315,7 +4315,7 @@ void gpuCopyInteractingWorkUnit( amoebaGpuContext amoebaGpu ){
amoebaGpu->psWorkUnit->Download();
(void) fprintf( amoebaGpu->log, "gpuCopyInteractingWorkUnit called -- to be removed.\n" );
for( unsigned int ii = 0; ii < gpu->psInteractingWorkUnit->_length; ii++ ){
gpu->psInteractingWorkUnit->_pSysStream[0][ii] = amoebaGpu->psWorkUnit->_pSysStream[0][ii];
//gpu->psInteractingWorkUnit->_pSysStream[0][ii] = amoebaGpu->psWorkUnit->_pSysStream[0][ii];
gpu->psWorkUnit->_pSysStream[0][ii] = amoebaGpu->psWorkUnit->_pSysStream[0][ii];
}
gpu->psInteractingWorkUnit->Upload();
......
......@@ -344,7 +344,7 @@ extern "C"
void gpuSetAmoebaBondOffsets(amoebaGpuContext gpu);
extern "C"
void gpuCopyInteractingWorkUnit(amoebaGpuContext gpu);
void gpuCopyWorkUnit(amoebaGpuContext gpu);
/*
extern "C"
......
......@@ -255,6 +255,14 @@ if( atomI == targetAtom ){
} else {
if (lasty != y) {
// load shared data
loadPmeDirectElectrostaticShared( &(sA[threadIdx.x]), (y+tgx) );
}
unsigned int flags = cSim.pInteractionFlag[pos];
if (flags == 0) {
// No interactions in this block.
......@@ -346,40 +354,40 @@ if( atomI == targetAtom ){
} else {
psA[threadIdx.x].tempForce[0] = mask ? 0.0f : force[0];
psA[threadIdx.x].tempForce[1] = mask ? 0.0f : force[1];
psA[threadIdx.x].tempForce[2] = mask ? 0.0f : force[2];
sA[threadIdx.x].tempForce[0] = mask ? 0.0f : force[0];
sA[threadIdx.x].tempForce[1] = mask ? 0.0f : force[1];
sA[threadIdx.x].tempForce[2] = mask ? 0.0f : force[2];
psA[threadIdx.x].tempTorque[0] = mask ? 0.0f : torque[1][0];
psA[threadIdx.x].tempTorque[1] = mask ? 0.0f : torque[1][1];
psA[threadIdx.x].tempTorque[2] = mask ? 0.0f : torque[1][2];
sA[threadIdx.x].tempTorque[0] = mask ? 0.0f : torque[1][0];
sA[threadIdx.x].tempTorque[1] = mask ? 0.0f : torque[1][1];
sA[threadIdx.x].tempTorque[2] = mask ? 0.0f : torque[1][2];
if( tgx % 2 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+1] );
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );
}
if( tgx % 4 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+2] );
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+2] );
}
if( tgx % 8 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+4] );
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+4] );
}
if( tgx % 16 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+8] );
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+8] );
}
if (tgx == 0)
{
psA[jIdx].force[0] -= psA[threadIdx.x].tempForce[0] + psA[threadIdx.x+16].tempForce[0];
psA[jIdx].force[1] -= psA[threadIdx.x].tempForce[1] + psA[threadIdx.x+16].tempForce[1];
psA[jIdx].force[2] -= psA[threadIdx.x].tempForce[2] + psA[threadIdx.x+16].tempForce[2];
psA[jIdx].force[0] -= sA[threadIdx.x].tempForce[0] + sA[threadIdx.x+16].tempForce[0];
psA[jIdx].force[1] -= sA[threadIdx.x].tempForce[1] + sA[threadIdx.x+16].tempForce[1];
psA[jIdx].force[2] -= sA[threadIdx.x].tempForce[2] + sA[threadIdx.x+16].tempForce[2];
psA[jIdx].torque[0] += psA[threadIdx.x].tempTorque[0] + psA[threadIdx.x+16].tempTorque[0];
psA[jIdx].torque[1] += psA[threadIdx.x].tempTorque[1] + psA[threadIdx.x+16].tempTorque[1];
psA[jIdx].torque[2] += psA[threadIdx.x].tempTorque[2] + psA[threadIdx.x+16].tempTorque[2];
psA[jIdx].torque[0] += sA[threadIdx.x].tempTorque[0] + sA[threadIdx.x+16].tempTorque[0];
psA[jIdx].torque[1] += sA[threadIdx.x].tempTorque[1] + sA[threadIdx.x+16].tempTorque[1];
psA[jIdx].torque[2] += sA[threadIdx.x].tempTorque[2] + sA[threadIdx.x+16].tempTorque[2];
}
}
tj = (tj + 1) & (GRID - 1);
tj = (tj + 1) & (GRID - 1);
} // end of j-loop
......
......@@ -429,7 +429,7 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
// print intermediate results for the targetAtom
unsigned int targetAtom = 354;
unsigned int targetAtom = 1280;
#endif
kClearFields_3( amoebaGpu, 2 );
......@@ -458,8 +458,6 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
#endif
} else {
//amoebaGpu->psWorkUnit->_pDevStream[0],
kCalculateAmoebaPmeDirectFixedE_FieldN2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevStream[0],
......@@ -553,24 +551,14 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
amoebaGpu->gpuContext->psPosq4->Download();
for( int jj = 0; jj < gpu->natoms; jj++ ){
int debugIndex = jj;
if( fabs(debugArray->_pSysStream[0][jj+paddedNumberOfAtoms].x) > 0.0 ){
if( fabs(debugArray->_pSysStream[0][jj+3*paddedNumberOfAtoms].x) > 0.0 ){
(void) fprintf( amoebaGpu->log,"%5d PmeFixedEField\n", jj );
for( int kk = 0; kk < 6; kk++ ){
for( int kk = 0; kk < 7; kk++ ){
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
debugIndex += paddedNumberOfAtoms;
}
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e ] [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] p\n",
amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].x,
amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].y,
amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].z,
amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].x - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].x,
amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].y - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].y,
amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].z - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].z,
(amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].x - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].x)/5.50f,
(amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].y - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].y)/5.50f,
(amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].z - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].z)/5.50f);
(void) fprintf( amoebaGpu->log,"\n" );
}
......@@ -591,7 +579,7 @@ if( fabs(debugArray->_pSysStream[0][jj+paddedNumberOfAtoms].x) > 0.0 ){
}
#endif
if( 1 ){
if( 0 ){
std::vector<int> fileId;
fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
......
......@@ -140,18 +140,30 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
fieldPolarSum[2] += match ? 0.0f : ijField[2][2];
#ifdef AMOEBA_DEBUG
if( atomI == targetAtom ){
if( atomI == targetAtom || targetAtom == (y+j) ){
unsigned int index = atomI == targetAtom ? (y + j) : atomI;
unsigned int indexI = 0;
unsigned int indexJ = indexI ? 0 : 2;
unsigned int indices[4] = { indexI, indexJ, indexI+1, indexJ+1 };
float flag = 7.0f;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + j);
debugArray[index].z = dScaleValue;
debugArray[index].w = pScaleValue;
float flag = 7.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = (float) bExclusionFlag;
debugArray[index].y = (float) (tgx);
debugArray[index].z = (float) j;
debugArray[index].w = flag;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = (float) dScaleMask;
debugArray[index].y = (float) pScaleMask.x;
debugArray[index].z = (float) pScaleMask.y;
debugArray[index].w = flag;
for( int ii = 0; ii < 4; ii++ ){
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[indices[ii]][0];
......@@ -159,6 +171,7 @@ if( atomI == targetAtom ){
debugArray[index].z = match ? 0.0f : ijField[indices[ii]][2];
debugArray[index].w = flag;
}
for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullIndex].x;
......@@ -186,18 +199,18 @@ if( atomI == targetAtom ){
} else {
if (lasty != y ) {
// load coordinates, charge, ...
loadFixedFieldShared( &(sA[threadIdx.x]), (y+tgx) );
}
unsigned int flags = cSim.pInteractionFlag[pos];
// flags = 0xFFFFFFFF;
if (flags == 0) {
// No interactions in this block.
} else {
if (lasty != y ) {
// load coordinates, charge, ...
loadFixedFieldShared( &(sA[threadIdx.x]), (y+tgx) );
}
// zero shared fields
......@@ -254,36 +267,36 @@ if( atomI == targetAtom ){
} else {
psA[threadIdx.x].tempBuffer[0] = outOfBounds ? 0.0f : ijField[1][0];
psA[threadIdx.x].tempBuffer[1] = outOfBounds ? 0.0f : ijField[1][1];
psA[threadIdx.x].tempBuffer[2] = outOfBounds ? 0.0f : ijField[1][2];
sA[threadIdx.x].tempBuffer[0] = outOfBounds ? 0.0f : ijField[1][0];
sA[threadIdx.x].tempBuffer[1] = outOfBounds ? 0.0f : ijField[1][1];
sA[threadIdx.x].tempBuffer[2] = outOfBounds ? 0.0f : ijField[1][2];
psA[threadIdx.x].tempBufferP[0] = outOfBounds ? 0.0f : ijField[3][0];
psA[threadIdx.x].tempBufferP[1] = outOfBounds ? 0.0f : ijField[3][1];
psA[threadIdx.x].tempBufferP[2] = outOfBounds ? 0.0f : ijField[3][2];
sA[threadIdx.x].tempBufferP[0] = outOfBounds ? 0.0f : ijField[3][0];
sA[threadIdx.x].tempBufferP[1] = outOfBounds ? 0.0f : ijField[3][1];
sA[threadIdx.x].tempBufferP[2] = outOfBounds ? 0.0f : ijField[3][2];
if( tgx % 2 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+1] );
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );
}
if( tgx % 4 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+2] );
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+2] );
}
if( tgx % 8 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+4] );
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+4] );
}
if( tgx % 16 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+8] );
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+8] );
}
if (tgx == 0)
{
psA[jIdx].eField[0] += psA[threadIdx.x].tempBuffer[0] + psA[threadIdx.x+16].tempBuffer[0];
psA[jIdx].eField[1] += psA[threadIdx.x].tempBuffer[1] + psA[threadIdx.x+16].tempBuffer[1];
psA[jIdx].eField[2] += psA[threadIdx.x].tempBuffer[2] + psA[threadIdx.x+16].tempBuffer[2];
psA[jIdx].eField[0] += sA[threadIdx.x].tempBuffer[0] + sA[threadIdx.x+16].tempBuffer[0];
psA[jIdx].eField[1] += sA[threadIdx.x].tempBuffer[1] + sA[threadIdx.x+16].tempBuffer[1];
psA[jIdx].eField[2] += sA[threadIdx.x].tempBuffer[2] + sA[threadIdx.x+16].tempBuffer[2];
psA[jIdx].eFieldP[0] += psA[threadIdx.x].tempBufferP[0] + psA[threadIdx.x+16].tempBufferP[0];
psA[jIdx].eFieldP[1] += psA[threadIdx.x].tempBufferP[1] + psA[threadIdx.x+16].tempBufferP[1];
psA[jIdx].eFieldP[2] += psA[threadIdx.x].tempBufferP[2] + psA[threadIdx.x+16].tempBufferP[2];
psA[jIdx].eFieldP[0] += sA[threadIdx.x].tempBufferP[0] + sA[threadIdx.x+16].tempBufferP[0];
psA[jIdx].eFieldP[1] += sA[threadIdx.x].tempBufferP[1] + sA[threadIdx.x+16].tempBufferP[1];
psA[jIdx].eFieldP[2] += sA[threadIdx.x].tempBufferP[2] + sA[threadIdx.x+16].tempBufferP[2];
}
}
......@@ -300,6 +313,18 @@ if( (atomI == targetAtom || (y + jIdx) == targetAtom) ){
debugArray[index].w = pScaleValue;
float flag = 9.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = (float) bExclusionFlag;
debugArray[index].y = (float) (tgx);
debugArray[index].z = (float) j;
debugArray[index].w = jIdx;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = (float) dScaleMask;
debugArray[index].y = (float) pScaleMask.x;
debugArray[index].z = (float) pScaleMask.y;
debugArray[index].w = (float) flags;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = outOfBounds ? 0.0f : ijField[indexI][0];
debugArray[index].y = outOfBounds ? 0.0f : ijField[indexI][1];
......
......@@ -759,13 +759,22 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
}
#endif
(void) fprintf( amoebaGpu->log, "MI iteration=%3d eps %14.6e [%14.6e %14.6e] done=%d\n",
iteration, amoebaGpu->mutualInducedCurrentEpsilon,
amoebaGpu->psCurrentEpsilon->_pSysStream[0][1],
amoebaGpu->psCurrentEpsilon->_pSysStream[0][2], done );
fflush( amoebaGpu->log );
if( amoebaGpu->mutualInducedCurrentEpsilon != amoebaGpu->mutualInducedCurrentEpsilon )exit(0);
iteration++;
}
amoebaGpu->mutualInducedDone = done;
amoebaGpu->mutualInducedConverged = ( !done || iteration > amoebaGpu->mutualInducedMaxIterations ) ? 0 : 1;
if( 1 ){
if( 0 ){
std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
......
......@@ -211,20 +211,20 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
} else {
if (lasty != y)
{
unsigned int atomJ = y + tgx;
// load coordinates, charge, ...
loadMutualInducedShared( &(sA[threadIdx.x]), atomJ );
}
unsigned int flags = cSim.pInteractionFlag[pos];
if (flags == 0) {
// No interactions in this block.
} else {
if (lasty != y)
{
unsigned int atomJ = y + tgx;
// load coordinates, charge, ...
loadMutualInducedShared( &(sA[threadIdx.x]), atomJ );
}
// zero shared fields
zeroMutualInducedParticleSharedField( &(sA[threadIdx.x]) );
......@@ -273,36 +273,36 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
} else {
psA[threadIdx.x].tempBuffer[0] = mask ? 0.0f : ijField[1][0];
psA[threadIdx.x].tempBuffer[1] = mask ? 0.0f : ijField[1][1];
psA[threadIdx.x].tempBuffer[2] = mask ? 0.0f : ijField[1][2];
sA[threadIdx.x].tempBuffer[0] = mask ? 0.0f : ijField[1][0];
sA[threadIdx.x].tempBuffer[1] = mask ? 0.0f : ijField[1][1];
sA[threadIdx.x].tempBuffer[2] = mask ? 0.0f : ijField[1][2];
psA[threadIdx.x].tempBufferP[0] = mask ? 0.0f : ijField[3][0];
psA[threadIdx.x].tempBufferP[1] = mask ? 0.0f : ijField[3][1];
psA[threadIdx.x].tempBufferP[2] = mask ? 0.0f : ijField[3][2];
sA[threadIdx.x].tempBufferP[0] = mask ? 0.0f : ijField[3][0];
sA[threadIdx.x].tempBufferP[1] = mask ? 0.0f : ijField[3][1];
sA[threadIdx.x].tempBufferP[2] = mask ? 0.0f : ijField[3][2];
if( tgx % 2 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+1] );
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );
}
if( tgx % 4 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+2] );
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+2] );
}
if( tgx % 8 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+4] );
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+4] );
}
if( tgx % 16 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+8] );
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+8] );
}
if (tgx == 0)
{
psA[jIdx].field[0] += psA[threadIdx.x].tempBuffer[0] + psA[threadIdx.x+16].tempBuffer[0];
psA[jIdx].field[1] += psA[threadIdx.x].tempBuffer[1] + psA[threadIdx.x+16].tempBuffer[1];
psA[jIdx].field[2] += psA[threadIdx.x].tempBuffer[2] + psA[threadIdx.x+16].tempBuffer[2];
psA[jIdx].field[0] += sA[threadIdx.x].tempBuffer[0] + sA[threadIdx.x+16].tempBuffer[0];
psA[jIdx].field[1] += sA[threadIdx.x].tempBuffer[1] + sA[threadIdx.x+16].tempBuffer[1];
psA[jIdx].field[2] += sA[threadIdx.x].tempBuffer[2] + sA[threadIdx.x+16].tempBuffer[2];
psA[jIdx].fieldPolar[0] += psA[threadIdx.x].tempBufferP[0] + psA[threadIdx.x+16].tempBufferP[0];
psA[jIdx].fieldPolar[1] += psA[threadIdx.x].tempBufferP[1] + psA[threadIdx.x+16].tempBufferP[1];
psA[jIdx].fieldPolar[2] += psA[threadIdx.x].tempBufferP[2] + psA[threadIdx.x+16].tempBufferP[2];
psA[jIdx].fieldPolar[0] += sA[threadIdx.x].tempBufferP[0] + sA[threadIdx.x+16].tempBufferP[0];
psA[jIdx].fieldPolar[1] += sA[threadIdx.x].tempBufferP[1] + sA[threadIdx.x+16].tempBufferP[1];
psA[jIdx].fieldPolar[2] += sA[threadIdx.x].tempBufferP[2] + sA[threadIdx.x+16].tempBufferP[2];
}
}
......
......@@ -396,16 +396,20 @@ if( 0 ){
amoebaGpu->psWorkUnit->Download();
(void) fprintf( amoebaGpu->log, "Ixn count=%u\n", gpu->psInteractionCount->_pSysStream[0][0] );
for( unsigned int ii = 0; ii < gpu->psInteractingWorkUnit->_length; ii++ ){
unsigned int x = gpu->psInteractingWorkUnit->_pSysStream[0][ii];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
//unsigned int y = ((x >> 2) & 0x7fff);
unsigned int exclusions = (x & 0x1);
x = (x >> 17) << GRIDBITS;
(void) fprintf( amoebaGpu->log, "Cell %8u %8u [%5u %5u %1u] ", ii, gpu->psInteractingWorkUnit->_pSysStream[0][ii], x,y,exclusions );
x = amoebaGpu->psWorkUnit->_pSysStream[0][ii];
y = ((x >> 2) & 0x7fff) << GRIDBITS;
exclusions = (x & 0x1);
x = (x >> 17) << GRIDBITS;
(void) fprintf( amoebaGpu->log, " %8u [%5u %5u %1u] %10u\n", amoebaGpu->psWorkUnit->_pSysStream[0][ii], x,y,exclusions, gpu->psInteractionFlag->_pSysStream[0][ii] );
// x = (x >> 17);
(void) fprintf( amoebaGpu->log, "GpuCell %8u %8u [%5u %5u %1u] %10u ", ii, gpu->psInteractingWorkUnit->_pSysStream[0][ii], x,y,exclusions, gpu->psInteractionFlag->_pSysStream[0][ii] );
x = amoebaGpu->psWorkUnit->_pSysStream[0][ii];
y = ((x >> 2) & 0x7fff) << GRIDBITS;
exclusions = (x & 0x1);
x = (x >> 17) << GRIDBITS;
(void) fprintf( amoebaGpu->log, " AmGpu %8u [%5u %5u %1u]\n", amoebaGpu->psWorkUnit->_pSysStream[0][ii], x,y,exclusions );
}
} else {
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment