Fix for direct PME

b20978e1 · Mark Friedrichs · a9054686 · b20978e1 · b20978e1 · b20978e1
Commit b20978e1 authored Oct 06, 2010 by Mark Friedrichs
11 changed files
--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaData.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaData.cpp
@@ -47,9 +47,7 @@ AmoebaCudaData::AmoebaCudaData( CudaPlatform::PlatformData& data ) : cudaPlatfor
 }   

 AmoebaCudaData::~AmoebaCudaData() {
-    (void) fprintf( stderr, "In AmoebaCudaData destructor\n" ); fflush( stderr );
    amoebaGpuShutDown( amoebaGpu );
-    (void) fprintf( stderr, "Done AmoebaCudaData destructor\n" ); fflush( stderr );
 }

 void AmoebaCudaData::decrementKernelCount( void ) {

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaData.h
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaData.h
@@ -167,9 +167,10 @@ public:
     */
    void setApplyCutoff( int applyCutoff ); 

+    CudaPlatform::PlatformData& cudaPlatformData;
+
 private:

-    CudaPlatform::PlatformData& cudaPlatformData;
    amoebaGpuContext amoebaGpu;
    bool hasAmoebaBonds, hasAmoebaGeneralizedKirkwood, hasAmoebaMultipole;
    int multipoleForceCount;

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -670,11 +670,11 @@ static void computeAmoebaMultipoleForce( AmoebaCudaData& data ) {

    amoebaGpuContext gpu = data.getAmoebaGpu();
    if( data.getMultipoleForceCount() == 0 ){
-        gpuCopyInteractingWorkUnit( gpu );
-    }
-    if( data.getApplyCutoff() && (data.getMultipoleForceCount() % 100) == 0 ){
-        gpuReorderAtoms(gpu->gpuContext);
+        gpuCopyWorkUnit( gpu );
    }
+    //if( data.getApplyCutoff() && (data.getMultipoleForceCount() % 100) == 0 ){
+        //gpuReorderAtoms(gpu->gpuContext);
+    //}
    data.incrementMultipoleForceCount();
    data.initializeGpu();

@@ -875,10 +875,11 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
        }
        gpuSetAmoebaPMEParameters(data.getAmoebaGpu(), (float) alpha, xsize, ysize, zsize);
        data.setApplyCutoff( 1 );
-        amoebaGpuContext amoebaGpu  = data.getAmoebaGpu();
-        gpuContext gpu              = amoebaGpu->gpuContext;
-        gpu->sim.nonbondedCutoffSqr = force.getCutoffDistance()*force.getCutoffDistance();
-        gpu->sim.nonbondedMethod    = PARTICLE_MESH_EWALD;
+        data.cudaPlatformData.nonbondedMethod = PARTICLE_MESH_EWALD;
+        amoebaGpuContext amoebaGpu            = data.getAmoebaGpu();
+        gpuContext gpu                        = amoebaGpu->gpuContext;
+        gpu->sim.nonbondedCutoffSqr           = force.getCutoffDistance()*force.getCutoffDistance();
+        gpu->sim.nonbondedMethod              = PARTICLE_MESH_EWALD;
    }
    data.getAmoebaGpu()->gpuContext->forces.push_back(new ForceInfo(force));
 }

--- a/plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
+++ b/plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
@@ -4305,7 +4305,7 @@ void trackMutualInducedIterations( amoebaGpuContext amoebaGpu, int iteration){

   --------------------------------------------------------------------------------------- */

-void gpuCopyInteractingWorkUnit( amoebaGpuContext amoebaGpu ){
+void gpuCopyWorkUnit( amoebaGpuContext amoebaGpu ){

 // ---------------------------------------------------------------------------------------

@@ -4315,7 +4315,7 @@ void gpuCopyInteractingWorkUnit( amoebaGpuContext amoebaGpu ){
    amoebaGpu->psWorkUnit->Download();
    (void) fprintf( amoebaGpu->log, "gpuCopyInteractingWorkUnit called -- to be removed.\n" );
    for( unsigned int ii = 0; ii < gpu->psInteractingWorkUnit->_length; ii++ ){
-        gpu->psInteractingWorkUnit->_pSysStream[0][ii] = amoebaGpu->psWorkUnit->_pSysStream[0][ii];
+        //gpu->psInteractingWorkUnit->_pSysStream[0][ii] = amoebaGpu->psWorkUnit->_pSysStream[0][ii];
        gpu->psWorkUnit->_pSysStream[0][ii]            = amoebaGpu->psWorkUnit->_pSysStream[0][ii];
    }    
    gpu->psInteractingWorkUnit->Upload();

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
@@ -344,7 +344,7 @@ extern "C"
 void gpuSetAmoebaBondOffsets(amoebaGpuContext gpu);

 extern "C"
-void gpuCopyInteractingWorkUnit(amoebaGpuContext gpu);
+void gpuCopyWorkUnit(amoebaGpuContext gpu);

 /*
 extern "C"

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.h
@@ -255,6 +255,14 @@ if( atomI == targetAtom ){

        } else {

+            if (lasty != y) {
+
+                // load shared data
+
+               loadPmeDirectElectrostaticShared( &(sA[threadIdx.x]), (y+tgx) );
+
+            }
+
            unsigned int flags           = cSim.pInteractionFlag[pos];
            if (flags == 0) {
                // No interactions in this block.
@@ -346,40 +354,40 @@ if( atomI == targetAtom ){

                    } else {

-                        psA[threadIdx.x].tempForce[0]     = mask ? 0.0f : force[0];
-                        psA[threadIdx.x].tempForce[1]     = mask ? 0.0f : force[1];
-                        psA[threadIdx.x].tempForce[2]     = mask ? 0.0f : force[2];
+                        sA[threadIdx.x].tempForce[0]     = mask ? 0.0f : force[0];
+                        sA[threadIdx.x].tempForce[1]     = mask ? 0.0f : force[1];
+                        sA[threadIdx.x].tempForce[2]     = mask ? 0.0f : force[2];
   
-                        psA[threadIdx.x].tempTorque[0]    = mask ? 0.0f : torque[1][0];
-                        psA[threadIdx.x].tempTorque[1]    = mask ? 0.0f : torque[1][1];
-                        psA[threadIdx.x].tempTorque[2]    = mask ? 0.0f : torque[1][2];
+                        sA[threadIdx.x].tempTorque[0]    = mask ? 0.0f : torque[1][0];
+                        sA[threadIdx.x].tempTorque[1]    = mask ? 0.0f : torque[1][1];
+                        sA[threadIdx.x].tempTorque[2]    = mask ? 0.0f : torque[1][2];

                        if( tgx % 2 == 0 ){
-                            sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+1] );  
+                            sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );  
                        }
                        if( tgx % 4 == 0 ){
-                            sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+2] );  
+                            sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+2] );  
                        }
                        if( tgx % 8 == 0 ){
-                            sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+4] );  
+                            sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+4] );  
                        }
                        if( tgx % 16 == 0 ){
-                            sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+8] );  
+                            sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+8] );  
                        }

                        if (tgx == 0)
                        {
-                            psA[jIdx].force[0]  -= psA[threadIdx.x].tempForce[0]  + psA[threadIdx.x+16].tempForce[0];
-                            psA[jIdx].force[1]  -= psA[threadIdx.x].tempForce[1]  + psA[threadIdx.x+16].tempForce[1];
-                            psA[jIdx].force[2]  -= psA[threadIdx.x].tempForce[2]  + psA[threadIdx.x+16].tempForce[2];
+                            psA[jIdx].force[0]  -= sA[threadIdx.x].tempForce[0]  + sA[threadIdx.x+16].tempForce[0];
+                            psA[jIdx].force[1]  -= sA[threadIdx.x].tempForce[1]  + sA[threadIdx.x+16].tempForce[1];
+                            psA[jIdx].force[2]  -= sA[threadIdx.x].tempForce[2]  + sA[threadIdx.x+16].tempForce[2];

-                            psA[jIdx].torque[0] += psA[threadIdx.x].tempTorque[0] + psA[threadIdx.x+16].tempTorque[0];
-                            psA[jIdx].torque[1] += psA[threadIdx.x].tempTorque[1] + psA[threadIdx.x+16].tempTorque[1];
-                            psA[jIdx].torque[2] += psA[threadIdx.x].tempTorque[2] + psA[threadIdx.x+16].tempTorque[2];
+                            psA[jIdx].torque[0] += sA[threadIdx.x].tempTorque[0] + sA[threadIdx.x+16].tempTorque[0];
+                            psA[jIdx].torque[1] += sA[threadIdx.x].tempTorque[1] + sA[threadIdx.x+16].tempTorque[1];
+                            psA[jIdx].torque[2] += sA[threadIdx.x].tempTorque[2] + sA[threadIdx.x+16].tempTorque[2];
                        }
                    }
 
-                    tj                              = (tj + 1) & (GRID - 1);
+                    tj = (tj + 1) & (GRID - 1);

                } // end of j-loop


--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
@@ -429,7 +429,7 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )

    // print intermediate results for the targetAtom 

-    unsigned int targetAtom  = 354;
+    unsigned int targetAtom  = 1280;
 #endif

    kClearFields_3( amoebaGpu, 2 );
@@ -458,8 +458,6 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
                                                                           amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
 #endif
    } else {
-
-                                                                           //amoebaGpu->psWorkUnit->_pDevStream[0],
        kCalculateAmoebaPmeDirectFixedE_FieldN2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
                                                                           gpu->sim.pInteractingWorkUnit,
                                                                           amoebaGpu->psWorkArray_3_1->_pDevStream[0],
@@ -553,24 +551,14 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
        amoebaGpu->gpuContext->psPosq4->Download();
        for( int jj = 0; jj < gpu->natoms; jj++ ){
            int debugIndex = jj;
-if( fabs(debugArray->_pSysStream[0][jj+paddedNumberOfAtoms].x) > 0.0 ){
+if( fabs(debugArray->_pSysStream[0][jj+3*paddedNumberOfAtoms].x) > 0.0 ){
            (void) fprintf( amoebaGpu->log,"%5d PmeFixedEField\n", jj );
-            for( int kk = 0; kk < 6; kk++ ){
+            for( int kk = 0; kk < 7; kk++ ){
                (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
                                debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
                                debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
                debugIndex += paddedNumberOfAtoms;
            }
-            (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e ] [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] p\n",
-                            amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].x,
-                            amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].y,
-                            amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].z,
-                            amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].x - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].x,
-                            amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].y - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].y,
-                            amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].z - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].z,
-                           (amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].x - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].x)/5.50f,
-                           (amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].y - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].y)/5.50f,
-                           (amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].z - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].z)/5.50f);
            (void) fprintf( amoebaGpu->log,"\n" );
 }

@@ -591,7 +579,7 @@ if( fabs(debugArray->_pSysStream[0][jj+paddedNumberOfAtoms].x) > 0.0 ){
    }
 #endif

-        if( 1 ){
+        if( 0 ){
            std::vector<int> fileId;
            fileId.push_back( 0 );
            VectorOfDoubleVectors outputVector;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.h
@@ -140,18 +140,30 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
                fieldPolarSum[2]       += match ? 0.0f : ijField[2][2];

 #ifdef AMOEBA_DEBUG
-if( atomI == targetAtom ){
+if( atomI == targetAtom || targetAtom == (y+j) ){
    unsigned int index                 = atomI == targetAtom ? (y + j) : atomI;
    unsigned int indexI                = 0;
    unsigned int indexJ                = indexI ? 0 : 2;
    unsigned int indices[4]            = { indexI, indexJ, indexI+1, indexJ+1 };
+    float flag                         = 7.0f;

    debugArray[index].x                = (float) atomI;
    debugArray[index].y                = (float) (y + j);
    debugArray[index].z                = dScaleValue;
    debugArray[index].w                = pScaleValue;

-    float flag                         = 7.0f;
+    index                             += cAmoebaSim.paddedNumberOfAtoms;
+    debugArray[index].x                = (float) bExclusionFlag;
+    debugArray[index].y                = (float) (tgx);
+    debugArray[index].z                = (float) j;
+    debugArray[index].w                = flag;
+
+    index                             += cAmoebaSim.paddedNumberOfAtoms;
+    debugArray[index].x                = (float) dScaleMask;
+    debugArray[index].y                = (float) pScaleMask.x;
+    debugArray[index].z                = (float) pScaleMask.y;
+    debugArray[index].w                = flag;
+
    for( int ii = 0; ii < 4; ii++ ){
        index                             += cAmoebaSim.paddedNumberOfAtoms;
        debugArray[index].x                = match ? 0.0f : ijField[indices[ii]][0];
@@ -159,6 +171,7 @@ if( atomI == targetAtom ){
        debugArray[index].z                = match ? 0.0f : ijField[indices[ii]][2];
        debugArray[index].w                = flag;
    }
+
    for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
        index                             += cAmoebaSim.paddedNumberOfAtoms;
        debugArray[index].x                = pullBack[pullIndex].x;
@@ -186,18 +199,18 @@ if( atomI == targetAtom ){

        } else {

+            if (lasty != y ) {
+    
+                // load coordinates, charge, ...
+    
+                loadFixedFieldShared( &(sA[threadIdx.x]), (y+tgx) );
+    
+            }
+
            unsigned int flags = cSim.pInteractionFlag[pos];
-//  flags = 0xFFFFFFFF;
            if (flags == 0) {
                // No interactions in this block.
            } else {
-                if (lasty != y ) {
-    
-                    // load coordinates, charge, ...
-    
-                    loadFixedFieldShared( &(sA[threadIdx.x]), (y+tgx) );
-    
-                }

                // zero shared fields

@@ -254,36 +267,36 @@ if( atomI == targetAtom ){
 
                    } else {

-                         psA[threadIdx.x].tempBuffer[0]  = outOfBounds ? 0.0f : ijField[1][0];
-                         psA[threadIdx.x].tempBuffer[1]  = outOfBounds ? 0.0f : ijField[1][1];
-                         psA[threadIdx.x].tempBuffer[2]  = outOfBounds ? 0.0f : ijField[1][2];
+                        sA[threadIdx.x].tempBuffer[0]  = outOfBounds ? 0.0f : ijField[1][0];
+                        sA[threadIdx.x].tempBuffer[1]  = outOfBounds ? 0.0f : ijField[1][1];
+                        sA[threadIdx.x].tempBuffer[2]  = outOfBounds ? 0.0f : ijField[1][2];
    
-                         psA[threadIdx.x].tempBufferP[0] = outOfBounds ? 0.0f : ijField[3][0];
-                         psA[threadIdx.x].tempBufferP[1] = outOfBounds ? 0.0f : ijField[3][1];
-                         psA[threadIdx.x].tempBufferP[2] = outOfBounds ? 0.0f : ijField[3][2];
+                        sA[threadIdx.x].tempBufferP[0] = outOfBounds ? 0.0f : ijField[3][0];
+                        sA[threadIdx.x].tempBufferP[1] = outOfBounds ? 0.0f : ijField[3][1];
+                        sA[threadIdx.x].tempBufferP[2] = outOfBounds ? 0.0f : ijField[3][2];

                        if( tgx % 2 == 0 ){
-                            sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+1] ); 
+                            sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] ); 
                        } 
                        if( tgx % 4 == 0 ){
-                            sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+2] ); 
+                            sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+2] ); 
                        } 
                        if( tgx % 8 == 0 ){
-                            sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+4] ); 
+                            sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+4] ); 
                        } 
                        if( tgx % 16 == 0 ){
-                            sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+8] ); 
+                            sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+8] ); 
                        } 

                        if (tgx == 0)
                        {
-                            psA[jIdx].eField[0]  += psA[threadIdx.x].tempBuffer[0]  + psA[threadIdx.x+16].tempBuffer[0];
-                            psA[jIdx].eField[1]  += psA[threadIdx.x].tempBuffer[1]  + psA[threadIdx.x+16].tempBuffer[1];
-                            psA[jIdx].eField[2]  += psA[threadIdx.x].tempBuffer[2]  + psA[threadIdx.x+16].tempBuffer[2];
+                            psA[jIdx].eField[0]  += sA[threadIdx.x].tempBuffer[0]  + sA[threadIdx.x+16].tempBuffer[0];
+                            psA[jIdx].eField[1]  += sA[threadIdx.x].tempBuffer[1]  + sA[threadIdx.x+16].tempBuffer[1];
+                            psA[jIdx].eField[2]  += sA[threadIdx.x].tempBuffer[2]  + sA[threadIdx.x+16].tempBuffer[2];

-                            psA[jIdx].eFieldP[0] += psA[threadIdx.x].tempBufferP[0] + psA[threadIdx.x+16].tempBufferP[0];
-                            psA[jIdx].eFieldP[1] += psA[threadIdx.x].tempBufferP[1] + psA[threadIdx.x+16].tempBufferP[1];
-                            psA[jIdx].eFieldP[2] += psA[threadIdx.x].tempBufferP[2] + psA[threadIdx.x+16].tempBufferP[2];
+                            psA[jIdx].eFieldP[0] += sA[threadIdx.x].tempBufferP[0] + sA[threadIdx.x+16].tempBufferP[0];
+                            psA[jIdx].eFieldP[1] += sA[threadIdx.x].tempBufferP[1] + sA[threadIdx.x+16].tempBufferP[1];
+                            psA[jIdx].eFieldP[2] += sA[threadIdx.x].tempBufferP[2] + sA[threadIdx.x+16].tempBufferP[2];
                        }
                    }

@@ -300,6 +313,18 @@ if( (atomI == targetAtom || (y + jIdx) == targetAtom) ){
            debugArray[index].w                = pScaleValue;

            float flag                         = 9.0f;
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = (float) bExclusionFlag;
+            debugArray[index].y                = (float) (tgx);
+            debugArray[index].z                = (float) j;
+            debugArray[index].w                = jIdx;
+        
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = (float) dScaleMask;
+            debugArray[index].y                = (float) pScaleMask.x;
+            debugArray[index].z                = (float) pScaleMask.y;
+            debugArray[index].w                = (float) flags;
+        
            index                             += cAmoebaSim.paddedNumberOfAtoms;
            debugArray[index].x                =  outOfBounds ? 0.0f : ijField[indexI][0];
            debugArray[index].y                =  outOfBounds ? 0.0f : ijField[indexI][1];

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
@@ -759,13 +759,22 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba

        }
 #endif
+
+(void) fprintf( amoebaGpu->log, "MI iteration=%3d eps %14.6e [%14.6e %14.6e] done=%d\n",
+                iteration, amoebaGpu->mutualInducedCurrentEpsilon,
+                amoebaGpu->psCurrentEpsilon->_pSysStream[0][1], 
+                amoebaGpu->psCurrentEpsilon->_pSysStream[0][2], done );
+fflush( amoebaGpu->log );
+if( amoebaGpu->mutualInducedCurrentEpsilon != amoebaGpu->mutualInducedCurrentEpsilon )exit(0);
+
+
        iteration++;
    }

    amoebaGpu->mutualInducedDone             = done;
    amoebaGpu->mutualInducedConverged        = ( !done || iteration > amoebaGpu->mutualInducedMaxIterations ) ? 0 : 1;

-    if( 1 ){
+    if( 0 ){
        std::vector<int> fileId;
        //fileId.push_back( 0 );
        VectorOfDoubleVectors outputVector;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.h
@@ -211,20 +211,20 @@ if( atomI == targetAtom || (y+j) == targetAtom ){

        } else {

+            if (lasty != y)
+            {
+                unsigned int atomJ        = y + tgx;
+
+                // load coordinates, charge, ...
+
+                loadMutualInducedShared( &(sA[threadIdx.x]), atomJ );
+            }
+    
            unsigned int flags = cSim.pInteractionFlag[pos];
            if (flags == 0) {
                // No interactions in this block.
            } else {

-                if (lasty != y)
-                {
-                    unsigned int atomJ        = y + tgx;
-    
-                    // load coordinates, charge, ...
-    
-                    loadMutualInducedShared( &(sA[threadIdx.x]), atomJ );
-                }
-    
               // zero shared fields
    
                zeroMutualInducedParticleSharedField(  &(sA[threadIdx.x]) );
@@ -273,36 +273,36 @@ if( atomI == targetAtom || (y+j) == targetAtom ){

                    } else {

-                        psA[threadIdx.x].tempBuffer[0]  = mask ? 0.0f : ijField[1][0];
-                        psA[threadIdx.x].tempBuffer[1]  = mask ? 0.0f : ijField[1][1];
-                        psA[threadIdx.x].tempBuffer[2]  = mask ? 0.0f : ijField[1][2];
+                        sA[threadIdx.x].tempBuffer[0]  = mask ? 0.0f : ijField[1][0];
+                        sA[threadIdx.x].tempBuffer[1]  = mask ? 0.0f : ijField[1][1];
+                        sA[threadIdx.x].tempBuffer[2]  = mask ? 0.0f : ijField[1][2];

-                        psA[threadIdx.x].tempBufferP[0] = mask ? 0.0f : ijField[3][0];
-                        psA[threadIdx.x].tempBufferP[1] = mask ? 0.0f : ijField[3][1];
-                        psA[threadIdx.x].tempBufferP[2] = mask ? 0.0f : ijField[3][2];
+                        sA[threadIdx.x].tempBufferP[0] = mask ? 0.0f : ijField[3][0];
+                        sA[threadIdx.x].tempBufferP[1] = mask ? 0.0f : ijField[3][1];
+                        sA[threadIdx.x].tempBufferP[2] = mask ? 0.0f : ijField[3][2];

                        if( tgx % 2 == 0 ){
-                            sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+1] );
+                            sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );
                        }
                        if( tgx % 4 == 0 ){
-                            sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+2] );
+                            sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+2] );
                        }
                        if( tgx % 8 == 0 ){
-                            sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+4] );
+                            sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+4] );
                        }
                        if( tgx % 16 == 0 ){
-                            sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+8] );
+                            sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+8] );
                        }

                        if (tgx == 0)
                        {
-                            psA[jIdx].field[0]         += psA[threadIdx.x].tempBuffer[0]  + psA[threadIdx.x+16].tempBuffer[0];
-                            psA[jIdx].field[1]         += psA[threadIdx.x].tempBuffer[1]  + psA[threadIdx.x+16].tempBuffer[1];
-                            psA[jIdx].field[2]         += psA[threadIdx.x].tempBuffer[2]  + psA[threadIdx.x+16].tempBuffer[2];
+                            psA[jIdx].field[0]         += sA[threadIdx.x].tempBuffer[0]  + sA[threadIdx.x+16].tempBuffer[0];
+                            psA[jIdx].field[1]         += sA[threadIdx.x].tempBuffer[1]  + sA[threadIdx.x+16].tempBuffer[1];
+                            psA[jIdx].field[2]         += sA[threadIdx.x].tempBuffer[2]  + sA[threadIdx.x+16].tempBuffer[2];

-                            psA[jIdx].fieldPolar[0]    += psA[threadIdx.x].tempBufferP[0] + psA[threadIdx.x+16].tempBufferP[0];
-                            psA[jIdx].fieldPolar[1]    += psA[threadIdx.x].tempBufferP[1] + psA[threadIdx.x+16].tempBufferP[1];
-                            psA[jIdx].fieldPolar[2]    += psA[threadIdx.x].tempBufferP[2] + psA[threadIdx.x+16].tempBufferP[2];
+                            psA[jIdx].fieldPolar[0]    += sA[threadIdx.x].tempBufferP[0] + sA[threadIdx.x+16].tempBufferP[0];
+                            psA[jIdx].fieldPolar[1]    += sA[threadIdx.x].tempBufferP[1] + sA[threadIdx.x+16].tempBufferP[1];
+                            psA[jIdx].fieldPolar[2]    += sA[threadIdx.x].tempBufferP[2] + sA[threadIdx.x+16].tempBufferP[2];
                        }

                    }

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaRotateFrame.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaRotateFrame.cu
@@ -396,16 +396,20 @@ if( 0 ){
    amoebaGpu->psWorkUnit->Download();
    (void) fprintf( amoebaGpu->log, "Ixn count=%u\n", gpu->psInteractionCount->_pSysStream[0][0] );
    for( unsigned int ii = 0; ii < gpu->psInteractingWorkUnit->_length; ii++ ){
+
        unsigned int x          = gpu->psInteractingWorkUnit->_pSysStream[0][ii];
        unsigned int y          = ((x >> 2) & 0x7fff) << GRIDBITS;
+        //unsigned int y          = ((x >> 2) & 0x7fff);
        unsigned int exclusions = (x & 0x1);
                     x          = (x >> 17) << GRIDBITS;
-        (void) fprintf( amoebaGpu->log, "Cell %8u  %8u [%5u %5u %1u] ", ii, gpu->psInteractingWorkUnit->_pSysStream[0][ii], x,y,exclusions );
-        x          = amoebaGpu->psWorkUnit->_pSysStream[0][ii];
-        y          = ((x >> 2) & 0x7fff) << GRIDBITS;
-        exclusions = (x & 0x1);
-        x          = (x >> 17) << GRIDBITS;
-        (void) fprintf( amoebaGpu->log, "   %8u [%5u %5u %1u]   %10u\n", amoebaGpu->psWorkUnit->_pSysStream[0][ii], x,y,exclusions, gpu->psInteractionFlag->_pSysStream[0][ii] );
+        //             x          = (x >> 17);
+        (void) fprintf( amoebaGpu->log, "GpuCell %8u  %8u [%5u %5u %1u] %10u ", ii, gpu->psInteractingWorkUnit->_pSysStream[0][ii], x,y,exclusions, gpu->psInteractionFlag->_pSysStream[0][ii] );
+
+                     x          = amoebaGpu->psWorkUnit->_pSysStream[0][ii];
+                     y          = ((x >> 2) & 0x7fff) << GRIDBITS;
+                     exclusions = (x & 0x1);
+                     x          = (x >> 17) << GRIDBITS;
+        (void) fprintf( amoebaGpu->log, "   AmGpu %8u [%5u %5u %1u]\n", amoebaGpu->psWorkUnit->_pSysStream[0][ii], x,y,exclusions );
    }    
 } else {
 }