Continue PME

42d5b80a · Mark Friedrichs · 770f7abc · 42d5b80a · 42d5b80a · 42d5b80a
Commit 42d5b80a authored Sep 14, 2010 by Mark Friedrichs
5 changed files
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
@@ -1271,6 +1271,6 @@ void cudaComputeAmoebaPmeElectrostatic( amoebaGpuContext amoebaGpu )
 {
 
    cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpu );
-    kCalculateAmoebaPME( amoebaGpu );
+    //kCalculateAmoebaPME( amoebaGpu );
 } 

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
@@ -46,6 +46,7 @@ void kReduceDirectSelfFields_kernel( unsigned int fieldComponents, unsigned int
    // Reduce field

    const float term = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
+    //const float term = 0.0f;
    while (pos < fieldComponents)
    {   

@@ -73,6 +74,7 @@ void kReduceDirectSelfFields_kernel( unsigned int fieldComponents, unsigned int
        {   
            totalField += pFt[0];
        }   
+
        fieldOut[pos]   = totalField;
        pos            += gridDim.x * blockDim.x;
    }   
@@ -256,18 +258,19 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
    if( r2 <= cAmoebaSim.cutoffDistance2 ){

        fields[0][0]       = fim[0] - fid[0];
-        fields[1][0]       = fkm[0] - fkd[0];
-        fields[2][0]       = fim[0] - fip[0];
-        fields[3][0]       = fkm[0] - fkp[0];
-    
        fields[0][1]       = fim[1] - fid[1];
-        fields[1][1]       = fkm[1] - fkd[1];
-        fields[2][1]       = fim[1] - fip[1];
-        fields[3][1]       = fkm[1] - fkp[1];
-    
        fields[0][2]       = fim[2] - fid[2];
+
+        fields[1][0]       = fkm[0] - fkd[0];
+        fields[1][1]       = fkm[1] - fkd[1];
        fields[1][2]       = fkm[2] - fkd[2];
+
+        fields[2][0]       = fim[0] - fip[0];
+        fields[2][1]       = fim[1] - fip[1];
        fields[2][2]       = fim[2] - fip[2];
+
+        fields[3][0]       = fkm[0] - fkp[0];
+        fields[3][1]       = fkm[1] - fkp[1];
        fields[3][2]       = fkm[2] - fkp[2];
 
    } else {
@@ -297,11 +300,12 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
    pullBack[1].y = atomJ.y - atomI.y;
    pullBack[1].z = atomJ.z - atomI.z;
    pullBack[1].w = (atomJ.x - atomI.x)*(atomJ.x - atomI.x) + (atomJ.y - atomI.y)*(atomJ.y - atomI.y)+ (atomJ.z - atomI.z)*(atomJ.z - atomI.z);
-/*
-    pullBack[1].x = scale3;
-    pullBack[1].y = scale5;
-    pullBack[1].z = scale7;
-*/
+
+    pullBack[2].x = scale3;
+    pullBack[2].y = scale5;
+    pullBack[2].z = scale7;
+    pullBack[2].w = -1.0;
+
 #endif
 }

@@ -351,10 +355,17 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
    unsigned int targetAtom  = 0;
 #endif

+//#define SET_ALPHA_EWALD
+//#ifdef SET_ALPHA_EWALD
+//    amoebaGpu->gpuContext->sim.alphaEwald = 5.4459052e+00f;
+//    (void) fprintf( amoebaGpu->log, "computeCudaAmoebaPmeFixedEField: forceing alphaEwald=%15.7e\n", amoebaGpu->gpuContext->sim.alphaEwald );
+//    SetCalculateAmoebaCudaPmeFixedEFieldSim(amoebaGpu);
+//#endif
+
+
    kClearFields_3( amoebaGpu, 2 );

    if (gpu->bOutputBufferPerWarp){
-        (void) fprintf( amoebaGpu->log, "N2 warp\n" );
        kCalculateAmoebaPmeDirectFixedE_FieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
                                                                           amoebaGpu->psWorkUnit->_pDevStream[0],
                                                                           amoebaGpu->psWorkArray_3_1->_pDevStream[0],
@@ -366,14 +377,6 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
 #endif
    } else {

-#ifdef AMOEBA_DEBUG
-        (void) fprintf( amoebaGpu->log, "N2 no warp\n" );
-        (void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n",
-                        amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
-                        sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
-        (void) fflush( amoebaGpu->log );
-#endif
-
        kCalculateAmoebaPmeDirectFixedE_FieldN2_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
                                                                           amoebaGpu->psWorkUnit->_pDevStream[0],
                                                                           amoebaGpu->psWorkArray_3_1->_pDevStream[0],
@@ -384,7 +387,6 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
                                                                           amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
 #endif
    }
-
    LAUNCHERROR("kCalculateAmoebaPmeDirectFixedE_Field_kernel");

 #if 0
@@ -406,16 +408,44 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
 #ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){
        gpu->psInteractionCount->Download();
-        (void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n",
+        (void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u warp=%d\n",
                        amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
-                        sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
+                        sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->energyOutputBuffers, 
+                        (*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->bOutputBufferPerWarp );
        (void) fflush( amoebaGpu->log );
        amoebaGpu->psWorkArray_3_1->Download();
        amoebaGpu->psWorkArray_3_2->Download();
        amoebaGpu->psE_Field->Download();
        amoebaGpu->psE_FieldPolar->Download();
-        (void) fprintf( amoebaGpu->log, "OutEFields\n" );
+        (void) fprintf( amoebaGpu->log, "Out WorkArray_3_[1,2]  paddedNumberOfAtoms=%d\n",  amoebaGpu->paddedNumberOfAtoms, amoebaGpu->outputBuffers );
        int maxPrint        = 32;
+        for( int ii = 0; ii < amoebaGpu->paddedNumberOfAtoms; ii++ ){
+           (void) fprintf( amoebaGpu->log, "%5d ", ii); 
+
+            int indexOffset     = ii*3;
+
+           // buffer 1
+
+           (void) fprintf( amoebaGpu->log,"WArry1[%16.9e %16.9e %16.9e] ",
+                           amoebaGpu->psWorkArray_3_1->_pSysStream[0][indexOffset],
+                           amoebaGpu->psWorkArray_3_1->_pSysStream[0][indexOffset+1],
+                           amoebaGpu->psWorkArray_3_1->_pSysStream[0][indexOffset+2] );
+   
+           // buffer 2
+
+           (void) fprintf( amoebaGpu->log,"WArry2[%16.9e %16.9e %16.9e] ",
+                           amoebaGpu->psWorkArray_3_2->_pSysStream[0][indexOffset],
+                           amoebaGpu->psWorkArray_3_2->_pSysStream[0][indexOffset+1],
+                           amoebaGpu->psWorkArray_3_2->_pSysStream[0][indexOffset+2] );
+
+           (void) fprintf( amoebaGpu->log,"\n" );
+           if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
+                ii = gpu->natoms - maxPrint;
+           }
+        }
+        (void) fflush( amoebaGpu->log );
+        (void) fprintf( amoebaGpu->log, "EFields End\n" );
+
        for( int ii = 0; ii < gpu->natoms; ii++ ){
           (void) fprintf( amoebaGpu->log, "%5d ", ii); 

@@ -450,7 +480,7 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
        for( int jj = 0; jj < gpu->natoms; jj++ ){
            int debugIndex = jj;
            (void) fprintf( amoebaGpu->log,"%5d PmeFixedEField\n", jj );
-            for( int kk = 0; kk < 7; kk++ ){
+            for( int kk = 0; kk < 10; kk++ ){
                (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
                                debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
                                debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
@@ -476,7 +506,7 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
    }
 #endif

-        if( 1 ){
+        if( 0 ){
            std::vector<int> fileId;
            fileId.push_back( 0 );
            VectorOfDoubleVectors outputVector;
@@ -491,5 +521,5 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
 void cudaComputeAmoebaPmeFixedEField( amoebaGpuContext amoebaGpu )
 {
    cudaComputeAmoebaPmeDirectFixedEField( amoebaGpu );
-    kCalculateAmoebaPMEFixedMultipoleField( amoebaGpu );
+    //kCalculateAmoebaPMEFixedMultipoleField( amoebaGpu );
 }
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.h
@@ -44,6 +44,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
 ){

 #ifdef AMOEBA_DEBUG
+    int pullIndexMax = 12;
    float4 pullBack[12];
    float dScaleVal;
    float pScaleVal;
@@ -120,6 +121,7 @@ pScaleVal = 1.0f;
                    );

                    unsigned int match      = (atomI == (y + j)) ? 1 : 0;
+match = 1;

                    // add to field at atomI the field due atomJ's charge/dipole/quadrupole

@@ -130,6 +132,7 @@ pScaleVal = 1.0f;
                    fieldPolarSum[0]       += match ? 0.0f : ijField[2][0];
                    fieldPolarSum[1]       += match ? 0.0f : ijField[2][1];
                    fieldPolarSum[2]       += match ? 0.0f : ijField[2][2];
+
                }

            }
@@ -166,7 +169,7 @@ pScaleVal = pScaleValue;
                    // nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution
                    // by setting match flag

-                    unsigned int match      = (atomI == (y + j)) ? 1 : 0;
+                    unsigned int match      = ( (atomI == (y + j)) || (atomI >= cAmoebaSim.numberOfAtoms) || ((y+j) >= cAmoebaSim.numberOfAtoms) ) ? 1 : 0;

                    // add to field at atomI the field due atomJ's charge/dipole/quadrupole

@@ -180,54 +183,40 @@ pScaleVal = pScaleValue;

 #ifdef AMOEBA_DEBUG
 if( atomI == targetAtom ){
-            unsigned int index                 = atomI == targetAtom ? (y + j) : atomI;
-            unsigned int pullBackIndex         = 0;
-            unsigned int indexI                = 0;
-            unsigned int indexJ                = indexI ? 0 : 2;
-
-            debugArray[index].x                = (float) atomI;
-            debugArray[index].y                = (float) (y + j);
-            debugArray[index].z                = dScaleValue;
-            debugArray[index].w                = pScaleValue;
-
-
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                = pullBack[pullBackIndex].x;
-            debugArray[index].y                = pullBack[pullBackIndex].y;
-            debugArray[index].z                = pullBack[pullBackIndex].z;
-            debugArray[index].w                = pullBack[pullBackIndex].w;
-
-            pullBackIndex++;
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                = pullBack[pullBackIndex].x;
-            debugArray[index].y                = pullBack[pullBackIndex].y;
-            debugArray[index].z                = pullBack[pullBackIndex].z;
-            debugArray[index].w                = pullBack[pullBackIndex].w;
-
+        unsigned int index                 = atomI == targetAtom ? (y + j) : atomI;
+        unsigned int indexI                = 0;
+        unsigned int indexJ                = indexI ? 0 : 2;
+        unsigned int indices[4]            = { indexI, indexJ, indexI+1, indexJ+1 };
+
+        debugArray[index].x                = (float) atomI;
+        debugArray[index].y                = (float) (y + j);
+        debugArray[index].z                = dScaleValue;
+        debugArray[index].w                = pScaleValue;
+
+        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        unsigned int off                   = 3*(x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+        debugArray[index].x                = (float) x;
+        debugArray[index].y                = (float) tgx;
+        debugArray[index].z                = -2;
+        debugArray[index].w                = (float) off;
+
+        float flag                         = 7.0f;
+        for( int ii = 0; ii < 4; ii++ ){
            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            float flag                         = 7.0f;
-            debugArray[index].x                = ijField[indexI][0];
-            debugArray[index].y                = ijField[indexI][1];
-            debugArray[index].z                = ijField[indexI][2];
+            debugArray[index].x                = match ? 0.0f : ijField[indices[ii]][0];
+            debugArray[index].y                = match ? 0.0f : ijField[indices[ii]][1];
+            debugArray[index].z                = match ? 0.0f : ijField[indices[ii]][2];
            debugArray[index].w                = flag;
+        }

+        for( int pullIndex = 0; pullIndex < pullIndexMax; pullIndex++ ){
            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexJ][0];
-            debugArray[index].y                = ijField[indexJ][1];
-            debugArray[index].z                = ijField[indexJ][2];
-            debugArray[index].w                = flag;
+            debugArray[index].x                = pullBack[pullIndex].x;
+            debugArray[index].y                = pullBack[pullIndex].y;
+            debugArray[index].z                = pullBack[pullIndex].z;
+            debugArray[index].w                = pullBack[pullIndex].w;
+        }   

-            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexI+1][0];
-            debugArray[index].y                = ijField[indexI+1][1];
-            debugArray[index].z                = ijField[indexI+1][2];
-            debugArray[index].w                = flag;
-
-            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexJ+1][0];
-            debugArray[index].y                = ijField[indexJ+1][1];
-            debugArray[index].z                = ijField[indexJ+1][2];
-            debugArray[index].w                = flag;

 /*
            index                             += cAmoebaSim.paddedNumberOfAtoms;
@@ -300,25 +289,27 @@ pScaleVal = 1.0f;
 #endif
                    );
    
+                    unsigned int outOfBounds     = ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+tj) >= cAmoebaSim.numberOfAtoms) ) ? 1 : 0;
+
                    // add to field at atomI the field due atomJ's charge/dipole/quadrupole

-                    fieldSum[0]        += ijField[0][0];
-                    fieldSum[1]        += ijField[0][1];
-                    fieldSum[2]        += ijField[0][2];
+                    fieldSum[0]                 += outOfBounds ? 0.0f : ijField[0][0];
+                    fieldSum[1]                 += outOfBounds ? 0.0f : ijField[0][1];
+                    fieldSum[2]                 += outOfBounds ? 0.0f : ijField[0][2];
        
-                    fieldPolarSum[0]   += ijField[2][0];
-                    fieldPolarSum[1]   += ijField[2][1];
-                    fieldPolarSum[2]   += ijField[2][2];
+                    fieldPolarSum[0]            += outOfBounds ? 0.0f : ijField[2][0];
+                    fieldPolarSum[1]            += outOfBounds ? 0.0f : ijField[2][1];
+                    fieldPolarSum[2]            += outOfBounds ? 0.0f : ijField[2][2];

                    // add to field at atomJ the field due atomI's charge/dipole/quadrupole

-                    psA[tj].eField[0]  += ijField[1][0];
-                    psA[tj].eField[1]  += ijField[1][1];
-                    psA[tj].eField[2]  += ijField[1][2];
+                    psA[tj].eField[0]           += outOfBounds ? 0.0f : ijField[1][0];
+                    psA[tj].eField[1]           += outOfBounds ? 0.0f : ijField[1][1];
+                    psA[tj].eField[2]           += outOfBounds ? 0.0f : ijField[1][2];

-                    psA[tj].eFieldP[0] += ijField[3][0];
-                    psA[tj].eFieldP[1] += ijField[3][1];
-                    psA[tj].eFieldP[2] += ijField[3][2];
+                    psA[tj].eFieldP[0]          += outOfBounds ? 0.0f : ijField[3][0];
+                    psA[tj].eFieldP[1]          += outOfBounds ? 0.0f : ijField[3][1];
+                    psA[tj].eFieldP[2]          += outOfBounds ? 0.0f : ijField[3][2];


 #ifdef AMOEBA_DEBUG
@@ -422,25 +413,27 @@ pScaleVal = pScaleValue;
 #endif
                    );

+                    unsigned int outOfBounds     = ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+tj) >= cAmoebaSim.numberOfAtoms) ) ? 1 : 0;
+
                    // add to field at atomI the field due atomJ's charge/dipole/quadrupole

-                    fieldSum[0]        += ijField[0][0];
-                    fieldSum[1]        += ijField[0][1];
-                    fieldSum[2]        += ijField[0][2];
+                    fieldSum[0]                 += outOfBounds ? 0.0f : ijField[0][0];
+                    fieldSum[1]                 += outOfBounds ? 0.0f : ijField[0][1];
+                    fieldSum[2]                 += outOfBounds ? 0.0f : ijField[0][2];

-                    fieldPolarSum[0]   += ijField[2][0];
-                    fieldPolarSum[1]   += ijField[2][1];
-                    fieldPolarSum[2]   += ijField[2][2];
+                    fieldPolarSum[0]            += outOfBounds ? 0.0f : ijField[2][0];
+                    fieldPolarSum[1]            += outOfBounds ? 0.0f : ijField[2][1];
+                    fieldPolarSum[2]            += outOfBounds ? 0.0f : ijField[2][2];

                    // add to field at atomJ the field due atomI's charge/dipole/quadrupole

-                    psA[tj].eField[0]  += ijField[1][0];
-                    psA[tj].eField[1]  += ijField[1][1];
-                    psA[tj].eField[2]  += ijField[1][2];
+                    psA[tj].eField[0]           += outOfBounds ? 0.0f : ijField[1][0];
+                    psA[tj].eField[1]           += outOfBounds ? 0.0f : ijField[1][1];
+                    psA[tj].eField[2]           += outOfBounds ? 0.0f : ijField[1][2];

-                    psA[tj].eFieldP[0] += ijField[3][0];
-                    psA[tj].eFieldP[1] += ijField[3][1];
-                    psA[tj].eFieldP[2] += ijField[3][2];
+                    psA[tj].eFieldP[0]          += outOfBounds ? 0.0f : ijField[3][0];
+                    psA[tj].eFieldP[1]          += outOfBounds ? 0.0f : ijField[3][1];
+                    psA[tj].eFieldP[2]          += outOfBounds ? 0.0f : ijField[3][2];


 #ifdef AMOEBA_DEBUG

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
@@ -429,7 +429,8 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte

 #ifdef AMOEBA_DEBUG
    if( amoebaGpu->log && iteration == 1 ){
-        (void) fprintf( amoebaGpu->log, "Finished maxtrixMultiply kernel execution %d\n", iteration ); (void) fflush( amoebaGpu->log );
+        (void) fprintf( amoebaGpu->log, "Finished maxtrixMultiply kernel execution %d -- Direct only -- self added in kSorUpdateMutualInducedField_kernel\n",
+                        iteration ); (void) fflush( amoebaGpu->log );
        outputArray->Download();
        outputPolarArray->Download();
        debugArray->Download();
@@ -556,6 +557,9 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
        amoebaGpu->psE_Field->_pSysStream[0][ii]       *= conversion;
        amoebaGpu->psE_FieldPolar->_pSysStream[0][ii]  *= conversion;
    }
+    amoebaGpu->gpuContext->sim.alphaEwald = 5.4459052e+00f;
+    (void) fprintf( amoebaGpu->log, "cudaComputeAmoebaPmeMutualInducedFieldBySOR: forcing alphaEwald=%15.7e\n", amoebaGpu->gpuContext->sim.alphaEwald );
+    SetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpu);
    amoebaGpu->psE_Field->Upload();
    amoebaGpu->psE_FieldPolar->Upload();
 #endif
@@ -584,7 +588,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
        amoebaGpu->psPolarizability->Download();
        (void) fprintf( amoebaGpu->log, "%s Initial setup for matrix multiply\n", methodName );
        int offset   = 0;
-        int maxPrint = 20000;
+        int maxPrint = 10;
        for( int ii = 0; ii < gpu->natoms; ii++ ){
            (void) fprintf( amoebaGpu->log, "%4d pol=%12.4e ", ii, 
                            amoebaGpu->psPolarizability->_pSysStream[0][offset] );
@@ -616,7 +620,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
        // matrix multiply

        cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpu, amoebaGpu->psWorkVector[0],  amoebaGpu->psWorkVector[1] );
-        kCalculateAmoebaPMEInducedDipoleField( amoebaGpu );
+        //kCalculateAmoebaPMEInducedDipoleField( amoebaGpu );
        LAUNCHERROR("cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply Loop\n");  



--- a/plugins/amoeba/platforms/cuda/tests/TestCudaAmoebaPME.cpp
+++ b/plugins/amoeba/platforms/cuda/tests/TestCudaAmoebaPME.cpp
@@ -63,7 +63,9 @@ void testPMEWater() {
    quadrupole[0] = 3.540307211e-2;
    quadrupole[4] = -3.902570771e-2;
    quadrupole[8] = 3.622635596e-3;
-    mp->addParticle(-0.51966, dipole, quadrupole, 1, 1, 2, 0.39, 9.707801995e-1, 0.837);
+    double damp = 9.707801995e-01*sqrt(0.1);
+    double polarity = 0.837*0.001;
+    mp->addParticle(-0.51966, dipole, quadrupole, 1, 1, 2, 0.39, damp, polarity);
    dipole[0] = -2.042094848e-2;
    dipole[2] = -3.078753000e-2;
    quadrupole[0] = -3.428482490e-3;
@@ -71,10 +73,30 @@ void testPMEWater() {
    quadrupole[4] = -1.002408752e-2;
    quadrupole[6] = -1.894859639e-4;
    quadrupole[8] = 1.345257001e-2;
-    mp->addParticle(0.25983, dipole, quadrupole, 0, 0, 2, 0.39, 8.897068742e-1, 0.496);
-    mp->addParticle(0.25983, dipole, quadrupole, 0, 0, 1, 0.39, 8.897068742e-1, 0.496);
+    damp          = 8.897068742e-01*sqrt(0.1);
+    polarity      = 0.496*0.001;
+    mp->addParticle(0.25983, dipole, quadrupole, 0, 0, 2, 0.39, damp, polarity);
+    mp->addParticle(0.25983, dipole, quadrupole, 0, 0, 1, 0.39, damp, polarity);
    mp->setCutoffDistance(1.0);
-//    nonbonded->setEwaldErrorTolerance(TOL);
+
+    std::vector<int> intVector;
+    intVector.push_back( 0 );
+    intVector.push_back( 1 );
+    intVector.push_back( 2 );
+    mp->setCovalentMap( 0, AmoebaMultipoleForce::PolarizationCovalent11, intVector );
+    mp->setCovalentMap( 1, AmoebaMultipoleForce::PolarizationCovalent11, intVector );
+    mp->setCovalentMap( 2, AmoebaMultipoleForce::PolarizationCovalent11, intVector );
+
+    intVector.resize(0); intVector.push_back( 1 ); intVector.push_back( 2 );
+    mp->setCovalentMap( 0, AmoebaMultipoleForce::Covalent12, intVector );
+
+    intVector.resize(0); intVector.push_back( 0 ); intVector.push_back( 2 );
+    mp->setCovalentMap( 1, AmoebaMultipoleForce::Covalent12, intVector );
+
+    intVector.resize(0); intVector.push_back( 0 ); intVector.push_back( 1 );
+    mp->setCovalentMap( 2, AmoebaMultipoleForce::Covalent12, intVector );
+
+    mp->setEwaldErrorTolerance(TOL);
    system.setDefaultPeriodicBoxVectors(Vec3(2, 0, 0), Vec3(0, 2, 0), Vec3(0, 0, 2));
    system.addForce(mp);
    Context context(system, integrator, platform);
@@ -87,6 +109,13 @@ void testPMEWater() {
    context.setPositions(positions);
    State state = context.getState(State::Forces | State::Energy);
    const vector<Vec3>& forces = state.getForces();
+    (void) fprintf( stderr, "PME forces\n" );
+        for( unsigned int ii = 0; ii < forces.size(); ii++ ){
+            (void) fprintf( stderr, "%6u [%14.7e %14.7e %14.7e]\n", ii, 
+                            forces[ii][0], forces[ii][1], forces[ii][2] );
+    }
+    (void) fflush( stderr );
+
 }

 int main() {