PME fixes

Converted CUDAStream SysStream[0] and DevStream[0] references to SysData & Devdata

PME fixes
Converted CUDAStream SysStream[0] and DevStream[0] references to SysData & Devdata
0fe5c905 · Mark Friedrichs · ffcf55d9 · 0fe5c905 · 0fe5c905 · 0fe5c905
Commit 0fe5c905 authored Feb 11, 2011 by Mark Friedrichs
20 changed files
--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -990,9 +990,13 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
            if( pmeParametersSetBasedOnEwaldErrorTolerance  ){
                 (void) fprintf( data.getLog(), " parameters set based on error tolerance and OpenMM algorithm.\n" );
            } else {
+                 double alphaT;
+                 int xsizeT, ysizeT, zsizeT;
+                 NonbondedForceImpl::calcPMEParameters(system, nb, alphaT, xsizeT, ysizeT, zsizeT);
                 double impliedTolerance  = alpha*force.getCutoffDistance();
                        impliedTolerance  = 0.5*exp( -(impliedTolerance*impliedTolerance) );
-                 (void) fprintf( data.getLog(), " using input parameters   implied tolerance=%12.3e\n", impliedTolerance );
+                 (void) fprintf( data.getLog(), " using input parameters implied tolerance=%12.3e;", impliedTolerance );
+                 (void) fprintf( data.getLog(), "OpenMM param: aEwald=%12.3f [%6d %6d %6d]\n", alphaT, xsizeT, ysizeT, zsizeT);
            }
            (void) fflush( data.getLog() );
        }

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
@@ -689,12 +689,12 @@ static void printElectrostaticBuffer( amoebaGpuContext amoebaGpu, unsigned int b
        unsigned int particleIndex = ii3Index - bufferIndex*(amoebaGpu->paddedNumberOfAtoms);
        (void) fprintf( amoebaGpu->log, "   %6u %3u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n", 
                            ii/3,  bufferIndex, particleIndex,
-                            amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii],
-                            amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii+1],
-                            amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii+2],
-                            amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii],
-                            amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii+1],
-                            amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii+2] );
+                            amoebaGpu->psWorkArray_3_1->_pSysData[ii],
+                            amoebaGpu->psWorkArray_3_1->_pSysData[ii+1],
+                            amoebaGpu->psWorkArray_3_1->_pSysData[ii+2],
+                            amoebaGpu->psWorkArray_3_2->_pSysData[ii],
+                            amoebaGpu->psWorkArray_3_2->_pSysData[ii+1],
+                            amoebaGpu->psWorkArray_3_2->_pSysData[ii+2] );
    } 

 /*
@@ -702,14 +702,14 @@ static void printElectrostaticBuffer( amoebaGpuContext amoebaGpu, unsigned int b
    stop  = -146016;
    float maxV = -1.0e+99;
    for( unsigned int ii = start; ii < stop; ii += 3 ){
-        if(  amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii] > maxV ){ 
+        if(  amoebaGpu->psWorkArray_3_1->_pSysData[ii] > maxV ){ 
            unsigned int ii3Index      = ii/3;
            unsigned int bufferIndex   = ii3Index/(amoebaGpu->paddedNumberOfAtoms);
            unsigned int particleIndex = ii3Index - bufferIndex*(amoebaGpu->paddedNumberOfAtoms);
            (void) fprintf( amoebaGpu->log, "MaxQ %6u %3u %6u %14.6e\n", 
                            ii/3,  bufferIndex, particleIndex,
-                            amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii] );
-            maxV = amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii];
+                            amoebaGpu->psWorkArray_3_1->_pSysData[ii] );
+            maxV = amoebaGpu->psWorkArray_3_1->_pSysData[ii];
        } 
    } 
 */
@@ -722,12 +722,12 @@ static void printElectrostaticAtomBuffers( amoebaGpuContext amoebaGpu, unsigned
        unsigned int particleIndex = 3*(targetAtom + ii*amoebaGpu->paddedNumberOfAtoms);
        (void) fprintf( amoebaGpu->log, " %2u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n", 
                        ii, particleIndex,
-                        amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex],
-                        amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex+1],
-                        amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex+2],
-                        amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex],
-                        amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex+1],
-                        amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex+2] );
+                        amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex],
+                        amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+1],
+                        amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+2],
+                        amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex],
+                        amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+1],
+                        amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+2] );
    } 
 }
 #endif
@@ -771,7 +771,7 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
    }   
   int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
    CUDAStream<float4>* debugArray            = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
-    memset( debugArray->_pSysStream[0],      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
+    memset( debugArray->_pSysData,      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
    debugArray->Upload();
    unsigned int targetAtom                   = 0;
 #endif
@@ -871,34 +871,34 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
           // force

           (void) fprintf( amoebaGpu->log,"ElectrostaticF [%16.9e %16.9e %16.9e] ",
-                           amoebaGpu->psForce->_pSysStream[0][indexOffset],
-                           amoebaGpu->psForce->_pSysStream[0][indexOffset+1],
-                           amoebaGpu->psForce->_pSysStream[0][indexOffset+2] );
+                           amoebaGpu->psForce->_pSysData[indexOffset],
+                           amoebaGpu->psForce->_pSysData[indexOffset+1],
+                           amoebaGpu->psForce->_pSysData[indexOffset+2] );
    
           // torque

           (void) fprintf( amoebaGpu->log,"ElectrostaticT [%16.9e %16.9e %16.9e] ",
-                           amoebaGpu->psTorque->_pSysStream[0][indexOffset],
-                           amoebaGpu->psTorque->_pSysStream[0][indexOffset+1],
-                           amoebaGpu->psTorque->_pSysStream[0][indexOffset+2] );
+                           amoebaGpu->psTorque->_pSysData[indexOffset],
+                           amoebaGpu->psTorque->_pSysData[indexOffset+1],
+                           amoebaGpu->psTorque->_pSysData[indexOffset+2] );

           // coords

 #if 0
            (void) fprintf( amoebaGpu->log,"x[%16.9e %16.9e %16.9e] ",
-                            gpu->psPosq4->_pSysStream[0][ii].x,
-                            gpu->psPosq4->_pSysStream[0][ii].y,
-                            gpu->psPosq4->_pSysStream[0][ii].z);
+                            gpu->psPosq4->_pSysData[ii].x,
+                            gpu->psPosq4->_pSysData[ii].y,
+                            gpu->psPosq4->_pSysData[ii].z);


           for( int jj = 0; jj < gpu->natoms && jj < 5; jj++ ){
               int debugIndex = jj*gpu->natoms + ii;
-               float xx       =  gpu->psPosq4->_pSysStream[0][jj].x -  gpu->psPosq4->_pSysStream[0][ii].x;
-               float yy       =  gpu->psPosq4->_pSysStream[0][jj].y -  gpu->psPosq4->_pSysStream[0][ii].y;
-               float zz       =  gpu->psPosq4->_pSysStream[0][jj].z -  gpu->psPosq4->_pSysStream[0][ii].z;
+               float xx       =  gpu->psPosq4->_pSysData[jj].x -  gpu->psPosq4->_pSysData[ii].x;
+               float yy       =  gpu->psPosq4->_pSysData[jj].y -  gpu->psPosq4->_pSysData[ii].y;
+               float zz       =  gpu->psPosq4->_pSysData[jj].z -  gpu->psPosq4->_pSysData[ii].z;
               (void) fprintf( amoebaGpu->log,"\n%4d %4d delta [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] ",
                               ii, jj, xx, yy, zz,
-                               debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y, debugArray->_pSysStream[0][debugIndex].z );
+                               debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y, debugArray->_pSysData[debugIndex].z );

           }
 #endif
@@ -914,8 +914,8 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
                int debugIndex = jj;
                for( int kk = 0; kk < 5; kk++ ){
                    (void) fprintf( amoebaGpu->log,"%5d %5d [%16.9e %16.9e %16.9e %16.9e] E11\n", targetAtom, jj,
-                                    debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
-                                    debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
+                                    debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
+                                    debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
                    debugIndex += paddedNumberOfAtoms;
                }
                (void) fprintf( amoebaGpu->log,"\n" );
@@ -933,12 +933,12 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
                int offset  = 3*ii;
    
                (void) fprintf( amoebaGpu->log,"%6d F[%16.7e %16.7e %16.7e] T[%16.7e %16.7e %16.7e]\n", ii,
-                                amoebaGpu->psForce->_pSysStream[0][offset],
-                                amoebaGpu->psForce->_pSysStream[0][offset+1],
-                                amoebaGpu->psForce->_pSysStream[0][offset+2],
-                                amoebaGpu->psTorque->_pSysStream[0][offset],
-                                amoebaGpu->psTorque->_pSysStream[0][offset+1],
-                                amoebaGpu->psTorque->_pSysStream[0][offset+2] );
+                                amoebaGpu->psForce->_pSysData[offset],
+                                amoebaGpu->psForce->_pSysData[offset+1],
+                                amoebaGpu->psForce->_pSysData[offset+2],
+                                amoebaGpu->psTorque->_pSysData[offset],
+                                amoebaGpu->psTorque->_pSysData[offset+1],
+                                amoebaGpu->psTorque->_pSysData[offset+2] );
                if( (ii == maxPrint) && (ii < (gpu->natoms - maxPrint)) )ii = gpu->natoms - maxPrint; 
            }   
        }   

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.cu
@@ -40,17 +40,17 @@ static void kReduceEAndGkFields(amoebaGpuContext amoebaGpu )

    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
-                               amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psE_Field->_pDevStream[0] );
+                               amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psE_Field->_pDevData );

    LAUNCHERROR("kReduceEAndGK_Fields1");
    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
-                               amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psE_FieldPolar->_pDevStream[0] );
+                               amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData );
    LAUNCHERROR("kReduceEAndGK_Fields2");

    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
-                               amoebaGpu->psWorkArray_3_3->_pDevStream[0], amoebaGpu->psGk_Field->_pDevStream[0] );
+                               amoebaGpu->psWorkArray_3_3->_pDevData, amoebaGpu->psGk_Field->_pDevData );
    LAUNCHERROR("kReduceEAndGK_Fields3");
 }

@@ -323,12 +323,12 @@ static void printEFieldBuffer( amoebaGpuContext amoebaGpu, unsigned int bufferIn
        unsigned int particleIndex = ii3Index - bufferIndex*(amoebaGpu->paddedNumberOfAtoms);
        (void) fprintf( amoebaGpu->log, "   %6u %3u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n", 
                            ii/3,  bufferIndex, particleIndex,
-                            amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii],
-                            amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii+1],
-                            amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii+2],
-                            amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii],
-                            amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii+1],
-                            amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii+2] );
+                            amoebaGpu->psWorkArray_3_1->_pSysData[ii],
+                            amoebaGpu->psWorkArray_3_1->_pSysData[ii+1],
+                            amoebaGpu->psWorkArray_3_1->_pSysData[ii+2],
+                            amoebaGpu->psWorkArray_3_2->_pSysData[ii],
+                            amoebaGpu->psWorkArray_3_2->_pSysData[ii+1],
+                            amoebaGpu->psWorkArray_3_2->_pSysData[ii+2] );
    } 
 }

@@ -339,12 +339,12 @@ static void printEFieldAtomBuffers( amoebaGpuContext amoebaGpu, unsigned int tar
        unsigned int particleIndex = targetAtom + ii*3*amoebaGpu->paddedNumberOfAtoms;
        (void) fprintf( amoebaGpu->log, " %2u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n", 
                        ii, particleIndex,
-                        amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex],
-                        amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex+1],
-                        amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex+2],
-                        amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex],
-                        amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex+1],
-                        amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex+2] );
+                        amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex],
+                        amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+1],
+                        amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+2],
+                        amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex],
+                        amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+1],
+                        amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+2] );
    } 
 }
 #endif
@@ -383,7 +383,7 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
    // N2 debug array

    CUDAStream<float4>* debugArray             = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
-    memset( debugArray->_pSysStream[0],      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
+    memset( debugArray->_pSysData,      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
    debugArray->Upload();

    (*gpu->psInteractionCount)[0]              = gpu->sim.workUnits;
@@ -413,18 +413,18 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
    if (gpu->bOutputBufferPerWarp){
        (void) fprintf( amoebaGpu->log, "N2 warp\n" );
        kCalculateAmoebaFixedEAndGkFieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
-                                                                           amoebaGpu->psWorkUnit->_pDevStream[0],
-                                                                           gpu->psPosq4->_pDevStream[0],
-                                                                           amoebaGpu->psLabFrameDipole->_pDevStream[0],
-                                                                           amoebaGpu->psLabFrameQuadrupole->_pDevStream[0],
-                                                                           gpu->psBornRadii->_pDevStream[0],
-                                                                           amoebaGpu->psWorkArray_3_1->_pDevStream[0],
-                                                                           amoebaGpu->psWorkArray_3_2->_pDevStream[0],
+                                                                           amoebaGpu->psWorkUnit->_pDevData,
+                                                                           gpu->psPosq4->_pDevData,
+                                                                           amoebaGpu->psLabFrameDipole->_pDevData,
+                                                                           amoebaGpu->psLabFrameQuadrupole->_pDevData,
+                                                                           gpu->psBornRadii->_pDevData,
+                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
+                                                                           amoebaGpu->psWorkArray_3_2->_pDevData,
 #ifdef AMOEBA_DEBUG
-                                                                           amoebaGpu->psWorkArray_3_3->_pDevStream[0],
-                                                                           debugArray->_pDevStream[0], targetAtom );
+                                                                           amoebaGpu->psWorkArray_3_3->_pDevData,
+                                                                           debugArray->_pDevData, targetAtom );
 #else
-                                                                           amoebaGpu->psWorkArray_3_3->_pDevStream[0] );
+                                                                           amoebaGpu->psWorkArray_3_3->_pDevData );
 #endif
    } else {

@@ -437,18 +437,18 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
 #endif

        kCalculateAmoebaFixedEAndGkFieldN2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
-                                                          amoebaGpu->psWorkUnit->_pDevStream[0],
-                                                          gpu->psPosq4->_pDevStream[0],
-                                                          amoebaGpu->psLabFrameDipole->_pDevStream[0],
-                                                          amoebaGpu->psLabFrameQuadrupole->_pDevStream[0],
-                                                          gpu->psBornRadii->_pDevStream[0],
-                                                          amoebaGpu->psWorkArray_3_1->_pDevStream[0],
-                                                          amoebaGpu->psWorkArray_3_2->_pDevStream[0],
+                                                          amoebaGpu->psWorkUnit->_pDevData,
+                                                          gpu->psPosq4->_pDevData,
+                                                          amoebaGpu->psLabFrameDipole->_pDevData,
+                                                          amoebaGpu->psLabFrameQuadrupole->_pDevData,
+                                                          gpu->psBornRadii->_pDevData,
+                                                          amoebaGpu->psWorkArray_3_1->_pDevData,
+                                                          amoebaGpu->psWorkArray_3_2->_pDevData,
 #ifdef AMOEBA_DEBUG
-                                                          amoebaGpu->psWorkArray_3_3->_pDevStream[0],
-                                                          debugArray->_pDevStream[0], targetAtom );
+                                                          amoebaGpu->psWorkArray_3_3->_pDevData,
+                                                          debugArray->_pDevData, targetAtom );
 #else
-                                                          amoebaGpu->psWorkArray_3_3->_pDevStream[0] );
+                                                          amoebaGpu->psWorkArray_3_3->_pDevData );
 #endif
    }
    LAUNCHERROR("kCalculateAmoebaFixedE_FieldN2Forces_kernel");
@@ -459,9 +459,9 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
            float index = (float) ii;
            for( unsigned int jj = 0; jj < 3*amoebaGpu->paddedNumberOfAtoms; jj += 3 ){
                unsigned int kk = 3*ii*amoebaGpu->paddedNumberOfAtoms + jj;
-                amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk]   = index;
-                amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk+1] = index;
-                amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk+2] = index;
+                amoebaGpu->psWorkArray_3_1->_pSysData[kk]   = index;
+                amoebaGpu->psWorkArray_3_1->_pSysData[kk+1] = index;
+                amoebaGpu->psWorkArray_3_1->_pSysData[kk+2] = index;
            }
        }
        amoebaGpu->psWorkArray_3_1->Upload();
@@ -497,23 +497,23 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
           // E_Field

           (void) fprintf( amoebaGpu->log,"E[%16.9e %16.9e %16.9e] ",
-                           amoebaGpu->psE_Field->_pSysStream[0][indexOffset],
-                           amoebaGpu->psE_Field->_pSysStream[0][indexOffset+1],
-                           amoebaGpu->psE_Field->_pSysStream[0][indexOffset+2] );
+                           amoebaGpu->psE_Field->_pSysData[indexOffset],
+                           amoebaGpu->psE_Field->_pSysData[indexOffset+1],
+                           amoebaGpu->psE_Field->_pSysData[indexOffset+2] );
   
           // E_Field polar

           (void) fprintf( amoebaGpu->log,"Epol[%16.9e %16.9e %16.9e] ",
-                           amoebaGpu->psE_FieldPolar->_pSysStream[0][indexOffset],
-                           amoebaGpu->psE_FieldPolar->_pSysStream[0][indexOffset+1],
-                           amoebaGpu->psE_FieldPolar->_pSysStream[0][indexOffset+2] );
+                           amoebaGpu->psE_FieldPolar->_pSysData[indexOffset],
+                           amoebaGpu->psE_FieldPolar->_pSysData[indexOffset+1],
+                           amoebaGpu->psE_FieldPolar->_pSysData[indexOffset+2] );

           // Gk_Field polar

           (void) fprintf( amoebaGpu->log,"Gk[%16.9e %16.9e %16.9e] ",
-                           amoebaGpu->psGk_Field->_pSysStream[0][indexOffset],
-                           amoebaGpu->psGk_Field->_pSysStream[0][indexOffset+1],
-                           amoebaGpu->psGk_Field->_pSysStream[0][indexOffset+2] );
+                           amoebaGpu->psGk_Field->_pSysData[indexOffset],
+                           amoebaGpu->psGk_Field->_pSysData[indexOffset+1],
+                           amoebaGpu->psGk_Field->_pSysData[indexOffset+2] );

           (void) fprintf( amoebaGpu->log,"\n" );
           if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
@@ -542,14 +542,14 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
                int debugIndex = jj; 
                (void) fprintf( amoebaGpu->log,"%4d %4d Qint [%16.9e %16.9e %16.9e %16.9e] %16.9e ",
                                   ii, jj, 
-                                   debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
-                                   debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w,
-                                   gpu->psBornRadii->_pSysStream[0][jj] );
+                                   debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
+                                   debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w,
+                                   gpu->psBornRadii->_pSysData[jj] );

                for( int kk = 0; kk < 2; kk++ ){
                    debugIndex += paddedNumberOfAtoms;
                    (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e] ",
-                                    debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y, debugArray->_pSysStream[0][debugIndex].z );
+                                    debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y, debugArray->_pSysData[debugIndex].z );
                }
                (void) fprintf( amoebaGpu->log,"\n" );
            }

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.cu
@@ -38,12 +38,12 @@ static void kReduceE_Fields_kernel(amoebaGpuContext amoebaGpu )
 {
    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
-                               amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psE_Field->_pDevStream[0] );
+                               amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psE_Field->_pDevData );
    LAUNCHERROR("kReduceE_Fields1");

    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
-                               amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psE_FieldPolar->_pDevStream[0] );
+                               amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData );
    LAUNCHERROR("kReduceE_Fields2");
 }

@@ -88,7 +88,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
    // N2 debug array

    CUDAStream<float4>* debugArray             = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
-    memset( debugArray->_pSysStream[0],      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
+    memset( debugArray->_pSysData,      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
    debugArray->Upload();

    (*gpu->psInteractionCount)[0]              = gpu->sim.workUnits;
@@ -104,13 +104,13 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
    if (gpu->bOutputBufferPerWarp){
        (void) fprintf( amoebaGpu->log, "N2 warp\n" );
        kCalculateAmoebaFixedE_FieldN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
-                                                                           amoebaGpu->psWorkUnit->_pDevStream[0],
-                                                                           amoebaGpu->psWorkArray_3_1->_pDevStream[0],
+                                                                           amoebaGpu->psWorkUnit->_pDevData,
+                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
 #ifdef AMOEBA_DEBUG
-                                                                           amoebaGpu->psWorkArray_3_2->_pDevStream[0],
-                                                                           debugArray->_pDevStream[0], targetAtom );
+                                                                           amoebaGpu->psWorkArray_3_2->_pDevData,
+                                                                           debugArray->_pDevData, targetAtom );
 #else
-                                                                           amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
+                                                                           amoebaGpu->psWorkArray_3_2->_pDevData );
 #endif
    } else {

@@ -123,13 +123,13 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
 #endif

        kCalculateAmoebaFixedE_FieldN2Forces_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
-                                                                           amoebaGpu->psWorkUnit->_pDevStream[0],
-                                                                           amoebaGpu->psWorkArray_3_1->_pDevStream[0],
+                                                                           amoebaGpu->psWorkUnit->_pDevData,
+                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
 #ifdef AMOEBA_DEBUG
-                                                                           amoebaGpu->psWorkArray_3_2->_pDevStream[0],
-                                                                           debugArray->_pDevStream[0], targetAtom );
+                                                                           amoebaGpu->psWorkArray_3_2->_pDevData,
+                                                                           debugArray->_pDevData, targetAtom );
 #else
-                                                                           amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
+                                                                           amoebaGpu->psWorkArray_3_2->_pDevData );
 #endif
    }

@@ -141,9 +141,9 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
            float index = (float) ii;
            for( unsigned int jj = 0; jj < 3*amoebaGpu->paddedNumberOfAtoms; jj += 3 ){
                unsigned int kk = 3*ii*amoebaGpu->paddedNumberOfAtoms + jj;
-                amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk]   = index;
-                amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk+1] = index;
-                amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk+2] = index;
+                amoebaGpu->psWorkArray_3_1->_pSysData[kk]   = index;
+                amoebaGpu->psWorkArray_3_1->_pSysData[kk+1] = index;
+                amoebaGpu->psWorkArray_3_1->_pSysData[kk+2] = index;
            }
        }
        amoebaGpu->psWorkArray_3_1->Upload();
@@ -172,16 +172,16 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
           // E_Field

           (void) fprintf( amoebaGpu->log,"E[%16.9e %16.9e %16.9e] ",
-                           amoebaGpu->psE_Field->_pSysStream[0][indexOffset],
-                           amoebaGpu->psE_Field->_pSysStream[0][indexOffset+1],
-                           amoebaGpu->psE_Field->_pSysStream[0][indexOffset+2] );
+                           amoebaGpu->psE_Field->_pSysData[indexOffset],
+                           amoebaGpu->psE_Field->_pSysData[indexOffset+1],
+                           amoebaGpu->psE_Field->_pSysData[indexOffset+2] );
   
           // E_Field polar

           (void) fprintf( amoebaGpu->log,"Epol[%16.9e %16.9e %16.9e] ",
-                           amoebaGpu->psE_FieldPolar->_pSysStream[0][indexOffset],
-                           amoebaGpu->psE_FieldPolar->_pSysStream[0][indexOffset+1],
-                           amoebaGpu->psE_FieldPolar->_pSysStream[0][indexOffset+2] );
+                           amoebaGpu->psE_FieldPolar->_pSysData[indexOffset],
+                           amoebaGpu->psE_FieldPolar->_pSysData[indexOffset+1],
+                           amoebaGpu->psE_FieldPolar->_pSysData[indexOffset+2] );

           (void) fprintf( amoebaGpu->log,"\n" );
           if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
@@ -211,65 +211,65 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
                   if( jj == ii )continue;
                   (void) fprintf( amoebaGpu->log,"\n\n%4d %4d rrs\n[%16.9e %16.9e %16.9e %16.9e]\n",
                                   ii, jj,
-                                   debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
-                                   debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
+                                   debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
+                                   debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
    
                   debugIndex += amoebaGpu->paddedNumberOfAtoms;
                   (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e]\n",
-                                   debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
-                                   debugArray->_pSysStream[0][debugIndex].z );
+                                   debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
+                                   debugArray->_pSysData[debugIndex].z );
    
                   debugIndex += amoebaGpu->paddedNumberOfAtoms;
                   (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e]\n",
-                                   debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
-                                   debugArray->_pSysStream[0][debugIndex].z );
+                                   debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
+                                   debugArray->_pSysData[debugIndex].z );
    
    
                   debugIndex += amoebaGpu->paddedNumberOfAtoms;
                   (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e]\n",
-                                   debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
-                                   debugArray->_pSysStream[0][debugIndex].z );
+                                   debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
+                                   debugArray->_pSysData[debugIndex].z );
    
    
                   debugIndex += amoebaGpu->paddedNumberOfAtoms;
                   (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e]\n",
-                                   debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
-                                   debugArray->_pSysStream[0][debugIndex].z );
+                                   debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
+                                   debugArray->_pSysData[debugIndex].z );

                   debugIndex += amoebaGpu->paddedNumberOfAtoms;
                   (void) fprintf( amoebaGpu->log,"Y1 %5d %16.9e %16.9e %16.9e\n", jj,
-                                   debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
-                                   debugArray->_pSysStream[0][debugIndex].z );
+                                   debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
+                                   debugArray->_pSysData[debugIndex].z );

-                   sum[0][0] += debugArray->_pSysStream[0][debugIndex].x;
-                   sum[0][1] += debugArray->_pSysStream[0][debugIndex].y;
-                   sum[0][2] += debugArray->_pSysStream[0][debugIndex].z;
+                   sum[0][0] += debugArray->_pSysData[debugIndex].x;
+                   sum[0][1] += debugArray->_pSysData[debugIndex].y;
+                   sum[0][2] += debugArray->_pSysData[debugIndex].z;
    
                   debugIndex += amoebaGpu->paddedNumberOfAtoms;
                   (void) fprintf( amoebaGpu->log,"Y2 %5d %16.9e %16.9e %16.9e\n", jj,
-                                   debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
-                                   debugArray->_pSysStream[0][debugIndex].z );
+                                   debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
+                                   debugArray->_pSysData[debugIndex].z );

-                   sum[1][0] += debugArray->_pSysStream[0][debugIndex].x;
-                   sum[1][1] += debugArray->_pSysStream[0][debugIndex].y;
-                   sum[1][2] += debugArray->_pSysStream[0][debugIndex].z;
+                   sum[1][0] += debugArray->_pSysData[debugIndex].x;
+                   sum[1][1] += debugArray->_pSysData[debugIndex].y;
+                   sum[1][2] += debugArray->_pSysData[debugIndex].z;

 /*
                   debugIndex += amoebaGpu->paddedNumberOfAtoms;
                   (void) fprintf( amoebaGpu->log,"atmJ[%16.9e %16.9e %16.9e]\n",
-                                   debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
-                                   debugArray->_pSysStream[0][debugIndex].z );
+                                   debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
+                                   debugArray->_pSysData[debugIndex].z );

                   debugIndex += amoebaGpu->paddedNumberOfAtoms;
                   (void) fprintf( amoebaGpu->log,"atmJ[%16.9e %16.9e %16.9e]\n",
-                                   debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
-                                   debugArray->_pSysStream[0][debugIndex].z );
+                                   debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
+                                   debugArray->_pSysData[debugIndex].z );

    
                   debugIndex += gpu->natoms;
                   (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
-                                   debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
-                                   debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
+                                   debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
+                                   debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
 */   
               }
               (void) fprintf( amoebaGpu->log,"SumQ [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e]\n",
@@ -278,12 +278,12 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
           }
        for( unsigned int ii = 0; ii < debugArray->_stride; ii++ ){
           int print;
-           if( debugArray->_pSysStream[0][ii].x  != 0.0f || debugArray->_pSysStream[0][ii].y != 0.0f ||
-               debugArray->_pSysStream[0][ii].y  != 0.0f || debugArray->_pSysStream[0][ii].w != 0.0f ||
-               debugArray->_pSysStream[0][ii].x  != debugArray->_pSysStream[0][ii].x               ||
-               debugArray->_pSysStream[0][ii].y  != debugArray->_pSysStream[0][ii].y               ||
-               debugArray->_pSysStream[0][ii].z  != debugArray->_pSysStream[0][ii].z               ||
-               debugArray->_pSysStream[0][ii].w  != debugArray->_pSysStream[0][ii].w               ){
+           if( debugArray->_pSysData[ii].x  != 0.0f || debugArray->_pSysData[ii].y != 0.0f ||
+               debugArray->_pSysData[ii].y  != 0.0f || debugArray->_pSysData[ii].w != 0.0f ||
+               debugArray->_pSysData[ii].x  != debugArray->_pSysData[ii].x               ||
+               debugArray->_pSysData[ii].y  != debugArray->_pSysData[ii].y               ||
+               debugArray->_pSysData[ii].z  != debugArray->_pSysData[ii].z               ||
+               debugArray->_pSysData[ii].w  != debugArray->_pSysData[ii].w               ){
               print = 0;
           } else {
               print = 0;
@@ -293,10 +293,10 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
                unsigned int atomJ = ii - atomI*amoebaGpu->paddedNumberOfAtoms;
                (void) fprintf( amoebaGpu->log, "%5u [%5u %5u]  ", ii, atomI, atomJ);
                (void) fprintf( amoebaGpu->log, "%14.6e %14.6e %14.6e %14.6e\n",
-                                debugArray->_pSysStream[0][ii].x,
-                                debugArray->_pSysStream[0][ii].y,
-                                debugArray->_pSysStream[0][ii].z,
-                                debugArray->_pSysStream[0][ii].w );
+                                debugArray->_pSysData[ii].x,
+                                debugArray->_pSysData[ii].y,
+                                debugArray->_pSysData[ii].z,
+                                debugArray->_pSysData[ii].w );
            }
        }


--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
@@ -1542,12 +1542,12 @@ static void kReduceForceTorque(amoebaGpuContext amoebaGpu )

    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                           amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
-                           amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psKirkwoodForce->_pDevStream[0] );
+                           amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psKirkwoodForce->_pDevData );
    LAUNCHERROR("kReduceForceTorque1");

    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                           amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
-                           amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psTorque->_pDevStream[0] );
+                           amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psTorque->_pDevData );
    LAUNCHERROR("kReduceForceTorque2");
 }

@@ -1561,13 +1561,13 @@ static void kReduce_dBorn(amoebaGpuContext amoebaGpu )
 /*
    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                           amoebaGpu->paddedNumberOfAtoms, amoebaGpu->outputBuffers,
-                           amoebaGpu->psWorkArray_1_1->_pDevStream[0], amoebaGpu->psBorn->_pDevStream[0] );
+                           amoebaGpu->psWorkArray_1_1->_pDevData, amoebaGpu->psBorn->_pDevData );
    LAUNCHERROR("kReduce_dBorn1");
 */

    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                           amoebaGpu->paddedNumberOfAtoms, amoebaGpu->outputBuffers,
-                           amoebaGpu->psWorkArray_1_2->_pDevStream[0], amoebaGpu->psBornPolar->_pDevStream[0] );
+                           amoebaGpu->psWorkArray_1_2->_pDevData, amoebaGpu->psBornPolar->_pDevData );

    LAUNCHERROR("kReduce_dBorn2");
 }
@@ -1709,9 +1709,9 @@ static void kReduceAndCombine_dBorn(amoebaGpuContext amoebaGpu )

    kReduceAndCombineFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                                     amoebaGpu->paddedNumberOfAtoms, amoebaGpu->outputBuffers,
-                                     amoebaGpu->psWorkArray_1_1->_pDevStream[0],
-                                     amoebaGpu->psWorkArray_1_2->_pDevStream[0],
-                                     amoebaGpu->psBorn->_pDevStream[0] );
+                                     amoebaGpu->psWorkArray_1_1->_pDevData,
+                                     amoebaGpu->psWorkArray_1_2->_pDevData,
+                                     amoebaGpu->psBorn->_pDevData );
    LAUNCHERROR("kReduce_dBorn");
 } */

@@ -1722,9 +1722,9 @@ static void kReduceToBornForcePrefactor( amoebaGpuContext amoebaGpu )

        kReduceToBornForcePrefactorAndSASA_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                                                    amoebaGpu->paddedNumberOfAtoms, amoebaGpu->outputBuffers,
-                                                    amoebaGpu->psWorkArray_1_1->_pDevStream[0],
-                                                    amoebaGpu->psWorkArray_1_2->_pDevStream[0],
-                                                    amoebaGpu->gpuContext->psBornForce->_pDevStream[0] );
+                                                    amoebaGpu->psWorkArray_1_1->_pDevData,
+                                                    amoebaGpu->psWorkArray_1_2->_pDevData,
+                                                    amoebaGpu->gpuContext->psBornForce->_pDevData );
 #ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){

@@ -1739,9 +1739,9 @@ static void kReduceToBornForcePrefactor( amoebaGpuContext amoebaGpu )
        for( int ii = 0; ii < amoebaGpu->gpuContext->natoms; ii++ ){
           (void) fprintf( amoebaGpu->log, "%5d ", ii);
           (void) fprintf( amoebaGpu->log,"bF %16.9e obc=%16.9e bR=%16.9e\n",
-                           amoebaGpu->gpuContext->psBornForce->_pSysStream[0][ii],
-                           amoebaGpu->gpuContext->psObcData->_pSysStream[0][ii].x,
-                           amoebaGpu->gpuContext->psBornRadii->_pSysStream[0][ii] );
+                           amoebaGpu->gpuContext->psBornForce->_pSysData[ii],
+                           amoebaGpu->gpuContext->psObcData->_pSysData[ii].x,
+                           amoebaGpu->gpuContext->psBornRadii->_pSysData[ii] );
        }
        (void) fflush( amoebaGpu->log );
        if( 1 ){
@@ -1764,9 +1764,9 @@ static void kReduceToBornForcePrefactor( amoebaGpuContext amoebaGpu )
    } else {
        kReduceToBornForcePrefactor_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                                             amoebaGpu->paddedNumberOfAtoms, amoebaGpu->outputBuffers,
-                                             amoebaGpu->psWorkArray_1_1->_pDevStream[0],
-                                             amoebaGpu->psWorkArray_1_2->_pDevStream[0],
-                                             amoebaGpu->gpuContext->psBornForce->_pDevStream[0] );
+                                             amoebaGpu->psWorkArray_1_1->_pDevData,
+                                             amoebaGpu->psWorkArray_1_2->_pDevData,
+                                             amoebaGpu->gpuContext->psBornForce->_pDevData );
    }
    LAUNCHERROR("kReduceToBornForcePrefactor");
 }
@@ -1784,12 +1784,12 @@ static void printKirkwoodBuffer( amoebaGpuContext amoebaGpu, unsigned int buffer
        unsigned int particleIndex = ii3Index - bufferIndex*(amoebaGpu->paddedNumberOfAtoms);
        (void) fprintf( amoebaGpu->log, "   %6u %3u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n",
                            ii/3,  bufferIndex, particleIndex,
-                            amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii],
-                            amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii+1],
-                            amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii+2],
-                            amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii],
-                            amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii+1],
-                            amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii+2] );
+                            amoebaGpu->psWorkArray_3_1->_pSysData[ii],
+                            amoebaGpu->psWorkArray_3_1->_pSysData[ii+1],
+                            amoebaGpu->psWorkArray_3_1->_pSysData[ii+2],
+                            amoebaGpu->psWorkArray_3_2->_pSysData[ii],
+                            amoebaGpu->psWorkArray_3_2->_pSysData[ii+1],
+                            amoebaGpu->psWorkArray_3_2->_pSysData[ii+2] );
    }

 /*
@@ -1797,14 +1797,14 @@ static void printKirkwoodBuffer( amoebaGpuContext amoebaGpu, unsigned int buffer
    stop  = -146016;
    float maxV = -1.0e+99;
    for( unsigned int ii = start; ii < stop; ii += 3 ){
-        if(  amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii] > maxV ){
+        if(  amoebaGpu->psWorkArray_3_1->_pSysData[ii] > maxV ){
            unsigned int ii3Index      = ii/3;
            unsigned int bufferIndex   = ii3Index/(amoebaGpu->paddedNumberOfAtoms);
            unsigned int particleIndex = ii3Index - bufferIndex*(amoebaGpu->paddedNumberOfAtoms);
            (void) fprintf( amoebaGpu->log, "MaxQ %6u %3u %6u %14.6e\n",
                            ii/3,  bufferIndex, particleIndex,
-                            amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii] );
-            maxV = amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii];
+                            amoebaGpu->psWorkArray_3_1->_pSysData[ii] );
+            maxV = amoebaGpu->psWorkArray_3_1->_pSysData[ii];
        }
    }
 */
@@ -1817,12 +1817,12 @@ static void printKirkwoodAtomBuffers( amoebaGpuContext amoebaGpu, unsigned int t
        unsigned int particleIndex = 3*(targetAtom + ii*amoebaGpu->paddedNumberOfAtoms);
        (void) fprintf( amoebaGpu->log, " %2u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n",
                        ii, particleIndex,
-                        amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex],
-                        amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex+1],
-                        amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex+2],
-                        amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex],
-                        amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex+1],
-                        amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex+2] );
+                        amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex],
+                        amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+1],
+                        amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+2],
+                        amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex],
+                        amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+1],
+                        amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+2] );
    }
 }
 #endif
@@ -1867,7 +1867,7 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
    }
   int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
    CUDAStream<float4>* debugArray            = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
-    memset( debugArray->_pSysStream[0],      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
+    memset( debugArray->_pSysData,      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
    debugArray->Upload();
    unsigned int targetAtom                   = 0;

@@ -1875,7 +1875,7 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
    (void) fprintf( amoebaGpu->log, "Kirkwood input\n" ); (void) fflush( amoebaGpu->log );
    for( int ii = 0; ii < amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; ii++ ){
        (void) fprintf( amoebaGpu->log,"Born %6d %16.9e\n", ii,
-                        gpu->psBornRadii->_pSysStream[0][ii] );
+                        gpu->psBornRadii->_pSysData[ii] );
    }
 #endif

@@ -1908,9 +1908,9 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )

    if (gpu->bOutputBufferPerWarp){
        kCalculateAmoebaCudaKirkwoodN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(KirkwoodParticle)*threadsPerBlock>>>(
-                                                                           amoebaGpu->psWorkUnit->_pDevStream[0]
+                                                                           amoebaGpu->psWorkUnit->_pDevData
 #ifdef AMOEBA_DEBUG
-                                                                           , debugArray->_pDevStream[0], targetAtom );
+                                                                           , debugArray->_pDevData, targetAtom );
 #else
                                                                           );
 #endif
@@ -1925,9 +1925,9 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
 #endif

        kCalculateAmoebaCudaKirkwoodN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(KirkwoodParticle)*threadsPerBlock>>>(
-                                                                           amoebaGpu->psWorkUnit->_pDevStream[0]
+                                                                           amoebaGpu->psWorkUnit->_pDevData
 #ifdef AMOEBA_DEBUG
-                                                                           , debugArray->_pDevStream[0], targetAtom );
+                                                                           , debugArray->_pDevData, targetAtom );
 #else
                                                                           );
 #endif
@@ -1952,19 +1952,19 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
           int indexOffset3    = ii*3;
           int indexOffset9    = ii*9;
           (void) fprintf( amoebaGpu->log, "%5d [%14.7e %14.7e %14.7e] q[%14.7e %14.7e %14.7e]\n", ii,
-                           amoebaGpu->psLabFrameDipole->_pSysStream[0][indexOffset3],
-                           amoebaGpu->psLabFrameDipole->_pSysStream[0][indexOffset3+1],
-                           amoebaGpu->psLabFrameDipole->_pSysStream[0][indexOffset3+2],
-                           amoebaGpu->psLabFrameQuadrupole->_pSysStream[0][indexOffset9],
-                           amoebaGpu->psLabFrameQuadrupole->_pSysStream[0][indexOffset9+1],
-                           amoebaGpu->psLabFrameQuadrupole->_pSysStream[0][indexOffset9+2] );
+                           amoebaGpu->psLabFrameDipole->_pSysData[indexOffset3],
+                           amoebaGpu->psLabFrameDipole->_pSysData[indexOffset3+1],
+                           amoebaGpu->psLabFrameDipole->_pSysData[indexOffset3+2],
+                           amoebaGpu->psLabFrameQuadrupole->_pSysData[indexOffset9],
+                           amoebaGpu->psLabFrameQuadrupole->_pSysData[indexOffset9+1],
+                           amoebaGpu->psLabFrameQuadrupole->_pSysData[indexOffset9+2] );
           (void) fprintf( amoebaGpu->log, "%5d [%14.7e %14.7e %14.7e] q[%14.7e %14.7e %14.7e]\n", ii,
-                           amoebaGpu->psInducedDipoleS->_pSysStream[0][indexOffset3],
-                           amoebaGpu->psInducedDipoleS->_pSysStream[0][indexOffset3+1],
-                           amoebaGpu->psInducedDipoleS->_pSysStream[0][indexOffset3+2],
-                           amoebaGpu->psInducedDipolePolarS->_pSysStream[0][indexOffset3],
-                           amoebaGpu->psInducedDipolePolarS->_pSysStream[0][indexOffset3+1],
-                           amoebaGpu->psInducedDipolePolarS->_pSysStream[0][indexOffset3+2] );
+                           amoebaGpu->psInducedDipoleS->_pSysData[indexOffset3],
+                           amoebaGpu->psInducedDipoleS->_pSysData[indexOffset3+1],
+                           amoebaGpu->psInducedDipoleS->_pSysData[indexOffset3+2],
+                           amoebaGpu->psInducedDipolePolarS->_pSysData[indexOffset3],
+                           amoebaGpu->psInducedDipolePolarS->_pSysData[indexOffset3+1],
+                           amoebaGpu->psInducedDipolePolarS->_pSysData[indexOffset3+2] );
        }
 */
        debugArray->Download();
@@ -1978,8 +1978,8 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
            (void) fprintf( amoebaGpu->log,"%5d %5d DebugGk\n", targetAtom, jj );
            for( int kk = 0; kk < 8; kk++ ){
                (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
-                                debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
-                                debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
+                                debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
+                                debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
                debugIndex += paddedNumberOfAtoms;
            }
            (void) fprintf( amoebaGpu->log,"\n" );
@@ -2015,28 +2015,28 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
           // force

           (void) fprintf( amoebaGpu->log,"KirkwoodF [%16.9e %16.9e %16.9e] ",
-                           amoebaGpu->psKirkwoodForce->_pSysStream[0][indexOffset],
-                           amoebaGpu->psKirkwoodForce->_pSysStream[0][indexOffset+1],
-                           amoebaGpu->psKirkwoodForce->_pSysStream[0][indexOffset+2] );
+                           amoebaGpu->psKirkwoodForce->_pSysData[indexOffset],
+                           amoebaGpu->psKirkwoodForce->_pSysData[indexOffset+1],
+                           amoebaGpu->psKirkwoodForce->_pSysData[indexOffset+2] );

           // torque

           (void) fprintf( amoebaGpu->log,"T [%16.9e %16.9e %16.9e] ",
-                           amoebaGpu->psTorque->_pSysStream[0][indexOffset],
-                           amoebaGpu->psTorque->_pSysStream[0][indexOffset+1],
-                           amoebaGpu->psTorque->_pSysStream[0][indexOffset+2] );
+                           amoebaGpu->psTorque->_pSysData[indexOffset],
+                           amoebaGpu->psTorque->_pSysData[indexOffset+1],
+                           amoebaGpu->psTorque->_pSysData[indexOffset+2] );

           // d_Born

-           //float bornForceValue = amoebaGpu->psBorn->_pSysStream[0][ii];
-           float bornForceValue = gpu->psBornForce->_pSysStream[0][ii];
-           float bornRadius     = gpu->psBornRadii->_pSysStream[0][ii];
-           float obcChain       = gpu->psObcChain->_pSysStream[0][ii];
+           //float bornForceValue = amoebaGpu->psBorn->_pSysData[ii];
+           float bornForceValue = gpu->psBornForce->_pSysData[ii];
+           float bornRadius     = gpu->psBornRadii->_pSysData[ii];
+           float obcChain       = gpu->psObcChain->_pSysData[ii];
           float bornSumValue   = bornRadius*obcChain != 0.0f ? bornForceValue/(bornRadius*bornRadius*obcChain) : 0.0f;
-           float bornValue      = bornSumValue - amoebaGpu->psBornPolar->_pSysStream[0][ii];
+           float bornValue      = bornSumValue - amoebaGpu->psBornPolar->_pSysData[ii];
           (void) fprintf( amoebaGpu->log,"dB br=%16.9e obcC=%16.9e bSum=%16.9e  [%16.9e %16.9e]",
                           bornRadius,obcChain, bornSumValue, bornValue,
-                           amoebaGpu->psBornPolar->_pSysStream[0][ii] );
+                           amoebaGpu->psBornPolar->_pSysData[ii] );

           // coords

@@ -2054,7 +2054,7 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
           // d_Born

           (void) fprintf( amoebaGpu->log,"dBrnSum %16.9e ",
-                           amoebaGpu->psBorn->_pSysStream[0][ii] );
+                           amoebaGpu->psBorn->_pSysData[ii] );

           (void) fprintf( amoebaGpu->log,"\n" );
           if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
@@ -2099,9 +2099,9 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
           // force

           (void) fprintf( amoebaGpu->log,"KirkwoodF [%16.9e %16.9e %16.9e] ",
-                           amoebaGpu->psKirkwoodForce->_pSysStream[0][indexOffset],
-                           amoebaGpu->psKirkwoodForce->_pSysStream[0][indexOffset+1],
-                           amoebaGpu->psKirkwoodForce->_pSysStream[0][indexOffset+2] );
+                           amoebaGpu->psKirkwoodForce->_pSysData[indexOffset],
+                           amoebaGpu->psKirkwoodForce->_pSysData[indexOffset+1],
+                           amoebaGpu->psKirkwoodForce->_pSysData[indexOffset+2] );

           (void) fprintf( amoebaGpu->log,"\n" );
           if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.cu
@@ -936,12 +936,12 @@ static void kReduceForceTorque( amoebaGpuContext amoebaGpu )

    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                           amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
-                           amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psKirkwoodEDiffForce->_pDevStream[0] );
+                           amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psKirkwoodEDiffForce->_pDevData );
    LAUNCHERROR("kReduceForceTorqueKirkwoodEDiff1");

    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                           amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
-                           amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psTorque->_pDevStream[0] );
+                           amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psTorque->_pDevData );

    LAUNCHERROR("kReduceForceTorqueKirkwoodEDiff2");
 }
@@ -959,12 +959,12 @@ static void printKirkwoodEDiffBuffer( amoebaGpuContext amoebaGpu, unsigned int b
        unsigned int particleIndex = ii3Index - bufferIndex*(amoebaGpu->paddedNumberOfAtoms);
        (void) fprintf( amoebaGpu->log, "   %6u %3u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n", 
                            ii/3,  bufferIndex, particleIndex,
-                            amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii],
-                            amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii+1],
-                            amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii+2],
-                            amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii],
-                            amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii+1],
-                            amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii+2] );
+                            amoebaGpu->psWorkArray_3_1->_pSysData[ii],
+                            amoebaGpu->psWorkArray_3_1->_pSysData[ii+1],
+                            amoebaGpu->psWorkArray_3_1->_pSysData[ii+2],
+                            amoebaGpu->psWorkArray_3_2->_pSysData[ii],
+                            amoebaGpu->psWorkArray_3_2->_pSysData[ii+1],
+                            amoebaGpu->psWorkArray_3_2->_pSysData[ii+2] );
    } 

 /*
@@ -972,14 +972,14 @@ static void printKirkwoodEDiffBuffer( amoebaGpuContext amoebaGpu, unsigned int b
    stop  = -146016;
    float maxV = -1.0e+99;
    for( unsigned int ii = start; ii < stop; ii += 3 ){
-        if(  amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii] > maxV ){ 
+        if(  amoebaGpu->psWorkArray_3_1->_pSysData[ii] > maxV ){ 
            unsigned int ii3Index      = ii/3;
            unsigned int bufferIndex   = ii3Index/(amoebaGpu->paddedNumberOfAtoms);
            unsigned int particleIndex = ii3Index - bufferIndex*(amoebaGpu->paddedNumberOfAtoms);
            (void) fprintf( amoebaGpu->log, "MaxQ %6u %3u %6u %14.6e\n", 
                            ii/3,  bufferIndex, particleIndex,
-                            amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii] );
-            maxV = amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii];
+                            amoebaGpu->psWorkArray_3_1->_pSysData[ii] );
+            maxV = amoebaGpu->psWorkArray_3_1->_pSysData[ii];
        } 
    } 
 */
@@ -992,12 +992,12 @@ static void printKirkwoodEDiffAtomBuffers( amoebaGpuContext amoebaGpu, unsigned
        unsigned int particleIndex = 3*(targetAtom + ii*amoebaGpu->paddedNumberOfAtoms);
        (void) fprintf( amoebaGpu->log, " %2u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n", 
                        ii, particleIndex,
-                        amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex],
-                        amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex+1],
-                        amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex+2],
-                        amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex],
-                        amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex+1],
-                        amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex+2] );
+                        amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex],
+                        amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+1],
+                        amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+2],
+                        amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex],
+                        amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+1],
+                        amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+2] );
    } 
 }
 #endif
@@ -1041,7 +1041,7 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
    }   
    int paddedNumberOfAtoms                   = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
    CUDAStream<float4>* debugArray            = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
-    memset( debugArray->_pSysStream[0],      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
+    memset( debugArray->_pSysData,      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
    debugArray->Upload();
    unsigned int targetAtom                   = 0;
 #endif
@@ -1074,39 +1074,39 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
    if (gpu->bOutputBufferPerWarp){

        kCalculateAmoebaCudaKirkwoodEDiffN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(KirkwoodEDiffParticle)*threadsPerBlock>>>(
-                                                                           amoebaGpu->psWorkUnit->_pDevStream[0],
-                                                                           gpu->psPosq4->_pDevStream[0],
-                                                                           amoebaGpu->psLabFrameDipole->_pDevStream[0],
-                                                                           amoebaGpu->psLabFrameQuadrupole->_pDevStream[0],
-                                                                           amoebaGpu->psInducedDipole->_pDevStream[0],
-                                                                           amoebaGpu->psInducedDipolePolar->_pDevStream[0],
-                                                                           amoebaGpu->psInducedDipoleS->_pDevStream[0],
-                                                                           amoebaGpu->psInducedDipolePolarS->_pDevStream[0],
-                                                                           amoebaGpu->psWorkArray_3_1->_pDevStream[0],
+                                                                           amoebaGpu->psWorkUnit->_pDevData,
+                                                                           gpu->psPosq4->_pDevData,
+                                                                           amoebaGpu->psLabFrameDipole->_pDevData,
+                                                                           amoebaGpu->psLabFrameQuadrupole->_pDevData,
+                                                                           amoebaGpu->psInducedDipole->_pDevData,
+                                                                           amoebaGpu->psInducedDipolePolar->_pDevData,
+                                                                           amoebaGpu->psInducedDipoleS->_pDevData,
+                                                                           amoebaGpu->psInducedDipolePolarS->_pDevData,
+                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
 #ifdef AMOEBA_DEBUG
-                                                                           amoebaGpu->psWorkArray_3_2->_pDevStream[0],
-                                                                           debugArray->_pDevStream[0], targetAtom );
+                                                                           amoebaGpu->psWorkArray_3_2->_pDevData,
+                                                                           debugArray->_pDevData, targetAtom );
 #else
-                                                                           amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
+                                                                           amoebaGpu->psWorkArray_3_2->_pDevData );
 #endif

    } else {

        kCalculateAmoebaCudaKirkwoodEDiffN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(KirkwoodEDiffParticle)*threadsPerBlock>>>(
-                                                                           amoebaGpu->psWorkUnit->_pDevStream[0],
-                                                                           gpu->psPosq4->_pDevStream[0],
-                                                                           amoebaGpu->psLabFrameDipole->_pDevStream[0],
-                                                                           amoebaGpu->psLabFrameQuadrupole->_pDevStream[0],
-                                                                           amoebaGpu->psInducedDipole->_pDevStream[0],
-                                                                           amoebaGpu->psInducedDipolePolar->_pDevStream[0],
-                                                                           amoebaGpu->psInducedDipoleS->_pDevStream[0],
-                                                                           amoebaGpu->psInducedDipolePolarS->_pDevStream[0],
-                                                                           amoebaGpu->psWorkArray_3_1->_pDevStream[0],
+                                                                           amoebaGpu->psWorkUnit->_pDevData,
+                                                                           gpu->psPosq4->_pDevData,
+                                                                           amoebaGpu->psLabFrameDipole->_pDevData,
+                                                                           amoebaGpu->psLabFrameQuadrupole->_pDevData,
+                                                                           amoebaGpu->psInducedDipole->_pDevData,
+                                                                           amoebaGpu->psInducedDipolePolar->_pDevData,
+                                                                           amoebaGpu->psInducedDipoleS->_pDevData,
+                                                                           amoebaGpu->psInducedDipolePolarS->_pDevData,
+                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
 #ifdef AMOEBA_DEBUG
-                                                                           amoebaGpu->psWorkArray_3_2->_pDevStream[0],
-                                                                           debugArray->_pDevStream[0], targetAtom );
+                                                                           amoebaGpu->psWorkArray_3_2->_pDevData,
+                                                                           debugArray->_pDevData, targetAtom );
 #else
-                                                                           amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
+                                                                           amoebaGpu->psWorkArray_3_2->_pDevData );
 #endif
    }

@@ -1138,34 +1138,34 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
           // force

           (void) fprintf( amoebaGpu->log,"KirkwoodEDiffF [%16.9e %16.9e %16.9e] ",
-                           amoebaGpu->psKirkwoodEDiffForce->_pSysStream[0][indexOffset],
-                           amoebaGpu->psKirkwoodEDiffForce->_pSysStream[0][indexOffset+1],
-                           amoebaGpu->psKirkwoodEDiffForce->_pSysStream[0][indexOffset+2] );
+                           amoebaGpu->psKirkwoodEDiffForce->_pSysData[indexOffset],
+                           amoebaGpu->psKirkwoodEDiffForce->_pSysData[indexOffset+1],
+                           amoebaGpu->psKirkwoodEDiffForce->_pSysData[indexOffset+2] );
    
           // torque

           (void) fprintf( amoebaGpu->log,"T [%16.9e %16.9e %16.9e] ",
-                           amoebaGpu->psTorque->_pSysStream[0][indexOffset],
-                           amoebaGpu->psTorque->_pSysStream[0][indexOffset+1],
-                           amoebaGpu->psTorque->_pSysStream[0][indexOffset+2] );
+                           amoebaGpu->psTorque->_pSysData[indexOffset],
+                           amoebaGpu->psTorque->_pSysData[indexOffset+1],
+                           amoebaGpu->psTorque->_pSysData[indexOffset+2] );

           // coords

 #if 0
            (void) fprintf( amoebaGpu->log,"x[%16.9e %16.9e %16.9e] ",
-                            gpu->psPosq4->_pSysStream[0][ii].x,
-                            gpu->psPosq4->_pSysStream[0][ii].y,
-                            gpu->psPosq4->_pSysStream[0][ii].z);
+                            gpu->psPosq4->_pSysData[ii].x,
+                            gpu->psPosq4->_pSysData[ii].y,
+                            gpu->psPosq4->_pSysData[ii].z);


           for( int jj = 0; jj < gpu->natoms && jj < 5; jj++ ){
               int debugIndex = jj*gpu->natoms + ii;
-               float xx       =  gpu->psPosq4->_pSysStream[0][jj].x -  gpu->psPosq4->_pSysStream[0][ii].x;
-               float yy       =  gpu->psPosq4->_pSysStream[0][jj].y -  gpu->psPosq4->_pSysStream[0][ii].y;
-               float zz       =  gpu->psPosq4->_pSysStream[0][jj].z -  gpu->psPosq4->_pSysStream[0][ii].z;
+               float xx       =  gpu->psPosq4->_pSysData[jj].x -  gpu->psPosq4->_pSysData[ii].x;
+               float yy       =  gpu->psPosq4->_pSysData[jj].y -  gpu->psPosq4->_pSysData[ii].y;
+               float zz       =  gpu->psPosq4->_pSysData[jj].z -  gpu->psPosq4->_pSysData[ii].z;
               (void) fprintf( amoebaGpu->log,"\n%4d %4d delta [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] ",
                               ii, jj, xx, yy, zz,
-                               debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y, debugArray->_pSysStream[0][debugIndex].z );
+                               debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y, debugArray->_pSysData[debugIndex].z );

           }
 #endif
@@ -1177,8 +1177,8 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
                   (void) fprintf( amoebaGpu->log,"%5d %5d ediff F%T\n", ii, jj );
                   for( int kk = 0; kk < 5; kk++ ){
                       (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
-                                       debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
-                                       debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
+                                       debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
+                                       debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
                       debugIndex += paddedNumberOfAtoms;
                   }
                   (void) fprintf( amoebaGpu->log,"\n" );
@@ -1203,12 +1203,12 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
                int offset  = 3*ii;
    
                (void) fprintf( amoebaGpu->log,"%6d F[%16.7e %16.7e %16.7e] T[%16.7e %16.7e %16.7e]\n", ii,
-                                amoebaGpu->psKirkwoodEDiffForce->_pSysStream[0][offset],
-                                amoebaGpu->psKirkwoodEDiffForce->_pSysStream[0][offset+1],
-                                amoebaGpu->psKirkwoodEDiffForce->_pSysStream[0][offset+2],
-                                amoebaGpu->psTorque->_pSysStream[0][offset],
-                                amoebaGpu->psTorque->_pSysStream[0][offset+1],
-                                amoebaGpu->psTorque->_pSysStream[0][offset+2] );
+                                amoebaGpu->psKirkwoodEDiffForce->_pSysData[offset],
+                                amoebaGpu->psKirkwoodEDiffForce->_pSysData[offset+1],
+                                amoebaGpu->psKirkwoodEDiffForce->_pSysData[offset+2],
+                                amoebaGpu->psTorque->_pSysData[offset],
+                                amoebaGpu->psTorque->_pSysData[offset+1],
+                                amoebaGpu->psTorque->_pSysData[offset+2] );
                if( (ii == maxPrint) && (ii < (gpu->natoms - maxPrint)) )ii = gpu->natoms - maxPrint; 
            }   
        }   
@@ -1248,9 +1248,9 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
           // force

           (void) fprintf( amoebaGpu->log,"KirkwoodEDiffF [%16.9e %16.9e %16.9e] ",
-                           amoebaGpu->psKirkwoodEDiffForce->_pSysStream[0][indexOffset],
-                           amoebaGpu->psKirkwoodEDiffForce->_pSysStream[0][indexOffset+1],
-                           amoebaGpu->psKirkwoodEDiffForce->_pSysStream[0][indexOffset+2] );
+                           amoebaGpu->psKirkwoodEDiffForce->_pSysData[indexOffset],
+                           amoebaGpu->psKirkwoodEDiffForce->_pSysData[indexOffset+1],
+                           amoebaGpu->psKirkwoodEDiffForce->_pSysData[indexOffset+2] );
    
           (void) fprintf( amoebaGpu->log,"\n" );
           if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMapTorques.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMapTorques.cu
@@ -715,13 +715,13 @@ void cudaComputeAmoebaMapTorques( amoebaGpuContext amoebaGpu, CUDAStream<float>*
        int indexOffset     = ii*3;

        (void) fprintf( amoebaGpu->log,"E[%16.9e %16.9e %16.9e] ",
-                        amoebaGpu->psForce->_pSysStream[0][indexOffset],
-                        amoebaGpu->psForce->_pSysStream[0][indexOffset+1],
-                        amoebaGpu->psForce->_pSysStream[0][indexOffset+2] );
+                        amoebaGpu->psForce->_pSysData[indexOffset],
+                        amoebaGpu->psForce->_pSysData[indexOffset+1],
+                        amoebaGpu->psForce->_pSysData[indexOffset+2] );
        (void) fprintf( amoebaGpu->log,"T[%16.9e %16.9e %16.9e]\n",
-                        amoebaGpu->psTorque->_pSysStream[0][indexOffset],
-                        amoebaGpu->psTorque->_pSysStream[0][indexOffset+1],
-                        amoebaGpu->psTorque->_pSysStream[0][indexOffset+2] );
+                        amoebaGpu->psTorque->_pSysData[indexOffset],
+                        amoebaGpu->psTorque->_pSysData[indexOffset+1],
+                        amoebaGpu->psTorque->_pSysData[indexOffset+2] );
        if( ii == maxPrint && (gpu->natoms - maxPrint) > ii )ii = gpu->natoms - maxPrint;
    }
    int nansDetected   = checkForNansAndInfinities( gpu->natoms*3, amoebaGpu->psForce );
@@ -735,7 +735,7 @@ void cudaComputeAmoebaMapTorques( amoebaGpuContext amoebaGpu, CUDAStream<float>*
 // zero forces
 #if 0
    for( int ii = 0; ii < 3*gpu->natoms; ii++ ){
-        amoebaGpu->psForce->_pSysStream[0][ii] = 0.0f;
+        amoebaGpu->psForce->_pSysData[ii] = 0.0f;
    }
    amoebaGpu->psForce->Upload();
 #endif
@@ -746,15 +746,15 @@ void cudaComputeAmoebaMapTorques( amoebaGpuContext amoebaGpu, CUDAStream<float>*

 /*
    AmoebaTorqueMapZeroKernel<<< numBlocks, numThreads >>>( 
-       gpu->natoms, amoebaGpu->torqueMapForce->_pDevStream[0] );
+       gpu->natoms, amoebaGpu->torqueMapForce->_pDevData );
    LAUNCHERROR("AmoebaMapTrqZeroKernel");  
 */


    amoebaMapTorqueToForce_kernel<<< numBlocks, numThreads>>> (
-                psTorque->_pDevStream[0],
+                psTorque->_pDevData,
                amoebaGpu->maxMapTorqueDifference,
-                amoebaGpu->torqueMapForce->_pDevStream[0] );
+                amoebaGpu->torqueMapForce->_pDevData );
    LAUNCHERROR("AmoebaMapTrqKernel");
  
 //#ifdef AMOEBA_DEBUG
@@ -764,24 +764,24 @@ void cudaComputeAmoebaMapTorques( amoebaGpuContext amoebaGpu, CUDAStream<float>*
    (void) fprintf( amoebaGpu->log,"Post AmoebaMapTrqKernel maxMapTorqueDifference=%d\n", amoebaGpu->maxMapTorqueDifference );
    for( int ii = 0; ii < gpu->natoms; ii++ ){
        (void) fprintf( amoebaGpu->log, "\n%5d multi[%d %d %d %d] offset=%d\n", ii, 
-                        amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].x,
-                        amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].y,
-                        amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].z,
-                        amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].w,
-                        amoebaGpu->psMultipoleAxisOffset->_pSysStream[0][ii] );
+                        amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].x,
+                        amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].y,
+                        amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].z,
+                        amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].w,
+                        amoebaGpu->psMultipoleAxisOffset->_pSysData[ii] );

        int indexOffset     = ii*3*amoebaGpu->maxMapTorqueDifference;
        float sum[3]        = { 0.0f,  0.0f,  0.0f };
        for( int jj = 0; jj < amoebaGpu->maxMapTorqueDifference; jj++ ){
-            if( amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset] != 0.0f ){ 
+            if( amoebaGpu->torqueMapForce->_pSysData[indexOffset] != 0.0f ){ 
                (void) fprintf( amoebaGpu->log,"    %4d %4d Temp[%16.9e %16.9e %16.9e] %d\n",
-                                ii, jj + amoebaGpu->psMultipoleAxisOffset->_pSysStream[0][ii],
-                                amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset],
-                                amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset+1],
-                                amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset+2], indexOffset );
-                sum[0] += amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset];
-                sum[1] += amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset+1];
-                sum[2] += amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset+2];
+                                ii, jj + amoebaGpu->psMultipoleAxisOffset->_pSysData[ii],
+                                amoebaGpu->torqueMapForce->_pSysData[indexOffset],
+                                amoebaGpu->torqueMapForce->_pSysData[indexOffset+1],
+                                amoebaGpu->torqueMapForce->_pSysData[indexOffset+2], indexOffset );
+                sum[0] += amoebaGpu->torqueMapForce->_pSysData[indexOffset];
+                sum[1] += amoebaGpu->torqueMapForce->_pSysData[indexOffset+1];
+                sum[2] += amoebaGpu->torqueMapForce->_pSysData[indexOffset+2];
            }
            indexOffset += 3;
        }
@@ -796,8 +796,8 @@ void cudaComputeAmoebaMapTorques( amoebaGpuContext amoebaGpu, CUDAStream<float>*
    amoebaMapTorqueReduce_kernel<<< numBlocks, numThreads>>>(
            numThreads, gpu->natoms,
            amoebaGpu->maxMapTorqueDifference,
-            amoebaGpu->torqueMapForce->_pDevStream[0],
-            psForce->_pDevStream[0] );
+            amoebaGpu->torqueMapForce->_pDevData,
+            psForce->_pDevData );
    LAUNCHERROR("amoebaMapTorqueReduce_kernel");

 #ifdef AMOEBA_DEBUG
@@ -813,13 +813,13 @@ void cudaComputeAmoebaMapTorques( amoebaGpuContext amoebaGpu, CUDAStream<float>*
            int indexOffset     = ii*3;
    
            (void) fprintf( amoebaGpu->log,"E[%16.9e %16.9e %16.9e] ",
-                            amoebaGpu->psForce->_pSysStream[0][indexOffset],
-                            amoebaGpu->psForce->_pSysStream[0][indexOffset+1],
-                            amoebaGpu->psForce->_pSysStream[0][indexOffset+2] );
+                            amoebaGpu->psForce->_pSysData[indexOffset],
+                            amoebaGpu->psForce->_pSysData[indexOffset+1],
+                            amoebaGpu->psForce->_pSysData[indexOffset+2] );
            (void) fprintf( amoebaGpu->log,"T[%16.9e %16.9e %16.9e]\n",
-                            amoebaGpu->psTorque->_pSysStream[0][indexOffset],
-                            amoebaGpu->psTorque->_pSysStream[0][indexOffset+1],
-                            amoebaGpu->psTorque->_pSysStream[0][indexOffset+2] );
+                            amoebaGpu->psTorque->_pSysData[indexOffset],
+                            amoebaGpu->psTorque->_pSysData[indexOffset+1],
+                            amoebaGpu->psTorque->_pSysData[indexOffset+2] );
            if( ii == maxPrint && (gpu->natoms - maxPrint) > ii )ii = gpu->natoms - maxPrint;
        }
        (void) fflush( amoebaGpu->log );
@@ -885,13 +885,13 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu,
        int indexOffset     = ii*3;

        (void) fprintf( amoebaGpu->log,"E[%16.9e %16.9e %16.9e] ",
-                        psForce->_pSysStream[0][indexOffset],
-                        psForce->_pSysStream[0][indexOffset+1],
-                        psForce->_pSysStream[0][indexOffset+2] );
+                        psForce->_pSysData[indexOffset],
+                        psForce->_pSysData[indexOffset+1],
+                        psForce->_pSysData[indexOffset+2] );
        (void) fprintf( amoebaGpu->log,"T[%16.9e %16.9e %16.9e]\n",
-                        psTorque->_pSysStream[0][indexOffset],
-                        psTorque->_pSysStream[0][indexOffset+1],
-                        psTorque->_pSysStream[0][indexOffset+2] );
+                        psTorque->_pSysData[indexOffset],
+                        psTorque->_pSysData[indexOffset+1],
+                        psTorque->_pSysData[indexOffset+2] );
        if( ii == maxPrint && (gpu->natoms - maxPrint) > ii )ii = gpu->natoms - maxPrint;
    }
    int nansDetected   = checkForNansAndInfinities( gpu->natoms*3, psForce );
@@ -905,7 +905,7 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu,
 // zero forces
 #if 0
    for( int ii = 0; ii < 3*gpu->natoms; ii++ ){
-        psForce->_pSysStream[0][ii] = 0.0f;
+        psForce->_pSysData[ii] = 0.0f;
    }
    psForce->Upload();
 #endif
@@ -914,18 +914,18 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu,
    (void) fprintf( amoebaGpu->log,"Setting force & torque values.\n" );
    for( int ii = 0; ii < 3*gpu->natoms; ii += 3 ){

-        psTorque->_pSysStream[0][ii]   = 1.0f;
-        psTorque->_pSysStream[0][ii+1] = 0.0f;
-        psTorque->_pSysStream[0][ii+2] = 0.0f;
+        psTorque->_pSysData[ii]   = 1.0f;
+        psTorque->_pSysData[ii+1] = 0.0f;
+        psTorque->_pSysData[ii+2] = 0.0f;

-        psForce->_pSysStream[0][ii]    = 0.0f;
-        psForce->_pSysStream[0][ii+1]  = 0.0f;
-        psForce->_pSysStream[0][ii+2]  = 0.0f;
+        psForce->_pSysData[ii]    = 0.0f;
+        psForce->_pSysData[ii+1]  = 0.0f;
+        psForce->_pSysData[ii+2]  = 0.0f;
    }
    for( int ii = 0; ii < gpu->natoms; ii++ ){
-        psCudaForce4->_pSysStream[0][ii].x        = 0.0f;
-        psCudaForce4->_pSysStream[0][ii].y        = 0.0f;
-        psCudaForce4->_pSysStream[0][ii].z        = 0.0f;
+        psCudaForce4->_pSysData[ii].x        = 0.0f;
+        psCudaForce4->_pSysData[ii].y        = 0.0f;
+        psCudaForce4->_pSysData[ii].z        = 0.0f;
    }
    psForce->Upload();
    psTorque->Upload();
@@ -938,16 +938,16 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu,

 /*
    AmoebaTorqueMapZeroKernel<<< numBlocks, numThreads >>>( 
-       gpu->natoms, amoebaGpu->torqueMapForce->_pDevStream[0] );
+       gpu->natoms, amoebaGpu->torqueMapForce->_pDevData );
    LAUNCHERROR("AmoebaMapTrqZeroKernel");  
 */


    //amoebaMapTorqueToForceOld_kernel<<< numBlocks, numThreads>>> (
    amoebaMapTorqueToForce_kernel<<< numBlocks, numThreads>>> (
-                psTorque->_pDevStream[0],
+                psTorque->_pDevData,
                amoebaGpu->maxMapTorqueDifference,
-                amoebaGpu->torqueMapForce->_pDevStream[0] );
+                amoebaGpu->torqueMapForce->_pDevData );
    LAUNCHERROR("AmoebaMapTrqKernel");
  
 //#ifdef AMOEBA_DEBUG
@@ -957,23 +957,23 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu,
    (void) fprintf( amoebaGpu->log,"Post AmoebaMapTrqKernel maxMapTorqueDifference=%d\n", amoebaGpu->maxMapTorqueDifference );
    for( int ii = 0; ii < gpu->natoms; ii++ ){
        (void) fprintf( amoebaGpu->log, "\n%5d multi[%d %d %d %d]\n", ii, 
-                        amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].x,
-                        amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].y,
-                        amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].z,
-                        amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].w, amoebaGpu->psMultipoleAxisOffset->_pSysStream[0][ii] );
+                        amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].x,
+                        amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].y,
+                        amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].z,
+                        amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].w, amoebaGpu->psMultipoleAxisOffset->_pSysData[ii] );

        int indexOffset     = ii*3*amoebaGpu->maxMapTorqueDifference;
        float sum[3]        = { 0.0f,  0.0f,  0.0f };
        for( int jj = 0; jj < amoebaGpu->maxMapTorqueDifference; jj++ ){
-            if( amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset] != 0.0f ){ 
+            if( amoebaGpu->torqueMapForce->_pSysData[indexOffset] != 0.0f ){ 
                (void) fprintf( amoebaGpu->log,"    %4d %4d Temp[%16.9e %16.9e %16.9e] %d\n",
-                                ii, jj + amoebaGpu->psMultipoleAxisOffset->_pSysStream[0][ii],
-                                amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset],
-                                amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset+1],
-                                amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset+2], indexOffset );
-                sum[0] += amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset];
-                sum[1] += amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset+1];
-                sum[2] += amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset+2];
+                                ii, jj + amoebaGpu->psMultipoleAxisOffset->_pSysData[ii],
+                                amoebaGpu->torqueMapForce->_pSysData[indexOffset],
+                                amoebaGpu->torqueMapForce->_pSysData[indexOffset+1],
+                                amoebaGpu->torqueMapForce->_pSysData[indexOffset+2], indexOffset );
+                sum[0] += amoebaGpu->torqueMapForce->_pSysData[indexOffset];
+                sum[1] += amoebaGpu->torqueMapForce->_pSysData[indexOffset+1];
+                sum[2] += amoebaGpu->torqueMapForce->_pSysData[indexOffset+2];
            }
            indexOffset += 3;
        }
@@ -988,8 +988,8 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu,
    amoebaMapTorqueReduce_kernel2<<< numBlocks, numThreads>>>(
            numThreads, gpu->natoms,
            amoebaGpu->maxMapTorqueDifference,
-            amoebaGpu->torqueMapForce->_pDevStream[0],
-            psForce->_pDevStream[0], psCudaForce4->_pDevStream[0] );
+            amoebaGpu->torqueMapForce->_pDevData,
+            psForce->_pDevData, psCudaForce4->_pDevData );
    LAUNCHERROR("amoebaMapTorqueReduce_kernel2");

 #ifdef AMOEBA_DEBUG
@@ -1007,21 +1007,21 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu,
            int indexOffset     = ii*3;
    
            (void) fprintf( amoebaGpu->log,"FTtl[%16.9e %16.9e %16.9e] ",
-                            psCudaForce4->_pSysStream[0][ii].x,
-                            psCudaForce4->_pSysStream[0][ii].y,
-                            psCudaForce4->_pSysStream[0][ii].z );
+                            psCudaForce4->_pSysData[ii].x,
+                            psCudaForce4->_pSysData[ii].y,
+                            psCudaForce4->_pSysData[ii].z );
            (void) fprintf( amoebaGpu->log,"F[%16.9e %16.9e %16.9e] ",
-                            psForce->_pSysStream[0][indexOffset],
-                            psForce->_pSysStream[0][indexOffset+1],
-                            psForce->_pSysStream[0][indexOffset+2] );
+                            psForce->_pSysData[indexOffset],
+                            psForce->_pSysData[indexOffset+1],
+                            psForce->_pSysData[indexOffset+2] );
            (void) fprintf( amoebaGpu->log,"fT[%16.9e %16.9e %16.9e] ",
-                            amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset],
-                            amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset+1],
-                            amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset+2] );
+                            amoebaGpu->torqueMapForce->_pSysData[indexOffset],
+                            amoebaGpu->torqueMapForce->_pSysData[indexOffset+1],
+                            amoebaGpu->torqueMapForce->_pSysData[indexOffset+2] );
            (void) fprintf( amoebaGpu->log,"T[%16.9e %16.9e %16.9e]\n",
-                            psTorque->_pSysStream[0][indexOffset],
-                            psTorque->_pSysStream[0][indexOffset+1],
-                            psTorque->_pSysStream[0][indexOffset+2] );
+                            psTorque->_pSysData[indexOffset],
+                            psTorque->_pSysData[indexOffset+1],
+                            psTorque->_pSysData[indexOffset+2] );
            if( ii == maxPrint && (gpu->natoms - maxPrint) > ii )ii = gpu->natoms - maxPrint;
        }
        (void) fflush( amoebaGpu->log );
@@ -1073,9 +1073,9 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce2( amoebaGpuContext amoebaGpu,
    int numBlocks     =  1 + (gpu->natoms/numThreads);

    amoebaMapTorqueToForce_kernel<<< numBlocks, numThreads>>> (
-                psTorque->_pDevStream[0],
+                psTorque->_pDevData,
                amoebaGpu->maxMapTorqueDifference,
-                amoebaGpu->torqueMapForce->_pDevStream[0] );
+                amoebaGpu->torqueMapForce->_pDevData );
    LAUNCHERROR("AmoebaMapTrqKernel");

    numBlocks  = gpu->natoms;
@@ -1084,8 +1084,8 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce2( amoebaGpuContext amoebaGpu,
    amoebaMapTorqueReduce_kernel3<<< numBlocks, numThreads>>>(
            numThreads, gpu->natoms,
            amoebaGpu->maxMapTorqueDifference,
-            amoebaGpu->torqueMapForce->_pDevStream[0],
-            psCudaForce4->_pDevStream[0] );
+            amoebaGpu->torqueMapForce->_pDevData,
+            psCudaForce4->_pDevData );
    LAUNCHERROR("amoebaMapTorqueReduce_kernel3");

 #ifdef AMOEBA_DEBUG
@@ -1103,17 +1103,17 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce2( amoebaGpuContext amoebaGpu,
            int indexOffset     = ii*3;

            (void) fprintf( amoebaGpu->log,"FTtl[%16.9e %16.9e %16.9e] ",
-                            psCudaForce4->_pSysStream[0][ii].x,
-                            psCudaForce4->_pSysStream[0][ii].y,
-                            psCudaForce4->_pSysStream[0][ii].z );
+                            psCudaForce4->_pSysData[ii].x,
+                            psCudaForce4->_pSysData[ii].y,
+                            psCudaForce4->_pSysData[ii].z );
            (void) fprintf( amoebaGpu->log,"F[%16.9e %16.9e %16.9e] ",
-                            amoebaGpu->psForce->_pSysStream[0][indexOffset],
-                            amoebaGpu->psForce->_pSysStream[0][indexOffset+1],
-                            amoebaGpu->psForce->_pSysStream[0][indexOffset+2] );
+                            amoebaGpu->psForce->_pSysData[indexOffset],
+                            amoebaGpu->psForce->_pSysData[indexOffset+1],
+                            amoebaGpu->psForce->_pSysData[indexOffset+2] );
            (void) fprintf( amoebaGpu->log,"T[%16.9e %16.9e %16.9e]\n",
-                            amoebaGpu->psTorque->_pSysStream[0][indexOffset],
-                            amoebaGpu->psTorque->_pSysStream[0][indexOffset+1],
-                            amoebaGpu->psTorque->_pSysStream[0][indexOffset+2] );
+                            amoebaGpu->psTorque->_pSysData[indexOffset],
+                            amoebaGpu->psTorque->_pSysData[indexOffset+1],
+                            amoebaGpu->psTorque->_pSysData[indexOffset+2] );
            if( ii == maxPrint && (gpu->natoms - maxPrint) > ii )ii = gpu->natoms - maxPrint;
        }
        (void) fflush( amoebaGpu->log );

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
@@ -219,12 +219,12 @@ static void kReduceMutualInducedFields(amoebaGpuContext amoebaGpu, CUDAStream<fl
 {
    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
-                               amoebaGpu->psWorkArray_3_1->_pDevStream[0], outputArray->_pDevStream[0] );
+                               amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData );
    LAUNCHERROR("kReduceMI_Fields1");

    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
-                               amoebaGpu->psWorkArray_3_2->_pDevStream[0], outputPolarArray->_pDevStream[0] );
+                               amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData );
    LAUNCHERROR("kReduceMI_Fields2");
 }

@@ -252,7 +252,7 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
    }
    int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
    CUDAStream<float4>* debugArray             = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
-    memset( debugArray->_pSysStream[0],      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
+    memset( debugArray->_pSysData,      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
    debugArray->Upload();
 #endif

@@ -260,13 +260,13 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext

    if (gpu->bOutputBufferPerWarp){
        kCalculateAmoebaMutualInducedFieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
-                                                                 amoebaGpu->psWorkUnit->_pDevStream[0],
-                                                                 amoebaGpu->psWorkArray_3_1->_pDevStream[0],
+                                                                 amoebaGpu->psWorkUnit->_pDevData,
+                                                                 amoebaGpu->psWorkArray_3_1->_pDevData,
 #ifdef AMOEBA_DEBUG
-                                                                 amoebaGpu->psWorkArray_3_2->_pDevStream[0],
-                                                                 debugArray->_pDevStream[0], targetAtom );
+                                                                 amoebaGpu->psWorkArray_3_2->_pDevData,
+                                                                 debugArray->_pDevData, targetAtom );
 #else
-                                                                 amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
+                                                                 amoebaGpu->psWorkArray_3_2->_pDevData );
 #endif

    } else {
@@ -280,13 +280,13 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
        (void) fflush( amoebaGpu->log );
 #endif
        kCalculateAmoebaMutualInducedFieldN2_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
-                                                                 amoebaGpu->psWorkUnit->_pDevStream[0],
-                                                                 amoebaGpu->psWorkArray_3_1->_pDevStream[0],
+                                                                 amoebaGpu->psWorkUnit->_pDevData,
+                                                                 amoebaGpu->psWorkArray_3_1->_pDevData,
 #ifdef AMOEBA_DEBUG
-                                                                 amoebaGpu->psWorkArray_3_2->_pDevStream[0],
-                                                                 debugArray->_pDevStream[0], targetAtom );
+                                                                 amoebaGpu->psWorkArray_3_2->_pDevData,
+                                                                 debugArray->_pDevData, targetAtom );
 #else
-                                                                 amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
+                                                                 amoebaGpu->psWorkArray_3_2->_pDevData );
 #endif


@@ -314,34 +314,34 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
           // MI

           (void) fprintf( amoebaGpu->log,"Mult[%16.9e %16.9e %16.9e] ",
-                           outputArray->_pSysStream[0][indexOffset],
-                           outputArray->_pSysStream[0][indexOffset+1],
-                           outputArray->_pSysStream[0][indexOffset+2] );
+                           outputArray->_pSysData[indexOffset],
+                           outputArray->_pSysData[indexOffset+1],
+                           outputArray->_pSysData[indexOffset+2] );
    
           // MI polar

           (void) fprintf( amoebaGpu->log,"MultP[%16.9e %16.9e %16.9e] ",
-                           outputPolarArray->_pSysStream[0][indexOffset],
-                           outputPolarArray->_pSysStream[0][indexOffset+1],
-                           outputPolarArray->_pSysStream[0][indexOffset+2] );
+                           outputPolarArray->_pSysData[indexOffset],
+                           outputPolarArray->_pSysData[indexOffset+1],
+                           outputPolarArray->_pSysData[indexOffset+2] );

           // coords

 #if 0
            (void) fprintf( amoebaGpu->log,"x[%16.9e %16.9e %16.9e] ",
-                            gpu->psPosq4->_pSysStream[0][ii].x,
-                            gpu->psPosq4->_pSysStream[0][ii].y,
-                            gpu->psPosq4->_pSysStream[0][ii].z);
+                            gpu->psPosq4->_pSysData[ii].x,
+                            gpu->psPosq4->_pSysData[ii].y,
+                            gpu->psPosq4->_pSysData[ii].z);


           for( int jj = 0; jj < gpu->natoms && jj < 5; jj++ ){
               int debugIndex = jj*gpu->natoms + ii;
-               float xx       =  gpu->psPosq4->_pSysStream[0][jj].x -  gpu->psPosq4->_pSysStream[0][ii].x;
-               float yy       =  gpu->psPosq4->_pSysStream[0][jj].y -  gpu->psPosq4->_pSysStream[0][ii].y;
-               float zz       =  gpu->psPosq4->_pSysStream[0][jj].z -  gpu->psPosq4->_pSysStream[0][ii].z;
+               float xx       =  gpu->psPosq4->_pSysData[jj].x -  gpu->psPosq4->_pSysData[ii].x;
+               float yy       =  gpu->psPosq4->_pSysData[jj].y -  gpu->psPosq4->_pSysData[ii].y;
+               float zz       =  gpu->psPosq4->_pSysData[jj].z -  gpu->psPosq4->_pSysData[ii].z;
               (void) fprintf( amoebaGpu->log,"\n%4d %4d delta [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] ",
                               ii, jj, xx, yy, zz,
-                               debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y, debugArray->_pSysStream[0][debugIndex].z );
+                               debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y, debugArray->_pSysData[debugIndex].z );

           }
 #endif
@@ -357,18 +357,18 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
                   int debugIndex = jj;
                   (void) fprintf( amoebaGpu->log,"%4d %4d Pint [%16.9e %16.9e %16.9e %16.9e] ",
                                   ii, jj,
-                                   debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
-                                   debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
+                                   debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
+                                   debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );

                   //debugIndex += gpu->natoms;
                   debugIndex += paddedNumberOfAtoms;
                   (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e] ",
-                                   debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y, debugArray->_pSysStream[0][debugIndex].z );
+                                   debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y, debugArray->_pSysData[debugIndex].z );

                   int index = 0;
-                   sums[index][0] += debugArray->_pSysStream[0][debugIndex].x; 
-                   sums[index][1] += debugArray->_pSysStream[0][debugIndex].y; 
-                   sums[index][2] += debugArray->_pSysStream[0][debugIndex].z; 
+                   sums[index][0] += debugArray->_pSysData[debugIndex].x; 
+                   sums[index][1] += debugArray->_pSysData[debugIndex].y; 
+                   sums[index][2] += debugArray->_pSysData[debugIndex].z; 
                   
                   if( count && ( (count % 31) == 0) ){
                      static float saveSum[3] = { 0.0f, 0.0f, 0.0f };
@@ -383,12 +383,12 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext

                   debugIndex += paddedNumberOfAtoms;
                   (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e] ",
-                                   debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y, debugArray->_pSysStream[0][debugIndex].z );
+                                   debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y, debugArray->_pSysData[debugIndex].z );

                   index++;
-                   sums[index][0] += debugArray->_pSysStream[0][debugIndex].x; 
-                   sums[index][1] += debugArray->_pSysStream[0][debugIndex].y; 
-                   sums[index][2] += debugArray->_pSysStream[0][debugIndex].z; 
+                   sums[index][0] += debugArray->_pSysData[debugIndex].x; 
+                   sums[index][1] += debugArray->_pSysData[debugIndex].y; 
+                   sums[index][2] += debugArray->_pSysData[debugIndex].z; 

                   if( count && ( (count % 31) == 0) ){
                      static float saveSum[3] = { 0.0f, 0.0f, 0.0f };
@@ -475,11 +475,11 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu

    kInitializeMutualInducedField_kernel<<< numBlocks, numThreads >>>(
         gpu->natoms,
-         amoebaGpu->psE_Field->_pDevStream[0],
-         amoebaGpu->psE_FieldPolar->_pDevStream[0],
-         amoebaGpu->psPolarizability->_pDevStream[0],
-         amoebaGpu->psInducedDipole->_pDevStream[0],
-         amoebaGpu->psInducedDipolePolar->_pDevStream[0] );
+         amoebaGpu->psE_Field->_pDevData,
+         amoebaGpu->psE_FieldPolar->_pDevData,
+         amoebaGpu->psPolarizability->_pDevData,
+         amoebaGpu->psInducedDipole->_pDevData,
+         amoebaGpu->psInducedDipolePolar->_pDevData );
    LAUNCHERROR("AmoebaMutualInducedFieldSetup");  

 #ifdef AMOEBA_DEBUG
@@ -495,18 +495,18 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
        int maxPrint = 20000;
        for( int ii = 0; ii < gpu->natoms; ii++ ){
            (void) fprintf( amoebaGpu->log, "%4d pol=%12.4e ", ii, 
-                            amoebaGpu->psPolarizability->_pSysStream[0][offset] );
-            if( amoebaGpu->psPolarizability->_pSysStream[0][offset] != amoebaGpu->psPolarizability->_pSysStream[0][offset+1] ||
-                amoebaGpu->psPolarizability->_pSysStream[0][offset] != amoebaGpu->psPolarizability->_pSysStream[0][offset+2] ){
-                (void) fprintf( amoebaGpu->log, "PolX!!! %12.4e %12.4e ", amoebaGpu->psPolarizability->_pSysStream[0][offset+1], amoebaGpu->psPolarizability->_pSysStream[0][offset+2] ); 
+                            amoebaGpu->psPolarizability->_pSysData[offset] );
+            if( amoebaGpu->psPolarizability->_pSysData[offset] != amoebaGpu->psPolarizability->_pSysData[offset+1] ||
+                amoebaGpu->psPolarizability->_pSysData[offset] != amoebaGpu->psPolarizability->_pSysData[offset+2] ){
+                (void) fprintf( amoebaGpu->log, "PolX!!! %12.4e %12.4e ", amoebaGpu->psPolarizability->_pSysData[offset+1], amoebaGpu->psPolarizability->_pSysData[offset+2] ); 
            }

            (void) fprintf( amoebaGpu->log," E[%14.6e %14.6e %14.6e] Mi[%14.6e %14.6e %14.6e] ",
-                            amoebaGpu->psE_Field->_pSysStream[0][offset],       amoebaGpu->psE_Field->_pSysStream[0][offset+1],       amoebaGpu->psE_Field->_pSysStream[0][offset+2],
-                            amoebaGpu->psInducedDipole->_pSysStream[0][offset], amoebaGpu->psInducedDipole->_pSysStream[0][offset+1], amoebaGpu->psInducedDipole->_pSysStream[0][offset+2] );
+                            amoebaGpu->psE_Field->_pSysData[offset],       amoebaGpu->psE_Field->_pSysData[offset+1],       amoebaGpu->psE_Field->_pSysData[offset+2],
+                            amoebaGpu->psInducedDipole->_pSysData[offset], amoebaGpu->psInducedDipole->_pSysData[offset+1], amoebaGpu->psInducedDipole->_pSysData[offset+2] );
            (void) fprintf( amoebaGpu->log,"Ep[%14.6e %14.6e %14.6e] Mip[%14.6e %14.6e %14.6e]\n",
-                            amoebaGpu->psE_FieldPolar->_pSysStream[0][offset],       amoebaGpu->psE_FieldPolar->_pSysStream[0][offset+1],       amoebaGpu->psE_FieldPolar->_pSysStream[0][offset+2],
-                            amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset], amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset+1], amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset+2] );
+                            amoebaGpu->psE_FieldPolar->_pSysData[offset],       amoebaGpu->psE_FieldPolar->_pSysData[offset+1],       amoebaGpu->psE_FieldPolar->_pSysData[offset+2],
+                            amoebaGpu->psInducedDipolePolar->_pSysData[offset], amoebaGpu->psInducedDipolePolar->_pSysData[offset+1], amoebaGpu->psInducedDipolePolar->_pSysData[offset+2] );
            offset += 3;
            if( ii == maxPrint && (ii < (gpu->natoms - maxPrint) ) )ii =  (gpu->natoms - maxPrint);
        }   
@@ -529,17 +529,17 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
        // post matrix multiply

        kSorUpdateMutualInducedField_kernel<<< numBlocks, numThreads >>>(
-           gpu->natoms, amoebaGpu->psPolarizability->_pDevStream[0],
-           amoebaGpu->psInducedDipole->_pDevStream[0], amoebaGpu->psInducedDipolePolar->_pDevStream[0],
-           amoebaGpu->psE_Field->_pDevStream[0],       amoebaGpu->psE_FieldPolar->_pDevStream[0],
-           amoebaGpu->psWorkVector[0]->_pDevStream[0],     amoebaGpu->psWorkVector[1]->_pDevStream[0] );
+           gpu->natoms, amoebaGpu->psPolarizability->_pDevData,
+           amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psInducedDipolePolar->_pDevData,
+           amoebaGpu->psE_Field->_pDevData,       amoebaGpu->psE_FieldPolar->_pDevData,
+           amoebaGpu->psWorkVector[0]->_pDevData,     amoebaGpu->psWorkVector[1]->_pDevData );
        LAUNCHERROR("kSorUpdateMutualInducedField");  

        // get total epsilon -- performing sums on gpu

        kReduceMutualInducedFieldDelta_kernel<<<1, amoebaGpu->epsilonThreadsPerBlock, 2*sizeof(float)*amoebaGpu->epsilonThreadsPerBlock>>>(
-           3*gpu->natoms, amoebaGpu->psWorkVector[0]->_pDevStream[0], amoebaGpu->psWorkVector[1]->_pDevStream[0],
-           amoebaGpu->psCurrentEpsilon->_pDevStream[0] );
+           3*gpu->natoms, amoebaGpu->psWorkVector[0]->_pDevData, amoebaGpu->psWorkVector[1]->_pDevData,
+           amoebaGpu->psCurrentEpsilon->_pDevData );
        LAUNCHERROR("kReduceMutualInducedFieldDelta");

        if( amoebaGpu->log ){
@@ -548,7 +548,7 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu

        // Debye=48.033324f
        amoebaGpu->psCurrentEpsilon->Download();
-        float currentEpsilon          = amoebaGpu->psCurrentEpsilon->_pSysStream[0][0];
+        float currentEpsilon          = amoebaGpu->psCurrentEpsilon->_pSysData[0];
        amoebaGpu->mutualInducedCurrentEpsilon   = currentEpsilon;

        if( iteration > amoebaGpu->mutualInducedMaxIterations || amoebaGpu->mutualInducedCurrentEpsilon < amoebaGpu->mutualInducedTargetEpsilon ){ 
@@ -569,9 +569,9 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
                (void) fprintf( amoebaGpu->log, "%4d ", ii ); 
    
                (void) fprintf( amoebaGpu->log," Mi[%14.6e %14.6e %14.6e] ",
-                                amoebaGpu->psInducedDipole->_pSysStream[0][offset], amoebaGpu->psInducedDipole->_pSysStream[0][offset+1], amoebaGpu->psInducedDipole->_pSysStream[0][offset+2] );
+                                amoebaGpu->psInducedDipole->_pSysData[offset], amoebaGpu->psInducedDipole->_pSysData[offset+1], amoebaGpu->psInducedDipole->_pSysData[offset+2] );
                (void) fprintf( amoebaGpu->log,"Mip[%14.6e %14.6e %14.6e]\n",
-                                amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset], amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset+1], amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset+2] );
+                                amoebaGpu->psInducedDipolePolar->_pSysData[offset], amoebaGpu->psInducedDipolePolar->_pSysData[offset+1], amoebaGpu->psInducedDipolePolar->_pSysData[offset+2] );
                if( ii == maxPrint && (ii < (gpu->natoms - maxPrint) ) ){
                    ii =  (gpu->natoms - maxPrint);
                    offset = 3*(ii+1);

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPME.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPME.cu
@@ -6,8 +6,6 @@
 #include "amoebaCudaKernels.h"
 #include "bbsort.h"

-//#define AMOEBA_DEBUG
-
 static __constant__ cudaGmxSimulation cSim;
 static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;

@@ -241,6 +239,9 @@ void kGridSpreadFixedMultipoles_kernel()
                    int atomIndex = atomData.x;
                    int z = atomData.y;
                    int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : cSim.pmeGridSize.z);
+                    if( iz >= cSim.pmeGridSize.z ){
+                        iz -= cSim.pmeGridSize.z;
+                    }
                    float atomCharge = cSim.pPosq[atomIndex].w;
                    float atomDipoleX = xscale*cAmoebaSim.pLabFrameDipole[atomIndex*3];
                    float atomDipoleY = yscale*cAmoebaSim.pLabFrameDipole[atomIndex*3+1];
@@ -271,6 +272,9 @@ void kGridSpreadFixedMultipoles_kernel()
                        int atomIndex = atomData.x;
                        int z = atomData.y;
                        int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : cSim.pmeGridSize.z);
+                        if( iz >= cSim.pmeGridSize.z ){
+                            iz -= cSim.pmeGridSize.z;
+                        }
                        float atomCharge = cSim.pPosq[atomIndex].w;
                        float atomDipoleX = xscale*cAmoebaSim.pLabFrameDipole[atomIndex*3];
                        float atomDipoleY = yscale*cAmoebaSim.pLabFrameDipole[atomIndex*3+1];
@@ -332,6 +336,9 @@ void kGridSpreadInducedDipoles_kernel()
                    int atomIndex = atomData.x;
                    int z = atomData.y;
                    int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : cSim.pmeGridSize.z);
+                    if( iz >= cSim.pmeGridSize.z ){
+                        iz -= cSim.pmeGridSize.z;
+                    }
                    float inducedDipoleX = xscale*cAmoebaSim.pInducedDipole[atomIndex*3];
                    float inducedDipoleY = yscale*cAmoebaSim.pInducedDipole[atomIndex*3+1];
                    float inducedDipoleZ = zscale*cAmoebaSim.pInducedDipole[atomIndex*3+2];
@@ -360,6 +367,9 @@ void kGridSpreadInducedDipoles_kernel()
                        int atomIndex = atomData.x;
                        int z = atomData.y;
                        int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : cSim.pmeGridSize.z);
+                        if( iz >= cSim.pmeGridSize.z ){
+                            iz -= cSim.pmeGridSize.z;
+                        }
                        float inducedDipoleX = xscale*cAmoebaSim.pInducedDipole[atomIndex*3];
                        float inducedDipoleY = yscale*cAmoebaSim.pInducedDipole[atomIndex*3+1];
                        float inducedDipoleZ = zscale*cAmoebaSim.pInducedDipole[atomIndex*3+2];
@@ -705,21 +715,23 @@ void kComputeInducedPotentialFromGrid_kernel()
        cAmoebaSim.pPhid[10*m+1] = tuv100_1;
        cAmoebaSim.pPhid[10*m+2] = tuv010_1;
        cAmoebaSim.pPhid[10*m+3] = tuv001_1;
-        cAmoebaSim.pPhid[10*m+4] = tuv100_1;
-        cAmoebaSim.pPhid[10*m+5] = tuv010_1;
+        cAmoebaSim.pPhid[10*m+4] = tuv200_1;
+        cAmoebaSim.pPhid[10*m+5] = tuv020_1;
        cAmoebaSim.pPhid[10*m+6] = tuv002_1;
        cAmoebaSim.pPhid[10*m+7] = tuv110_1;
        cAmoebaSim.pPhid[10*m+8] = tuv101_1;
        cAmoebaSim.pPhid[10*m+9] = tuv011_1;
+
        cAmoebaSim.pPhip[10*m+1] = tuv100_2;
        cAmoebaSim.pPhip[10*m+2] = tuv010_2;
        cAmoebaSim.pPhip[10*m+3] = tuv001_2;
-        cAmoebaSim.pPhip[10*m+4] = tuv100_2;
-        cAmoebaSim.pPhip[10*m+5] = tuv010_2;
+        cAmoebaSim.pPhip[10*m+4] = tuv200_2;
+        cAmoebaSim.pPhip[10*m+5] = tuv020_2;
        cAmoebaSim.pPhip[10*m+6] = tuv002_2;
        cAmoebaSim.pPhip[10*m+7] = tuv110_2;
        cAmoebaSim.pPhip[10*m+8] = tuv101_2;
        cAmoebaSim.pPhip[10*m+9] = tuv011_2;
+
        cAmoebaSim.pPhidp[20*m] = tuv000;
        cAmoebaSim.pPhidp[20*m+1] = tuv100;
        cAmoebaSim.pPhidp[20*m+2] = tuv010;
@@ -888,23 +900,40 @@ void kComputeInducedDipoleForceAndEnergy_kernel()
        float* phip = &cAmoebaSim.pPhip[10*i];
        float* phid = &cAmoebaSim.pPhid[10*i];
        float4 f = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
        for (int k = 0; k < 3; k++) {
            int j1 = deriv1[k+1];
            int j2 = deriv2[k+1];
            int j3 = deriv3[k+1];
            energy += inducedDipole[k]*phi[k+1];
+
            f.x += (inducedDipole[k]+inducedDipolePolar[k])*phi[j1] + inducedDipole[k]*phip[j1] + inducedDipolePolar[k]*phid[j1];
            f.y += (inducedDipole[k]+inducedDipolePolar[k])*phi[j2] + inducedDipole[k]*phip[j2] + inducedDipolePolar[k]*phid[j2];
            f.z += (inducedDipole[k]+inducedDipolePolar[k])*phi[j3] + inducedDipole[k]*phip[j3] + inducedDipolePolar[k]*phid[j3];
+
        }
+
+        f.x *= cSim.pmeGridSize.x*cSim.invPeriodicBoxSizeX;
+        f.y *= cSim.pmeGridSize.y*cSim.invPeriodicBoxSizeY;
+        f.z *= cSim.pmeGridSize.z*cSim.invPeriodicBoxSizeZ;
+
        for (int k = 0; k < 10; k++) {
            f.x += multipole[k]*phidp[deriv1[k]];
            f.y += multipole[k]*phidp[deriv2[k]];
            f.z += multipole[k]*phidp[deriv3[k]];
        }
+
+
        f.x *= 0.5f*cAmoebaSim.electric*cSim.pmeGridSize.x*cSim.invPeriodicBoxSizeX;
        f.y *= 0.5f*cAmoebaSim.electric*cSim.pmeGridSize.y*cSim.invPeriodicBoxSizeY;
        f.z *= 0.5f*cAmoebaSim.electric*cSim.pmeGridSize.z*cSim.invPeriodicBoxSizeZ;
+
+/*
+        f.x *= 0.5f*cAmoebaSim.electric;
+        f.y *= 0.5f*cAmoebaSim.electric;
+        f.z *= 0.5f*cAmoebaSim.electric;
+*/
+
        float4 force = cSim.pForce4[i];
        force.x -= f.x;
        force.y -= f.y;
@@ -1021,5 +1050,7 @@ void kCalculateAmoebaPMEInducedDipoleForces(amoebaGpuContext amoebaGpu)
    gpuContext gpu = amoebaGpu->gpuContext;
    kComputeInducedDipoleForceAndEnergy_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
    LAUNCHERROR("kComputeInducedDipoleForceAndEnergy");
+
    cudaComputeAmoebaMapTorquesAndAddTotalForce2(amoebaGpu, amoebaGpu->psTorque, gpu->psForce4);
+    LAUNCHERROR("cudaComputeAmoebaMapTorquesAndAddTotalForce2_kCalculateAmoebaPMEInducedDipoleForces");
 }
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
@@ -1089,12 +1089,12 @@ static void kReduceForceTorque(amoebaGpuContext amoebaGpu )
 {
    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                             amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
-                             amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psForce->_pDevStream[0] );
+                             amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psForce->_pDevData );
    LAUNCHERROR("kReducePmeDirectElectrostaticForce");

    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                             amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
-                             amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psTorque->_pDevStream[0] );
+                             amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psTorque->_pDevData );
    LAUNCHERROR("kReducePmeDirectElectrostaticTorque");
 }

@@ -1109,9 +1109,9 @@ static void zeroForce( amoebaGpuContext amoebaGpu )
 {
    gpuContext gpu   = amoebaGpu->gpuContext;
    for( int ii = 0; ii < amoebaGpu->gpuContext->natoms; ii++ ){
-        gpu->psForce4->_pSysStream[0][ii].x  = 0.0f;
-        gpu->psForce4->_pSysStream[0][ii].y  = 0.0f;
-        gpu->psForce4->_pSysStream[0][ii].z  = 0.0f;
+        gpu->psForce4->_pSysData[ii].x  = 0.0f;
+        gpu->psForce4->_pSysData[ii].y  = 0.0f;
+        gpu->psForce4->_pSysData[ii].z  = 0.0f;
    }
    gpu->psForce4->Upload();
 }
@@ -1129,10 +1129,10 @@ static void copyForce( amoebaGpuContext amoebaGpu, float conversion )
    gpu->psForce4->Download();
    int indexOffset  = 0;
    for( int ii = 0; ii < amoebaGpu->gpuContext->natoms; ii++ ){
-        amoebaGpu->psForce->_pSysStream[0][indexOffset]    = conversion*(gpu->psForce4->_pSysStream[0][ii].x);
-        amoebaGpu->psForce->_pSysStream[0][indexOffset+1]  = conversion*(gpu->psForce4->_pSysStream[0][ii].y);
-        amoebaGpu->psForce->_pSysStream[0][indexOffset+2]  = conversion*(gpu->psForce4->_pSysStream[0][ii].z);
-        indexOffset                                       += 3;
+        amoebaGpu->psForce->_pSysData[indexOffset]    = conversion*(gpu->psForce4->_pSysData[ii].x);
+        amoebaGpu->psForce->_pSysData[indexOffset+1]  = conversion*(gpu->psForce4->_pSysData[ii].y);
+        amoebaGpu->psForce->_pSysData[indexOffset+2]  = conversion*(gpu->psForce4->_pSysData[ii].z);
+        indexOffset                                  += 3;
    }
    amoebaGpu->psForce->Upload();
 }
@@ -1179,8 +1179,9 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
                      methodName, gpu->natoms, amoebaGpu->maxCovalentDegreeSz );
    }
    int paddedNumberOfAtoms                   = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-    CUDAStream<float4>* debugArray            = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
-    memset( debugArray->_pSysStream[0],      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
+    int maxOffset                             = 20;
+    CUDAStream<float4>* debugArray            = new CUDAStream<float4>(maxOffset*paddedNumberOfAtoms, 1, "DebugArray");
+    memset( debugArray->_pSysData,      0, sizeof( float )*4*maxOffset*paddedNumberOfAtoms);
    debugArray->Upload();
    unsigned int targetAtom                   = 49;
 #endif
@@ -1198,18 +1199,18 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
        unsigned int lineTokenIndex = 1;

        // (void) fprintf( amoebaGpu->log, "   %u %s %s\n", ii, lineTokens[0].c_str(), lineTokens[lineTokenIndex].c_str() ); fflush(  amoebaGpu->log );
-        amoebaGpu->psInducedDipole->_pSysStream[0][offset++]       = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
-        amoebaGpu->psInducedDipole->_pSysStream[0][offset++]       = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
-        amoebaGpu->psInducedDipole->_pSysStream[0][offset++]       = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
+        amoebaGpu->psInducedDipole->_pSysData[offset++]       = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
+        amoebaGpu->psInducedDipole->_pSysData[offset++]       = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
+        amoebaGpu->psInducedDipole->_pSysData[offset++]       = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
        offset                                              -= 3;
-        amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset++]  = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
-        amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset++]  = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
-        amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset++]  = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
+        amoebaGpu->psInducedDipolePolar->_pSysData[offset++]  = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
+        amoebaGpu->psInducedDipolePolar->_pSysData[offset++]  = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
+        amoebaGpu->psInducedDipolePolar->_pSysData[offset++]  = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
    }
    float conversion = 0.1f;
    for( int ii = 0; ii < 3*gpu->natoms; ii++ ){
-        amoebaGpu->psInducedDipole->_pSysStream[0][ii]       *= conversion;
-        amoebaGpu->psInducedDipolePolar->_pSysStream[0][ii]  *= conversion;
+        amoebaGpu->psInducedDipole->_pSysData[ii]       *= conversion;
+        amoebaGpu->psInducedDipolePolar->_pSysData[ii]  *= conversion;
    }
    amoebaGpu->gpuContext->sim.alphaEwald = 5.4459052e+00f;
    SetCalculateAmoebaPmeDirectElectrostaticSim(amoebaGpu);
@@ -1248,24 +1249,24 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )

        kCalculateAmoebaPmeDirectElectrostaticCutoffByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>(
                                                                          gpu->sim.pInteractingWorkUnit,
-                                                                          amoebaGpu->psWorkArray_3_1->_pDevStream[0],
+                                                                          amoebaGpu->psWorkArray_3_1->_pDevData,
 #ifdef AMOEBA_DEBUG
-                                                                          amoebaGpu->psWorkArray_3_2->_pDevStream[0],
-                                                                          debugArray->_pDevStream[0], targetAtom );
+                                                                          amoebaGpu->psWorkArray_3_2->_pDevData,
+                                                                          debugArray->_pDevData, targetAtom );
 #else
-                                                                          amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
+                                                                          amoebaGpu->psWorkArray_3_2->_pDevData );
 #endif

    } else {

        kCalculateAmoebaPmeDirectElectrostaticCutoffForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>(
                                                                    gpu->sim.pInteractingWorkUnit,
-                                                                    amoebaGpu->psWorkArray_3_1->_pDevStream[0],
+                                                                    amoebaGpu->psWorkArray_3_1->_pDevData,
 #ifdef AMOEBA_DEBUG
-                                                                    amoebaGpu->psWorkArray_3_2->_pDevStream[0],
-                                                                    debugArray->_pDevStream[0], targetAtom );
+                                                                    amoebaGpu->psWorkArray_3_2->_pDevData,
+                                                                    debugArray->_pDevData, targetAtom );
 #else
-                                                                    amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
+                                                                    amoebaGpu->psWorkArray_3_2->_pDevData );
 #endif
    }
    LAUNCHERROR("kCalculateAmoebaPmeDirectElectrostaticCutoffForces");
@@ -1292,20 +1293,20 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
            // force

            (void) fprintf( amoebaGpu->log,"PmeDirectElectrostaticF [%16.9e %16.9e %16.9e] ",
-                            conversion*amoebaGpu->psForce->_pSysStream[0][indexOffset],
-                            conversion*amoebaGpu->psForce->_pSysStream[0][indexOffset+1],
-                            conversion*amoebaGpu->psForce->_pSysStream[0][indexOffset+2] );
+                            conversion*amoebaGpu->psForce->_pSysData[indexOffset],
+                            conversion*amoebaGpu->psForce->_pSysData[indexOffset+1],
+                            conversion*amoebaGpu->psForce->_pSysData[indexOffset+2] );

-            forceSum[0]         += amoebaGpu->psForce->_pSysStream[0][indexOffset];
-            forceSum[1]         += amoebaGpu->psForce->_pSysStream[0][indexOffset+1];
-            forceSum[2]         += amoebaGpu->psForce->_pSysStream[0][indexOffset+2];
+            forceSum[0]         += amoebaGpu->psForce->_pSysData[indexOffset];
+            forceSum[1]         += amoebaGpu->psForce->_pSysData[indexOffset+1];
+            forceSum[2]         += amoebaGpu->psForce->_pSysData[indexOffset+2];

            // torque

            (void) fprintf( amoebaGpu->log,"PmeDirectElectrostaticT [%16.9e %16.9e %16.9e] ",
-                            conversion*amoebaGpu->psTorque->_pSysStream[0][indexOffset],
-                            conversion*amoebaGpu->psTorque->_pSysStream[0][indexOffset+1],
-                            conversion*amoebaGpu->psTorque->_pSysStream[0][indexOffset+2] );
+                            conversion*amoebaGpu->psTorque->_pSysData[indexOffset],
+                            conversion*amoebaGpu->psTorque->_pSysData[indexOffset+1],
+                            conversion*amoebaGpu->psTorque->_pSysData[indexOffset+2] );

            (void) fprintf( amoebaGpu->log,"\n" );
            if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
@@ -1337,15 +1338,15 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
            int offset3             = 8*paddedNumberOfAtoms;
            for( int jj = 0; jj < gpu->natoms; jj++ ){
                int debugIndex = jj;
-                if( fabs( debugArray->_pSysStream[0][debugIndex+5*paddedNumberOfAtoms].y ) < 1.0e-10 )continue;
+                if( fabs( debugArray->_pSysData[debugIndex+5*paddedNumberOfAtoms].y ) < 1.0e-10 )continue;
                if( jj != targetAtom ){
-                    torqueSum0[0] += debugArray->_pSysStream[0][debugIndex+offset0].x + debugArray->_pSysStream[0][debugIndex+offset1].x; 
-                    torqueSum0[1] += debugArray->_pSysStream[0][debugIndex+offset0].y + debugArray->_pSysStream[0][debugIndex+offset1].y; 
-                    torqueSum0[2] += debugArray->_pSysStream[0][debugIndex+offset0].z + debugArray->_pSysStream[0][debugIndex+offset1].z; 
+                    torqueSum0[0] += debugArray->_pSysData[debugIndex+offset0].x + debugArray->_pSysData[debugIndex+offset1].x; 
+                    torqueSum0[1] += debugArray->_pSysData[debugIndex+offset0].y + debugArray->_pSysData[debugIndex+offset1].y; 
+                    torqueSum0[2] += debugArray->_pSysData[debugIndex+offset0].z + debugArray->_pSysData[debugIndex+offset1].z; 

-                    torqueSum1[0] += debugArray->_pSysStream[0][debugIndex+offset2].x + debugArray->_pSysStream[0][debugIndex+offset3].x; 
-                    torqueSum1[1] += debugArray->_pSysStream[0][debugIndex+offset2].y + debugArray->_pSysStream[0][debugIndex+offset3].y; 
-                    torqueSum1[2] += debugArray->_pSysStream[0][debugIndex+offset2].z + debugArray->_pSysStream[0][debugIndex+offset3].z; 
+                    torqueSum1[0] += debugArray->_pSysData[debugIndex+offset2].x + debugArray->_pSysData[debugIndex+offset3].x; 
+                    torqueSum1[1] += debugArray->_pSysData[debugIndex+offset2].y + debugArray->_pSysData[debugIndex+offset3].y; 
+                    torqueSum1[2] += debugArray->_pSysData[debugIndex+offset2].z + debugArray->_pSysData[debugIndex+offset3].z; 
                }
                if( jj == 2 ){
                    offset0             += 2*paddedNumberOfAtoms;
@@ -1353,8 +1354,8 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
                }
                for( int kk = 0; kk < 12; kk++ ){
                    (void) fprintf( amoebaGpu->log,"%5d %5d %5d [%16.9e %16.9e %16.9e %16.9e] E11\n", targetAtom, jj, kk,
-                                    debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
-                                    debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
+                                    debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
+                                    debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
                    debugIndex += paddedNumberOfAtoms;
                }
                (void) fprintf( amoebaGpu->log,"%5d %5d [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] Sum\n", targetAtom, jj,
@@ -1405,13 +1406,48 @@ void cudaComputeAmoebaPmeElectrostatic( amoebaGpuContext amoebaGpu )
    if( 0 ){
        gpuContext gpu = amoebaGpu->gpuContext;
        std::vector<int> fileId;
-        zeroForce( amoebaGpu );
-        kCalculateAmoebaPMEInducedDipoleForces( amoebaGpu );
-        copyForce( amoebaGpu, -1.0f/41.84 );
+        float conversion = -1.0f/41.84;
+        copyForce( amoebaGpu, conversion );
        VectorOfDoubleVectors outputVector;
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psForce,      outputVector, gpu->psAtomIndex->_pSysData );
-        cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeRecipForce", fileId, outputVector );
+        cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeRecipDemForce", fileId, outputVector );
+
+
+        int paddedNumberOfAtoms                   = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
+        CUDAStream<float>* debugArray             = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "FArray");
+        int index                                 = 0;
+        for( int ii = 0; ii < amoebaGpu->gpuContext->natoms; ii++ ){
+            debugArray->_pSysData[index]    = amoebaGpu->psForce->_pSysData[index];
+            debugArray->_pSysData[index+1]  = amoebaGpu->psForce->_pSysData[index+1];
+            debugArray->_pSysData[index+2]  = amoebaGpu->psForce->_pSysData[index+2];
+            index                          += 3;
+        }
+        //zeroForce( amoebaGpu );
+        kCalculateAmoebaPMEInducedDipoleForces( amoebaGpu );
+        copyForce( amoebaGpu, conversion );
+        VectorOfDoubleVectors outputVector1;
+        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psForce,      outputVector1, gpu->psAtomIndex->_pSysData );
+        cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeRecipForce", fileId, outputVector1 );
+
+        VectorOfDoubleVectors outputVector2;
+        index = 0;
+        for( int ii = 0; ii < amoebaGpu->gpuContext->natoms; ii++ ){
+            amoebaGpu->psForce->_pSysData[index]   -= debugArray->_pSysData[index];
+            amoebaGpu->psForce->_pSysData[index+1] -= debugArray->_pSysData[index+1];
+            amoebaGpu->psForce->_pSysData[index+2] -= debugArray->_pSysData[index+2];
+            index                                  += 3;
+ 
+        }
+        amoebaGpu->psForce->Upload();
+        outputVector.resize(0);
+        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psForce,      outputVector2, gpu->psAtomIndex->_pSysData );
+        cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeRecipDepForce", fileId, outputVector2 );
+
+
        zeroForce( amoebaGpu );
+        kCalculateAmoebaPMEInducedDipoleForces( amoebaGpu );
+        //zeroForce( amoebaGpu );
+        exit(0);
    }

    if( 0 ){

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.h
@@ -335,8 +335,8 @@ if( atomI == targetAtom || atomJ == targetAtom ){
                            } else {
    
                                sA[threadIdx.x].tempForce[0]  = forceTorqueEnergy[0].x;
-                                sA[threadIdx.x].tempForce[1]  = forceTorqueEnergy[1].y;
-                                sA[threadIdx.x].tempForce[2]  = forceTorqueEnergy[2].z;
+                                sA[threadIdx.x].tempForce[1]  = forceTorqueEnergy[0].y;
+                                sA[threadIdx.x].tempForce[2]  = forceTorqueEnergy[0].z;
    
                                sA[threadIdx.x].tempTorque[0] = forceTorqueEnergy[2].x;
                                sA[threadIdx.x].tempTorque[1] = forceTorqueEnergy[2].y;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.h
@@ -144,14 +144,13 @@ if( atomI == targetAtom || targetAtom == (y+j) ){
    unsigned int index                 = atomI == targetAtom ? (y + j) : atomI;
    unsigned int indexI                = 0;
    unsigned int indexJ                = indexI ? 0 : 2;
-    unsigned int indices[4]            = { indexI, indexJ, indexI+1, indexJ+1 };
    float flag                         = 7.0f;

    debugArray[index].x                = (float) atomI;
    debugArray[index].y                = (float) (y + j);
    debugArray[index].z                = dScaleValue;
    debugArray[index].w                = pScaleValue;
-
+/*
    index                             += cAmoebaSim.paddedNumberOfAtoms;
    debugArray[index].x                = (float) bExclusionFlag;
    debugArray[index].y                = (float) (tgx);
@@ -163,14 +162,31 @@ if( atomI == targetAtom || targetAtom == (y+j) ){
    debugArray[index].y                = (float) pScaleMask.x;
    debugArray[index].z                = (float) pScaleMask.y;
    debugArray[index].w                = flag;
+*/
+    index                             += cAmoebaSim.paddedNumberOfAtoms;
+    debugArray[index].x                = match ? 0.0f : ijField[0].x;
+    debugArray[index].y                = match ? 0.0f : ijField[1].x;
+    debugArray[index].z                = match ? 0.0f : ijField[2].x;
+    debugArray[index].w                = flag + 1.0f;
+
+    index                             += cAmoebaSim.paddedNumberOfAtoms;
+    debugArray[index].x                = match ? 0.0f : ijField[0].z;
+    debugArray[index].y                = match ? 0.0f : ijField[1].z;
+    debugArray[index].z                = match ? 0.0f : ijField[2].z;
+    debugArray[index].w                = flag + 2.0f;
+
+    index                             += cAmoebaSim.paddedNumberOfAtoms;
+    debugArray[index].x                = match ? 0.0f : ijField[0].y;
+    debugArray[index].y                = match ? 0.0f : ijField[1].y;
+    debugArray[index].z                = match ? 0.0f : ijField[2].y;
+    debugArray[index].w                = flag + 3.0f;
+
+    index                             += cAmoebaSim.paddedNumberOfAtoms;
+    debugArray[index].x                = match ? 0.0f : ijField[0].w;
+    debugArray[index].y                = match ? 0.0f : ijField[1].w;
+    debugArray[index].z                = match ? 0.0f : ijField[2].w;
+    debugArray[index].w                = flag + 4.0f;

-    for( int ii = 0; ii < 4; ii++ ){
-        index                             += cAmoebaSim.paddedNumberOfAtoms;
-        debugArray[index].x                = match ? 0.0f : ijField[indices[ii]][0];
-        debugArray[index].y                = match ? 0.0f : ijField[indices[ii]][1];
-        debugArray[index].z                = match ? 0.0f : ijField[indices[ii]][2];
-        debugArray[index].w                = flag;
-    }

    for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
        index                             += cAmoebaSim.paddedNumberOfAtoms;
@@ -314,6 +330,7 @@ if( (atomI == targetAtom || (y + jIdx) == targetAtom) ){
            debugArray[index].w                = pScaleValue;

            float flag                         = 9.0f;
+/*
            index                             += cAmoebaSim.paddedNumberOfAtoms;
            debugArray[index].x                = (float) bExclusionFlag;
            debugArray[index].y                = (float) (tgx);
@@ -325,30 +342,30 @@ if( (atomI == targetAtom || (y + jIdx) == targetAtom) ){
            debugArray[index].y                = (float) pScaleMask.x;
            debugArray[index].z                = (float) pScaleMask.y;
            debugArray[index].w                = (float) flags;
-        
+ */       
            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                =  outOfBounds ? 0.0f : ijField[indexI][0];
-            debugArray[index].y                =  outOfBounds ? 0.0f : ijField[indexI][1];
-            debugArray[index].z                =  outOfBounds ? 0.0f : ijField[indexI][2];
-            debugArray[index].w                =  flag;
+            debugArray[index].x                =  outOfBounds ? 0.0f : ijField[0].x;
+            debugArray[index].y                =  outOfBounds ? 0.0f : ijField[1].x;
+            debugArray[index].z                =  outOfBounds ? 0.0f : ijField[2].x;
+            debugArray[index].w                =  flag + 1.0f;

            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                =  outOfBounds ? 0.0f : ijField[indexJ][0];
-            debugArray[index].y                =  outOfBounds ? 0.0f : ijField[indexJ][1];
-            debugArray[index].z                =  outOfBounds ? 0.0f : ijField[indexJ][2];
-            debugArray[index].w                = flag;
+            debugArray[index].x                =  outOfBounds ? 0.0f : ijField[0].y;
+            debugArray[index].y                =  outOfBounds ? 0.0f : ijField[1].y;
+            debugArray[index].z                =  outOfBounds ? 0.0f : ijField[2].y;
+            debugArray[index].w                = flag + 2.0f;

            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                =  outOfBounds ? 0.0f : ijField[indexI+1][0];
-            debugArray[index].y                =  outOfBounds ? 0.0f : ijField[indexI+1][1];
-            debugArray[index].z                =  outOfBounds ? 0.0f : ijField[indexI+1][2];
-            debugArray[index].w                = flag;
+            debugArray[index].x                =  outOfBounds ? 0.0f : ijField[0].z;
+            debugArray[index].y                =  outOfBounds ? 0.0f : ijField[1].z;
+            debugArray[index].z                =  outOfBounds ? 0.0f : ijField[2].z;
+            debugArray[index].w                = flag + 3.0f;

            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                =  outOfBounds ? 0.0f : ijField[indexJ+1][0];
-            debugArray[index].y                =  outOfBounds ? 0.0f : ijField[indexJ+1][1];
-            debugArray[index].z                =  outOfBounds ? 0.0f : ijField[indexJ+1][2];
-            debugArray[index].w                = flag;
+            debugArray[index].x                =  outOfBounds ? 0.0f : ijField[0].w;
+            debugArray[index].y                =  outOfBounds ? 0.0f : ijField[1].w;
+            debugArray[index].z                =  outOfBounds ? 0.0f : ijField[2].w;
+            debugArray[index].w                = flag + 4.0f;

            for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
                index                             += cAmoebaSim.paddedNumberOfAtoms;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.h
@@ -122,7 +122,6 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)(
                fieldPolarSum[1]       += mask ? ijField[1].z : 0.0f;
                fieldPolarSum[2]       += mask ? ijField[2].z : 0.0f;

-/*
 #ifdef AMOEBA_DEBUG
 if( atomI == targetAtom || (y+j) == targetAtom ){
            unsigned int index                 = atomI == targetAtom ? (y+j) : atomI;
@@ -151,35 +150,35 @@ if( atomI == targetAtom || (y+j) == targetAtom ){

            index                             += cAmoebaSim.paddedNumberOfAtoms;
            float flag                         = 6.0f;
-            debugArray[index].x                = ijField[indexI][0];
-            debugArray[index].y                = ijField[indexI][1];
-            debugArray[index].z                = ijField[indexI][2];
+            debugArray[index].x                = ijField[0].x;
+            debugArray[index].y                = ijField[1].x;
+            debugArray[index].z                = ijField[2].x;
            debugArray[index].w                = flag;

            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexJ][0];
-            debugArray[index].y                = ijField[indexJ][1];
-            debugArray[index].z                = ijField[indexJ][2];
+            debugArray[index].x                = ijField[0].x;
+            debugArray[index].y                = ijField[1].x;
+            debugArray[index].z                = ijField[2].x;
            debugArray[index].w                = flag;

            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexI+1][0];
-            debugArray[index].y                = ijField[indexI+1][1];
-            debugArray[index].z                = ijField[indexI+1][2];
+            debugArray[index].x                = ijField[0].z;
+            debugArray[index].y                = ijField[1].z;
+            debugArray[index].z                = ijField[2].z;
            debugArray[index].w                = flag;

            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexJ+1][0];
-            debugArray[index].y                = ijField[indexJ+1][1];
-            debugArray[index].z                = ijField[indexJ+1][2];
+            debugArray[index].x                = ijField[0].z;
+            debugArray[index].y                = ijField[1].z;
+            debugArray[index].z                = ijField[2].z;
            debugArray[index].w                = flag;

            index                             += cAmoebaSim.paddedNumberOfAtoms;

            index                             += cAmoebaSim.paddedNumberOfAtoms;
-            debugArray[index].x                = match ? 0.0f : ijField[indexI][0];
-            debugArray[index].y                = match ? 0.0f : ijField[indexI][1];
-            debugArray[index].z                = match ? 0.0f : ijField[indexI][2];
+            debugArray[index].x                = match ? 0.0f : ijField[0].x;
+            debugArray[index].y                = match ? 0.0f : ijField[1].x;
+            debugArray[index].z                = match ? 0.0f : ijField[2].x;
            index                             += cAmoebaSim.paddedNumberOfAtoms;
            unsigned int mask                  = 1 << j;
            unsigned int pScaleIndex           = (scaleMask.x & mask) ? 1 : 0;
@@ -192,7 +191,7 @@ if( atomI == targetAtom || (y+j) == targetAtom ){

 }
 #endif
-*/
+
            }

            // Write results
@@ -274,13 +273,13 @@ if( atomI == targetAtom || (y+j) == targetAtom ){

                        } else {

-                            sA[threadIdx.x].tempBuffer[0]  = mask ? 0.0f : ijField[0].y;
-                            sA[threadIdx.x].tempBuffer[1]  = mask ? 0.0f : ijField[1].y;
-                            sA[threadIdx.x].tempBuffer[2]  = mask ? 0.0f : ijField[2].y;
+                            sA[threadIdx.x].tempBuffer[0]  = mask ? ijField[0].y : 0.0;
+                            sA[threadIdx.x].tempBuffer[1]  = mask ? ijField[1].y : 0.0;
+                            sA[threadIdx.x].tempBuffer[2]  = mask ? ijField[2].y : 0.0;

-                            sA[threadIdx.x].tempBufferP[0] = mask ? 0.0f : ijField[0].w;
-                            sA[threadIdx.x].tempBufferP[1] = mask ? 0.0f : ijField[1].w;
-                            sA[threadIdx.x].tempBufferP[2] = mask ? 0.0f : ijField[2].w;
+                            sA[threadIdx.x].tempBufferP[0] = mask ? ijField[0].w : 0.0;
+                            sA[threadIdx.x].tempBufferP[1] = mask ? ijField[1].w : 0.0;
+                            sA[threadIdx.x].tempBufferP[2] = mask ? ijField[2].w : 0.0;

                            if( tgx % 2 == 0 ){
                                sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaRotateFrame.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaRotateFrame.cu
@@ -423,15 +423,15 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
        for( int ii = 0; ii < gpu->natoms; ii++ ){
            int mIndex = 3*ii;
             (void) fprintf( amoebaGpu->log,"%6d [%6d %6d %6d] x[%16.9e %16.9e %16.9e] dpl[%16.9e %16.9e %16.9e]\nRot[%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e]\n\n", ii,
-                             amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].x,
-                             amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].y,
-                             amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].w,
-                             gpu->psPosq4->_pSysStream[0][ii].x,
-                             gpu->psPosq4->_pSysStream[0][ii].y,
-                             gpu->psPosq4->_pSysStream[0][ii].z,
-                             amoebaGpu->psMolecularDipole->_pSysStream[0][mIndex],
-                             amoebaGpu->psMolecularDipole->_pSysStream[0][mIndex+1],
-                             amoebaGpu->psMolecularDipole->_pSysStream[0][mIndex+2] );
+                             amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].x,
+                             amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].y,
+                             amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].w,
+                             gpu->psPosq4->_pSysData[ii].x,
+                             gpu->psPosq4->_pSysData[ii].y,
+                             gpu->psPosq4->_pSysData[ii].z,
+                             amoebaGpu->psMolecularDipole->_pSysData[mIndex],
+                             amoebaGpu->psMolecularDipole->_pSysData[mIndex+1],
+                             amoebaGpu->psMolecularDipole->_pSysData[mIndex+2] );
        }
    }
 //    int64 kernelTime = AmoebaTiming::getTimeOfDay();
@@ -452,10 +452,10 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
        (void) fprintf( amoebaGpu->log, "Finished rotation kernel execution in %lf us\n", kernelTime ); (void) fflush( amoebaGpu->log );
        (void) fprintf( amoebaGpu->log, "psLabFrameDipole=%p _pSysStream=%p _pSysStream[0]=%p _pDevStream=%p _pDevStream[0]=%p\n",
                        amoebaGpu->psLabFrameDipole,  amoebaGpu->psLabFrameDipole->_pSysStream, 
-                        amoebaGpu->psLabFrameDipole->_pSysStream[0], amoebaGpu->psLabFrameDipole->_pDevStream, amoebaGpu->psLabFrameDipole->_pDevStream[0] );
+                        amoebaGpu->psLabFrameDipole->_pSysData, amoebaGpu->psLabFrameDipole->_pDevStream, amoebaGpu->psLabFrameDipole->_pDevData );
        fflush( amoebaGpu->log );

-        amoebaGpu->psRotationMatrix->Download();
+        //amoebaGpu->psRotationMatrix->Download();
        amoebaGpu->psLabFrameDipole->Download();
        (void) fprintf( amoebaGpu->log, "psLabFrameDipole completed\n" );  (void) fflush( amoebaGpu->log );

@@ -469,46 +469,46 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
             int quadrupoleOffset = 9*ii;

             (void) fprintf( amoebaGpu->log,"\n%6d [%6d %6d %6d] ", ii,
-                             amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].x,
-                             amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].y,
-                             amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].w );
+                             amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].x,
+                             amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].y,
+                             amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].w );
             // coords

             (void) fprintf( amoebaGpu->log,"x[%16.9e %16.9e %16.9e]\n",
-                             gpu->psPosq4->_pSysStream[0][ii].x,
-                             gpu->psPosq4->_pSysStream[0][ii].y,
-                             gpu->psPosq4->_pSysStream[0][ii].z);
-
+                             gpu->psPosq4->_pSysData[ii].x,
+                             gpu->psPosq4->_pSysData[ii].y,
+                             gpu->psPosq4->_pSysData[ii].z);
+/*
             (void) fprintf( amoebaGpu->log,"   R[%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e]\n",
-                             amoebaGpu->psRotationMatrix->_pSysStream[0][quadrupoleOffset],
-                             amoebaGpu->psRotationMatrix->_pSysStream[0][quadrupoleOffset+1],
-                             amoebaGpu->psRotationMatrix->_pSysStream[0][quadrupoleOffset+2],
-                             amoebaGpu->psRotationMatrix->_pSysStream[0][quadrupoleOffset+3],
-                             amoebaGpu->psRotationMatrix->_pSysStream[0][quadrupoleOffset+4],
-                             amoebaGpu->psRotationMatrix->_pSysStream[0][quadrupoleOffset+5],
-                             amoebaGpu->psRotationMatrix->_pSysStream[0][quadrupoleOffset+6],
-                             amoebaGpu->psRotationMatrix->_pSysStream[0][quadrupoleOffset+7],
-                             amoebaGpu->psRotationMatrix->_pSysStream[0][quadrupoleOffset+8] );
-
+                             amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset],
+                             amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+1],
+                             amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+2],
+                             amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+3],
+                             amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+4],
+                             amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+5],
+                             amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+6],
+                             amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+7],
+                             amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+8] );
+*/
             // dipole

             (void) fprintf( amoebaGpu->log,"   D[%16.9e %16.9e %16.9e]\n",
-                             amoebaGpu->psLabFrameDipole->_pSysStream[0][dipoleOffset],
-                             amoebaGpu->psLabFrameDipole->_pSysStream[0][dipoleOffset+1],
-                             amoebaGpu->psLabFrameDipole->_pSysStream[0][dipoleOffset+2] );
+                             amoebaGpu->psLabFrameDipole->_pSysData[dipoleOffset],
+                             amoebaGpu->psLabFrameDipole->_pSysData[dipoleOffset+1],
+                             amoebaGpu->psLabFrameDipole->_pSysData[dipoleOffset+2] );
    
             // quadrupole

             (void) fprintf( amoebaGpu->log,"   Q[%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e]\n",
-                             amoebaGpu->psLabFrameQuadrupole->_pSysStream[0][quadrupoleOffset],
-                             amoebaGpu->psLabFrameQuadrupole->_pSysStream[0][quadrupoleOffset+1],
-                             amoebaGpu->psLabFrameQuadrupole->_pSysStream[0][quadrupoleOffset+2],
-                             amoebaGpu->psLabFrameQuadrupole->_pSysStream[0][quadrupoleOffset+3],
-                             amoebaGpu->psLabFrameQuadrupole->_pSysStream[0][quadrupoleOffset+4],
-                             amoebaGpu->psLabFrameQuadrupole->_pSysStream[0][quadrupoleOffset+5],
-                             amoebaGpu->psLabFrameQuadrupole->_pSysStream[0][quadrupoleOffset+6],
-                             amoebaGpu->psLabFrameQuadrupole->_pSysStream[0][quadrupoleOffset+7],
-                             amoebaGpu->psLabFrameQuadrupole->_pSysStream[0][quadrupoleOffset+8] );
+                             amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset],
+                             amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset+1],
+                             amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset+2],
+                             amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset+3],
+                             amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset+4],
+                             amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset+5],
+                             amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset+6],
+                             amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset+7],
+                             amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset+8] );

            if( ii == maxPrint && (ii < (gpu->natoms - maxPrint)) ){
                ii = gpu->natoms - maxPrint;
@@ -549,41 +549,30 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
  
 }

-//#define GET_INDUCED_DIPOLE_FROM_FILE
-#ifdef GET_INDUCED_DIPOLE_FROM_FILE
-#include <stdlib.h>
-#endif
-
 void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaGeneralizedKirkwood ) 
 {
    std::string methodName = "kCalculateAmoebaMultipoleForces";
-    //printf("%s \n", methodName.c_str() ); fflush( stdout );

    // compute lab frame moments

    cudaComputeAmoebaLabFrameMoments( amoebaGpu );

+    if( 0 ){
+        gpuContext gpu                       = amoebaGpu->gpuContext;
+        std::vector<int> fileId;
+        //fileId.push_back( 0 );
+        VectorOfDoubleVectors outputVector;
+        //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,              outputVector, gpu->psAtomIndex->_pSysData );
+        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psLabFrameDipole,     outputVector, gpu->psAtomIndex->_pSysData );
+        cudaLoadCudaFloatArray( gpu->natoms,  9, amoebaGpu->psLabFrameQuadrupole, outputVector, gpu->psAtomIndex->_pSysData );
+        cudaWriteVectorOfDoubleVectorsToFile( "CudaLabMoments", fileId, outputVector );
+    }   
+
    // compute fixed E-field and mutual induced field 

    if( hasAmoebaGeneralizedKirkwood ){
        cudaComputeAmoebaFixedEAndGkFields( amoebaGpu );
-        if( 0 ){
-            gpuContext gpu = amoebaGpu->gpuContext;
-            initializeCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, 0.0 );
-            initializeCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, 0.0 );
-            initializeCudaFloatArray( gpu->natoms, 3, amoebaGpu->psGk_Field, 0.0 );
-        }
-
        cudaComputeAmoebaMutualInducedAndGkField( amoebaGpu );
-        if( 0 ){
-            gpuContext gpu = amoebaGpu->gpuContext;
-            initializeCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, 0.0 );
-            initializeCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, 0.0 );
-            initializeCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipoleS, 0.0 );
-            initializeCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolarS, 0.0 );
-            amoebaGpu->mutualInducedDone = 1;
-        }
-
    } else {
        if( amoebaGpu->multipoleNonbondedMethod == AMOEBA_NO_CUTOFF ){
            cudaComputeAmoebaFixedEField( amoebaGpu );
@@ -595,75 +584,12 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
            kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
            LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
            //compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
-            compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, amoebaGpu->psWorkUnit->_pDevStream[0], gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
+            compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, amoebaGpu->psWorkUnit->_pDevData, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
            kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
            LAUNCHERROR("kFindInteractionsWithinBlocksPeriodic");
-/*
-            if( 0 ){ 
-                gpu->psInteractionCount->Download();
-                gpu->psInteractingWorkUnit->Download();
-                gpu->psInteractionFlag->Download();
-                amoebaGpu->psWorkUnit->Download();
-                (void) fprintf( amoebaGpu->log, "Ixn count=%u\n", gpu->psInteractionCount->_pSysStream[0][0] );
-                for( unsigned int ii = 0; ii < gpu->psInteractingWorkUnit->_length; ii++ ){
-            
-                    unsigned int x          = gpu->psInteractingWorkUnit->_pSysStream[0][ii];
-                    unsigned int y          = ((x >> 2) & 0x7fff) << GRIDBITS;
-                    //unsigned int y          = ((x >> 2) & 0x7fff);
-                    unsigned int exclusions = (x & 0x1);
-                                 x          = (x >> 17) << GRIDBITS;
-                    //             x          = (x >> 17);
-                    (void) fprintf( amoebaGpu->log, "GpuCell %8u  %8u [%5u %5u %1u] %10u ", ii, gpu->psInteractingWorkUnit->_pSysStream[0][ii], x,y,exclusions, gpu->psInteractionFlag->_pSysStream[0][ii] );
-            
-                                 x          = amoebaGpu->psWorkUnit->_pSysStream[0][ii];
-                                 y          = ((x >> 2) & 0x7fff) << GRIDBITS;
-                                 exclusions = (x & 0x1);
-                                 x          = (x >> 17) << GRIDBITS;
-                    (void) fprintf( amoebaGpu->log, "   AmGpu %8u [%5u %5u %1u]\n", amoebaGpu->psWorkUnit->_pSysStream[0][ii], x,y,exclusions );
-                }    
-            }
-*/
            cudaComputeAmoebaPmeFixedEField( amoebaGpu );
            cudaComputeAmoebaPmeMutualInducedField( amoebaGpu );
-
-#ifdef GET_INDUCED_DIPOLE_FROM_FILE
-            if( 0 ){
-                //std::string fileName = "waterInducedDipole.txt";
-                std::string fileName = "water_3_MI.txt";
-                StringVectorVector fileContents;
-                readFile( fileName, fileContents );
-                unsigned int offset  = 0; 
-                (void) fprintf( amoebaGpu->log, "Read file: %s %u\n", fileName.c_str(), fileContents.size() ); fflush(  amoebaGpu->log );
-                for( unsigned int ii = 1; ii < fileContents.size()-1; ii++ ){
-            
-                    StringVector lineTokens     = fileContents[ii];
-                    unsigned int lineTokenIndex = 1; 
-            
-                    (void) fprintf( amoebaGpu->log, "   %u %s [%s %s %s] [%15.7e %15.7e %15.7e]\n", ii, lineTokens[0].c_str(),
-                                    lineTokens[lineTokenIndex].c_str(), lineTokens[lineTokenIndex+1].c_str(), lineTokens[lineTokenIndex+2].c_str(),
-                                    amoebaGpu->psInducedDipole->_pSysStream[0][offset], amoebaGpu->psInducedDipole->_pSysStream[0][offset+1], amoebaGpu->psInducedDipole->_pSysStream[0][offset+2]); 
-                    amoebaGpu->psInducedDipole->_pSysStream[0][offset++]       = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
-                    amoebaGpu->psInducedDipole->_pSysStream[0][offset++]       = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
-                    amoebaGpu->psInducedDipole->_pSysStream[0][offset++]       = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
-                    offset                                                    -= 3;
-                    amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset++]  = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
-                    amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset++]  = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
-                    amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset++]  = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
-                }
-                (void) fflush( amoebaGpu->log );
-                float conversion = 0.1f;
-                for( int ii = 0; ii < 3*gpu->natoms; ii++ ){
-                    amoebaGpu->psInducedDipole->_pSysStream[0][ii]       *= conversion;
-                    amoebaGpu->psInducedDipolePolar->_pSysStream[0][ii]  *= conversion;
-                }    
-                //amoebaGpu->gpuContext->sim.alphaEwald = 5.4459052e+00f;
-                //SetCalculateAmoebaPmeDirectElectrostaticSim(amoebaGpu);
-                amoebaGpu->psInducedDipole->Upload();
-                amoebaGpu->psInducedDipolePolar->Upload();
-           }
-#endif
-
        }
    }


--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaUtilities.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaUtilities.cu
@@ -66,7 +66,7 @@ void kClearFloat4_kernel( unsigned int bufferLength, float4* fieldToClear )

 void kClearFloat4( amoebaGpuContext amoebaGpu, unsigned int entries, CUDAStream<float4>* fieldToClear )
 {
-    kClearFloat4_kernel<<<amoebaGpu->gpuContext->blocksPerSM, 384>>>( entries, fieldToClear->_pDevStream[0] );
+    kClearFloat4_kernel<<<amoebaGpu->gpuContext->blocksPerSM, 384>>>( entries, fieldToClear->_pDevData );
    LAUNCHERROR("kClearFloat4");
 }

@@ -90,7 +90,7 @@ void kClearFloat_kernel( unsigned int bufferLength, float* fieldToClear )

 void kClearFloat( amoebaGpuContext amoebaGpu, unsigned int entries, CUDAStream<float>* fieldToClear )
 {
-    kClearFloat_kernel<<<amoebaGpu->gpuContext->blocksPerSM, 384>>>( entries, fieldToClear->_pDevStream[0] );
+    kClearFloat_kernel<<<amoebaGpu->gpuContext->blocksPerSM, 384>>>( entries, fieldToClear->_pDevData );
    LAUNCHERROR("kClearFloat");
 }

@@ -118,23 +118,23 @@ void kClearFields_3( amoebaGpuContext amoebaGpu, unsigned int numberToClear )
 {

    kClearFields_kernel<<<amoebaGpu->nonbondBlocks, 384>>>( amoebaGpu->paddedNumberOfAtoms*3*amoebaGpu->outputBuffers,
-                                                            amoebaGpu->psWorkArray_3_1->_pDevStream[0] );
+                                                            amoebaGpu->psWorkArray_3_1->_pDevData );
    LAUNCHERROR("kClearFields_3_1");

    kClearFields_kernel<<<amoebaGpu->nonbondBlocks, 384>>>( amoebaGpu->paddedNumberOfAtoms*3*amoebaGpu->outputBuffers,
-                                                            amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
+                                                            amoebaGpu->psWorkArray_3_2->_pDevData );
    LAUNCHERROR("kClearFields_3_2");

    if( numberToClear > 2 ){
        kClearFields_kernel<<<amoebaGpu->nonbondBlocks, 384>>>( amoebaGpu->paddedNumberOfAtoms*3*amoebaGpu->outputBuffers,
-                                                            amoebaGpu->psWorkArray_3_3->_pDevStream[0] );
+                                                            amoebaGpu->psWorkArray_3_3->_pDevData );
        LAUNCHERROR("kClearFields_3_3");
    } else {
        return;
    }
    if( numberToClear > 3 ){
        kClearFields_kernel<<<amoebaGpu->nonbondBlocks, 384>>>( amoebaGpu->paddedNumberOfAtoms*3*amoebaGpu->outputBuffers,
-                                                                amoebaGpu->psWorkArray_3_4->_pDevStream[0] );
+                                                                amoebaGpu->psWorkArray_3_4->_pDevData );
        LAUNCHERROR("kClearFields_3_4");
    }
 }
@@ -145,11 +145,11 @@ void kClearFields_1( amoebaGpuContext amoebaGpu )
 {

    kClearFields_kernel<<<amoebaGpu->nonbondBlocks, 384>>>( amoebaGpu->paddedNumberOfAtoms*amoebaGpu->outputBuffers,
-                                                            amoebaGpu->psWorkArray_1_1->_pDevStream[0] );
+                                                            amoebaGpu->psWorkArray_1_1->_pDevData );
    LAUNCHERROR("kClearFields_1_1");

    kClearFields_kernel<<<amoebaGpu->nonbondBlocks, 384>>>( amoebaGpu->paddedNumberOfAtoms*amoebaGpu->outputBuffers,
-                                                            amoebaGpu->psWorkArray_1_2->_pDevStream[0] );
+                                                            amoebaGpu->psWorkArray_1_2->_pDevData );
    LAUNCHERROR("kClearFields_1_2");
 }


--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.cu
@@ -271,7 +271,7 @@ static void kCalculateAmoebaVdw14_7Reduction(amoebaGpuContext amoebaGpu, CUDAStr
    else
        threadsPerBlock = G8X_NONBOND_THREADS_PER_BLOCK;
    kCalculateAmoebaVdw14_7Reduction_kernel<<<amoebaGpu->gpuContext->sim.blocks, threadsPerBlock>>>(
-                               vdwOutputArray->_pDevStream[0], forceOutputArray->_pDevStream[0] );
+                               vdwOutputArray->_pDevData, forceOutputArray->_pDevData );
    LAUNCHERROR("kCalculateAmoebaVdw14_7Reduction");
 }

@@ -369,7 +369,7 @@ static void kCalculateAmoebaVdw14_7CoordinateReduction(amoebaGpuContext amoebaGp
    else
        threadsPerBlock = G8X_THREADS_PER_BLOCK;
    kCalculateAmoebaVdw14_7CoordinateReduction_kernel<<<amoebaGpu->gpuContext->sim.blocks, threadsPerBlock>>>(
-                               coordinateArray->_pDevStream[0], reducedCoordinateArray->_pDevStream[0] );
+                               coordinateArray->_pDevData, reducedCoordinateArray->_pDevData );
    LAUNCHERROR("kCalculateAmoebaVdw14_7CoordinateReduction");
 }

@@ -404,7 +404,7 @@ void kCalculateAmoebaVdw14_7NonReduction_kernel( float* inputForce, float4* outp
 static void kCalculateAmoebaVdw14_7NonReduction(amoebaGpuContext amoebaGpu, CUDAStream<float>* vdwOutputArray, CUDAStream<float4>* forceOutputArray )
 {
    kCalculateAmoebaVdw14_7NonReduction_kernel<<<amoebaGpu->gpuContext->sim.blocks, 384>>>(
-                               vdwOutputArray->_pDevStream[0], forceOutputArray->_pDevStream[0] );
+                               vdwOutputArray->_pDevData, forceOutputArray->_pDevData );
    LAUNCHERROR("kCalculateAmoebaVdw14_7MonReduction");
 }

@@ -439,7 +439,7 @@ static void kReduceVdw14_7(amoebaGpuContext amoebaGpu, CUDAStream<float>* output
 {
    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                           amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
-                           amoebaGpu->psWorkArray_3_1->_pDevStream[0], outputArray->_pDevStream[0] );
+                           amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData );
    LAUNCHERROR("kReduceVdw14_7");
 }

@@ -467,7 +467,7 @@ void kCalculateAmoebaVdw14_7CopyCoordinates_kernel( unsigned int bufferLength, f
 void kCalculateAmoebaVdw14_7CopyCoordinates( amoebaGpuContext amoebaGpu, CUDAStream<float4>* toCopy, CUDAStream<float4>* copy )
 {
    kCalculateAmoebaVdw14_7CopyCoordinates_kernel<<<amoebaGpu->gpuContext->blocksPerSM, 384>>>( amoebaGpu->gpuContext->sim.paddedNumberOfAtoms, 
-                      toCopy->_pDevStream[0], copy->_pDevStream[0] );
+                      toCopy->_pDevData, copy->_pDevData );
    LAUNCHERROR("kCalculateAmoebaVdw14_7CopyCoordinates");
 }

@@ -499,7 +499,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
 #ifdef AMOEBA_DEBUG
    int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
    CUDAStream<float4>* debugArray             = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
-    memset( debugArray->_pSysStream[0],      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
+    memset( debugArray->_pSysData,      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
    debugArray->Upload();
    int targetAtom                             = 342;
 #endif
@@ -534,10 +534,10 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
        amoebaGpu->psVdwWorkUnit->Download();
        unsigned int totalWarps  = (amoebaGpu->nonbondBlocks*threadsPerBlock)/GRID;
        float        ratiof     = (float)totalWarps/(float)amoebaGpu->psVdwWorkUnit->_length;
-        (void) fprintf( amoebaGpu->log, "Ixn warps=%u count=%u\n", totalWarps, gpu->psInteractionCount->_pSysStream[0][0] );
+        (void) fprintf( amoebaGpu->log, "Ixn warps=%u count=%u\n", totalWarps, gpu->psInteractionCount->_pSysData[0] );
        for( unsigned int ii = 0; ii < amoebaGpu->psVdwWorkUnit->_length; ii++ ){
    
-            unsigned int x          = amoebaGpu->psVdwWorkUnit->_pSysStream[0][ii];
+            unsigned int x          = amoebaGpu->psVdwWorkUnit->_pSysData[ii];
            unsigned int y          = ((x >> 2) & 0x7fff) << GRIDBITS;
            unsigned int exclusions = (x & 0x1);
                         x          = (x >> 17) << GRIDBITS;
@@ -570,20 +570,20 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
            gpu->psInteractingWorkUnit->Download();
            gpu->psInteractionFlag->Download();
            amoebaGpu->psVdwWorkUnit->Download();
-            (void) fprintf( amoebaGpu->log, "Vdw Ixn count=%u\n", gpu->psInteractionCount->_pSysStream[0][0] );
+            (void) fprintf( amoebaGpu->log, "Vdw Ixn count=%u\n", gpu->psInteractionCount->_pSysData[0] );
            for( unsigned int ii = 0; ii < gpu->psInteractingWorkUnit->_length; ii++ ){
        
-                unsigned int x          = gpu->psInteractingWorkUnit->_pSysStream[0][ii];
+                unsigned int x          = gpu->psInteractingWorkUnit->_pSysData[ii];
                unsigned int y          = ((x >> 2) & 0x7fff) << GRIDBITS;
                unsigned int exclusions = (x & 0x1);
                             x          = (x >> 17) << GRIDBITS;
-                (void) fprintf( amoebaGpu->log, "GpuCell %8u  %8u [%5u %5u %1u] %10u ", ii, gpu->psInteractingWorkUnit->_pSysStream[0][ii], x,y,exclusions, gpu->psInteractionFlag->_pSysStream[0][ii] );
+                (void) fprintf( amoebaGpu->log, "GpuCell %8u  %8u [%5u %5u %1u] %10u ", ii, gpu->psInteractingWorkUnit->_pSysData[ii], x,y,exclusions, gpu->psInteractionFlag->_pSysData[ii] );
        
-                             x          = amoebaGpu->psVdwWorkUnit->_pSysStream[0][ii];
+                             x          = amoebaGpu->psVdwWorkUnit->_pSysData[ii];
                             y          = ((x >> 2) & 0x7fff) << GRIDBITS;
                             exclusions = (x & 0x1);
                             x          = (x >> 17) << GRIDBITS;
-                (void) fprintf( amoebaGpu->log, "   AmGpu %8u [%5u %5u %1u]\n", amoebaGpu->psWorkUnit->_pSysStream[0][ii], x,y,exclusions );
+                (void) fprintf( amoebaGpu->log, "   AmGpu %8u [%5u %5u %1u]\n", amoebaGpu->psWorkUnit->_pSysData[ii], x,y,exclusions );
            }    
                (void) fflush( amoebaGpu->log );
        }
@@ -591,29 +591,29 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
        if (gpu->bOutputBufferPerWarp){
            kCalculateAmoebaVdw14_7CutoffByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
                                                                gpu->sim.pInteractingWorkUnit,
-                                                                amoebaGpu->psAmoebaVdwCoordinates->_pDevStream[0],
-                                                                amoebaGpu->psVdwSigmaEpsilon->_pDevStream[0],
+                                                                amoebaGpu->psAmoebaVdwCoordinates->_pDevData,
+                                                                amoebaGpu->psVdwSigmaEpsilon->_pDevData,
                                                                amoebaGpu->vdwSigmaCombiningRule,
                                                                amoebaGpu->vdwEpsilonCombiningRule,
 #ifdef AMOEBA_DEBUG
-                                                                amoebaGpu->psWorkArray_3_1->_pDevStream[0],
-                                                                debugArray->_pDevStream[0], targetAtom );
+                                                                amoebaGpu->psWorkArray_3_1->_pDevData,
+                                                                debugArray->_pDevData, targetAtom );
 #else
-                                                                amoebaGpu->psWorkArray_3_1->_pDevStream[0] );
+                                                                amoebaGpu->psWorkArray_3_1->_pDevData );
 #endif
        } else {

            kCalculateAmoebaVdw14_7Cutoff_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
                                                                gpu->sim.pInteractingWorkUnit,
-                                                                amoebaGpu->psAmoebaVdwCoordinates->_pDevStream[0],
-                                                                amoebaGpu->psVdwSigmaEpsilon->_pDevStream[0],
+                                                                amoebaGpu->psAmoebaVdwCoordinates->_pDevData,
+                                                                amoebaGpu->psVdwSigmaEpsilon->_pDevData,
                                                                amoebaGpu->vdwSigmaCombiningRule,
                                                                amoebaGpu->vdwEpsilonCombiningRule,
 #ifdef AMOEBA_DEBUG
-                                                                amoebaGpu->psWorkArray_3_1->_pDevStream[0],
-                                                                debugArray->_pDevStream[0], targetAtom );
+                                                                amoebaGpu->psWorkArray_3_1->_pDevData,
+                                                                debugArray->_pDevData, targetAtom );
 #else
-                                                                amoebaGpu->psWorkArray_3_1->_pDevStream[0] );
+                                                                amoebaGpu->psWorkArray_3_1->_pDevData );
 #endif
    
        }
@@ -624,30 +624,30 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
        if (gpu->bOutputBufferPerWarp){

            kCalculateAmoebaVdw14_7N2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
-                                                                amoebaGpu->psVdwWorkUnit->_pDevStream[0],
-                                                                amoebaGpu->psAmoebaVdwCoordinates->_pDevStream[0],
-                                                                amoebaGpu->psVdwSigmaEpsilon->_pDevStream[0],
+                                                                amoebaGpu->psVdwWorkUnit->_pDevData,
+                                                                amoebaGpu->psAmoebaVdwCoordinates->_pDevData,
+                                                                amoebaGpu->psVdwSigmaEpsilon->_pDevData,
                                                                amoebaGpu->vdwSigmaCombiningRule,
                                                                amoebaGpu->vdwEpsilonCombiningRule,
 #ifdef AMOEBA_DEBUG
-                                                                amoebaGpu->psWorkArray_3_1->_pDevStream[0],
-                                                                debugArray->_pDevStream[0], targetAtom );
+                                                                amoebaGpu->psWorkArray_3_1->_pDevData,
+                                                                debugArray->_pDevData, targetAtom );
 #else
-                                                                amoebaGpu->psWorkArray_3_1->_pDevStream[0] );
+                                                                amoebaGpu->psWorkArray_3_1->_pDevData );
 #endif
        } else {

            kCalculateAmoebaVdw14_7N2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
-                                                                amoebaGpu->psVdwWorkUnit->_pDevStream[0],
-                                                                amoebaGpu->psAmoebaVdwCoordinates->_pDevStream[0],
-                                                                amoebaGpu->psVdwSigmaEpsilon->_pDevStream[0],
+                                                                amoebaGpu->psVdwWorkUnit->_pDevData,
+                                                                amoebaGpu->psAmoebaVdwCoordinates->_pDevData,
+                                                                amoebaGpu->psVdwSigmaEpsilon->_pDevData,
                                                                amoebaGpu->vdwSigmaCombiningRule,
                                                                amoebaGpu->vdwEpsilonCombiningRule,
 #ifdef AMOEBA_DEBUG
-                                                                amoebaGpu->psWorkArray_3_1->_pDevStream[0],
-                                                                debugArray->_pDevStream[0], targetAtom );
+                                                                amoebaGpu->psWorkArray_3_1->_pDevData,
+                                                                debugArray->_pDevData, targetAtom );
 #else
-                                                                amoebaGpu->psWorkArray_3_1->_pDevStream[0] );
+                                                                amoebaGpu->psWorkArray_3_1->_pDevData );
 #endif
    
        }
@@ -669,11 +669,11 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
            (void) fprintf( amoebaGpu->log,"%5d %5d DebugVdw\n", targetAtom, jj );
            for( int kk = 0; kk < 5; kk++ ){
                (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
-                                debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
-                                debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
-                if( kk == 4 && ( fabs(  debugArray->_pSysStream[0][debugIndex].x ) > cutOff ||
-                                 fabs(  debugArray->_pSysStream[0][debugIndex].y ) > cutOff ||
-                                 fabs(  debugArray->_pSysStream[0][debugIndex].z ) > cutOff ) ){
+                                debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
+                                debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
+                if( kk == 4 && ( fabs(  debugArray->_pSysData[debugIndex].x ) > cutOff ||
+                                 fabs(  debugArray->_pSysData[debugIndex].y ) > cutOff ||
+                                 fabs(  debugArray->_pSysData[debugIndex].z ) > cutOff ) ){
                    (void) fprintf( amoebaGpu->log," XXXX\n" );
                }
                debugIndex += paddedNumberOfAtoms;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersion.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersion.cu
@@ -32,7 +32,7 @@ void GetCalculateAmoebaCudaWcaDispersionSim(amoebaGpuContext amoebaGpu)
    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));    

    fprintf( stderr, "In GetCalculateAmoebaCudaWcaDispersionSim: %p %lu %u\n", 
-             gpu->psInteractionCount->_pSysStream[0], gpu->psInteractionCount->_pSysStream[0][0], gpu->sim.workUnits );
+             gpu->psInteractionCount->_pSysData, gpu->psInteractionCount->_pSysData[0], gpu->sim.workUnits );

    RTERROR(status, "GetCalculateAmoebaCudaWcaDispersionSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
    status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));    
@@ -363,7 +363,7 @@ static void kReduceWcaDispersion(amoebaGpuContext amoebaGpu, CUDAStream<float>*
 {
    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
-                               amoebaGpu->psWorkArray_3_1->_pDevStream[0], outputArray->_pDevStream[0] );
+                               amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData );
    LAUNCHERROR("kReduceWcaDispersion");
 }

@@ -373,7 +373,7 @@ static void kReduceWcaDispersionToFloat4(amoebaGpuContext amoebaGpu, CUDAStream<
 {
    kReduceFieldsToFloat4_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                                   amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
-                                   amoebaGpu->psWorkArray_3_1->_pDevStream[0], outputArray->_pDevStream[0] );
+                                   amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData );
    LAUNCHERROR("kReduceWcaDispersion");
 }

@@ -400,7 +400,7 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )
    static const char* methodName = "kCalculateAmoebaWcaDispersionForces";
    int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
    CUDAStream<float4>* debugArray             = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
-    memset( debugArray->_pSysStream[0],      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
+    memset( debugArray->_pSysData,      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
    debugArray->Upload();
    int targetAtom                             = 3;
 #endif
@@ -420,14 +420,14 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )

    if (gpu->bOutputBufferPerWarp){
        kCalculateAmoebaWcaDispersionN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(WcaDispersionParticle)*threadsPerBlock>>>(
-                                                            amoebaGpu->psWorkUnit->_pDevStream[0],
-                                                            gpu->psPosq4->_pDevStream[0],
-                                                            amoebaGpu->psWcaDispersionRadiusEpsilon->_pDevStream[0],
+                                                            amoebaGpu->psWorkUnit->_pDevData,
+                                                            gpu->psPosq4->_pDevData,
+                                                            amoebaGpu->psWcaDispersionRadiusEpsilon->_pDevData,
 #ifdef AMOEBA_DEBUG
-                                                            amoebaGpu->psWorkArray_3_1->_pDevStream[0],
-                                                            debugArray->_pDevStream[0], targetAtom );
+                                                            amoebaGpu->psWorkArray_3_1->_pDevData,
+                                                            debugArray->_pDevData, targetAtom );
 #else
-                                                            amoebaGpu->psWorkArray_3_1->_pDevStream[0] );
+                                                            amoebaGpu->psWorkArray_3_1->_pDevData );
 #endif

    } else {
@@ -442,14 +442,14 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )
 #endif

        kCalculateAmoebaWcaDispersionN2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(WcaDispersionParticle)*threadsPerBlock>>>(
-                                                            amoebaGpu->psWorkUnit->_pDevStream[0],
-                                                            gpu->psPosq4->_pDevStream[0],
-                                                            amoebaGpu->psWcaDispersionRadiusEpsilon->_pDevStream[0],
+                                                            amoebaGpu->psWorkUnit->_pDevData,
+                                                            gpu->psPosq4->_pDevData,
+                                                            amoebaGpu->psWcaDispersionRadiusEpsilon->_pDevData,
 #ifdef AMOEBA_DEBUG
-                                                            amoebaGpu->psWorkArray_3_1->_pDevStream[0],
-                                                            debugArray->_pDevStream[0], targetAtom );
+                                                            amoebaGpu->psWorkArray_3_1->_pDevData,
+                                                            debugArray->_pDevData, targetAtom );
 #else
-                                                            amoebaGpu->psWorkArray_3_1->_pDevStream[0] );
+                                                            amoebaGpu->psWorkArray_3_1->_pDevData );
 #endif

    }
@@ -481,80 +481,80 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )

            for( int kk = 0; kk < -3; kk++ ){
                if( kk == 1 ){
-                    block =  static_cast<int>(debugArray->_pSysStream[0][debugIndex].w + 1.0e-04); 
+                    block =  static_cast<int>(debugArray->_pSysData[debugIndex].w + 1.0e-04); 
                    if( buffers.find(block) == buffers.end() ){
                        buffers[block] = 0.0;
                    }
                }
                if( kk == 1  && jj != targetAtom ){
-                    sums[0] += debugArray->_pSysStream[0][debugIndex].y;
-                    sums[1] += debugArray->_pSysStream[0][debugIndex].z;
-                    sums[2] += debugArray->_pSysStream[0][debugIndex].w;
-                    double x4 = debugArray->_pSysStream[0][debugIndex].x - (debugArray->_pSysStream[0][debugIndex].y + debugArray->_pSysStream[0][debugIndex].z + debugArray->_pSysStream[0][debugIndex].w);
+                    sums[0] += debugArray->_pSysData[debugIndex].y;
+                    sums[1] += debugArray->_pSysData[debugIndex].z;
+                    sums[2] += debugArray->_pSysData[debugIndex].w;
+                    double x4 = debugArray->_pSysData[debugIndex].x - (debugArray->_pSysData[debugIndex].y + debugArray->_pSysData[debugIndex].z + debugArray->_pSysData[debugIndex].w);
                    sums[3] += x4;
-                    //sum     += debugArray->_pSysStream[0][debugIndex].x;
-                    sum     += debugArray->_pSysStream[0][debugIndex].z;
-                    buffers[block] += debugArray->_pSysStream[0][debugIndex].z; 
+                    //sum     += debugArray->_pSysData[debugIndex].x;
+                    sum     += debugArray->_pSysData[debugIndex].z;
+                    buffers[block] += debugArray->_pSysData[debugIndex].z; 
                    (void) fprintf( amoebaGpu->log," %16.9e [%16.9e %16.9e %16.9e %16.9e]\n",
-                                    debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
-                                    debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w, x4);
+                                    debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
+                                    debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w, x4);
         
                } else if( kk == 2 &&  jj != targetAtom){
-                    //sum     += debugArray->_pSysStream[0][debugIndex].x;
-                    sum     += debugArray->_pSysStream[0][debugIndex].z;
-                    sums[4] += debugArray->_pSysStream[0][debugIndex].y;
-                    sums[5] += debugArray->_pSysStream[0][debugIndex].z;
-                    sums[6] += debugArray->_pSysStream[0][debugIndex].w;
-                    double x4 = debugArray->_pSysStream[0][debugIndex].x - (debugArray->_pSysStream[0][debugIndex].y + debugArray->_pSysStream[0][debugIndex].z + debugArray->_pSysStream[0][debugIndex].w);
+                    //sum     += debugArray->_pSysData[debugIndex].x;
+                    sum     += debugArray->_pSysData[debugIndex].z;
+                    sums[4] += debugArray->_pSysData[debugIndex].y;
+                    sums[5] += debugArray->_pSysData[debugIndex].z;
+                    sums[6] += debugArray->_pSysData[debugIndex].w;
+                    double x4 = debugArray->_pSysData[debugIndex].x - (debugArray->_pSysData[debugIndex].y + debugArray->_pSysData[debugIndex].z + debugArray->_pSysData[debugIndex].w);
                    sums[7] += x4;
-                    buffers[block] += debugArray->_pSysStream[0][debugIndex].z; 
+                    buffers[block] += debugArray->_pSysData[debugIndex].z; 
                    (void) fprintf( amoebaGpu->log," %16.9e [%16.9e %16.9e %16.9e %16.9e]\n",
-                                    debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
-                                    debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w, x4);
+                                    debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
+                                    debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w, x4);
                } else {
                    (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e] %7u\n",
-                                    debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
-                                    debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w, debugIndex );
+                                    debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
+                                    debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w, debugIndex );
                }
                if( kk == 5 )(void) fprintf( amoebaGpu->log,"\n" );
                debugIndex += paddedNumberOfAtoms;
            }

-            block =  static_cast<int>(debugArray->_pSysStream[0][debugIndex+paddedNumberOfAtoms].w + 1.0e-04); 
+            block =  static_cast<int>(debugArray->_pSysData[debugIndex+paddedNumberOfAtoms].w + 1.0e-04); 
            if( buffers.find(block) == buffers.end() ){
                buffers[block] = 0.0;
                maxD[block]    = 0.0;
            }
            for( int kk = 0; kk < 3; kk++ ){
                if( kk == 0 && jj != targetAtom ){
-                    sum            += debugArray->_pSysStream[0][debugIndex].z;
-                    buffers[block] += debugArray->_pSysStream[0][debugIndex].z; 
+                    sum            += debugArray->_pSysData[debugIndex].z;
+                    buffers[block] += debugArray->_pSysData[debugIndex].z; 
                    trackI[block].push_back( jj );
-                    trackD[block].push_back( debugArray->_pSysStream[0][debugIndex].z );
-                    trackT[block].push_back( debugArray->_pSysStream[0][debugIndex].w );
-                    if( fabs( debugArray->_pSysStream[0][debugIndex].w ) > maxD[block] ){
-                        maxD[block] = fabs( debugArray->_pSysStream[0][debugIndex].w );
+                    trackD[block].push_back( debugArray->_pSysData[debugIndex].z );
+                    trackT[block].push_back( debugArray->_pSysData[debugIndex].w );
+                    if( fabs( debugArray->_pSysData[debugIndex].w ) > maxD[block] ){
+                        maxD[block] = fabs( debugArray->_pSysData[debugIndex].w );
                    }
                    (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
-                                    debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
-                                    debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w);
+                                    debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
+                                    debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w);
         
                } else if( kk == 2 &&  jj != targetAtom){
-                    sum             += debugArray->_pSysStream[0][debugIndex].z;
-                    buffers[block]  += debugArray->_pSysStream[0][debugIndex].z; 
+                    sum             += debugArray->_pSysData[debugIndex].z;
+                    buffers[block]  += debugArray->_pSysData[debugIndex].z; 
                    trackI[block].push_back( jj );
-                    trackD[block].push_back( debugArray->_pSysStream[0][debugIndex].z );
-                    trackT[block].push_back( debugArray->_pSysStream[0][debugIndex].w );
-                    if( fabs( debugArray->_pSysStream[0][debugIndex].w ) > maxD[block] ){
-                        maxD[block] = fabs( debugArray->_pSysStream[0][debugIndex].w );
+                    trackD[block].push_back( debugArray->_pSysData[debugIndex].z );
+                    trackT[block].push_back( debugArray->_pSysData[debugIndex].w );
+                    if( fabs( debugArray->_pSysData[debugIndex].w ) > maxD[block] ){
+                        maxD[block] = fabs( debugArray->_pSysData[debugIndex].w );
                    }
                    (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
-                                    debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
-                                    debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w);
+                                    debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
+                                    debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w);
                } else {
                    (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e] %7u\n",
-                                    debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
-                                    debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w, debugIndex );
+                                    debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
+                                    debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w, debugIndex );
                }
                if( kk == 5 )(void) fprintf( amoebaGpu->log,"\n" );
                debugIndex += paddedNumberOfAtoms;
@@ -600,11 +600,11 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )
            int debugIndex9  = debugIndex + 9*paddedNumberOfAtoms;
            int debugIndex10 = debugIndex + 10*paddedNumberOfAtoms;
            (void) fprintf( amoebaGpu->log,"%6d %14.7e %14.7e %14.7e %14.7e %14.7e %14.7e %14.7e %14.7e %14.7e \n", jj,
-                                       debugArray->_pSysStream[0][debugIndex4].x*debugArray->_pSysStream[0][debugIndex4].x,  // r2
-                                       debugArray->_pSysStream[0][debugIndex4].y, debugArray->_pSysStream[0][debugIndex9].y, // de
-                                       debugArray->_pSysStream[0][debugIndex5].x, debugArray->_pSysStream[0][debugIndex10].x, // dll
-                                       debugArray->_pSysStream[0][debugIndex5].y, debugArray->_pSysStream[0][debugIndex10].y, // duu
-                                       debugArray->_pSysStream[0][debugIndex4].z, debugArray->_pSysStream[0][debugIndex9].z ); // emxio
+                                       debugArray->_pSysData[debugIndex4].x*debugArray->_pSysData[debugIndex4].x,  // r2
+                                       debugArray->_pSysData[debugIndex4].y, debugArray->_pSysData[debugIndex9].y, // de
+                                       debugArray->_pSysData[debugIndex5].x, debugArray->_pSysData[debugIndex10].x, // dll
+                                       debugArray->_pSysData[debugIndex5].y, debugArray->_pSysData[debugIndex10].y, // duu
+                                       debugArray->_pSysData[debugIndex4].z, debugArray->_pSysData[debugIndex9].z ); // emxio
        }
 */
    }
@@ -616,7 +616,7 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )
        gpu->psEnergy->Download();
        double sum = 0.0;
        for (int i = 0; i < gpu->sim.energyOutputBuffers; i++){
-            sum  += gpu->psEnergy->_pSysStream[0][i];
+            sum  += gpu->psEnergy->_pSysData[i];
            if( fabsf( (*gpu->psEnergy)[i]) > 0.0 )
                (void) fprintf( amoebaGpu->log,"SumQQ %6d %14.7e QQ SUM\n", i, (*gpu->psEnergy)[i] );
        }