Enhancements to CUDAStream to reduce the risk of bugs and make debugging easier

306d99e8 · Peter Eastman · 968cb132 · 306d99e8 · 306d99e8 · 306d99e8
Commit 306d99e8 authored Apr 08, 2009 by Peter Eastman
3 changed files
--- a/platforms/cuda/src/kernels/cudatypes.h
+++ b/platforms/cuda/src/kernels/cudatypes.h
@@ -76,40 +76,42 @@ struct CUDAStream : public SoADeviceObject
    T**             _pDevStream;
    T*              _pSysData;
    T*              _pDevData;
-    CUDAStream(int length, int subStreams = 1);
-    CUDAStream(unsigned int length, unsigned int subStreams = 1);
-    CUDAStream(unsigned int length, int subStreams = 1);
-    CUDAStream(int length, unsigned int subStreams = 1);
+    std::string     _name;
+    CUDAStream(int length, int subStreams = 1, std::string name="");
+    CUDAStream(unsigned int length, unsigned int subStreams = 1, std::string name="");
+    CUDAStream(unsigned int length, int subStreams = 1, std::string name="");
+    CUDAStream(int length, unsigned int subStreams = 1, std::string name="");
    virtual ~CUDAStream();
    void Allocate();
    void Deallocate();
    void Upload();
    void Download();
    void Collapse(unsigned int newstreams = 1, unsigned int interleave = 1);
+    T& operator[](int index);
 };

 float CompareStreams(CUDAStream<float>& s1, CUDAStream<float>& s2, float tolerance, unsigned int maxindex = 0);

 template <typename T>
-CUDAStream<T>::CUDAStream(int length, unsigned int subStreams) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0)
+CUDAStream<T>::CUDAStream(int length, unsigned int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
 {
    Allocate();   
 }

 template <typename T>
-CUDAStream<T>::CUDAStream(unsigned int length, int subStreams) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0)
+CUDAStream<T>::CUDAStream(unsigned int length, int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
 {
    Allocate();   
 }

 template <typename T>
-CUDAStream<T>::CUDAStream(unsigned int length, unsigned int subStreams) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0)
+CUDAStream<T>::CUDAStream(unsigned int length, unsigned int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
 {
    Allocate();   
 }

 template <typename T>
-CUDAStream<T>::CUDAStream(int length, int subStreams) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0)
+CUDAStream<T>::CUDAStream(int length, int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
 {
    Allocate();   
 }
@@ -129,7 +131,7 @@ void CUDAStream<T>::Allocate()
    _pSysData =     new T[_subStreams * _stride];

    status = cudaMalloc((void **) &_pDevData, _stride * _subStreams * sizeof(T));
-    RTERROR(status, "cudaMalloc CUDAStream::Allocate failed");
+    RTERROR(status, (_name+": cudaMalloc in CUDAStream::Allocate failed").c_str());

    for (unsigned int i = 0; i < _subStreams; i++)
    {
@@ -149,7 +151,7 @@ void CUDAStream<T>::Deallocate()
    delete[] _pSysData;
    _pSysData = NULL;
    status = cudaFree(_pDevData);
-    RTERROR(status, "cudaFree CUDAStream::Deallocate failed");    
+    RTERROR(status, (_name+": cudaFree in CUDAStream::Deallocate failed").c_str());
 }

 template <typename T>
@@ -157,7 +159,7 @@ void CUDAStream<T>::Upload()
 {
    cudaError_t status;
    status = cudaMemcpy(_pDevData, _pSysData, _stride * _subStreams * sizeof(T), cudaMemcpyHostToDevice);
-    RTERROR(status, "cudaMemcpy CUDAStream::Upload failed");
+    RTERROR(status, (_name+": cudaMemcpy in CUDAStream::Upload failed").c_str());
 }

 template <typename T>
@@ -165,7 +167,7 @@ void CUDAStream<T>::Download()
 {
    cudaError_t status;
    status = cudaMemcpy(_pSysData, _pDevData, _stride * _subStreams * sizeof(T), cudaMemcpyDeviceToHost);
-    RTERROR(status, "cudaMemcpy CUDAStream::Download failed");
+    RTERROR(status, (_name+": cudaMemcpy in CUDAStream::Download failed").c_str());
 }

 template <typename T>
@@ -210,6 +212,12 @@ void CUDAStream<T>::Collapse(unsigned int newstreams, unsigned int interleave)
    delete[] pTemp;
 }

+template <typename T>
+T& CUDAStream<T>::operator[](int index)
+{
+    return _pSysData[index];
+}
+
 static const unsigned int GRID = 32;
 static const unsigned int GRIDBITS = 5;
 static const int G8X_NONBOND_THREADS_PER_BLOCK          = 256;

--- a/platforms/cuda/src/kernels/gpu.cpp
+++ b/platforms/cuda/src/kernels/gpu.cpp
@@ -128,29 +128,29 @@ void gpuSetBondParameters(gpuContext gpu, const vector<int>& atom1, const vector
 {
    int bonds = atom1.size();
    gpu->sim.bonds                              = bonds;
-    CUDAStream<int4>* psBondID                  = new CUDAStream<int4>(bonds, 1);
+    CUDAStream<int4>* psBondID                  = new CUDAStream<int4>(bonds, 1, "BondID");
    gpu->psBondID                               = psBondID;
    gpu->sim.pBondID                            = psBondID->_pDevStream[0];
-    CUDAStream<float2>* psBondParameter         = new CUDAStream<float2>(bonds, 1);
+    CUDAStream<float2>* psBondParameter         = new CUDAStream<float2>(bonds, 1, "BondParameter");
    gpu->psBondParameter                        = psBondParameter;
    gpu->sim.pBondParameter                     = psBondParameter->_pDevStream[0];
    for (int i = 0; i < bonds; i++)
    {
-        psBondID->_pSysStream[0][i].x = atom1[i];
-        psBondID->_pSysStream[0][i].y = atom2[i];
-        psBondParameter->_pSysStream[0][i].x = length[i];
-        psBondParameter->_pSysStream[0][i].y = k[i];
-        psBondID->_pSysStream[0][i].z = gpu->pOutputBufferCounter[psBondID->_pSysStream[0][i].x]++;
-        psBondID->_pSysStream[0][i].w = gpu->pOutputBufferCounter[psBondID->_pSysStream[0][i].y]++;
+        (*psBondID)[i].x = atom1[i];
+        (*psBondID)[i].y = atom2[i];
+        (*psBondParameter)[i].x = length[i];
+        (*psBondParameter)[i].y = k[i];
+        psBondID->_pSysData[i].z = gpu->pOutputBufferCounter[psBondID->_pSysData[i].x]++;
+        psBondID->_pSysData[i].w = gpu->pOutputBufferCounter[psBondID->_pSysData[i].y]++;
 #if (DUMP_PARAMETERS == 1)                
        cout << 
            i << " " << 
-            psBondID->_pSysStream[0][i].x << " " << 
-            psBondID->_pSysStream[0][i].y << " " << 
-            psBondID->_pSysStream[0][i].z << " " << 
-            psBondID->_pSysStream[0][i].w << " " << 
-            psBondParameter->_pSysStream[0][i].x << " " << 
-            psBondParameter->_pSysStream[0][i].y << 
+            (*psBondID)[i].x << " " <<
+            (*psBondID)[i].y << " " <<
+            (*psBondID)[i].z << " " <<
+            (*psBondID)[i].w << " " <<
+            (*psBondParameter)[i].x << " " <<
+            (*psBondParameter)[i].y <<
            endl;
 #endif
    }
@@ -164,37 +164,37 @@ void gpuSetBondAngleParameters(gpuContext gpu, const vector<int>& atom1, const v
 {
    int bond_angles = atom1.size();
    gpu->sim.bond_angles                        = bond_angles;
-    CUDAStream<int4>* psBondAngleID1            = new CUDAStream<int4>(bond_angles, 1);
+    CUDAStream<int4>* psBondAngleID1            = new CUDAStream<int4>(bond_angles, 1, "BondAngleID1");
    gpu->psBondAngleID1                         = psBondAngleID1;
    gpu->sim.pBondAngleID1                      = psBondAngleID1->_pDevStream[0];
-    CUDAStream<int2>* psBondAngleID2            = new CUDAStream<int2>(bond_angles, 1);
+    CUDAStream<int2>* psBondAngleID2            = new CUDAStream<int2>(bond_angles, 1, "BondAngleID2");
    gpu->psBondAngleID2                         = psBondAngleID2;
    gpu->sim.pBondAngleID2                      = psBondAngleID2->_pDevStream[0];
-    CUDAStream<float2>* psBondAngleParameter    = new CUDAStream<float2>(bond_angles, 1);
+    CUDAStream<float2>* psBondAngleParameter    = new CUDAStream<float2>(bond_angles, 1, "BondAngleParameter");
    gpu->psBondAngleParameter                   = psBondAngleParameter;
    gpu->sim.pBondAngleParameter                = psBondAngleParameter->_pDevStream[0];        

    for (int i = 0; i < bond_angles; i++)
    {
-        psBondAngleID1->_pSysStream[0][i].x = atom1[i];
-        psBondAngleID1->_pSysStream[0][i].y = atom2[i];
-        psBondAngleID1->_pSysStream[0][i].z = atom3[i];
-        psBondAngleParameter->_pSysStream[0][i].x = angle[i];
-        psBondAngleParameter->_pSysStream[0][i].y = k[i];
-        psBondAngleID1->_pSysStream[0][i].w = gpu->pOutputBufferCounter[psBondAngleID1->_pSysStream[0][i].x]++;
-        psBondAngleID2->_pSysStream[0][i].x = gpu->pOutputBufferCounter[psBondAngleID1->_pSysStream[0][i].y]++;
-        psBondAngleID2->_pSysStream[0][i].y = gpu->pOutputBufferCounter[psBondAngleID1->_pSysStream[0][i].z]++;
+        (*psBondAngleID1)[i].x = atom1[i];
+        (*psBondAngleID1)[i].y = atom2[i];
+        (*psBondAngleID1)[i].z = atom3[i];
+        (*psBondAngleParameter)[i].x = angle[i];
+        (*psBondAngleParameter)[i].y = k[i];
+        psBondAngleID1->_pSysData[i].w = gpu->pOutputBufferCounter[psBondAngleID1->_pSysData[i].x]++;
+        psBondAngleID2->_pSysData[i].x = gpu->pOutputBufferCounter[psBondAngleID1->_pSysData[i].y]++;
+        psBondAngleID2->_pSysData[i].y = gpu->pOutputBufferCounter[psBondAngleID1->_pSysData[i].z]++;
 #if (DUMP_PARAMETERS == 1)
         cout << 
            i << " " << 
-            psBondAngleID1->_pSysStream[0][i].x << " " << 
-            psBondAngleID1->_pSysStream[0][i].y << " " << 
-            psBondAngleID1->_pSysStream[0][i].z << " " << 
-            psBondAngleID1->_pSysStream[0][i].w << " " << 
-            psBondAngleID2->_pSysStream[0][i].x << " " << 
-            psBondAngleID2->_pSysStream[0][i].y << " " << 
-            psBondAngleParameter->_pSysStream[0][i].x << " " << 
-            psBondAngleParameter->_pSysStream[0][i].y << 
+            (*psBondAngleID1)[i].x << " " <<
+            (*psBondAngleID1)[i].y << " " <<
+            (*psBondAngleID1)[i].z << " " <<
+            (*psBondAngleID1)[i].w << " " <<
+            (*psBondAngleID2)[i].x << " " <<
+            (*psBondAngleID2)[i].y << " " <<
+            (*psBondAngleParameter)[i].x << " " <<
+            (*psBondAngleParameter)[i].y <<
            endl;
 #endif
    }
@@ -209,42 +209,42 @@ void gpuSetDihedralParameters(gpuContext gpu, const vector<int>& atom1, const ve
 {
        int dihedrals = atom1.size();
        gpu->sim.dihedrals = dihedrals;
-        CUDAStream<int4>* psDihedralID1             = new CUDAStream<int4>(dihedrals, 1);
+        CUDAStream<int4>* psDihedralID1             = new CUDAStream<int4>(dihedrals, 1, "DihedralID1");
        gpu->psDihedralID1                          = psDihedralID1;
        gpu->sim.pDihedralID1                       = psDihedralID1->_pDevStream[0];
-        CUDAStream<int4>* psDihedralID2             = new CUDAStream<int4>(dihedrals, 1);
+        CUDAStream<int4>* psDihedralID2             = new CUDAStream<int4>(dihedrals, 1, "DihedralID2");
        gpu->psDihedralID2                          = psDihedralID2;
        gpu->sim.pDihedralID2                       = psDihedralID2->_pDevStream[0];
-        CUDAStream<float4>* psDihedralParameter     = new CUDAStream<float4>(dihedrals, 1);
+        CUDAStream<float4>* psDihedralParameter     = new CUDAStream<float4>(dihedrals, 1, "DihedralParameter");
        gpu->psDihedralParameter                    = psDihedralParameter;
        gpu->sim.pDihedralParameter                 = psDihedralParameter->_pDevStream[0];
        for (int i = 0; i < dihedrals; i++)
        {
-            psDihedralID1->_pSysStream[0][i].x = atom1[i];
-            psDihedralID1->_pSysStream[0][i].y = atom2[i];
-            psDihedralID1->_pSysStream[0][i].z = atom3[i];
-            psDihedralID1->_pSysStream[0][i].w = atom4[i];
-            psDihedralParameter->_pSysStream[0][i].x = k[i];
-            psDihedralParameter->_pSysStream[0][i].y = phase[i];
-            psDihedralParameter->_pSysStream[0][i].z = (float) periodicity[i];
-            psDihedralID2->_pSysStream[0][i].x = gpu->pOutputBufferCounter[psDihedralID1->_pSysStream[0][i].x]++;
-            psDihedralID2->_pSysStream[0][i].y = gpu->pOutputBufferCounter[psDihedralID1->_pSysStream[0][i].y]++;
-            psDihedralID2->_pSysStream[0][i].z = gpu->pOutputBufferCounter[psDihedralID1->_pSysStream[0][i].z]++;
-            psDihedralID2->_pSysStream[0][i].w = gpu->pOutputBufferCounter[psDihedralID1->_pSysStream[0][i].w]++;
+            (*psDihedralID1)[i].x = atom1[i];
+            (*psDihedralID1)[i].y = atom2[i];
+            (*psDihedralID1)[i].z = atom3[i];
+            (*psDihedralID1)[i].w = atom4[i];
+            (*psDihedralParameter)[i].x = k[i];
+            (*psDihedralParameter)[i].y = phase[i];
+            (*psDihedralParameter)[i].z = (float) periodicity[i];
+            psDihedralID2->_pSysData[i].x = gpu->pOutputBufferCounter[psDihedralID1->_pSysData[i].x]++;
+            psDihedralID2->_pSysData[i].y = gpu->pOutputBufferCounter[psDihedralID1->_pSysData[i].y]++;
+            psDihedralID2->_pSysData[i].z = gpu->pOutputBufferCounter[psDihedralID1->_pSysData[i].z]++;
+            psDihedralID2->_pSysData[i].w = gpu->pOutputBufferCounter[psDihedralID1->_pSysData[i].w]++;
 #if (DUMP_PARAMETERS == 1)
            cout << 
                i << " " << 
-                psDihedralID1->_pSysStream[0][i].x << " " << 
-                psDihedralID1->_pSysStream[0][i].y << " " << 
-                psDihedralID1->_pSysStream[0][i].z << " " << 
-                psDihedralID1->_pSysStream[0][i].w << " " << 
-                psDihedralID2->_pSysStream[0][i].x << " " << 
-                psDihedralID2->_pSysStream[0][i].y << " " << 
-                psDihedralID2->_pSysStream[0][i].z << " " << 
-                psDihedralID2->_pSysStream[0][i].w << " " << 
-                psDihedralParameter->_pSysStream[0][i].x << " " << 
-                psDihedralParameter->_pSysStream[0][i].y << " " << 
-                psDihedralParameter->_pSysStream[0][i].z << endl;
+                (*psDihedralID1)[i].x << " " <<
+                (*psDihedralID1)[i].y << " " <<
+                (*psDihedralID1)[i].z << " " <<
+                (*psDihedralID1)[i].w << " " <<
+                (*psDihedralID2)[i].x << " " <<
+                (*psDihedralID2)[i].y << " " <<
+                (*psDihedralID2)[i].z << " " <<
+                (*psDihedralID2)[i].w << " " <<
+                (*psDihedralParameter)[i].x << " " <<
+                (*psDihedralParameter)[i].y << " " <<
+                (*psDihedralParameter)[i].z << endl;
 #endif
        }
        psDihedralID1->Upload();
@@ -258,52 +258,52 @@ void gpuSetRbDihedralParameters(gpuContext gpu, const vector<int>& atom1, const
 {
    int rb_dihedrals = atom1.size();
    gpu->sim.rb_dihedrals = rb_dihedrals;
-    CUDAStream<int4>* psRbDihedralID1           = new CUDAStream<int4>(rb_dihedrals, 1);
+    CUDAStream<int4>* psRbDihedralID1           = new CUDAStream<int4>(rb_dihedrals, 1, "RbDihedralID1");
    gpu->psRbDihedralID1                        = psRbDihedralID1;
    gpu->sim.pRbDihedralID1                     = psRbDihedralID1->_pDevStream[0];
-    CUDAStream<int4>* psRbDihedralID2           = new CUDAStream<int4>(rb_dihedrals, 1);
+    CUDAStream<int4>* psRbDihedralID2           = new CUDAStream<int4>(rb_dihedrals, 1, "RbDihedralID2");
    gpu->psRbDihedralID2                        = psRbDihedralID2;
    gpu->sim.pRbDihedralID2                     = psRbDihedralID2->_pDevStream[0];
-    CUDAStream<float4>* psRbDihedralParameter1  = new CUDAStream<float4>(rb_dihedrals, 1);
+    CUDAStream<float4>* psRbDihedralParameter1  = new CUDAStream<float4>(rb_dihedrals, 1, "RbDihedralParameter1");
    gpu->psRbDihedralParameter1                 = psRbDihedralParameter1;
    gpu->sim.pRbDihedralParameter1              = psRbDihedralParameter1->_pDevStream[0];
-    CUDAStream<float2>* psRbDihedralParameter2  = new CUDAStream<float2>(rb_dihedrals, 1);    
+    CUDAStream<float2>* psRbDihedralParameter2  = new CUDAStream<float2>(rb_dihedrals, 1, "RbDihedralParameter2");
    gpu->psRbDihedralParameter2                 = psRbDihedralParameter2;
    gpu->sim.pRbDihedralParameter2              = psRbDihedralParameter2->_pDevStream[0];

    for (int i = 0; i < rb_dihedrals; i++)
    {
-        psRbDihedralID1->_pSysStream[0][i].x = atom1[i];
-        psRbDihedralID1->_pSysStream[0][i].y = atom2[i];
-        psRbDihedralID1->_pSysStream[0][i].z = atom3[i];
-        psRbDihedralID1->_pSysStream[0][i].w = atom4[i];
-        psRbDihedralParameter1->_pSysStream[0][i].x = c0[i];
-        psRbDihedralParameter1->_pSysStream[0][i].y = c1[i];
-        psRbDihedralParameter1->_pSysStream[0][i].z = c2[i];
-        psRbDihedralParameter1->_pSysStream[0][i].w = c3[i];
-        psRbDihedralParameter2->_pSysStream[0][i].x = c4[i];
-        psRbDihedralParameter2->_pSysStream[0][i].y = c5[i];
-        psRbDihedralID2->_pSysStream[0][i].x = gpu->pOutputBufferCounter[psRbDihedralID1->_pSysStream[0][i].x]++;
-        psRbDihedralID2->_pSysStream[0][i].y = gpu->pOutputBufferCounter[psRbDihedralID1->_pSysStream[0][i].y]++;
-        psRbDihedralID2->_pSysStream[0][i].z = gpu->pOutputBufferCounter[psRbDihedralID1->_pSysStream[0][i].z]++;
-        psRbDihedralID2->_pSysStream[0][i].w = gpu->pOutputBufferCounter[psRbDihedralID1->_pSysStream[0][i].w]++;
+        (*psRbDihedralID1)[i].x = atom1[i];
+        (*psRbDihedralID1)[i].y = atom2[i];
+        (*psRbDihedralID1)[i].z = atom3[i];
+        (*psRbDihedralID1)[i].w = atom4[i];
+        (*psRbDihedralParameter1)[i].x = c0[i];
+        (*psRbDihedralParameter1)[i].y = c1[i];
+        (*psRbDihedralParameter1)[i].z = c2[i];
+        (*psRbDihedralParameter1)[i].w = c3[i];
+        (*psRbDihedralParameter2)[i].x = c4[i];
+        (*psRbDihedralParameter2)[i].y = c5[i];
+        psRbDihedralID2->_pSysData[i].x = gpu->pOutputBufferCounter[psRbDihedralID1->_pSysData[i].x]++;
+        psRbDihedralID2->_pSysData[i].y = gpu->pOutputBufferCounter[psRbDihedralID1->_pSysData[i].y]++;
+        psRbDihedralID2->_pSysData[i].z = gpu->pOutputBufferCounter[psRbDihedralID1->_pSysData[i].z]++;
+        psRbDihedralID2->_pSysData[i].w = gpu->pOutputBufferCounter[psRbDihedralID1->_pSysData[i].w]++;
 #if (DUMP_PARAMETERS == 1)
        cout << 
            i << " " << 
-            psRbDihedralID1->_pSysStream[0][i].x << " " << 
-            psRbDihedralID1->_pSysStream[0][i].y << " " << 
-            psRbDihedralID1->_pSysStream[0][i].z << " " << 
-            psRbDihedralID1->_pSysStream[0][i].w <<" " << 
-            psRbDihedralID2->_pSysStream[0][i].x << " " << 
-            psRbDihedralID2->_pSysStream[0][i].y << " " << 
-            psRbDihedralID2->_pSysStream[0][i].z << " " << 
-            psRbDihedralID2->_pSysStream[0][i].w <<" " <<                 
-            psRbDihedralParameter1->_pSysStream[0][i].x << " " << 
-            psRbDihedralParameter1->_pSysStream[0][i].y << " " << 
-            psRbDihedralParameter1->_pSysStream[0][i].z << " " << 
-            psRbDihedralParameter1->_pSysStream[0][i].w << " " << 
-            psRbDihedralParameter2->_pSysStream[0][i].x << " " << 
-            psRbDihedralParameter2->_pSysStream[0][i].y << 
+            (*psRbDihedralID1)[i].x << " " <<
+            (*psRbDihedralID1)[i].y << " " <<
+            (*psRbDihedralID1)[i].z << " " <<
+            (*psRbDihedralID1)[i].w <<" " <<
+            (*psRbDihedralID2)[i].x << " " <<
+            (*psRbDihedralID2)[i].y << " " <<
+            (*psRbDihedralID2)[i].z << " " <<
+            (*psRbDihedralID2)[i].w <<" " <<
+            (*psRbDihedralParameter1)[i].x << " " <<
+            (*psRbDihedralParameter1)[i].y << " " <<
+            (*psRbDihedralParameter1)[i].z << " " <<
+            (*psRbDihedralParameter1)[i].w << " " <<
+            (*psRbDihedralParameter2)[i].x << " " <<
+            (*psRbDihedralParameter2)[i].y <<
            endl;
 #endif
    }
@@ -321,19 +321,19 @@ void gpuSetLJ14Parameters(gpuContext gpu, float epsfac, float fudge, const vecto
    float scale = epsfac * fudge;

    gpu->sim.LJ14s                              = LJ14s;
-    CUDAStream<int4>* psLJ14ID                  = new CUDAStream<int4>(LJ14s, 1);
+    CUDAStream<int4>* psLJ14ID                  = new CUDAStream<int4>(LJ14s, 1, "LJ14ID");
    gpu->psLJ14ID                               = psLJ14ID;
    gpu->sim.pLJ14ID                            = psLJ14ID->_pDevStream[0];
-    CUDAStream<float4>* psLJ14Parameter         = new CUDAStream<float4>(LJ14s, 1);
+    CUDAStream<float4>* psLJ14Parameter         = new CUDAStream<float4>(LJ14s, 1, "LJ14Parameter");
    gpu->psLJ14Parameter                        = psLJ14Parameter;
    gpu->sim.pLJ14Parameter                     = psLJ14Parameter->_pDevStream[0];

    for (int i = 0; i < LJ14s; i++)
    {
-        psLJ14ID->_pSysStream[0][i].x = atom1[i];
-        psLJ14ID->_pSysStream[0][i].y = atom2[i];
-        psLJ14ID->_pSysStream[0][i].z = gpu->pOutputBufferCounter[psLJ14ID->_pSysStream[0][i].x]++;
-        psLJ14ID->_pSysStream[0][i].w = gpu->pOutputBufferCounter[psLJ14ID->_pSysStream[0][i].y]++;
+        (*psLJ14ID)[i].x = atom1[i];
+        (*psLJ14ID)[i].y = atom2[i];
+        psLJ14ID->_pSysData[i].z = gpu->pOutputBufferCounter[psLJ14ID->_pSysData[i].x]++;
+        psLJ14ID->_pSysData[i].w = gpu->pOutputBufferCounter[psLJ14ID->_pSysData[i].y]++;
        float p0, p1, p2;
        if (c12[i] == 0.0f)
        {
@@ -346,20 +346,20 @@ void gpuSetLJ14Parameters(gpuContext gpu, float epsfac, float fudge, const vecto
            p1 = pow(c12[i] / c6[i], 1.0f / 6.0f);
        }
        p2 = scale * q1[i] * q2[i];
-        psLJ14Parameter->_pSysStream[0][i].x = p0;
-        psLJ14Parameter->_pSysStream[0][i].y = p1;
-        psLJ14Parameter->_pSysStream[0][i].z = p2;
+        (*psLJ14Parameter)[i].x = p0;
+        (*psLJ14Parameter)[i].y = p1;
+        (*psLJ14Parameter)[i].z = p2;
    }
 #if (DUMP_PARAMETERS == 1)
        cout << 
            i << " " <<
-            psLJ14ID->_pSysStream[0][i].x << " " << 
-            psLJ14ID->_pSysStream[0][i].y << " " << 
-            psLJ14ID->_pSysStream[0][i].z << " " << 
-            psLJ14ID->_pSysStream[0][i].w << " " << 
-            psLJ14Parameter->_pSysStream[0][i].x << " " << 
-            psLJ14Parameter->_pSysStream[0][i].y << " " <<
-            psLJ14Parameter->_pSysStream[0][i].z << " " << 
+            (*psLJ14ID)[i].x << " " <<
+            (*psLJ14ID)[i].y << " " <<
+            (*psLJ14ID)[i].z << " " <<
+            (*psLJ14ID)[i].w << " " <<
+            (*psLJ14Parameter)[i].x << " " <<
+            (*psLJ14Parameter)[i].y << " " <<
+            (*psLJ14Parameter)[i].z << " " <<
            p0 << " " << 
            p1 << " " << 
            p2 << " " << 
@@ -389,20 +389,20 @@ void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const vector<int>& at
            }
            if (symbol.size() > 0)
                gpu->pAtomSymbol[i] = symbol[i];
-            gpu->psPosq4->_pSysStream[0][i].w = p0;
-            gpu->psSigEps2->_pSysStream[0][i].x = p1;
-            gpu->psSigEps2->_pSysStream[0][i].y = p2;
+            (*gpu->psPosq4)[i].w = p0;
+            (*gpu->psSigEps2)[i].x = p1;
+            (*gpu->psSigEps2)[i].y = p2;
    }

    // Dummy out extra atom data
    for (unsigned int i = coulombs; i < gpu->sim.paddedNumberOfAtoms; i++)
    {
-        gpu->psPosq4->_pSysStream[0][i].x       = 100000.0f + i * 10.0f;
-        gpu->psPosq4->_pSysStream[0][i].y       = 100000.0f + i * 10.0f;
-        gpu->psPosq4->_pSysStream[0][i].z       = 100000.0f + i * 10.0f;
-        gpu->psPosq4->_pSysStream[0][i].w       = 0.0f;
-        gpu->psSigEps2->_pSysStream[0][i].x     = 0.0f;
-        gpu->psSigEps2->_pSysStream[0][i].y     = 0.0f;   
+        (*gpu->psPosq4)[i].x       = 100000.0f + i * 10.0f;
+        (*gpu->psPosq4)[i].y       = 100000.0f + i * 10.0f;
+        (*gpu->psPosq4)[i].z       = 100000.0f + i * 10.0f;
+        (*gpu->psPosq4)[i].w       = 0.0f;
+        (*gpu->psSigEps2)[i].x     = 0.0f;
+        (*gpu->psSigEps2)[i].y     = 0.0f;
    }

    gpu->psPosq4->Upload();
@@ -432,23 +432,23 @@ void gpuSetObcParameters(gpuContext gpu, float innerDielectric, float solventDie
    gpu->bIncludeGBSA = true;
    for (unsigned int i = 0; i < atoms; i++)
    {
-            gpu->psObcData->_pSysStream[0][i].x = radius[i] - dielectricOffset;
-            gpu->psObcData->_pSysStream[0][i].y = scale[i] * gpu->psObcData->_pSysStream[0][i].x;
+            (*gpu->psObcData)[i].x = radius[i] - dielectricOffset;
+            (*gpu->psObcData)[i].y = scale[i] * (*gpu->psObcData)[i].x;

 #if (DUMP_PARAMETERS == 1)
        cout << 
            i << " " << 
-            gpu->psObcData->_pSysStream[0][i].x << " " <<
-            gpu->psObcData->_pSysStream[0][i].y;
+            (*gpu->psObcData)[i].x << " " <<
+            (*gpu->psObcData)[i].y;
 #endif
    }

    // Dummy out extra atom data
    for (unsigned int i = atoms; i < gpu->sim.paddedNumberOfAtoms; i++)
    {
-        gpu->psBornRadii->_pSysStream[0][i]     = 0.2f;
-        gpu->psObcData->_pSysStream[0][i].x     = 0.01f;
-        gpu->psObcData->_pSysStream[0][i].y     = 0.01f;
+        (*gpu->psBornRadii)[i]     = 0.2f;
+        (*gpu->psObcData)[i].x     = 0.01f;
+        (*gpu->psObcData)[i].y     = 0.01f;
    }

    gpu->psBornRadii->Upload();
@@ -515,10 +515,10 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const

    // Record the actual SETTLE clusters.

-    CUDAStream<int4>* psSettleID          = new CUDAStream<int4>((int) settleClusters.size(), 1);
+    CUDAStream<int4>* psSettleID          = new CUDAStream<int4>((int) settleClusters.size(), 1, "SettleID");
    gpu->psSettleID                       = psSettleID;
    gpu->sim.pSettleID                    = psSettleID->_pDevStream[0];
-    CUDAStream<float2>* psSettleParameter = new CUDAStream<float2>((int) settleClusters.size(), 1);
+    CUDAStream<float2>* psSettleParameter = new CUDAStream<float2>((int) settleClusters.size(), 1, "SettleParameter");
    gpu->psSettleParameter                = psSettleParameter;
    gpu->sim.pSettleParameter             = psSettleParameter->_pDevStream[0];
    gpu->sim.settleConstraints            = settleClusters.size();
@@ -530,25 +530,25 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
        float dist13 = settleConstraints[atom1].find(atom3)->second;
        float dist23 = settleConstraints[atom2].find(atom3)->second;
        if (dist12 == dist13) { // atom1 is the central atom
-            psSettleID->_pSysData[i].x = atom1;
-            psSettleID->_pSysData[i].y = atom2;
-            psSettleID->_pSysData[i].z = atom3;
-            psSettleParameter->_pSysData[i].x = dist12;
-            psSettleParameter->_pSysData[i].y = dist23;
+            (*psSettleID)[i].x = atom1;
+            (*psSettleID)[i].y = atom2;
+            (*psSettleID)[i].z = atom3;
+            (*psSettleParameter)[i].x = dist12;
+            (*psSettleParameter)[i].y = dist23;
        }
        else if (dist12 == dist23) { // atom2 is the central atom
-            psSettleID->_pSysData[i].x = atom2;
-            psSettleID->_pSysData[i].y = atom1;
-            psSettleID->_pSysData[i].z = atom3;
-            psSettleParameter->_pSysData[i].x = dist12;
-            psSettleParameter->_pSysData[i].y = dist13;
+            (*psSettleID)[i].x = atom2;
+            (*psSettleID)[i].y = atom1;
+            (*psSettleID)[i].z = atom3;
+            (*psSettleParameter)[i].x = dist12;
+            (*psSettleParameter)[i].y = dist13;
        }
        else if (dist13 == dist23) { // atom3 is the central atom
-            psSettleID->_pSysData[i].x = atom3;
-            psSettleID->_pSysData[i].y = atom1;
-            psSettleID->_pSysData[i].z = atom2;
-            psSettleParameter->_pSysData[i].x = dist13;
-            psSettleParameter->_pSysData[i].y = dist12;
+            (*psSettleID)[i].x = atom3;
+            (*psSettleID)[i].y = atom1;
+            (*psSettleID)[i].z = atom2;
+            (*psSettleParameter)[i].x = dist13;
+            (*psSettleParameter)[i].y = dist12;
        }
        else
            throw OpenMMException("Two of the three distances constrained with SETTLE must be the same.");
@@ -627,10 +627,10 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const

    // Fill in the Cuda streams.

-    CUDAStream<int4>* psShakeID             = new CUDAStream<int4>(validShakeClusters, 1);
+    CUDAStream<int4>* psShakeID             = new CUDAStream<int4>(validShakeClusters, 1, "ShakeID");
    gpu->psShakeID                          = psShakeID;
    gpu->sim.pShakeID                       = psShakeID->_pDevStream[0];
-    CUDAStream<float4>* psShakeParameter    = new CUDAStream<float4>(validShakeClusters, 1);
+    CUDAStream<float4>* psShakeParameter    = new CUDAStream<float4>(validShakeClusters, 1, "ShakeParameter");
    gpu->psShakeParameter                   = psShakeParameter;
    gpu->sim.pShakeParameter                = psShakeParameter->_pDevStream[0];
    gpu->sim.ShakeConstraints               = validShakeClusters;
@@ -639,14 +639,14 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
        const ShakeCluster& cluster = iter->second;
        if (!cluster.valid)
            continue;
-        psShakeID->_pSysStream[0][index].x = cluster.centralID;
-        psShakeID->_pSysStream[0][index].y = cluster.peripheralID[0];
-        psShakeID->_pSysStream[0][index].z = cluster.size > 1 ? cluster.peripheralID[1] : -1;
-        psShakeID->_pSysStream[0][index].w = cluster.size > 2 ? cluster.peripheralID[2] : -1;
-        psShakeParameter->_pSysStream[0][index].x = cluster.centralInvMass;
-        psShakeParameter->_pSysStream[0][index].y = 0.5f/(cluster.centralInvMass+cluster.peripheralInvMass);
-        psShakeParameter->_pSysStream[0][index].z = cluster.distance*cluster.distance;
-        psShakeParameter->_pSysStream[0][index].w = cluster.peripheralInvMass;
+        (*psShakeID)[index].x = cluster.centralID;
+        (*psShakeID)[index].y = cluster.peripheralID[0];
+        (*psShakeID)[index].z = cluster.size > 1 ? cluster.peripheralID[1] : -1;
+        (*psShakeID)[index].w = cluster.size > 2 ? cluster.peripheralID[2] : -1;
+        (*psShakeParameter)[index].x = cluster.centralInvMass;
+        (*psShakeParameter)[index].y = 0.5f/(cluster.centralInvMass+cluster.peripheralInvMass);
+        (*psShakeParameter)[index].z = cluster.distance*cluster.distance;
+        (*psShakeParameter)[index].w = cluster.peripheralInvMass;
        isShakeAtom[cluster.centralID] = true;
        isShakeAtom[cluster.peripheralID[0]] = true;
        if (cluster.size > 1)
@@ -691,64 +691,64 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const

    // Fill in the CUDA streams.

-    CUDAStream<int2>* psLincsAtoms = new CUDAStream<int2>((int) lincsConstraints.size(), 1);
+    CUDAStream<int2>* psLincsAtoms = new CUDAStream<int2>((int) lincsConstraints.size(), 1, "LincsAtoms");
    gpu->psLincsAtoms              = psLincsAtoms;
    gpu->sim.pLincsAtoms           = psLincsAtoms->_pDevData;
-    CUDAStream<float4>* psLincsDistance = new CUDAStream<float4>((int) lincsConstraints.size(), 1);
+    CUDAStream<float4>* psLincsDistance = new CUDAStream<float4>((int) lincsConstraints.size(), 1, "LincsDistance");
    gpu->psLincsDistance                = psLincsDistance;
    gpu->sim.pLincsDistance             = psLincsDistance->_pDevData;
-    CUDAStream<int>* psLincsConnections = new CUDAStream<int>(totalLinks, 1);
+    CUDAStream<int>* psLincsConnections = new CUDAStream<int>(totalLinks, 1, "LincsConnections");
    gpu->psLincsConnections             = psLincsConnections;
    gpu->sim.pLincsConnections          = psLincsConnections->_pDevData;
-    CUDAStream<int>* psLincsConnectionsIndex = new CUDAStream<int>((int) lincsConstraints.size()+1, 1);
+    CUDAStream<int>* psLincsConnectionsIndex = new CUDAStream<int>((int) lincsConstraints.size()+1, 1, "LincsConnectionsIndex");
    gpu->psLincsConnectionsIndex             = psLincsConnectionsIndex;
    gpu->sim.pLincsConnectionsIndex          = psLincsConnectionsIndex->_pDevData;
-    CUDAStream<int>* psLincsAtomConstraints = new CUDAStream<int>((int) lincsConstraints.size()*2, 1);
+    CUDAStream<int>* psLincsAtomConstraints = new CUDAStream<int>((int) lincsConstraints.size()*2, 1, "LincsAtomConstraints");
    gpu->psLincsAtomConstraints             = psLincsAtomConstraints;
    gpu->sim.pLincsAtomConstraints          = psLincsAtomConstraints->_pDevData;
-    CUDAStream<int>* psLincsAtomConstraintsIndex = new CUDAStream<int>(gpu->natoms+1, 1);
+    CUDAStream<int>* psLincsAtomConstraintsIndex = new CUDAStream<int>(gpu->natoms+1, 1, "LincsAtomConstraintsIndex");
    gpu->psLincsAtomConstraintsIndex             = psLincsAtomConstraintsIndex;
    gpu->sim.pLincsAtomConstraintsIndex          = psLincsAtomConstraintsIndex->_pDevData;
-    CUDAStream<float>* psLincsS = new CUDAStream<float>((int) lincsConstraints.size(), 1);
+    CUDAStream<float>* psLincsS = new CUDAStream<float>((int) lincsConstraints.size(), 1, "LincsS");
    gpu->psLincsS             = psLincsS;
    gpu->sim.pLincsS          = psLincsS->_pDevData;
-    CUDAStream<float>* psLincsCoupling = new CUDAStream<float>(totalLinks, 1);
+    CUDAStream<float>* psLincsCoupling = new CUDAStream<float>(totalLinks, 1, "LincsCoupling");
    gpu->psLincsCoupling               = psLincsCoupling;
    gpu->sim.pLincsCoupling            = psLincsCoupling->_pDevData;
-    CUDAStream<float>* psLincsRhs1 = new CUDAStream<float>((int) lincsConstraints.size(), 1);
+    CUDAStream<float>* psLincsRhs1 = new CUDAStream<float>((int) lincsConstraints.size(), 1, "LincsRhs1");
    gpu->psLincsRhs1             = psLincsRhs1;
    gpu->sim.pLincsRhs1          = psLincsRhs1->_pDevData;
-    CUDAStream<float>* psLincsRhs2 = new CUDAStream<float>((int) lincsConstraints.size(), 1);
+    CUDAStream<float>* psLincsRhs2 = new CUDAStream<float>((int) lincsConstraints.size(), 1, "LincsRhs2");
    gpu->psLincsRhs2             = psLincsRhs2;
    gpu->sim.pLincsRhs2          = psLincsRhs2->_pDevData;
-    CUDAStream<float>* psLincsSolution = new CUDAStream<float>((int) lincsConstraints.size(), 1);
+    CUDAStream<float>* psLincsSolution = new CUDAStream<float>((int) lincsConstraints.size(), 1, "LincsSolution");
    gpu->psLincsSolution             = psLincsSolution;
    gpu->sim.pLincsSolution          = psLincsSolution->_pDevData;
-    CUDAStream<unsigned int>* psSyncCounter = new CUDAStream<unsigned int>(2*lincsTerms+2, 1);
+    CUDAStream<unsigned int>* psSyncCounter = new CUDAStream<unsigned int>(2*lincsTerms+2, 1, "SyncCounter");
    gpu->psSyncCounter                      = psSyncCounter;
    gpu->sim.pSyncCounter                   = psSyncCounter->_pDevData;
    gpu->sim.lincsConstraints = lincsConstraints.size();
    index = 0;
    for (unsigned int i = 0; i < lincsConstraints.size(); i++) {
        int c = lincsConstraints[i];
-        psLincsAtoms->_pSysData[i].x = atom1[c];
-        psLincsAtoms->_pSysData[i].y = atom2[c];
-        psLincsDistance->_pSysData[i].w = distance[c];
-        psLincsS->_pSysData[i] = 1.0f/sqrt(invMass1[c]+invMass2[c]);
-        psLincsConnectionsIndex->_pSysData[i] = index;
+        (*psLincsAtoms)[i].x = atom1[c];
+        (*psLincsAtoms)[i].y = atom2[c];
+        (*psLincsDistance)[i].w = distance[c];
+        (*psLincsS)[i] = 1.0f/sqrt(invMass1[c]+invMass2[c]);
+        (*psLincsConnectionsIndex)[i] = index;
        for (unsigned int j = 0; j < linkedConstraints[i].size(); j++)
-            psLincsConnections->_pSysData[index++] = linkedConstraints[i][j];
+            (*psLincsConnections)[index++] = linkedConstraints[i][j];
    }
-    psLincsConnectionsIndex->_pSysData[lincsConstraints.size()] = index;
+    (*psLincsConnectionsIndex)[lincsConstraints.size()] = index;
    for (unsigned int i = 0; i < psSyncCounter->_length; i++)
-        psSyncCounter->_pSysData[i] = 0;
+        (*psSyncCounter)[i] = 0;
    index = 0;
    for (unsigned int i = 0; i < atomConstraints.size(); i++) {
-        psLincsAtomConstraintsIndex->_pSysData[i] = index;
+        (*psLincsAtomConstraintsIndex)[i] = index;
        for (unsigned int j = 0; j < atomConstraints[i].size(); j++)
-            psLincsAtomConstraints->_pSysData[index++] = atomConstraints[i][j];
+            (*psLincsAtomConstraints)[index++] = atomConstraints[i][j];
    }
-    psLincsAtomConstraintsIndex->_pSysData[atomConstraints.size()] = index;
+    (*psLincsAtomConstraintsIndex)[atomConstraints.size()] = index;
    psLincsAtoms->Upload();
    psLincsDistance->Upload();
    psLincsS->Upload();
@@ -785,7 +785,7 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
    gpu->sim.NonShakeConstraints                  = count;
    if( count || true ){

-       CUDAStream<int>* psNonShakeID              = new CUDAStream<int>(count, 1);
+       CUDAStream<int>* psNonShakeID              = new CUDAStream<int>(count, 1, "NonShakeID");
       gpu->psNonShakeID                          = psNonShakeID;
       gpu->sim.pNonShakeID                       = psNonShakeID->_pDevStream[0];

@@ -802,7 +802,7 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
       count = 0;
       for (int i = 0; i < gpu->natoms; i++){
          if (!isShakeAtom[i]){
-             psNonShakeID->_pSysStream[0][count++] = i;
+             (*psNonShakeID)[count++] = i;
          }
       }
       psNonShakeID->Upload();
@@ -821,7 +821,7 @@ int gpuAllocateInitialBuffers(gpuContext gpu)
    gpu->sim.degreesOfFreedom           = 3 * gpu->sim.atoms - 6;
    gpu->gpAtomTable                    = NULL;
    gpu->gAtomTypes                     = 0;
-    gpu->psPosq4                        = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1);
+    gpu->psPosq4                        = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1, "Posq");
    gpu->sim.stride                     = gpu->psPosq4->_stride;
    gpu->sim.stride2                    = gpu->sim.stride * 2;
    gpu->sim.stride3                    = gpu->sim.stride * 3;
@@ -831,29 +831,29 @@ int gpuAllocateInitialBuffers(gpuContext gpu)
    gpu->sim.stride2                    = 2 * gpu->sim.stride;
    gpu->sim.stride3                    = 3 * gpu->sim.stride;
    gpu->sim.stride4                    = 4 * gpu->sim.stride;
-    gpu->psPosqP4                       = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1);
+    gpu->psPosqP4                       = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1, "PosqP");
    gpu->sim.pPosqP                     = gpu->psPosqP4->_pDevStream[0];
-    gpu->psOldPosq4                     = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1);
+    gpu->psOldPosq4                     = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1, "OldPosq");
    gpu->sim.pOldPosq                   = gpu->psOldPosq4->_pDevStream[0];
-    gpu->psVelm4                        = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1);
+    gpu->psVelm4                        = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1, "Velm");
    gpu->sim.pVelm4                     = gpu->psVelm4->_pDevStream[0];
-    gpu->psvVector4                     = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1);
+    gpu->psvVector4                     = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1, "vVector");
    gpu->sim.pvVector4                  = gpu->psvVector4->_pDevStream[0];
-    gpu->psxVector4                     = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1);
+    gpu->psxVector4                     = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1, "xVector");
    gpu->sim.pxVector4                  = gpu->psxVector4->_pDevStream[0];
-    gpu->psBornRadii                    = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, 1);
+    gpu->psBornRadii                    = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, 1, "BornRadii");
    gpu->sim.pBornRadii                 = gpu->psBornRadii->_pDevStream[0];
-    gpu->psObcChain                     = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, 1);
+    gpu->psObcChain                     = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, 1, "ObcChain");
    gpu->sim.pObcChain                  = gpu->psObcChain->_pDevStream[0];
-    gpu->psSigEps2                      = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1);
+    gpu->psSigEps2                      = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1, "SigEps2");
    gpu->sim.pAttr                      = gpu->psSigEps2->_pDevStream[0];
-    gpu->psObcData                      = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1);
+    gpu->psObcData                      = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1, "ObcData");
    gpu->sim.pObcData                   = gpu->psObcData->_pDevStream[0];
    gpu->pAtomSymbol                    = new unsigned char[gpu->natoms];
-    gpu->psAtomIndex                    = new CUDAStream<int>(gpu->sim.paddedNumberOfAtoms, 1);
+    gpu->psAtomIndex                    = new CUDAStream<int>(gpu->sim.paddedNumberOfAtoms, 1, "AtomIndex");
    gpu->sim.pAtomIndex                 = gpu->psAtomIndex->_pDevStream[0];
    for (int i = 0; i < (int) gpu->sim.paddedNumberOfAtoms; i++)
-        gpu->psAtomIndex->_pSysStream[0][i] = i;
+        (*gpu->psAtomIndex)[i] = i;
    gpu->psAtomIndex->Upload();
    // Determine randoms
    gpu->seed                           = 1;
@@ -862,10 +862,10 @@ int gpuAllocateInitialBuffers(gpuContext gpu)
    gpu->sim.randoms                    = gpu->sim.randomFrames * gpu->sim.paddedNumberOfAtoms - 5 * GRID;
    gpu->sim.totalRandoms               = gpu->sim.randoms + gpu->sim.paddedNumberOfAtoms;
    gpu->sim.totalRandomsTimesTwo       = gpu->sim.totalRandoms * 2;
-    gpu->psRandom4                      = new CUDAStream<float4>(gpu->sim.totalRandomsTimesTwo, 1);
-    gpu->psRandom2                      = new CUDAStream<float2>(gpu->sim.totalRandomsTimesTwo, 1);
-    gpu->psRandomPosition               = new CUDAStream<int>(gpu->sim.blocks, 1);
-    gpu->psRandomSeed                   = new CUDAStream<uint4>(gpu->sim.blocks * gpu->sim.random_threads_per_block, 1);
+    gpu->psRandom4                      = new CUDAStream<float4>(gpu->sim.totalRandomsTimesTwo, 1, "Random4");
+    gpu->psRandom2                      = new CUDAStream<float2>(gpu->sim.totalRandomsTimesTwo, 1, "Random2");
+    gpu->psRandomPosition               = new CUDAStream<int>(gpu->sim.blocks, 1, "RandomPosition");
+    gpu->psRandomSeed                   = new CUDAStream<uint4>(gpu->sim.blocks * gpu->sim.random_threads_per_block, 1, "RandomSeed");
    gpu->sim.pRandom4a                  = gpu->psRandom4->_pDevStream[0];
    gpu->sim.pRandom2a                  = gpu->psRandom2->_pDevStream[0];
    gpu->sim.pRandom4b                  = gpu->psRandom4->_pDevStream[0] + gpu->sim.totalRandoms;
@@ -874,14 +874,14 @@ int gpuAllocateInitialBuffers(gpuContext gpu)
    gpu->sim.pRandomSeed                = gpu->psRandomSeed->_pDevStream[0];

    // Allocate and clear linear momentum buffer
-    gpu->psLinearMomentum = new CUDAStream<float4>(gpu->sim.blocks, 1);
+    gpu->psLinearMomentum = new CUDAStream<float4>(gpu->sim.blocks, 1, "LinearMomentum");
    gpu->sim.pLinearMomentum = gpu->psLinearMomentum->_pDevStream[0];
    for (int i = 0; i < (int) gpu->sim.blocks; i++)
    {
-        gpu->psLinearMomentum->_pSysStream[0][i].x = 0.0f;
-        gpu->psLinearMomentum->_pSysStream[0][i].y = 0.0f;
-        gpu->psLinearMomentum->_pSysStream[0][i].z = 0.0f;
-        gpu->psLinearMomentum->_pSysStream[0][i].w = 0.0f;
+        (*gpu->psLinearMomentum)[i].x = 0.0f;
+        (*gpu->psLinearMomentum)[i].y = 0.0f;
+        (*gpu->psLinearMomentum)[i].z = 0.0f;
+        (*gpu->psLinearMomentum)[i].w = 0.0f;
    }
    gpu->psLinearMomentum->Upload();

@@ -893,9 +893,9 @@ void gpuSetPositions(gpuContext gpu, const vector<float>& x, const vector<float>
 {
    for (int i = 0; i < gpu->natoms; i++)
    {
-        gpu->psPosq4->_pSysStream[0][i].x = x[i];
-        gpu->psPosq4->_pSysStream[0][i].y = y[i];
-        gpu->psPosq4->_pSysStream[0][i].z = z[i];
+        (*gpu->psPosq4)[i].x = x[i];
+        (*gpu->psPosq4)[i].y = y[i];
+        (*gpu->psPosq4)[i].z = z[i];
    }
    gpu->psPosq4->Upload();

@@ -909,9 +909,9 @@ void gpuSetVelocities(gpuContext gpu, const vector<float>& x, const vector<float
 {
    for (int i = 0; i < gpu->natoms; i++)
    {
-        gpu->psVelm4->_pSysStream[0][i].x = x[i];
-        gpu->psVelm4->_pSysStream[0][i].y = y[i];
-        gpu->psVelm4->_pSysStream[0][i].z = z[i];
+        (*gpu->psVelm4)[i].x = x[i];
+        (*gpu->psVelm4)[i].y = y[i];
+        (*gpu->psVelm4)[i].z = z[i];
    }
    gpu->psVelm4->Upload();
 } 
@@ -922,7 +922,7 @@ void gpuSetMass(gpuContext gpu, const vector<float>& mass)
    float totalMass = 0.0f;
    for (int i = 0; i < gpu->natoms; i++)
    {
-        gpu->psVelm4->_pSysStream[0][i].w = 1.0f/mass[i];
+        (*gpu->psVelm4)[i].w = 1.0f/mass[i];
        totalMass += mass[i];
    }
    gpu->sim.inverseTotalMass = 1.0f / totalMass;
@@ -934,16 +934,16 @@ void gpuInitializeRandoms(gpuContext gpu)
 {
    for (int i = 0; i < (int) gpu->sim.blocks; i++)
    {
-        gpu->psRandomPosition->_pSysStream[0][i] = 0;
+        (*gpu->psRandomPosition)[i] = 0;
    }
    int seed = gpu->seed | ((gpu->seed ^ 0xffffffff) << 16);
    srand(seed);
    for (int i = 0; i < (int) (gpu->sim.blocks * gpu->sim.random_threads_per_block); i++)
    {
-        gpu->psRandomSeed->_pSysStream[0][i].x = rand();
-        gpu->psRandomSeed->_pSysStream[0][i].y = rand();
-        gpu->psRandomSeed->_pSysStream[0][i].z = rand();
-        gpu->psRandomSeed->_pSysStream[0][i].w = rand();
+        (*gpu->psRandomSeed)[i].x = rand();
+        (*gpu->psRandomSeed)[i].y = rand();
+        (*gpu->psRandomSeed)[i].z = rand();
+        (*gpu->psRandomSeed)[i].w = rand();
    }
    gpu->psRandomPosition->Upload();
    gpu->psRandomSeed->Upload();
@@ -1046,10 +1046,10 @@ void* gpuInit(int numAtoms)
    gpuAllocateInitialBuffers(gpu);
    for (int i = 0; i < gpu->natoms; i++)
    {
-        gpu->psxVector4->_pSysStream[0][i].x = 0.0f;
-        gpu->psxVector4->_pSysStream[0][i].y = 0.0f;
-        gpu->psxVector4->_pSysStream[0][i].z = 0.0f;
-        gpu->psxVector4->_pSysStream[0][i].w = 0.0f;
+        (*gpu->psxVector4)[i].x = 0.0f;
+        (*gpu->psxVector4)[i].y = 0.0f;
+        (*gpu->psxVector4)[i].z = 0.0f;
+        (*gpu->psxVector4)[i].w = 0.0f;
    }
    gpu->psxVector4->Upload();

@@ -1323,9 +1323,9 @@ int gpuBuildOutputBuffers(gpuContext gpu)
        }
    }    
    gpu->sim.outputBuffers      = outputBuffers;
-    gpu->psForce4               = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, outputBuffers);
-    gpu->psBornForce            = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, gpu->sim.nonbondOutputBuffers);
-    gpu->psBornSum              = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, gpu->sim.nonbondOutputBuffers);
+    gpu->psForce4               = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, outputBuffers, "Force");
+    gpu->psBornForce            = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, gpu->sim.nonbondOutputBuffers, "BornForce");
+    gpu->psBornSum              = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, gpu->sim.nonbondOutputBuffers, "BornSum");
    gpu->sim.pForce4            = gpu->psForce4->_pDevStream[0];
    gpu->sim.pForce4a           = gpu->sim.pForce4;
    gpu->sim.pForce4b           = gpu->sim.pForce4 + 1 * gpu->sim.nonbondOutputBuffers * gpu->sim.stride;
@@ -1348,33 +1348,33 @@ int gpuBuildOutputBuffers(gpuContext gpu)
    int flip = outputBuffers - 1;
    for (int i = 0; i < (int) gpu->sim.bonds; i++)
    {
-        gpu->psBondID->_pSysStream[0][i].z = flip - gpu->psBondID->_pSysStream[0][i].z;
-        gpu->psBondID->_pSysStream[0][i].w = flip - gpu->psBondID->_pSysStream[0][i].w;
+        (*gpu->psBondID)[i].z = flip - (*gpu->psBondID)[i].z;
+        (*gpu->psBondID)[i].w = flip - (*gpu->psBondID)[i].w;
    }
    for (int i = 0; i < (int) gpu->sim.bond_angles; i++)
    {
-        gpu->psBondAngleID1->_pSysStream[0][i].w = flip - gpu->psBondAngleID1->_pSysStream[0][i].w;
-        gpu->psBondAngleID2->_pSysStream[0][i].x = flip - gpu->psBondAngleID2->_pSysStream[0][i].x;
-        gpu->psBondAngleID2->_pSysStream[0][i].y = flip - gpu->psBondAngleID2->_pSysStream[0][i].y;
+        (*gpu->psBondAngleID1)[i].w = flip - (*gpu->psBondAngleID1)[i].w;
+        (*gpu->psBondAngleID2)[i].x = flip - (*gpu->psBondAngleID2)[i].x;
+        (*gpu->psBondAngleID2)[i].y = flip - (*gpu->psBondAngleID2)[i].y;
    }
    for (int i = 0; i < (int) gpu->sim.dihedrals; i++)
    {
-        gpu->psDihedralID2->_pSysStream[0][i].x = flip - gpu->psDihedralID2->_pSysStream[0][i].x;
-        gpu->psDihedralID2->_pSysStream[0][i].y = flip - gpu->psDihedralID2->_pSysStream[0][i].y;
-        gpu->psDihedralID2->_pSysStream[0][i].z = flip - gpu->psDihedralID2->_pSysStream[0][i].z;
-        gpu->psDihedralID2->_pSysStream[0][i].w = flip - gpu->psDihedralID2->_pSysStream[0][i].w;
+        (*gpu->psDihedralID2)[i].x = flip - (*gpu->psDihedralID2)[i].x;
+        (*gpu->psDihedralID2)[i].y = flip - (*gpu->psDihedralID2)[i].y;
+        (*gpu->psDihedralID2)[i].z = flip - (*gpu->psDihedralID2)[i].z;
+        (*gpu->psDihedralID2)[i].w = flip - (*gpu->psDihedralID2)[i].w;
    }
    for (int i = 0; i < (int) gpu->sim.rb_dihedrals; i++)
    {
-        gpu->psRbDihedralID2->_pSysStream[0][i].x = flip - gpu->psRbDihedralID2->_pSysStream[0][i].x;
-        gpu->psRbDihedralID2->_pSysStream[0][i].y = flip - gpu->psRbDihedralID2->_pSysStream[0][i].y;
-        gpu->psRbDihedralID2->_pSysStream[0][i].z = flip - gpu->psRbDihedralID2->_pSysStream[0][i].z;
-        gpu->psRbDihedralID2->_pSysStream[0][i].w = flip - gpu->psRbDihedralID2->_pSysStream[0][i].w;
+        (*gpu->psRbDihedralID2)[i].x = flip - (*gpu->psRbDihedralID2)[i].x;
+        (*gpu->psRbDihedralID2)[i].y = flip - (*gpu->psRbDihedralID2)[i].y;
+        (*gpu->psRbDihedralID2)[i].z = flip - (*gpu->psRbDihedralID2)[i].z;
+        (*gpu->psRbDihedralID2)[i].w = flip - (*gpu->psRbDihedralID2)[i].w;
    }
    for (int i = 0; i < (int) gpu->sim.LJ14s; i++)
    {
-        gpu->psLJ14ID->_pSysStream[0][i].z = flip - gpu->psLJ14ID->_pSysStream[0][i].z;
-        gpu->psLJ14ID->_pSysStream[0][i].w = flip - gpu->psLJ14ID->_pSysStream[0][i].w;
+        (*gpu->psLJ14ID)[i].z = flip - (*gpu->psLJ14ID)[i].z;
+        (*gpu->psLJ14ID)[i].w = flip - (*gpu->psLJ14ID)[i].w;
    }
    gpu->psBondID->Upload();
    gpu->psBondAngleID1->Upload();
@@ -1393,23 +1393,23 @@ int gpuBuildThreadBlockWorkList(gpuContext gpu)
    const unsigned int grid = gpu->grid;
    const unsigned int dim = (atoms + (grid - 1)) / grid;
    const unsigned int cells = dim * (dim + 1) / 2;
-    CUDAStream<unsigned int>* psWorkUnit = new CUDAStream<unsigned int>(cells, 1u);
-    unsigned int* pWorkList = psWorkUnit->_pSysStream[0];
+    CUDAStream<unsigned int>* psWorkUnit = new CUDAStream<unsigned int>(cells, 1u, "WorkUnit");
+    unsigned int* pWorkList = psWorkUnit->_pSysData;
    gpu->psWorkUnit = psWorkUnit;
    gpu->sim.pWorkUnit = psWorkUnit->_pDevStream[0];
-    CUDAStream<unsigned int>* psInteractingWorkUnit = new CUDAStream<unsigned int>(cells, 1u);
+    CUDAStream<unsigned int>* psInteractingWorkUnit = new CUDAStream<unsigned int>(cells, 1u, "InteractingWorkUnit");
    gpu->psInteractingWorkUnit = psInteractingWorkUnit;
    gpu->sim.pInteractingWorkUnit = psInteractingWorkUnit->_pDevStream[0];
-    CUDAStream<unsigned int>* psInteractionFlag = new CUDAStream<unsigned int>(cells, 1u);
+    CUDAStream<unsigned int>* psInteractionFlag = new CUDAStream<unsigned int>(cells, 1u, "InteractionFlag");
    gpu->psInteractionFlag = psInteractionFlag;
    gpu->sim.pInteractionFlag = psInteractionFlag->_pDevStream[0];
-    CUDAStream<size_t>* psInteractionCount = new CUDAStream<size_t>(1, 1u);
+    CUDAStream<size_t>* psInteractionCount = new CUDAStream<size_t>(1, 1u, "InteractionCount");
    gpu->psInteractionCount = psInteractionCount;
    gpu->sim.pInteractionCount = psInteractionCount->_pDevStream[0];
-    CUDAStream<float4>* psGridBoundingBox = new CUDAStream<float4>(dim, 1u);
+    CUDAStream<float4>* psGridBoundingBox = new CUDAStream<float4>(dim, 1u, "GridBoundingBox");
    gpu->psGridBoundingBox = psGridBoundingBox;
    gpu->sim.pGridBoundingBox = psGridBoundingBox->_pDevStream[0];
-    CUDAStream<float4>* psGridCenter = new CUDAStream<float4>(dim, 1u);
+    CUDAStream<float4>* psGridCenter = new CUDAStream<float4>(dim, 1u, "GridCenter");
    gpu->psGridCenter = psGridCenter;
    gpu->sim.pGridCenter = psGridCenter->_pDevStream[0];
    gpu->sim.nonbond_workBlock      = gpu->sim.nonbond_threads_per_block / GRID;
@@ -1485,7 +1485,7 @@ void gpuBuildExclusionList(gpuContext gpu)
    const unsigned int atoms = gpu->sim.paddedNumberOfAtoms;
    const unsigned int grid = gpu->grid;
    const unsigned int dim = atoms/grid;
-    unsigned int* pWorkList = gpu->psWorkUnit->_pSysStream[0];
+    unsigned int* pWorkList = gpu->psWorkUnit->_pSysData;

    // Mark which work units have exclusions.

@@ -1514,7 +1514,7 @@ void gpuBuildExclusionList(gpuContext gpu)

    // Build a list of indexes for the work units with exclusions.

-    CUDAStream<unsigned int>* psExclusionIndex = new CUDAStream<unsigned int>(gpu->sim.workUnits, 1u);
+    CUDAStream<unsigned int>* psExclusionIndex = new CUDAStream<unsigned int>(gpu->sim.workUnits, 1u, "ExclusionIndex");
    gpu->psExclusionIndex = psExclusionIndex;
    unsigned int* pExclusionIndex = psExclusionIndex->_pSysData;
    gpu->sim.pExclusionIndex = psExclusionIndex->_pDevData;
@@ -1525,7 +1525,7 @@ void gpuBuildExclusionList(gpuContext gpu)

    // Record the exclusion data.

-    CUDAStream<unsigned int>* psExclusion = new CUDAStream<unsigned int>(numWithExclusions*grid, 1u);
+    CUDAStream<unsigned int>* psExclusion = new CUDAStream<unsigned int>(numWithExclusions*grid, 1u, "Exclusion");
    gpu->psExclusion = psExclusion;
    unsigned int* pExclusion = psExclusion->_pSysData;
    gpu->sim.pExclusion = psExclusion->_pDevData;
@@ -1619,11 +1619,11 @@ static void findMoleculeGroups(gpuContext gpu)
    vector<Constraint> constraints;
    for (int i = 0; i < gpu->sim.ShakeConstraints; i++)
    {
-        int atom1 = gpu->psShakeID->_pSysData[i].x;
-        int atom2 = gpu->psShakeID->_pSysData[i].y;
-        int atom3 = gpu->psShakeID->_pSysData[i].z;
-        int atom4 = gpu->psShakeID->_pSysData[i].w;
-        float distance2 = gpu->psShakeParameter->_pSysData[i].z;
+        int atom1 = (*gpu->psShakeID)[i].x;
+        int atom2 = (*gpu->psShakeID)[i].y;
+        int atom3 = (*gpu->psShakeID)[i].z;
+        int atom4 = (*gpu->psShakeID)[i].w;
+        float distance2 = (*gpu->psShakeParameter)[i].z;
        constraints.push_back(Constraint(atom1, atom2, distance2));
        if (atom3 != -1)
            constraints.push_back(Constraint(atom1, atom3, distance2));
@@ -1632,11 +1632,11 @@ static void findMoleculeGroups(gpuContext gpu)
    }
    for (int i = 0; i < gpu->sim.settleConstraints; i++)
    {
-        int atom1 = gpu->psSettleID->_pSysData[i].x;
-        int atom2 = gpu->psSettleID->_pSysData[i].y;
-        int atom3 = gpu->psSettleID->_pSysData[i].z;
-        float distance12 = gpu->psSettleParameter->_pSysData[i].x;
-        float distance23 = gpu->psSettleParameter->_pSysData[i].y;
+        int atom1 = (*gpu->psSettleID)[i].x;
+        int atom2 = (*gpu->psSettleID)[i].y;
+        int atom3 = (*gpu->psSettleID)[i].z;
+        float distance12 = (*gpu->psSettleParameter)[i].x;
+        float distance23 = (*gpu->psSettleParameter)[i].y;
        constraints.push_back(Constraint(atom1, atom2, distance12*distance12));
        constraints.push_back(Constraint(atom1, atom3, distance12*distance12));
        constraints.push_back(Constraint(atom2, atom3, distance23*distance23));
@@ -1648,8 +1648,8 @@ static void findMoleculeGroups(gpuContext gpu)
    vector<vector<int> > atomBonds(numAtoms);
    for (int i = 0; i < gpu->sim.bonds; i++)
    {
-        int atom1 = gpu->psBondID->_pSysData[i].x;
-        int atom2 = gpu->psBondID->_pSysData[i].y;
+        int atom1 = (*gpu->psBondID)[i].x;
+        int atom2 = (*gpu->psBondID)[i].y;
        atomBonds[atom1].push_back(atom2);
        atomBonds[atom2].push_back(atom1);
    }
@@ -1679,22 +1679,22 @@ static void findMoleculeGroups(gpuContext gpu)
        molecules[i].atoms = atomIndices[i];
    for (int i = 0; i < gpu->sim.bonds; i++)
    {
-        int atom1 = gpu->psBondID->_pSysData[i].x;
+        int atom1 = (*gpu->psBondID)[i].x;
        molecules[atomMolecule[atom1]].bonds.push_back(i);
    }
    for (int i = 0; i < gpu->sim.bond_angles; i++)
    {
-        int atom1 = gpu->psBondAngleID1->_pSysData[i].x;
+        int atom1 = (*gpu->psBondAngleID1)[i].x;
        molecules[atomMolecule[atom1]].angles.push_back(i);
    }
    for (int i = 0; i < gpu->sim.dihedrals; i++)
    {
-        int atom1 = gpu->psDihedralID1->_pSysData[i].x;
+        int atom1 = (*gpu->psDihedralID1)[i].x;
        molecules[atomMolecule[atom1]].periodicTorsions.push_back(i);
    }
    for (int i = 0; i < gpu->sim.rb_dihedrals; i++)
    {
-        int atom1 = gpu->psRbDihedralID1->_pSysData[i].x;
+        int atom1 = (*gpu->psRbDihedralID1)[i].x;
        molecules[atomMolecule[atom1]].rbTorsions.push_back(i);
    }
    for (int i = 0; i < constraints.size(); i++)
@@ -1937,7 +1937,7 @@ void gpuReorderAtoms(gpuContext gpu)
            {
                int oldIndex = mol.instances[molBins[i].second]+atoms[j];
                int newIndex = mol.instances[i]+atoms[j];
-                originalIndex[newIndex] = gpu->psAtomIndex->_pSysStream[0][oldIndex];
+                originalIndex[newIndex] = (*gpu->psAtomIndex)[oldIndex];
                newPosq[newIndex] = posq[oldIndex];
                newVelm[newIndex] = velm[oldIndex];
            }
@@ -1953,6 +1953,6 @@ void gpuReorderAtoms(gpuContext gpu)
        velm[i] = newVelm[i];
    gpu->psVelm4->Upload();
    for (int i = 0; i < numAtoms; i++)
-        gpu->psAtomIndex->_pSysData[i] = originalIndex[i];
+        (*gpu->psAtomIndex)[i] = originalIndex[i];
    gpu->psAtomIndex->Upload();
 }
--- a/platforms/cuda/tests/TestCudaNonbondedForce.cpp
+++ b/platforms/cuda/tests/TestCudaNonbondedForce.cpp
@@ -428,8 +428,8 @@ void testBlockInteractions(bool periodic) {
    data.gpu->psGridBoundingBox->Download();
    data.gpu->psGridCenter->Download();
    for (int i = 0; i < numBlocks; i++) {
-        float4 gridSize = data.gpu->psGridBoundingBox->_pSysData[i];
-        float4 center = data.gpu->psGridCenter->_pSysData[i];
+        float4 gridSize = (*data.gpu->psGridBoundingBox)[i];
+        float4 center = (*data.gpu->psGridCenter)[i];
        if (periodic) {
            ASSERT(gridSize.x < 0.5*boxSize);
            ASSERT(gridSize.y < 0.5*boxSize);
@@ -437,7 +437,7 @@ void testBlockInteractions(bool periodic) {
        }
        float minx = 0.0, maxx = 0.0, miny = 0.0, maxy = 0.0, minz = 0.0, maxz = 0.0, radius = 0.0;
        for (int j = 0; j < blockSize; j++) {
-            float4 pos = data.gpu->psPosq4->_pSysData[i*blockSize+j];
+            float4 pos = (*data.gpu->psPosq4)[i*blockSize+j];
            float dx = pos.x-center.x;
            float dy = pos.y-center.y;
            float dz = pos.z-center.z;
@@ -467,7 +467,7 @@ void testBlockInteractions(bool periodic) {
    // Verify that interactions were identified correctly.

    data.gpu->psInteractionCount->Download();
-    int numWithInteractions = data.gpu->psInteractionCount->_pSysData[0];
+    int numWithInteractions = (*data.gpu->psInteractionCount)[0];
    vector<bool> hasInteractions(data.gpu->sim.workUnits, false);
    data.gpu->psInteractingWorkUnit->Download();
    data.gpu->psInteractionFlag->Download();
@@ -475,7 +475,7 @@ void testBlockInteractions(bool periodic) {
    const unsigned int grid = data.gpu->grid;
    const unsigned int dim = (atoms+(grid-1))/grid;
    for (int i = 0; i < numWithInteractions; i++) {
-        unsigned int workUnit = data.gpu->psInteractingWorkUnit->_pSysData[i];
+        unsigned int workUnit = (*data.gpu->psInteractingWorkUnit)[i];
        unsigned int x = (workUnit >> 17);
        unsigned int y = ((workUnit >> 2) & 0x7fff);
        int tile = (x > y ? x+y*dim-y*(y+1)/2 : y+x*dim-x*(x+1)/2);
@@ -483,10 +483,10 @@ void testBlockInteractions(bool periodic) {

        // Make sure this tile really should have been flagged based on bounding volumes.

-        float4 gridSize1 = data.gpu->psGridBoundingBox->_pSysData[x];
-        float4 gridSize2 = data.gpu->psGridBoundingBox->_pSysData[y];
-        float4 center1 = data.gpu->psGridCenter->_pSysData[x];
-        float4 center2 = data.gpu->psGridCenter->_pSysData[y];
+        float4 gridSize1 = (*data.gpu->psGridBoundingBox)[x];
+        float4 gridSize2 = (*data.gpu->psGridBoundingBox)[y];
+        float4 center1 = (*data.gpu->psGridCenter)[x];
+        float4 center2 = (*data.gpu->psGridCenter)[y];
        float dx = center1.x-center2.x;
        float dy = center1.y-center2.y;
        float dz = center1.z-center2.z;
@@ -502,12 +502,12 @@ void testBlockInteractions(bool periodic) {

        // Check the interaction flags.

-        unsigned int flags = data.gpu->psInteractionFlag->_pSysData[i];
+        unsigned int flags = (*data.gpu->psInteractionFlag)[i];
        for (int atom2 = 0; atom2 < 32; atom2++) {
            if ((flags & 1) == 0) {
-                float4 pos2 = data.gpu->psPosq4->_pSysData[y*blockSize+atom2];
+                float4 pos2 = (*data.gpu->psPosq4)[y*blockSize+atom2];
                for (int atom1 = 0; atom1 < blockSize; ++atom1) {
-                    float4 pos1 = data.gpu->psPosq4->_pSysData[x*blockSize+atom1];
+                    float4 pos1 = (*data.gpu->psPosq4)[x*blockSize+atom1];
                    float dx = pos2.x-pos1.x;
                    float dy = pos2.y-pos1.y;
                    float dz = pos2.z-pos1.z;
@@ -536,13 +536,13 @@ void testBlockInteractions(bool periodic) {
    data.gpu->psWorkUnit->Download();
    for (int i = 0; i < hasInteractions.size(); i++)
        if (!hasInteractions[i]) {
-            unsigned int workUnit = data.gpu->psWorkUnit->_pSysData[i];
+            unsigned int workUnit = (*data.gpu->psWorkUnit)[i];
            unsigned int x = (workUnit >> 17);
            unsigned int y = ((workUnit >> 2) & 0x7fff);
            for (int atom1 = 0; atom1 < blockSize; ++atom1) {
-                float4 pos1 = data.gpu->psPosq4->_pSysData[x*blockSize+atom1];
+                float4 pos1 = (*data.gpu->psPosq4)[x*blockSize+atom1];
                for (int atom2 = 0; atom2 < blockSize; ++atom2) {
-                    float4 pos2 = data.gpu->psPosq4->_pSysData[y*blockSize+atom2];
+                    float4 pos2 = (*data.gpu->psPosq4)[y*blockSize+atom2];
                    float dx = pos1.x-pos2.x;
                    float dy = pos1.y-pos2.y;
                    float dz = pos1.z-pos2.z;