Began converting CudaArrays.

b8c86406 · Peter Eastman · b33ee3b0 · b8c86406 · b8c86406 · b8c86406
Commit b8c86406 authored Feb 12, 2018 by Peter Eastman
12 changed files
--- a/platforms/cuda/include/CudaArray.h
+++ b/platforms/cuda/include/CudaArray.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2018 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -57,6 +57,11 @@ public:
    static CudaArray* create(CudaContext& context, int size, const std::string& name) {
        return new CudaArray(context, size, sizeof(T), name);
    }
+    /**
+     * Create an uninitialized CudaArray object.  It does not point to any device memory,
+     * and cannot be used until initialize() is called on it.
+     */
+    CudaArray();
    /**
     * Create a CudaArray object.
     *
@@ -67,6 +72,36 @@ public:
     */
    CudaArray(CudaContext& context, int size, int elementSize, const std::string& name);
    ~CudaArray();
+    /**
+     * Initialize this object.
+     *
+     * @param context           the context for which to create the array
+     * @param size              the number of elements in the array
+     * @param elementSize       the size of each element in bytes
+     * @param name              the name of the array
+     */
+    void initialize(CudaContext& context, int size, int elementSize, const std::string& name);
+    /**
+     * Initialize this object.  The template argument is the data type of each array element.
+     *
+     * @param context           the context for which to create the array
+     * @param size              the number of elements in the array
+     * @param name              the name of the array
+     */
+    template <class T>
+    void initialize(CudaContext& context, int size, const std::string& name) {
+        initialize(context, size, sizeof(T), name);
+    }
+    /**
+     * Recreate the internal storage to have a different size.
+     */
+    void resize(int size);
+    /**
+     * Get whether this array has been initialized.
+     */
+    bool isInitialized() const {
+        return (pointer != 0);
+    }
    /**
     * Get the number of elements in the array.
     */
@@ -134,7 +169,7 @@ public:
     */
    void copyTo(CudaArray& dest) const;
 private:
-    CudaContext& context;
+    CudaContext* context;
    CUdeviceptr pointer;
    int size, elementSize;
    bool ownsMemory;

--- a/platforms/cuda/include/CudaBondedUtilities.h
+++ b/platforms/cuda/include/CudaBondedUtilities.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2011-2018 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -81,7 +81,6 @@ namespace OpenMM {
 class OPENMM_EXPORT_CUDA CudaBondedUtilities {
 public:
    CudaBondedUtilities(CudaContext& context);
-    ~CudaBondedUtilities();
    /**
     * Add a bonded interaction.
     *
@@ -136,7 +135,7 @@ private:
    std::vector<int> forceGroup;
    std::vector<CUdeviceptr> arguments;
    std::vector<std::string> argTypes;
-    std::vector<std::vector<CudaArray*> > atomIndices;
+    std::vector<std::vector<CudaArray> > atomIndices;
    std::vector<std::string> prefixCode;
    std::vector<std::string> energyParameterDerivatives;
    std::vector<void*> kernelArgs;

--- a/platforms/cuda/include/CudaContext.h
+++ b/platforms/cuda/include/CudaContext.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2017 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2018 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -41,6 +41,7 @@
 #include <builtin_types.h>
 #include <vector_functions.h>
 #include "windowsExportCuda.h"
+#include "CudaArray.h"
 #include "CudaPlatform.h"
 #include "openmm/Kernel.h"

@@ -48,7 +49,6 @@ typedef unsigned int tileflags;

 namespace OpenMM {

-class CudaArray;
 class CudaForceInfo;
 class CudaExpressionUtilities;
 class CudaIntegrationUtilities;
@@ -152,37 +152,37 @@ public:
     * Get the array which contains the position (the xyz components) and charge (the w component) of each atom.
     */
    CudaArray& getPosq() {
-        return *posq;
+        return posq;
    }
    /**
     * Get the array which contains a correction to the position of each atom.  This only exists if getUseMixedPrecision() returns true.
     */
    CudaArray& getPosqCorrection() {
-        return *posqCorrection;
+        return posqCorrection;
    }
    /**
     * Get the array which contains the velocity (the xyz components) and inverse mass (the w component) of each atom.
     */
    CudaArray& getVelm() {
-        return *velm;
+        return velm;
    }
    /**
     * Get the array which contains the force on each atom (represented as three long longs in 64 bit fixed point).
     */
    CudaArray& getForce() {
-        return *force;
+        return force;
    }
    /**
     * Get the array which contains the buffer in which energy is computed.
     */
    CudaArray& getEnergyBuffer() {
-        return *energyBuffer;
+        return energyBuffer;
    }
    /**
     * Get the array which contains the buffer in which derivatives of the energy with respect to parameters are computed.
     */
    CudaArray& getEnergyParamDerivBuffer() {
-        return *energyParamDerivBuffer;
+        return energyParamDerivBuffer;
    }
    /**
     * Get a pointer to a block of pinned memory that can be used for efficient transfers between host and device.
@@ -201,7 +201,7 @@ public:
     * Get the array which contains the index of each atom.
     */
    CudaArray& getAtomIndexArray() {
-        return *atomIndexDevice;
+        return atomIndexDevice;
    }
    /**
     * Get the number of cells by which the positions are offset.
@@ -649,15 +649,15 @@ private:
    std::vector<MoleculeGroup> moleculeGroups;
    std::vector<int4> posCellOffsets;
    void* pinnedBuffer;
-    CudaArray* posq;
-    CudaArray* posqCorrection;
-    CudaArray* velm;
-    CudaArray* force;
-    CudaArray* energyBuffer;
-    CudaArray* energySum;
-    CudaArray* energyParamDerivBuffer;
-    CudaArray* atomIndexDevice;
-    CudaArray* chargeBuffer;
+    CudaArray posq;
+    CudaArray posqCorrection;
+    CudaArray velm;
+    CudaArray force;
+    CudaArray energyBuffer;
+    CudaArray energySum;
+    CudaArray energyParamDerivBuffer;
+    CudaArray atomIndexDevice;
+    CudaArray chargeBuffer;
    std::vector<std::string> energyParamDerivNames;
    std::map<std::string, double> energyParamDerivWorkspace;
    std::vector<int> atomIndex;

--- a/platforms/cuda/include/CudaIntegrationUtilities.h
+++ b/platforms/cuda/include/CudaIntegrationUtilities.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2017 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2018 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -47,20 +47,20 @@ public:
     * Get the array which contains position deltas.
     */
    CudaArray& getPosDelta() {
-        return *posDelta;
+        return posDelta;
    }
    /**
     * Get the array which contains random values.  Each element is a float4, whose components
     * are independent, normally distributed random numbers with mean 0 and variance 1.
     */
    CudaArray& getRandom() {
-        return *random;
+        return random;
    }
    /**
     * Get the array which contains the current step size.
     */
    CudaArray& getStepSize() {
-        return *stepSize;
+        return stepSize;
    }
    /**
     * Set the size to use for the next step.
@@ -131,38 +131,38 @@ private:
    CUfunction ccmaUpdateKernel;
    CUfunction vsitePositionKernel, vsiteForceKernel;
    CUfunction randomKernel, timeShiftKernel;
-    CudaArray* posDelta;
-    CudaArray* settleAtoms;
-    CudaArray* settleParams;
-    CudaArray* shakeAtoms;
-    CudaArray* shakeParams;
-    CudaArray* random;
-    CudaArray* randomSeed;
-    CudaArray* stepSize;
-    CudaArray* ccmaAtoms;
-    CudaArray* ccmaDistance;
-    CudaArray* ccmaReducedMass;
-    CudaArray* ccmaAtomConstraints;
-    CudaArray* ccmaNumAtomConstraints;
-    CudaArray* ccmaConstraintMatrixColumn;
-    CudaArray* ccmaConstraintMatrixValue;
-    CudaArray* ccmaDelta1;
-    CudaArray* ccmaDelta2;
-    CudaArray* ccmaConverged;
+    CudaArray posDelta;
+    CudaArray settleAtoms;
+    CudaArray settleParams;
+    CudaArray shakeAtoms;
+    CudaArray shakeParams;
+    CudaArray random;
+    CudaArray randomSeed;
+    CudaArray stepSize;
+    CudaArray ccmaAtoms;
+    CudaArray ccmaDistance;
+    CudaArray ccmaReducedMass;
+    CudaArray ccmaAtomConstraints;
+    CudaArray ccmaNumAtomConstraints;
+    CudaArray ccmaConstraintMatrixColumn;
+    CudaArray ccmaConstraintMatrixValue;
+    CudaArray ccmaDelta1;
+    CudaArray ccmaDelta2;
+    CudaArray ccmaConverged;
    int* ccmaConvergedMemory;
    CUdeviceptr ccmaConvergedDeviceMemory;
    CUevent ccmaEvent;
-    CudaArray* vsite2AvgAtoms;
-    CudaArray* vsite2AvgWeights;
-    CudaArray* vsite3AvgAtoms;
-    CudaArray* vsite3AvgWeights;
-    CudaArray* vsiteOutOfPlaneAtoms;
-    CudaArray* vsiteOutOfPlaneWeights;
-    CudaArray* vsiteLocalCoordsIndex;
-    CudaArray* vsiteLocalCoordsAtoms;
-    CudaArray* vsiteLocalCoordsWeights;
-    CudaArray* vsiteLocalCoordsPos;
-    CudaArray* vsiteLocalCoordsStartIndex;
+    CudaArray vsite2AvgAtoms;
+    CudaArray vsite2AvgWeights;
+    CudaArray vsite3AvgAtoms;
+    CudaArray vsite3AvgWeights;
+    CudaArray vsiteOutOfPlaneAtoms;
+    CudaArray vsiteOutOfPlaneWeights;
+    CudaArray vsiteLocalCoordsIndex;
+    CudaArray vsiteLocalCoordsAtoms;
+    CudaArray vsiteLocalCoordsWeights;
+    CudaArray vsiteLocalCoordsPos;
+    CudaArray vsiteLocalCoordsStartIndex;
    int randomPos;
    int lastSeed, numVsites;
    double2 lastStepSize;

--- a/platforms/cuda/include/CudaNonbondedUtilities.h
+++ b/platforms/cuda/include/CudaNonbondedUtilities.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2018 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -164,61 +164,61 @@ public:
     * Get the array containing the center of each atom block.
     */
    CudaArray& getBlockCenters() {
-        return *blockCenter;
+        return blockCenter;
    }
    /**
     * Get the array containing the dimensions of each atom block.
     */
    CudaArray& getBlockBoundingBoxes() {
-        return *blockBoundingBox;
+        return blockBoundingBox;
    }
    /**
     * Get the array whose first element contains the number of tiles with interactions.
     */
    CudaArray& getInteractionCount() {
-        return *interactionCount;
+        return interactionCount;
    }
    /**
     * Get the array containing tiles with interactions.
     */
    CudaArray& getInteractingTiles() {
-        return *interactingTiles;
+        return interactingTiles;
    }
    /**
     * Get the array containing the atoms in each tile with interactions.
     */
    CudaArray& getInteractingAtoms() {
-        return *interactingAtoms;
+        return interactingAtoms;
    }
    /**
     * Get the array containing single pairs in the neighbor list.
     */
    CudaArray& getSinglePairs() {
-        return *singlePairs;
+        return singlePairs;
    }
    /**
     * Get the array containing exclusion flags.
     */
    CudaArray& getExclusions() {
-        return *exclusions;
+        return exclusions;
    }
    /**
     * Get the array containing tiles with exclusions.
     */
    CudaArray& getExclusionTiles() {
-        return *exclusionTiles;
+        return exclusionTiles;
    }
    /**
     * Get the array containing the index into the exclusion array for each tile.
     */
    CudaArray& getExclusionIndices() {
-        return *exclusionIndices;
+        return exclusionIndices;
    }
    /**
     * Get the array listing where the exclusion data starts for each row.
     */
    CudaArray& getExclusionRowIndices() {
-        return *exclusionRowIndices;
+        return exclusionRowIndices;
    }
    /**
     * Get the index of the first tile this context is responsible for processing.
@@ -270,22 +270,22 @@ private:
    class BlockSortTrait;
    CudaContext& context;
    std::map<int, KernelSet> groupKernels;
-    CudaArray* exclusionTiles;
-    CudaArray* exclusions;
-    CudaArray* exclusionIndices;
-    CudaArray* exclusionRowIndices;
-    CudaArray* interactingTiles;
-    CudaArray* interactingAtoms;
-    CudaArray* interactionCount;
-    CudaArray* singlePairs;
-    CudaArray* singlePairCount;
-    CudaArray* blockCenter;
-    CudaArray* blockBoundingBox;
-    CudaArray* sortedBlocks;
-    CudaArray* sortedBlockCenter;
-    CudaArray* sortedBlockBoundingBox;
-    CudaArray* oldPositions;
-    CudaArray* rebuildNeighborList;
+    CudaArray exclusionTiles;
+    CudaArray exclusions;
+    CudaArray exclusionIndices;
+    CudaArray exclusionRowIndices;
+    CudaArray interactingTiles;
+    CudaArray interactingAtoms;
+    CudaArray interactionCount;
+    CudaArray singlePairs;
+    CudaArray singlePairCount;
+    CudaArray blockCenter;
+    CudaArray blockBoundingBox;
+    CudaArray sortedBlocks;
+    CudaArray sortedBlockCenter;
+    CudaArray sortedBlockBoundingBox;
+    CudaArray oldPositions;
+    CudaArray rebuildNeighborList;
    CudaSort* blockSorter;
    CUevent downloadCountEvent;
    int* pinnedCountBuffer;

--- a/platforms/cuda/include/CudaSort.h
+++ b/platforms/cuda/include/CudaSort.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2010-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2010-2018 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -87,11 +87,11 @@ public:
 private:
    CudaContext& context;
    SortTrait* trait;
-    CudaArray* dataRange;
-    CudaArray* bucketOfElement;
-    CudaArray* offsetInBucket;
-    CudaArray* bucketOffset;
-    CudaArray* buckets;
+    CudaArray dataRange;
+    CudaArray bucketOfElement;
+    CudaArray offsetInBucket;
+    CudaArray bucketOffset;
+    CudaArray buckets;
    CUfunction shortListKernel, computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
    unsigned int dataLength, rangeKernelSize, positionsKernelSize, sortKernelSize;
    bool isShortList;

--- a/platforms/cuda/src/CudaArray.cpp
+++ b/platforms/cuda/src/CudaArray.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2012 Stanford University and the Authors.           *
+ * Portions copyright (c) 2012-2018 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -32,18 +32,15 @@

 using namespace OpenMM;

-CudaArray::CudaArray(CudaContext& context, int size, int elementSize, const std::string& name) :
-        context(context), size(size), elementSize(elementSize), name(name), ownsMemory(true) {
-    CUresult result = cuMemAlloc(&pointer, size*elementSize);
-    if (result != CUDA_SUCCESS) {
-        std::stringstream str;
-        str<<"Error creating array "<<name<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
-        throw OpenMMException(str.str());
-    }
+CudaArray::CudaArray() : pointer(0), ownsMemory(false) {
+}
+
+CudaArray::CudaArray(CudaContext& context, int size, int elementSize, const std::string& name) : pointer(0) {
+    initialize(context, size, elementSize, name);
 }

 CudaArray::~CudaArray() {
-    if (ownsMemory && context.getContextIsValid()) {
+    if (pointer != 0 && ownsMemory && context->getContextIsValid()) {
        CUresult result = cuMemFree(pointer);
        if (result != CUDA_SUCCESS) {
            std::stringstream str;
@@ -53,12 +50,45 @@ CudaArray::~CudaArray() {
    }
 }

+void CudaArray::initialize(CudaContext& context, int size, int elementSize, const std::string& name) {
+    if (this->pointer != 0)
+        throw OpenMMException("CudaArray has already been initialized");
+    this->context = &context;
+    this->size = size;
+    this->elementSize = elementSize;
+    this->name = name;
+    ownsMemory = true;
+    CUresult result = cuMemAlloc(&pointer, size*elementSize);
+    if (result != CUDA_SUCCESS) {
+        std::stringstream str;
+        str<<"Error creating array "<<name<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
+        throw OpenMMException(str.str());
+    }
+}
+
+void CudaArray::resize(int size) {
+    if (pointer == 0)
+        throw OpenMMException("CudaArray has not been initialized");
+    if (!ownsMemory)
+        throw OpenMMException("Cannot resize an array that does not own its storage");
+    CUresult result = cuMemFree(pointer);
+    if (result != CUDA_SUCCESS) {
+        std::stringstream str;
+        str<<"Error deleting array "<<name<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
+        throw OpenMMException(str.str());
+    }
+    pointer = 0;
+    initialize(*context, size, elementSize, name);
+}
+
 void CudaArray::upload(const void* data, bool blocking) {
+    if (pointer == 0)
+        throw OpenMMException("CudaArray has not been initialized");
    CUresult result;
    if (blocking)
        result = cuMemcpyHtoD(pointer, data, size*elementSize);
    else
-        result = cuMemcpyHtoDAsync(pointer, data, size*elementSize, context.getCurrentStream());
+        result = cuMemcpyHtoDAsync(pointer, data, size*elementSize, context->getCurrentStream());
    if (result != CUDA_SUCCESS) {
        std::stringstream str;
        str<<"Error uploading array "<<name<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
@@ -67,11 +97,13 @@ void CudaArray::upload(const void* data, bool blocking) {
 }

 void CudaArray::download(void* data, bool blocking) const {
+    if (pointer == 0)
+        throw OpenMMException("CudaArray has not been initialized");
    CUresult result;
    if (blocking)
        result = cuMemcpyDtoH(data, pointer, size*elementSize);
    else
-        result = cuMemcpyDtoHAsync(data, pointer, size*elementSize, context.getCurrentStream());
+        result = cuMemcpyDtoHAsync(data, pointer, size*elementSize, context->getCurrentStream());
    if (result != CUDA_SUCCESS) {
        std::stringstream str;
        str<<"Error downloading array "<<name<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
@@ -80,9 +112,11 @@ void CudaArray::download(void* data, bool blocking) const {
 }

 void CudaArray::copyTo(CudaArray& dest) const {
+    if (pointer == 0)
+        throw OpenMMException("CudaArray has not been initialized");
    if (dest.getSize() != size || dest.getElementSize() != elementSize)
        throw OpenMMException("Error copying array "+name+" to "+dest.getName()+": The destination array does not match the size of the array");
-    CUresult result = cuMemcpyDtoDAsync(dest.getDevicePointer(), pointer, size*elementSize, context.getCurrentStream());
+    CUresult result = cuMemcpyDtoDAsync(dest.getDevicePointer(), pointer, size*elementSize, context->getCurrentStream());
    if (result != CUDA_SUCCESS) {
        std::stringstream str;
        str<<"Error copying array "<<name<<" to "<<dest.getName()<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";

--- a/platforms/cuda/src/CudaBondedUtilities.cpp
+++ b/platforms/cuda/src/CudaBondedUtilities.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2011-2018 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -37,12 +37,6 @@ using namespace std;
 CudaBondedUtilities::CudaBondedUtilities(CudaContext& context) : context(context), numForceBuffers(0), maxBonds(0), allGroups(0), hasInitializedKernels(false) {
 }

-CudaBondedUtilities::~CudaBondedUtilities() {
-    for (int i = 0; i < (int) atomIndices.size(); i++)
-        for (int j = 0; j < (int) atomIndices[i].size(); j++)
-            delete atomIndices[i][j];
-}
-
 void CudaBondedUtilities::addInteraction(const vector<vector<int> >& atoms, const string& source, int group) {
    if (atoms.size() > 0) {
        forceAtoms.push_back(atoms);
@@ -99,9 +93,9 @@ void CudaBondedUtilities::initialize(const System& system) {
                for (int atom = 0; atom < width; atom++)
                    indexVec[bond*paddedWidth+atom] = forceAtoms[i][bond][startAtom+atom];
            }
-            CudaArray* indices = new CudaArray(context, numBonds, 4*paddedWidth, "bondedIndices");
-            indices->upload(&indexVec[0]);
-            atomIndices[i].push_back(indices);
+            atomIndices[i].push_back(CudaArray());
+            atomIndices[i].back().initialize(context, numBonds, 4*paddedWidth, "bondedIndices");
+            atomIndices[i].back().upload(&indexVec[0]);
            startAtom += width;
        }
    }
@@ -115,7 +109,7 @@ void CudaBondedUtilities::initialize(const System& system) {
    s<<"extern \"C\" __global__ void computeBondedForces(unsigned long long* __restrict__ forceBuffer, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, int groups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ";
    for (int force = 0; force < numForces; force++) {
        for (int i = 0; i < (int) atomIndices[force].size(); i++) {
-            int indexWidth = atomIndices[force][i]->getElementSize()/4;
+            int indexWidth = atomIndices[force][i].getElementSize()/4;
            string indexType = "uint"+context.intToString(indexWidth);
            s<<", const "<<indexType<<"* __restrict__ atomIndices"<<force<<"_"<<i;
        }
@@ -154,7 +148,7 @@ string CudaBondedUtilities::createForceSource(int forceIndex, int numBonds, int
    s<<"for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < "<<numBonds<<"; index += blockDim.x*gridDim.x) {\n";
    int startAtom = 0;
    for (int i = 0; i < (int) atomIndices[forceIndex].size(); i++) {
-        int indexWidth = atomIndices[forceIndex][i]->getElementSize()/4;
+        int indexWidth = atomIndices[forceIndex][i].getElementSize()/4;
        string indexType = "uint"+context.intToString(indexWidth);
        s<<"    "<<indexType<<" atoms"<<i<<" = atomIndices"<<forceIndex<<"_"<<i<<"[index];\n";
        int atomsToLoad = min(indexWidth, numAtoms-startAtom);
@@ -191,7 +185,7 @@ void CudaBondedUtilities::computeInteractions(int groups) {
        kernelArgs.push_back(context.getPeriodicBoxVecZPointer());
        for (int i = 0; i < (int) atomIndices.size(); i++)
            for (int j = 0; j < (int) atomIndices[i].size(); j++)
-                kernelArgs.push_back(&atomIndices[i][j]->getDevicePointer());
+                kernelArgs.push_back(&atomIndices[i][j].getDevicePointer());
        for (int i = 0; i < (int) arguments.size(); i++)
            kernelArgs.push_back(&arguments[i]);
        if (energyParameterDerivatives.size() > 0)

--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2017 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2018 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -108,8 +108,7 @@ static int executeInWindows(const string &command) {
 CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
        const string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData, CudaContext* originalContext) : system(system), currentStream(0),
        time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), contextIsValid(false), atomsWereReordered(false), hasCompilerKernel(false), isNvccAvailable(false),
-        pinnedBuffer(NULL), posq(NULL), posqCorrection(NULL), velm(NULL), force(NULL), energyBuffer(NULL), energySum(NULL), energyParamDerivBuffer(NULL), atomIndexDevice(NULL), chargeBuffer(NULL),
-        integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
+        pinnedBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
    // Determine what compiler to use.
    
    this->compiler = "\""+compiler+"\"";
@@ -268,8 +267,8 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
        compilationDefines["BALLOT(var)"] = "__ballot(var);";
    }
    if (useDoublePrecision) {
-        posq = CudaArray::create<double4>(*this, paddedNumAtoms, "posq");
-        velm = CudaArray::create<double4>(*this, paddedNumAtoms, "velm");
+        posq.initialize<double4>(*this, paddedNumAtoms, "posq");
+        velm.initialize<double4>(*this, paddedNumAtoms, "velm");
        compilationDefines["USE_DOUBLE_PRECISION"] = "1";
        compilationDefines["make_real2"] = "make_double2";
        compilationDefines["make_real3"] = "make_double3";
@@ -279,9 +278,9 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
        compilationDefines["make_mixed4"] = "make_double4";
    }
    else if (useMixedPrecision) {
-        posq = CudaArray::create<float4>(*this, paddedNumAtoms, "posq");
-        posqCorrection = CudaArray::create<float4>(*this, paddedNumAtoms, "posqCorrection");
-        velm = CudaArray::create<double4>(*this, paddedNumAtoms, "velm");
+        posq.initialize<float4>(*this, paddedNumAtoms, "posq");
+        posqCorrection.initialize<float4>(*this, paddedNumAtoms, "posqCorrection");
+        velm.initialize<double4>(*this, paddedNumAtoms, "velm");
        compilationDefines["USE_MIXED_PRECISION"] = "1";
        compilationDefines["make_real2"] = "make_float2";
        compilationDefines["make_real3"] = "make_float3";
@@ -291,8 +290,8 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
        compilationDefines["make_mixed4"] = "make_double4";
    }
    else {
-        posq = CudaArray::create<float4>(*this, paddedNumAtoms, "posq");
-        velm = CudaArray::create<float4>(*this, paddedNumAtoms, "velm");
+        posq.initialize<float4>(*this, paddedNumAtoms, "posq");
+        velm.initialize<float4>(*this, paddedNumAtoms, "velm");
        compilationDefines["make_real2"] = "make_float2";
        compilationDefines["make_real3"] = "make_float3";
        compilationDefines["make_real4"] = "make_float4";
@@ -415,24 +414,6 @@ CudaContext::~CudaContext() {
        delete computation;
    if (pinnedBuffer != NULL)
        cuMemFreeHost(pinnedBuffer);
-    if (posq != NULL)
-        delete posq;
-    if (posqCorrection != NULL)
-        delete posqCorrection;
-    if (velm != NULL)
-        delete velm;
-    if (force != NULL)
-        delete force;
-    if (energyBuffer != NULL)
-        delete energyBuffer;
-    if (energySum != NULL)
-        delete energySum;
-    if (energyParamDerivBuffer != NULL)
-        delete energyParamDerivBuffer;
-    if (atomIndexDevice != NULL)
-        delete atomIndexDevice;
-    if (chargeBuffer != NULL)
-        delete chargeBuffer;
    if (integration != NULL)
        delete integration;
    if (expression != NULL)
@@ -456,20 +437,20 @@ void CudaContext::initialize() {
    string errorMessage = "Error initializing Context";
    int numEnergyBuffers = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers());
    if (useDoublePrecision) {
-        energyBuffer = CudaArray::create<double>(*this, numEnergyBuffers, "energyBuffer");
-        energySum = CudaArray::create<double>(*this, 1, "energySum");
+        energyBuffer.initialize<double>(*this, numEnergyBuffers, "energyBuffer");
+        energySum.initialize<double>(*this, 1, "energySum");
        int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
    }
    else if (useMixedPrecision) {
-        energyBuffer = CudaArray::create<double>(*this, numEnergyBuffers, "energyBuffer");
-        energySum = CudaArray::create<double>(*this, 1, "energySum");
+        energyBuffer.initialize<double>(*this, numEnergyBuffers, "energyBuffer");
+        energySum.initialize<double>(*this, 1, "energySum");
        int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
    }
    else {
-        energyBuffer = CudaArray::create<float>(*this, numEnergyBuffers, "energyBuffer");
-        energySum = CudaArray::create<float>(*this, 1, "energySum");
+        energyBuffer.initialize<float>(*this, numEnergyBuffers, "energyBuffer");
+        energySum.initialize<float>(*this, 1, "energySum");
        int pinnedBufferSize = max(paddedNumAtoms*6, numEnergyBuffers);
        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), 0));
    }
@@ -480,24 +461,24 @@ void CudaContext::initialize() {
        else
            ((float4*) pinnedBuffer)[i] = make_float4(0.0f, 0.0f, 0.0f, mass == 0.0 ? 0.0f : (float) (1.0/mass));
    }
-    velm->upload(pinnedBuffer);
+    velm.upload(pinnedBuffer);
    bonded->initialize(system);
-    force = CudaArray::create<long long>(*this, paddedNumAtoms*3, "force");
-    addAutoclearBuffer(force->getDevicePointer(), force->getSize()*force->getElementSize());
-    addAutoclearBuffer(energyBuffer->getDevicePointer(), energyBuffer->getSize()*energyBuffer->getElementSize());
+    force.initialize<long long>(*this, paddedNumAtoms*3, "force");
+    addAutoclearBuffer(force.getDevicePointer(), force.getSize()*force.getElementSize());
+    addAutoclearBuffer(energyBuffer.getDevicePointer(), energyBuffer.getSize()*energyBuffer.getElementSize());
    int numEnergyParamDerivs = energyParamDerivNames.size();
    if (numEnergyParamDerivs > 0) {
        if (useDoublePrecision || useMixedPrecision)
-            energyParamDerivBuffer = CudaArray::create<double>(*this, numEnergyParamDerivs*numEnergyBuffers, "energyParamDerivBuffer");
+            energyParamDerivBuffer.initialize<double>(*this, numEnergyParamDerivs*numEnergyBuffers, "energyParamDerivBuffer");
        else
-            energyParamDerivBuffer = CudaArray::create<float>(*this, numEnergyParamDerivs*numEnergyBuffers, "energyParamDerivBuffer");
-        addAutoclearBuffer(*energyParamDerivBuffer);
+            energyParamDerivBuffer.initialize<float>(*this, numEnergyParamDerivs*numEnergyBuffers, "energyParamDerivBuffer");
+        addAutoclearBuffer(energyParamDerivBuffer);
    }
-    atomIndexDevice = CudaArray::create<int>(*this, paddedNumAtoms, "atomIndex");
+    atomIndexDevice.initialize<int>(*this, paddedNumAtoms, "atomIndex");
    atomIndex.resize(paddedNumAtoms);
    for (int i = 0; i < paddedNumAtoms; ++i)
        atomIndex[i] = i;
-    atomIndexDevice->upload(atomIndex);
+    atomIndexDevice.upload(atomIndex);
    findMoleculeGroups();
    nonbonded->initialize(system);
 }
@@ -890,11 +871,11 @@ void CudaContext::clearAutoclearBuffers() {
 }

 double CudaContext::reduceEnergy() {
-    int bufferSize = energyBuffer->getSize();
+    int bufferSize = energyBuffer.getSize();
    int workGroupSize  = 512;
-    void* args[] = {&energyBuffer->getDevicePointer(), &energySum->getDevicePointer(), &bufferSize, &workGroupSize};
-    executeKernel(reduceEnergyKernel, args, workGroupSize, workGroupSize, workGroupSize*energyBuffer->getElementSize());
-    energySum->download(pinnedBuffer);
+    void* args[] = {&energyBuffer.getDevicePointer(), &energySum.getDevicePointer(), &bufferSize, &workGroupSize};
+    executeKernel(reduceEnergyKernel, args, workGroupSize, workGroupSize, workGroupSize*energyBuffer.getElementSize());
+    energySum.download(pinnedBuffer);
    if (getUseDoublePrecision() || getUseMixedPrecision())
        return *((double*) pinnedBuffer);
    else
@@ -902,21 +883,21 @@ double CudaContext::reduceEnergy() {
 }

 void CudaContext::setCharges(const vector<double>& charges) {
-    if (chargeBuffer == NULL)
-        chargeBuffer = new CudaArray(*this, numAtoms, useDoublePrecision ? sizeof(double) : sizeof(float), "chargeBuffer");
+    if (!chargeBuffer.isInitialized())
+        chargeBuffer.initialize(*this, numAtoms, useDoublePrecision ? sizeof(double) : sizeof(float), "chargeBuffer");
    if (getUseDoublePrecision()) {
        double* c = (double*) getPinnedBuffer();
        for (int i = 0; i < charges.size(); i++)
            c[i] = charges[i];
-        chargeBuffer->upload(c);
+        chargeBuffer.upload(c);
    }
    else {
        float* c = (float*) getPinnedBuffer();
        for (int i = 0; i < charges.size(); i++)
            c[i] = (float) charges[i];
-        chargeBuffer->upload(c);
+        chargeBuffer.upload(c);
    }
-    void* args[] = {&chargeBuffer->getDevicePointer(), &posq->getDevicePointer(), &atomIndexDevice->getDevicePointer(), &numAtoms};
+    void* args[] = {&chargeBuffer.getDevicePointer(), &posq.getDevicePointer(), &atomIndexDevice.getDevicePointer(), &numAtoms};
    executeKernel(setChargesKernel, args, numAtoms);
 }

@@ -1178,16 +1159,16 @@ bool CudaContext::invalidateMolecules(CudaForceInfo* force) {
        vector<double4> newPosq(paddedNumAtoms, make_double4(0, 0, 0, 0));
        vector<double4> oldVelm(paddedNumAtoms);
        vector<double4> newVelm(paddedNumAtoms, make_double4(0, 0, 0, 0));
-        posq->download(oldPosq);
-        velm->download(oldVelm);
+        posq.download(oldPosq);
+        velm.download(oldVelm);
        for (int i = 0; i < numAtoms; i++) {
            int index = atomIndex[i];
            newPosq[index] = oldPosq[i];
            newVelm[index] = oldVelm[i];
            newCellOffsets[index] = posCellOffsets[i];
        }
-        posq->upload(newPosq);
-        velm->upload(newVelm);
+        posq.upload(newPosq);
+        velm.upload(newVelm);
    }
    else if (useMixedPrecision) {
        vector<float4> oldPosq(paddedNumAtoms);
@@ -1196,8 +1177,8 @@ bool CudaContext::invalidateMolecules(CudaForceInfo* force) {
        vector<float4> newPosqCorrection(paddedNumAtoms, make_float4(0, 0, 0, 0));
        vector<double4> oldVelm(paddedNumAtoms);
        vector<double4> newVelm(paddedNumAtoms, make_double4(0, 0, 0, 0));
-        posq->download(oldPosq);
-        velm->download(oldVelm);
+        posq.download(oldPosq);
+        velm.download(oldVelm);
        for (int i = 0; i < numAtoms; i++) {
            int index = atomIndex[i];
            newPosq[index] = oldPosq[i];
@@ -1205,31 +1186,31 @@ bool CudaContext::invalidateMolecules(CudaForceInfo* force) {
            newVelm[index] = oldVelm[i];
            newCellOffsets[index] = posCellOffsets[i];
        }
-        posq->upload(newPosq);
-        posqCorrection->upload(newPosqCorrection);
-        velm->upload(newVelm);
+        posq.upload(newPosq);
+        posqCorrection.upload(newPosqCorrection);
+        velm.upload(newVelm);
    }
    else {
        vector<float4> oldPosq(paddedNumAtoms);
        vector<float4> newPosq(paddedNumAtoms, make_float4(0, 0, 0, 0));
        vector<float4> oldVelm(paddedNumAtoms);
        vector<float4> newVelm(paddedNumAtoms, make_float4(0, 0, 0, 0));
-        posq->download(oldPosq);
-        velm->download(oldVelm);
+        posq.download(oldPosq);
+        velm.download(oldVelm);
        for (int i = 0; i < numAtoms; i++) {
            int index = atomIndex[i];
            newPosq[index] = oldPosq[i];
            newVelm[index] = oldVelm[i];
            newCellOffsets[index] = posCellOffsets[i];
        }
-        posq->upload(newPosq);
-        velm->upload(newVelm);
+        posq.upload(newPosq);
+        velm.upload(newVelm);
    }
    for (int i = 0; i < numAtoms; i++) {
        atomIndex[i] = i;
        posCellOffsets[i] = newCellOffsets[i];
    }
-    atomIndexDevice->upload(atomIndex);
+    atomIndexDevice.upload(atomIndex);
    findMoleculeGroups();
    for (auto listener : reorderListeners)
        listener->execute();
@@ -1262,10 +1243,10 @@ void CudaContext::reorderAtomsImpl() {
    vector<Real4> oldPosqCorrection(paddedNumAtoms, padding);
    Mixed4 paddingMixed = {0, 0, 0, 0};
    vector<Mixed4> oldVelm(paddedNumAtoms, paddingMixed);
-    posq->download(oldPosq);
-    velm->download(oldVelm);
+    posq.download(oldPosq);
+    velm.download(oldVelm);
    if (useMixedPrecision)
-        posqCorrection->download(oldPosqCorrection);
+        posqCorrection.download(oldPosqCorrection);
    Real minx = oldPosq[0].x, maxx = oldPosq[0].x;
    Real miny = oldPosq[0].y, maxy = oldPosq[0].y;
    Real minz = oldPosq[0].z, maxz = oldPosq[0].z;
@@ -1409,11 +1390,11 @@ void CudaContext::reorderAtomsImpl() {
        atomIndex[i] = originalIndex[i];
        posCellOffsets[i] = newCellOffsets[i];
    }
-    posq->upload(newPosq);
+    posq.upload(newPosq);
    if (useMixedPrecision)
-        posqCorrection->upload(newPosqCorrection);
-    velm->upload(newVelm);
-    atomIndexDevice->upload(atomIndex);
+        posqCorrection.upload(newPosqCorrection);
+    velm.upload(newVelm);
+    atomIndexDevice.upload(atomIndex);
    for (auto listener : reorderListeners)
        listener->execute();
 }

--- a/platforms/cuda/src/CudaIntegrationUtilities.cpp
+++ b/platforms/cuda/src/CudaIntegrationUtilities.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2017 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2018 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -98,30 +98,24 @@ struct CudaIntegrationUtilities::ConstraintOrderer : public binary_function<int,
 };

 CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const System& system) : context(context),
-        posDelta(NULL), settleAtoms(NULL), settleParams(NULL), shakeAtoms(NULL), shakeParams(NULL),
-        random(NULL), randomSeed(NULL), randomPos(0), stepSize(NULL), ccmaAtoms(NULL), ccmaDistance(NULL),
-        ccmaReducedMass(NULL), ccmaAtomConstraints(NULL), ccmaNumAtomConstraints(NULL), ccmaConstraintMatrixColumn(NULL),
-        ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConverged(NULL), ccmaConvergedMemory(NULL),
-        vsite2AvgAtoms(NULL), vsite2AvgWeights(NULL), vsite3AvgAtoms(NULL), vsite3AvgWeights(NULL),
-        vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL), vsiteLocalCoordsIndex(NULL), vsiteLocalCoordsAtoms(NULL),
-        vsiteLocalCoordsWeights(NULL), vsiteLocalCoordsPos(NULL), vsiteLocalCoordsStartIndex(NULL) {
+        randomPos(0) {
    // Create workspace arrays.

    lastStepSize = make_double2(0.0, 0.0);
    if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) {
-        posDelta = CudaArray::create<double4>(context, context.getPaddedNumAtoms(), "posDelta");
-        vector<double4> deltas(posDelta->getSize(), make_double4(0.0, 0.0, 0.0, 0.0));
-        posDelta->upload(deltas);
-        stepSize = CudaArray::create<double2>(context, 1, "stepSize");
-        stepSize->upload(&lastStepSize);
+        posDelta.initialize<double4>(context, context.getPaddedNumAtoms(), "posDelta");
+        vector<double4> deltas(posDelta.getSize(), make_double4(0.0, 0.0, 0.0, 0.0));
+        posDelta.upload(deltas);
+        stepSize.initialize<double2>(context, 1, "stepSize");
+        stepSize.upload(&lastStepSize);
    }
    else {
-        posDelta = CudaArray::create<float4>(context, context.getPaddedNumAtoms(), "posDelta");
-        vector<float4> deltas(posDelta->getSize(), make_float4(0.0f, 0.0f, 0.0f, 0.0f));
-        posDelta->upload(deltas);
-        stepSize = CudaArray::create<float2>(context, 1, "stepSize");
+        posDelta.initialize<float4>(context, context.getPaddedNumAtoms(), "posDelta");
+        vector<float4> deltas(posDelta.getSize(), make_float4(0.0f, 0.0f, 0.0f, 0.0f));
+        posDelta.upload(deltas);
+        stepSize.initialize<float2>(context, 1, "stepSize");
        float2 lastStepSizeFloat = make_float2(0.0f, 0.0f);
-        stepSize->upload(&lastStepSizeFloat);
+        stepSize.upload(&lastStepSizeFloat);
    }

    // Record the set of constraints and how many constraints each atom is involved in.
@@ -208,10 +202,10 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
            isShakeAtom[atom3] = true;
        }
        if (atoms.size() > 0) {
-            settleAtoms = CudaArray::create<int4>(context, atoms.size(), "settleAtoms");
-            settleParams = CudaArray::create<float2>(context, params.size(), "settleParams");
-            settleAtoms->upload(atoms);
-            settleParams->upload(params);
+            settleAtoms.initialize<int4>(context, atoms.size(), "settleAtoms");
+            settleParams.initialize<float2>(context, params.size(), "settleParams");
+            settleAtoms.upload(atoms);
+            settleParams.upload(params);
        }
    }

@@ -291,10 +285,10 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
                isShakeAtom[cluster.peripheralID[2]] = true;
            ++index;
        }
-        shakeAtoms = CudaArray::create<int4>(context, atoms.size(), "shakeAtoms");
-        shakeParams = CudaArray::create<float4>(context, params.size(), "shakeParams");
-        shakeAtoms->upload(atoms);
-        shakeParams->upload(params);
+        shakeAtoms.initialize<int4>(context, atoms.size(), "shakeAtoms");
+        shakeParams.initialize<float4>(context, params.size(), "shakeParams");
+        shakeAtoms.upload(atoms);
+        shakeParams.upload(params);
    }

    // Find connected constraints for CCMA.
@@ -371,26 +365,26 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S

        // Record the CCMA data structures.

-        ccmaAtoms = CudaArray::create<int2>(context, numCCMA, "CcmaAtoms");
-        ccmaAtomConstraints = CudaArray::create<int>(context, numAtoms*maxAtomConstraints, "CcmaAtomConstraints");
-        ccmaNumAtomConstraints = CudaArray::create<int>(context, numAtoms, "CcmaAtomConstraintsIndex");
-        ccmaConstraintMatrixColumn = CudaArray::create<int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
-        ccmaConverged = CudaArray::create<int>(context, 2, "ccmaConverged");
+        ccmaAtoms.initialize<int2>(context, numCCMA, "CcmaAtoms");
+        ccmaAtomConstraints.initialize<int>(context, numAtoms*maxAtomConstraints, "CcmaAtomConstraints");
+        ccmaNumAtomConstraints.initialize<int>(context, numAtoms, "CcmaAtomConstraintsIndex");
+        ccmaConstraintMatrixColumn.initialize<int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
+        ccmaConverged.initialize<int>(context, 2, "ccmaConverged");
        CHECK_RESULT2(cuMemHostAlloc((void**) &ccmaConvergedMemory, sizeof(int), CU_MEMHOSTALLOC_DEVICEMAP), "Error allocating pinned memory");
        CHECK_RESULT2(cuMemHostGetDevicePointer(&ccmaConvergedDeviceMemory, ccmaConvergedMemory, 0), "Error getting device address for pinned memory");
-        vector<int2> atomsVec(ccmaAtoms->getSize());
-        vector<int> atomConstraintsVec(ccmaAtomConstraints->getSize());
-        vector<int> numAtomConstraintsVec(ccmaNumAtomConstraints->getSize());
-        vector<int> constraintMatrixColumnVec(ccmaConstraintMatrixColumn->getSize());
+        vector<int2> atomsVec(ccmaAtoms.getSize());
+        vector<int> atomConstraintsVec(ccmaAtomConstraints.getSize());
+        vector<int> numAtomConstraintsVec(ccmaNumAtomConstraints.getSize());
+        vector<int> constraintMatrixColumnVec(ccmaConstraintMatrixColumn.getSize());
        if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) {
-            ccmaDistance = CudaArray::create<double4>(context, numCCMA, "CcmaDistance");
-            ccmaDelta1 = CudaArray::create<double>(context, numCCMA, "CcmaDelta1");
-            ccmaDelta2 = CudaArray::create<double>(context, numCCMA, "CcmaDelta2");
-            ccmaReducedMass = CudaArray::create<double>(context, numCCMA, "CcmaReducedMass");
-            ccmaConstraintMatrixValue = CudaArray::create<double>(context, numCCMA*maxRowElements, "ConstraintMatrixValue");
-            vector<double4> distanceVec(ccmaDistance->getSize());
-            vector<double> reducedMassVec(ccmaReducedMass->getSize());
-            vector<double> constraintMatrixValueVec(ccmaConstraintMatrixValue->getSize());
+            ccmaDistance.initialize<double4>(context, numCCMA, "CcmaDistance");
+            ccmaDelta1.initialize<double>(context, numCCMA, "CcmaDelta1");
+            ccmaDelta2.initialize<double>(context, numCCMA, "CcmaDelta2");
+            ccmaReducedMass.initialize<double>(context, numCCMA, "CcmaReducedMass");
+            ccmaConstraintMatrixValue.initialize<double>(context, numCCMA*maxRowElements, "ConstraintMatrixValue");
+            vector<double4> distanceVec(ccmaDistance.getSize());
+            vector<double> reducedMassVec(ccmaReducedMass.getSize());
+            vector<double> constraintMatrixValueVec(ccmaConstraintMatrixValue.getSize());
            for (int i = 0; i < numCCMA; i++) {
                int index = constraintOrder[i];
                int c = ccmaConstraints[index];
@@ -404,19 +398,19 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
                }
                constraintMatrixColumnVec[i+matrix[index].size()*numCCMA] = numCCMA;
            }
-            ccmaDistance->upload(distanceVec);
-            ccmaReducedMass->upload(reducedMassVec);
-            ccmaConstraintMatrixValue->upload(constraintMatrixValueVec);
+            ccmaDistance.upload(distanceVec);
+            ccmaReducedMass.upload(reducedMassVec);
+            ccmaConstraintMatrixValue.upload(constraintMatrixValueVec);
        }
        else {
-            ccmaDistance = CudaArray::create<float4>(context, numCCMA, "CcmaDistance");
-            ccmaDelta1 = CudaArray::create<float>(context, numCCMA, "CcmaDelta1");
-            ccmaDelta2 = CudaArray::create<float>(context, numCCMA, "CcmaDelta2");
-            ccmaReducedMass = CudaArray::create<float>(context, numCCMA, "CcmaReducedMass");
-            ccmaConstraintMatrixValue = CudaArray::create<float>(context, numCCMA*maxRowElements, "ConstraintMatrixValue");
-            vector<float4> distanceVec(ccmaDistance->getSize());
-            vector<float> reducedMassVec(ccmaReducedMass->getSize());
-            vector<float> constraintMatrixValueVec(ccmaConstraintMatrixValue->getSize());
+            ccmaDistance.initialize<float4>(context, numCCMA, "CcmaDistance");
+            ccmaDelta1.initialize<float>(context, numCCMA, "CcmaDelta1");
+            ccmaDelta2.initialize<float>(context, numCCMA, "CcmaDelta2");
+            ccmaReducedMass.initialize<float>(context, numCCMA, "CcmaReducedMass");
+            ccmaConstraintMatrixValue.initialize<float>(context, numCCMA*maxRowElements, "ConstraintMatrixValue");
+            vector<float4> distanceVec(ccmaDistance.getSize());
+            vector<float> reducedMassVec(ccmaReducedMass.getSize());
+            vector<float> constraintMatrixValueVec(ccmaConstraintMatrixValue.getSize());
            for (int i = 0; i < numCCMA; i++) {
                int index = constraintOrder[i];
                int c = ccmaConstraints[index];
@@ -430,9 +424,9 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
                }
                constraintMatrixColumnVec[i+matrix[index].size()*numCCMA] = numCCMA;
            }
-            ccmaDistance->upload(distanceVec);
-            ccmaReducedMass->upload(reducedMassVec);
-            ccmaConstraintMatrixValue->upload(constraintMatrixValueVec);
+            ccmaDistance.upload(distanceVec);
+            ccmaReducedMass.upload(reducedMassVec);
+            ccmaConstraintMatrixValue.upload(constraintMatrixValueVec);
        }
        for (unsigned int i = 0; i < atomConstraints.size(); i++) {
            numAtomConstraintsVec[i] = atomConstraints[i].size();
@@ -441,10 +435,10 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
                atomConstraintsVec[i+j*numAtoms] = (forward ? inverseOrder[atomConstraints[i][j]]+1 : -inverseOrder[atomConstraints[i][j]]-1);
            }
        }
-        ccmaAtoms->upload(atomsVec);
-        ccmaAtomConstraints->upload(atomConstraintsVec);
-        ccmaNumAtomConstraints->upload(numAtomConstraintsVec);
-        ccmaConstraintMatrixColumn->upload(constraintMatrixColumnVec);
+        ccmaAtoms.upload(atomsVec);
+        ccmaAtomConstraints.upload(atomConstraintsVec);
+        ccmaNumAtomConstraints.upload(numAtomConstraintsVec);
+        ccmaConstraintMatrixColumn.upload(constraintMatrixColumnVec);
    }
    
    // Build the list of virtual sites.
@@ -510,73 +504,73 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
    int num3Avg = vsite3AvgAtomVec.size();
    int numOutOfPlane = vsiteOutOfPlaneAtomVec.size();
    int numLocalCoords = vsiteLocalCoordsPosVec.size();
-    vsite2AvgAtoms = CudaArray::create<int4>(context, max(1, num2Avg), "vsite2AvgAtoms");
-    vsite3AvgAtoms = CudaArray::create<int4>(context, max(1, num3Avg), "vsite3AvgAtoms");
-    vsiteOutOfPlaneAtoms = CudaArray::create<int4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneAtoms");
-    vsiteLocalCoordsIndex = CudaArray::create<int>(context, max(1, (int) vsiteLocalCoordsIndexVec.size()), "vsiteLocalCoordsIndex");
-    vsiteLocalCoordsAtoms = CudaArray::create<int>(context, max(1, (int) vsiteLocalCoordsAtomVec.size()), "vsiteLocalCoordsAtoms");
-    vsiteLocalCoordsStartIndex = CudaArray::create<int>(context, max(1, (int) vsiteLocalCoordsStartVec.size()), "vsiteLocalCoordsStartIndex");
+    vsite2AvgAtoms.initialize<int4>(context, max(1, num2Avg), "vsite2AvgAtoms");
+    vsite3AvgAtoms.initialize<int4>(context, max(1, num3Avg), "vsite3AvgAtoms");
+    vsiteOutOfPlaneAtoms.initialize<int4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneAtoms");
+    vsiteLocalCoordsIndex.initialize<int>(context, max(1, (int) vsiteLocalCoordsIndexVec.size()), "vsiteLocalCoordsIndex");
+    vsiteLocalCoordsAtoms.initialize<int>(context, max(1, (int) vsiteLocalCoordsAtomVec.size()), "vsiteLocalCoordsAtoms");
+    vsiteLocalCoordsStartIndex.initialize<int>(context, max(1, (int) vsiteLocalCoordsStartVec.size()), "vsiteLocalCoordsStartIndex");
    if (num2Avg > 0)
-        vsite2AvgAtoms->upload(vsite2AvgAtomVec);
+        vsite2AvgAtoms.upload(vsite2AvgAtomVec);
    if (num3Avg > 0)
-        vsite3AvgAtoms->upload(vsite3AvgAtomVec);
+        vsite3AvgAtoms.upload(vsite3AvgAtomVec);
    if (numOutOfPlane > 0)
-        vsiteOutOfPlaneAtoms->upload(vsiteOutOfPlaneAtomVec);
+        vsiteOutOfPlaneAtoms.upload(vsiteOutOfPlaneAtomVec);
    if (numLocalCoords > 0) {
-        vsiteLocalCoordsIndex->upload(vsiteLocalCoordsIndexVec);
-        vsiteLocalCoordsAtoms->upload(vsiteLocalCoordsAtomVec);
-        vsiteLocalCoordsStartIndex->upload(vsiteLocalCoordsStartVec);
+        vsiteLocalCoordsIndex.upload(vsiteLocalCoordsIndexVec);
+        vsiteLocalCoordsAtoms.upload(vsiteLocalCoordsAtomVec);
+        vsiteLocalCoordsStartIndex.upload(vsiteLocalCoordsStartVec);
    }
    if (context.getUseDoublePrecision()) {
-        vsite2AvgWeights = CudaArray::create<double2>(context, max(1, num2Avg), "vsite2AvgWeights");
-        vsite3AvgWeights = CudaArray::create<double4>(context, max(1, num3Avg), "vsite3AvgWeights");
-        vsiteOutOfPlaneWeights = CudaArray::create<double4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights");
-        vsiteLocalCoordsWeights = CudaArray::create<double>(context, max(1, (int) vsiteLocalCoordsWeightVec.size()), "vsiteLocalCoordsWeights");
-        vsiteLocalCoordsPos = CudaArray::create<double4>(context, max(1, (int) vsiteLocalCoordsPosVec.size()), "vsiteLocalCoordsPos");
+        vsite2AvgWeights.initialize<double2>(context, max(1, num2Avg), "vsite2AvgWeights");
+        vsite3AvgWeights.initialize<double4>(context, max(1, num3Avg), "vsite3AvgWeights");
+        vsiteOutOfPlaneWeights.initialize<double4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights");
+        vsiteLocalCoordsWeights.initialize<double>(context, max(1, (int) vsiteLocalCoordsWeightVec.size()), "vsiteLocalCoordsWeights");
+        vsiteLocalCoordsPos.initialize<double4>(context, max(1, (int) vsiteLocalCoordsPosVec.size()), "vsiteLocalCoordsPos");
        if (num2Avg > 0)
-            vsite2AvgWeights->upload(vsite2AvgWeightVec);
+            vsite2AvgWeights.upload(vsite2AvgWeightVec);
        if (num3Avg > 0)
-            vsite3AvgWeights->upload(vsite3AvgWeightVec);
+            vsite3AvgWeights.upload(vsite3AvgWeightVec);
        if (numOutOfPlane > 0)
-            vsiteOutOfPlaneWeights->upload(vsiteOutOfPlaneWeightVec);
+            vsiteOutOfPlaneWeights.upload(vsiteOutOfPlaneWeightVec);
        if (numLocalCoords > 0) {
-            vsiteLocalCoordsWeights->upload(vsiteLocalCoordsWeightVec);
-            vsiteLocalCoordsPos->upload(vsiteLocalCoordsPosVec);
+            vsiteLocalCoordsWeights.upload(vsiteLocalCoordsWeightVec);
+            vsiteLocalCoordsPos.upload(vsiteLocalCoordsPosVec);
        }
    }
    else {
-        vsite2AvgWeights = CudaArray::create<float2>(context, max(1, num2Avg), "vsite2AvgWeights");
-        vsite3AvgWeights = CudaArray::create<float4>(context, max(1, num3Avg), "vsite3AvgWeights");
-        vsiteOutOfPlaneWeights = CudaArray::create<float4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights");
-        vsiteLocalCoordsWeights = CudaArray::create<float>(context, max(1, (int) vsiteLocalCoordsWeightVec.size()), "vsiteLocalCoordsWeights");
-        vsiteLocalCoordsPos = CudaArray::create<float4>(context, max(1, (int) vsiteLocalCoordsPosVec.size()), "vsiteLocalCoordsPos");
+        vsite2AvgWeights.initialize<float2>(context, max(1, num2Avg), "vsite2AvgWeights");
+        vsite3AvgWeights.initialize<float4>(context, max(1, num3Avg), "vsite3AvgWeights");
+        vsiteOutOfPlaneWeights.initialize<float4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights");
+        vsiteLocalCoordsWeights.initialize<float>(context, max(1, (int) vsiteLocalCoordsWeightVec.size()), "vsiteLocalCoordsWeights");
+        vsiteLocalCoordsPos.initialize<float4>(context, max(1, (int) vsiteLocalCoordsPosVec.size()), "vsiteLocalCoordsPos");
        if (num2Avg > 0) {
            vector<float2> floatWeights(num2Avg);
            for (int i = 0; i < num2Avg; i++)
                floatWeights[i] = make_float2((float) vsite2AvgWeightVec[i].x, (float) vsite2AvgWeightVec[i].y);
-            vsite2AvgWeights->upload(floatWeights);
+            vsite2AvgWeights.upload(floatWeights);
        }
        if (num3Avg > 0) {
            vector<float4> floatWeights(num3Avg);
            for (int i = 0; i < num3Avg; i++)
                floatWeights[i] = make_float4((float) vsite3AvgWeightVec[i].x, (float) vsite3AvgWeightVec[i].y, (float) vsite3AvgWeightVec[i].z, 0.0f);
-            vsite3AvgWeights->upload(floatWeights);
+            vsite3AvgWeights.upload(floatWeights);
        }
        if (numOutOfPlane > 0) {
            vector<float4> floatWeights(numOutOfPlane);
            for (int i = 0; i < numOutOfPlane; i++)
                floatWeights[i] = make_float4((float) vsiteOutOfPlaneWeightVec[i].x, (float) vsiteOutOfPlaneWeightVec[i].y, (float) vsiteOutOfPlaneWeightVec[i].z, 0.0f);
-            vsiteOutOfPlaneWeights->upload(floatWeights);
+            vsiteOutOfPlaneWeights.upload(floatWeights);
        }
        if (numLocalCoords > 0) {
            vector<float> floatWeights(vsiteLocalCoordsWeightVec.size());
            for (int i = 0; i < (int) vsiteLocalCoordsWeightVec.size(); i++)
                floatWeights[i] = (float) vsiteLocalCoordsWeightVec[i];
-            vsiteLocalCoordsWeights->upload(floatWeights);
+            vsiteLocalCoordsWeights.upload(floatWeights);
            vector<float4> floatPos(vsiteLocalCoordsPosVec.size());
            for (int i = 0; i < (int) vsiteLocalCoordsPosVec.size(); i++)
                floatPos[i] = make_float4((float) vsiteLocalCoordsPosVec[i].x, (float) vsiteLocalCoordsPosVec[i].y, (float) vsiteLocalCoordsPosVec[i].z, 0.0f);
-            vsiteLocalCoordsPos->upload(floatPos);
+            vsiteLocalCoordsPos.upload(floatPos);
        }
    }

@@ -610,86 +604,28 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S

 CudaIntegrationUtilities::~CudaIntegrationUtilities() {
    context.setAsCurrent();
-    if (posDelta != NULL)
-        delete posDelta;
-    if (settleAtoms != NULL)
-        delete settleAtoms;
-    if (settleParams != NULL)
-        delete settleParams;
-    if (shakeAtoms != NULL)
-        delete shakeAtoms;
-    if (shakeParams != NULL)
-        delete shakeParams;
-    if (random != NULL)
-        delete random;
-    if (randomSeed != NULL)
-        delete randomSeed;
-    if (stepSize != NULL)
-        delete stepSize;
-    if (ccmaAtoms != NULL)
-        delete ccmaAtoms;
-    if (ccmaDistance != NULL)
-        delete ccmaDistance;
-    if (ccmaReducedMass != NULL)
-        delete ccmaReducedMass;
-    if (ccmaAtomConstraints != NULL)
-        delete ccmaAtomConstraints;
-    if (ccmaNumAtomConstraints != NULL)
-        delete ccmaNumAtomConstraints;
-    if (ccmaConstraintMatrixColumn != NULL)
-        delete ccmaConstraintMatrixColumn;
-    if (ccmaConstraintMatrixValue != NULL)
-        delete ccmaConstraintMatrixValue;
-    if (ccmaDelta1 != NULL)
-        delete ccmaDelta1;
-    if (ccmaDelta2 != NULL)
-        delete ccmaDelta2;
-    if (ccmaConverged != NULL)
-        delete ccmaConverged;
    if (ccmaConvergedMemory != NULL)
        cuMemFreeHost(ccmaConvergedMemory);
-    if (vsite2AvgAtoms != NULL)
-        delete vsite2AvgAtoms;
-    if (vsite2AvgWeights != NULL)
-        delete vsite2AvgWeights;
-    if (vsite3AvgAtoms != NULL)
-        delete vsite3AvgAtoms;
-    if (vsite3AvgWeights != NULL)
-        delete vsite3AvgWeights;
-    if (vsiteOutOfPlaneAtoms != NULL)
-        delete vsiteOutOfPlaneAtoms;
-    if (vsiteOutOfPlaneWeights != NULL)
-        delete vsiteOutOfPlaneWeights;
-    if (vsiteLocalCoordsIndex != NULL)
-        delete vsiteLocalCoordsIndex;
-    if (vsiteLocalCoordsAtoms != NULL)
-        delete vsiteLocalCoordsAtoms;
-    if (vsiteLocalCoordsWeights != NULL)
-        delete vsiteLocalCoordsWeights;
-    if (vsiteLocalCoordsPos != NULL)
-        delete vsiteLocalCoordsPos;
-    if (vsiteLocalCoordsStartIndex != NULL)
-        delete vsiteLocalCoordsStartIndex;
 }

 void CudaIntegrationUtilities::setNextStepSize(double size) {
    if (size != lastStepSize.x || size != lastStepSize.y) {
        lastStepSize = make_double2(size, size);
        if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
-            stepSize->upload(&lastStepSize);
+            stepSize.upload(&lastStepSize);
        else {
            float2 lastStepSizeFloat = make_float2((float) size, (float) size);
-            stepSize->upload(&lastStepSizeFloat);
+            stepSize.upload(&lastStepSizeFloat);
        }
    }
 }

 double CudaIntegrationUtilities::getLastStepSize() {
    if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
-        stepSize->download(&lastStepSize);
+        stepSize.download(&lastStepSize);
    else {
        float2 lastStepSizeFloat;
-        stepSize->download(&lastStepSizeFloat);
+        stepSize.download(&lastStepSizeFloat);
        lastStepSize = make_double2(lastStepSizeFloat.x, lastStepSizeFloat.y);
    }
    return lastStepSize.y;
@@ -718,41 +654,41 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double
    float floatTol = (float) tol;
    void* tolPointer = (context.getUseDoublePrecision() || context.getUseMixedPrecision() ? (void*) &tol : (void*) &floatTol);
    CUdeviceptr posCorrection = (context.getUseMixedPrecision() ? context.getPosqCorrection().getDevicePointer() : 0);
-    if (settleAtoms != NULL) {
-        int numClusters = settleAtoms->getSize();
+    if (settleAtoms.isInitialized()) {
+        int numClusters = settleAtoms.getSize();
        void* args[] = {&numClusters, tolPointer, &context.getPosq().getDevicePointer(), &posCorrection,
-                &posDelta->getDevicePointer(), &context.getVelm().getDevicePointer(),
-                &settleAtoms->getDevicePointer(), &settleParams->getDevicePointer()};
-        context.executeKernel(settleKernel, args, settleAtoms->getSize());
+                &posDelta.getDevicePointer(), &context.getVelm().getDevicePointer(),
+                &settleAtoms.getDevicePointer(), &settleParams.getDevicePointer()};
+        context.executeKernel(settleKernel, args, settleAtoms.getSize());
    }
-    if (shakeAtoms != NULL) {
-        int numClusters = shakeAtoms->getSize();
+    if (shakeAtoms.isInitialized()) {
+        int numClusters = shakeAtoms.getSize();
        void* args[] = {&numClusters, tolPointer, &context.getPosq().getDevicePointer(), &posCorrection,
-                constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta->getDevicePointer(),
-                &shakeAtoms->getDevicePointer(), &shakeParams->getDevicePointer()};
-        context.executeKernel(shakeKernel, args, shakeAtoms->getSize());
+                constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta.getDevicePointer(),
+                &shakeAtoms.getDevicePointer(), &shakeParams.getDevicePointer()};
+        context.executeKernel(shakeKernel, args, shakeAtoms.getSize());
    }
-    if (ccmaAtoms != NULL) {
-        void* directionsArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(), &context.getPosq().getDevicePointer(), &posCorrection, &ccmaConverged->getDevicePointer()};
-        context.executeKernel(ccmaDirectionsKernel, directionsArgs, ccmaAtoms->getSize());
+    if (ccmaAtoms.isInitialized()) {
+        void* directionsArgs[] = {&ccmaAtoms.getDevicePointer(), &ccmaDistance.getDevicePointer(), &context.getPosq().getDevicePointer(), &posCorrection, &ccmaConverged.getDevicePointer()};
+        context.executeKernel(ccmaDirectionsKernel, directionsArgs, ccmaAtoms.getSize());
        int i;
-        void* forceArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(),
-                constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta->getDevicePointer(),
-                &ccmaReducedMass->getDevicePointer(), &ccmaDelta1->getDevicePointer(), &ccmaConverged->getDevicePointer(),
+        void* forceArgs[] = {&ccmaAtoms.getDevicePointer(), &ccmaDistance.getDevicePointer(),
+                constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta.getDevicePointer(),
+                &ccmaReducedMass.getDevicePointer(), &ccmaDelta1.getDevicePointer(), &ccmaConverged.getDevicePointer(),
                &ccmaConvergedDeviceMemory, tolPointer, &i};
-        void* multiplyArgs[] = {&ccmaDelta1->getDevicePointer(), &ccmaDelta2->getDevicePointer(),
-                &ccmaConstraintMatrixColumn->getDevicePointer(), &ccmaConstraintMatrixValue->getDevicePointer(), &ccmaConverged->getDevicePointer(), &i};
-        void* updateArgs[] = {&ccmaNumAtomConstraints->getDevicePointer(), &ccmaAtomConstraints->getDevicePointer(), &ccmaDistance->getDevicePointer(),
-                constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta->getDevicePointer(),
-                &context.getVelm().getDevicePointer(), &ccmaDelta1->getDevicePointer(), &ccmaDelta2->getDevicePointer(),
-                &ccmaConverged->getDevicePointer(), &i};
+        void* multiplyArgs[] = {&ccmaDelta1.getDevicePointer(), &ccmaDelta2.getDevicePointer(),
+                &ccmaConstraintMatrixColumn.getDevicePointer(), &ccmaConstraintMatrixValue.getDevicePointer(), &ccmaConverged.getDevicePointer(), &i};
+        void* updateArgs[] = {&ccmaNumAtomConstraints.getDevicePointer(), &ccmaAtomConstraints.getDevicePointer(), &ccmaDistance.getDevicePointer(),
+                constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta.getDevicePointer(),
+                &context.getVelm().getDevicePointer(), &ccmaDelta1.getDevicePointer(), &ccmaDelta2.getDevicePointer(),
+                &ccmaConverged.getDevicePointer(), &i};
        const int checkInterval = 4;
        ccmaConvergedMemory[0] = 0;
        for (i = 0; i < 150; i++) {
-            context.executeKernel(ccmaForceKernel, forceArgs, ccmaAtoms->getSize());
+            context.executeKernel(ccmaForceKernel, forceArgs, ccmaAtoms.getSize());
            if ((i+1)%checkInterval == 0)
                CHECK_RESULT2(cuEventRecord(ccmaEvent, 0), "Error recording event for CCMA");
-            context.executeKernel(ccmaMultiplyKernel, multiplyArgs, ccmaAtoms->getSize());
+            context.executeKernel(ccmaMultiplyKernel, multiplyArgs, ccmaAtoms.getSize());
            context.executeKernel(ccmaUpdateKernel, updateArgs, context.getNumAtoms());
            if ((i+1)%checkInterval == 0) {
                CHECK_RESULT2(cuEventSynchronize(ccmaEvent), "Error synchronizing on event for CCMA");
@@ -766,12 +702,12 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double
 void CudaIntegrationUtilities::computeVirtualSites() {
    if (numVsites > 0) {
        CUdeviceptr posCorrection = (context.getUseMixedPrecision() ? context.getPosqCorrection().getDevicePointer() : 0);
-        void* args[] = {&context.getPosq().getDevicePointer(), &posCorrection, &vsite2AvgAtoms->getDevicePointer(), &vsite2AvgWeights->getDevicePointer(),
-                &vsite3AvgAtoms->getDevicePointer(), &vsite3AvgWeights->getDevicePointer(),
-                &vsiteOutOfPlaneAtoms->getDevicePointer(), &vsiteOutOfPlaneWeights->getDevicePointer(),
-                &vsiteLocalCoordsIndex->getDevicePointer(), &vsiteLocalCoordsAtoms->getDevicePointer(),
-                &vsiteLocalCoordsWeights->getDevicePointer(), &vsiteLocalCoordsPos->getDevicePointer(),
-                &vsiteLocalCoordsStartIndex->getDevicePointer()};
+        void* args[] = {&context.getPosq().getDevicePointer(), &posCorrection, &vsite2AvgAtoms.getDevicePointer(), &vsite2AvgWeights.getDevicePointer(),
+                &vsite3AvgAtoms.getDevicePointer(), &vsite3AvgWeights.getDevicePointer(),
+                &vsiteOutOfPlaneAtoms.getDevicePointer(), &vsiteOutOfPlaneWeights.getDevicePointer(),
+                &vsiteLocalCoordsIndex.getDevicePointer(), &vsiteLocalCoordsAtoms.getDevicePointer(),
+                &vsiteLocalCoordsWeights.getDevicePointer(), &vsiteLocalCoordsPos.getDevicePointer(),
+                &vsiteLocalCoordsStartIndex.getDevicePointer()};
        context.executeKernel(vsitePositionKernel, args, numVsites);
    }
 }
@@ -780,18 +716,18 @@ void CudaIntegrationUtilities::distributeForcesFromVirtualSites() {
    if (numVsites > 0) {
        CUdeviceptr posCorrection = (context.getUseMixedPrecision() ? context.getPosqCorrection().getDevicePointer() : 0);
        void* args[] = {&context.getPosq().getDevicePointer(), &posCorrection, &context.getForce().getDevicePointer(),
-                &vsite2AvgAtoms->getDevicePointer(), &vsite2AvgWeights->getDevicePointer(),
-                &vsite3AvgAtoms->getDevicePointer(), &vsite3AvgWeights->getDevicePointer(),
-                &vsiteOutOfPlaneAtoms->getDevicePointer(), &vsiteOutOfPlaneWeights->getDevicePointer(),
-                &vsiteLocalCoordsIndex->getDevicePointer(), &vsiteLocalCoordsAtoms->getDevicePointer(),
-                &vsiteLocalCoordsWeights->getDevicePointer(), &vsiteLocalCoordsPos->getDevicePointer(),
-                &vsiteLocalCoordsStartIndex->getDevicePointer()};
+                &vsite2AvgAtoms.getDevicePointer(), &vsite2AvgWeights.getDevicePointer(),
+                &vsite3AvgAtoms.getDevicePointer(), &vsite3AvgWeights.getDevicePointer(),
+                &vsiteOutOfPlaneAtoms.getDevicePointer(), &vsiteOutOfPlaneWeights.getDevicePointer(),
+                &vsiteLocalCoordsIndex.getDevicePointer(), &vsiteLocalCoordsAtoms.getDevicePointer(),
+                &vsiteLocalCoordsWeights.getDevicePointer(), &vsiteLocalCoordsPos.getDevicePointer(),
+                &vsiteLocalCoordsStartIndex.getDevicePointer()};
        context.executeKernel(vsiteForceKernel, args, numVsites);
    }
 }

 void CudaIntegrationUtilities::initRandomNumberGenerator(unsigned int randomNumberSeed) {
-    if (random != NULL) {
+    if (random.isInitialized()) {
        if (randomNumberSeed != lastSeed)
           throw OpenMMException("CudaIntegrationUtilities::initRandomNumberGenerator(): Requested two different values for the random number seed");
        return;
@@ -800,63 +736,61 @@ void CudaIntegrationUtilities::initRandomNumberGenerator(unsigned int randomNumb
    // Create the random number arrays.

    lastSeed = randomNumberSeed;
-    random = CudaArray::create<float4>(context, 4*context.getPaddedNumAtoms(), "random");
-    randomSeed = CudaArray::create<int4>(context, context.getNumThreadBlocks()*CudaContext::ThreadBlockSize, "randomSeed");
-    randomPos = random->getSize();
+    random.initialize<float4>(context, 4*context.getPaddedNumAtoms(), "random");
+    randomSeed.initialize<int4>(context, context.getNumThreadBlocks()*CudaContext::ThreadBlockSize, "randomSeed");
+    randomPos = random.getSize();

    // Use a quick and dirty RNG to pick seeds for the real random number generator.

-    vector<int4> seed(randomSeed->getSize());
+    vector<int4> seed(randomSeed.getSize());
    unsigned int r = randomNumberSeed;
    if (r == 0) r = (unsigned int) osrngseed();
-    for (int i = 0; i < randomSeed->getSize(); i++) {
+    for (int i = 0; i < randomSeed.getSize(); i++) {
        seed[i].x = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
        seed[i].y = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
        seed[i].z = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
        seed[i].w = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
    }
-    randomSeed->upload(seed);
+    randomSeed.upload(seed);
 }

 int CudaIntegrationUtilities::prepareRandomNumbers(int numValues) {
-    if (randomPos+numValues <= random->getSize()) {
+    if (randomPos+numValues <= random.getSize()) {
        int oldPos = randomPos;
        randomPos += numValues;
        return oldPos;
    }
-    if (numValues > random->getSize()) {
-        delete random;
-        random = CudaArray::create<float4>(context, numValues, "random");
-    }
-    int size = random->getSize();
-    void* args[] = {&size, &random->getDevicePointer(), &randomSeed->getDevicePointer()};
-    context.executeKernel(randomKernel, args, random->getSize());
+    if (numValues > random.getSize())
+        random.resize(numValues);
+    int size = random.getSize();
+    void* args[] = {&size, &random.getDevicePointer(), &randomSeed.getDevicePointer()};
+    context.executeKernel(randomKernel, args, random.getSize());
    randomPos = numValues;
    return 0;
 }

 void CudaIntegrationUtilities::createCheckpoint(ostream& stream) {
-    if(random == NULL) 
+    if (!random.isInitialized()) 
        return;
    stream.write((char*) &randomPos, sizeof(int));
    vector<float4> randomVec;
-    random->download(randomVec);
-    stream.write((char*) &randomVec[0], sizeof(float4)*random->getSize());
+    random.download(randomVec);
+    stream.write((char*) &randomVec[0], sizeof(float4)*random.getSize());
    vector<int4> randomSeedVec;
-    randomSeed->download(randomSeedVec);
-    stream.write((char*) &randomSeedVec[0], sizeof(int4)*randomSeed->getSize());
+    randomSeed.download(randomSeedVec);
+    stream.write((char*) &randomSeedVec[0], sizeof(int4)*randomSeed.getSize());
 }

 void CudaIntegrationUtilities::loadCheckpoint(istream& stream) {
-    if(random == NULL) 
+    if (!random.isInitialized()) 
        return;
    stream.read((char*) &randomPos, sizeof(int));
-    vector<float4> randomVec(random->getSize());
-    stream.read((char*) &randomVec[0], sizeof(float4)*random->getSize());
-    random->upload(randomVec);
-    vector<int4> randomSeedVec(randomSeed->getSize());
-    stream.read((char*) &randomSeedVec[0], sizeof(int4)*randomSeed->getSize());
-    randomSeed->upload(randomSeedVec);
+    vector<float4> randomVec(random.getSize());
+    stream.read((char*) &randomVec[0], sizeof(float4)*random.getSize());
+    random.upload(randomVec);
+    vector<int4> randomSeedVec(randomSeed.getSize());
+    stream.read((char*) &randomSeedVec[0], sizeof(int4)*randomSeed.getSize());
+    randomSeed.upload(randomSeedVec);
 }

 double CudaIntegrationUtilities::computeKineticEnergy(double timeShift) {
@@ -867,7 +801,7 @@ double CudaIntegrationUtilities::computeKineticEnergy(double timeShift) {

        // Copy the velocities into the posDelta array while we temporarily modify them.

-        context.getVelm().copyTo(*posDelta);
+        context.getVelm().copyTo(posDelta);

        // Apply the time shift.

@@ -901,6 +835,6 @@ double CudaIntegrationUtilities::computeKineticEnergy(double timeShift) {
    // Restore the velocities.
    
    if (timeShift != 0)
-        posDelta->copyTo(context.getVelm());
+        posDelta.copyTo(context.getVelm());
    return 0.5*energy;
 }
--- a/platforms/cuda/src/CudaNonbondedUtilities.cpp
+++ b/platforms/cuda/src/CudaNonbondedUtilities.cpp
@@ -63,10 +63,7 @@ private:
 };

 CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true),
-        exclusionIndices(NULL), exclusionRowIndices(NULL), exclusionTiles(NULL), exclusions(NULL), interactingTiles(NULL), interactingAtoms(NULL),
-        interactionCount(NULL), singlePairs(NULL), blockCenter(NULL), blockBoundingBox(NULL), sortedBlocks(NULL), sortedBlockCenter(NULL), sortedBlockBoundingBox(NULL),
-        oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), pinnedCountBuffer(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0),
-        canUsePairList(true) {
+        blockSorter(NULL), pinnedCountBuffer(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0), canUsePairList(true) {
    // Decide how many thread blocks to use.

    string errorMessage = "Error initializing nonbonded utilities";
@@ -79,36 +76,6 @@ CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(c
 }

 CudaNonbondedUtilities::~CudaNonbondedUtilities() {
-    if (exclusionIndices != NULL)
-        delete exclusionIndices;
-    if (exclusionRowIndices != NULL)
-        delete exclusionRowIndices;
-    if (exclusionTiles != NULL)
-        delete exclusionTiles;
-    if (exclusions != NULL)
-        delete exclusions;
-    if (interactingTiles != NULL)
-        delete interactingTiles;
-    if (interactingAtoms != NULL)
-        delete interactingAtoms;
-    if (interactionCount != NULL)
-        delete interactionCount;
-    if (singlePairs != NULL)
-        delete singlePairs;
-    if (blockCenter != NULL)
-        delete blockCenter;
-    if (blockBoundingBox != NULL)
-        delete blockBoundingBox;
-    if (sortedBlocks != NULL)
-        delete sortedBlocks;
-    if (sortedBlockCenter != NULL)
-        delete sortedBlockCenter;
-    if (sortedBlockBoundingBox != NULL)
-        delete sortedBlockBoundingBox;
-    if (oldPositions != NULL)
-        delete oldPositions;
-    if (rebuildNeighborList != NULL)
-        delete rebuildNeighborList;
    if (blockSorter != NULL)
        delete blockSorter;
    if (pinnedCountBuffer != NULL)
@@ -220,8 +187,8 @@ void CudaNonbondedUtilities::initialize(const System& system) {
    for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter)
        exclusionTilesVec.push_back(make_ushort2((unsigned short) iter->first, (unsigned short) iter->second));
    sort(exclusionTilesVec.begin(), exclusionTilesVec.end(), compareUshort2);
-    exclusionTiles = CudaArray::create<ushort2>(context, exclusionTilesVec.size(), "exclusionTiles");
-    exclusionTiles->upload(exclusionTilesVec);
+    exclusionTiles.initialize<ushort2>(context, exclusionTilesVec.size(), "exclusionTiles");
+    exclusionTiles.upload(exclusionTilesVec);
    map<pair<int, int>, int> exclusionTileMap;
    for (int i = 0; i < (int) exclusionTilesVec.size(); i++) {
        ushort2 tile = exclusionTilesVec[i];
@@ -242,16 +209,16 @@ void CudaNonbondedUtilities::initialize(const System& system) {
    maxExclusions = 0;
    for (int i = 0; i < (int) exclusionBlocksForBlock.size(); i++)
        maxExclusions = (maxExclusions > exclusionBlocksForBlock[i].size() ? maxExclusions : exclusionBlocksForBlock[i].size());
-    exclusionIndices = CudaArray::create<unsigned int>(context, exclusionIndicesVec.size(), "exclusionIndices");
-    exclusionRowIndices = CudaArray::create<unsigned int>(context, exclusionRowIndicesVec.size(), "exclusionRowIndices");
-    exclusionIndices->upload(exclusionIndicesVec);
-    exclusionRowIndices->upload(exclusionRowIndicesVec);
+    exclusionIndices.initialize<unsigned int>(context, exclusionIndicesVec.size(), "exclusionIndices");
+    exclusionRowIndices.initialize<unsigned int>(context, exclusionRowIndicesVec.size(), "exclusionRowIndices");
+    exclusionIndices.upload(exclusionIndicesVec);
+    exclusionRowIndices.upload(exclusionRowIndicesVec);

    // Record the exclusion data.

-    exclusions = CudaArray::create<tileflags>(context, tilesWithExclusions.size()*CudaContext::TileSize, "exclusions");
+    exclusions.initialize<tileflags>(context, tilesWithExclusions.size()*CudaContext::TileSize, "exclusions");
    tileflags allFlags = (tileflags) -1;
-    vector<tileflags> exclusionVec(exclusions->getSize(), allFlags);
+    vector<tileflags> exclusionVec(exclusions.getSize(), allFlags);
    for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
        int x = atom1/CudaContext::TileSize;
        int offset1 = atom1-x*CudaContext::TileSize;
@@ -270,7 +237,7 @@ void CudaNonbondedUtilities::initialize(const System& system) {
        }
    }
    atomExclusions.clear(); // We won't use this again, so free the memory it used
-    exclusions->upload(exclusionVec);
+    exclusions.upload(exclusionVec);

    // Create data structures for the neighbor list.

@@ -284,21 +251,21 @@ void CudaNonbondedUtilities::initialize(const System& system) {
        if (maxTiles < 1)
            maxTiles = 1;
        maxSinglePairs = 5*numAtoms;
-        interactingTiles = CudaArray::create<int>(context, maxTiles, "interactingTiles");
-        interactingAtoms = CudaArray::create<int>(context, CudaContext::TileSize*maxTiles, "interactingAtoms");
-        interactionCount = CudaArray::create<unsigned int>(context, 2, "interactionCount");
-        singlePairs = CudaArray::create<int2>(context, maxSinglePairs, "singlePairs");
+        interactingTiles.initialize<int>(context, maxTiles, "interactingTiles");
+        interactingAtoms.initialize<int>(context, CudaContext::TileSize*maxTiles, "interactingAtoms");
+        interactionCount.initialize<unsigned int>(context, 2, "interactionCount");
+        singlePairs.initialize<int2>(context, maxSinglePairs, "singlePairs");
        int elementSize = (context.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
-        blockCenter = new CudaArray(context, numAtomBlocks, 4*elementSize, "blockCenter");
-        blockBoundingBox = new CudaArray(context, numAtomBlocks, 4*elementSize, "blockBoundingBox");
-        sortedBlocks = new CudaArray(context, numAtomBlocks, 2*elementSize, "sortedBlocks");
-        sortedBlockCenter = new CudaArray(context, numAtomBlocks+1, 4*elementSize, "sortedBlockCenter");
-        sortedBlockBoundingBox = new CudaArray(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox");
-        oldPositions = new CudaArray(context, numAtoms, 4*elementSize, "oldPositions");
-        rebuildNeighborList = CudaArray::create<int>(context, 1, "rebuildNeighborList");
+        blockCenter.initialize(context, numAtomBlocks, 4*elementSize, "blockCenter");
+        blockBoundingBox.initialize(context, numAtomBlocks, 4*elementSize, "blockBoundingBox");
+        sortedBlocks.initialize(context, numAtomBlocks, 2*elementSize, "sortedBlocks");
+        sortedBlockCenter.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockCenter");
+        sortedBlockBoundingBox.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox");
+        oldPositions.initialize(context, numAtoms, 4*elementSize, "oldPositions");
+        rebuildNeighborList.initialize<int>(context, 1, "rebuildNeighborList");
        blockSorter = new CudaSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks);
        vector<unsigned int> count(2, 0);
-        interactionCount->upload(count);
+        interactionCount.upload(count);
    }

    // Record arguments for kernels.
@@ -306,24 +273,24 @@ void CudaNonbondedUtilities::initialize(const System& system) {
    forceArgs.push_back(&context.getForce().getDevicePointer());
    forceArgs.push_back(&context.getEnergyBuffer().getDevicePointer());
    forceArgs.push_back(&context.getPosq().getDevicePointer());
-    forceArgs.push_back(&exclusions->getDevicePointer());
-    forceArgs.push_back(&exclusionTiles->getDevicePointer());
+    forceArgs.push_back(&exclusions.getDevicePointer());
+    forceArgs.push_back(&exclusionTiles.getDevicePointer());
    forceArgs.push_back(&startTileIndex);
    forceArgs.push_back(&numTiles);
    if (useCutoff) {
-        forceArgs.push_back(&interactingTiles->getDevicePointer());
-        forceArgs.push_back(&interactionCount->getDevicePointer());
+        forceArgs.push_back(&interactingTiles.getDevicePointer());
+        forceArgs.push_back(&interactionCount.getDevicePointer());
        forceArgs.push_back(context.getPeriodicBoxSizePointer());
        forceArgs.push_back(context.getInvPeriodicBoxSizePointer());
        forceArgs.push_back(context.getPeriodicBoxVecXPointer());
        forceArgs.push_back(context.getPeriodicBoxVecYPointer());
        forceArgs.push_back(context.getPeriodicBoxVecZPointer());
        forceArgs.push_back(&maxTiles);
-        forceArgs.push_back(&blockCenter->getDevicePointer());
-        forceArgs.push_back(&blockBoundingBox->getDevicePointer());
-        forceArgs.push_back(&interactingAtoms->getDevicePointer());
+        forceArgs.push_back(&blockCenter.getDevicePointer());
+        forceArgs.push_back(&blockBoundingBox.getDevicePointer());
+        forceArgs.push_back(&interactingAtoms.getDevicePointer());
        forceArgs.push_back(&maxSinglePairs);
-        forceArgs.push_back(&singlePairs->getDevicePointer());
+        forceArgs.push_back(&singlePairs.getDevicePointer());
    }
    for (int i = 0; i < (int) parameters.size(); i++)
        forceArgs.push_back(&parameters[i].getMemory());
@@ -339,41 +306,41 @@ void CudaNonbondedUtilities::initialize(const System& system) {
        findBlockBoundsArgs.push_back(context.getPeriodicBoxVecYPointer());
        findBlockBoundsArgs.push_back(context.getPeriodicBoxVecZPointer());
        findBlockBoundsArgs.push_back(&context.getPosq().getDevicePointer());
-        findBlockBoundsArgs.push_back(&blockCenter->getDevicePointer());
-        findBlockBoundsArgs.push_back(&blockBoundingBox->getDevicePointer());
-        findBlockBoundsArgs.push_back(&rebuildNeighborList->getDevicePointer());
-        findBlockBoundsArgs.push_back(&sortedBlocks->getDevicePointer());
-        sortBoxDataArgs.push_back(&sortedBlocks->getDevicePointer());
-        sortBoxDataArgs.push_back(&blockCenter->getDevicePointer());
-        sortBoxDataArgs.push_back(&blockBoundingBox->getDevicePointer());
-        sortBoxDataArgs.push_back(&sortedBlockCenter->getDevicePointer());
-        sortBoxDataArgs.push_back(&sortedBlockBoundingBox->getDevicePointer());
+        findBlockBoundsArgs.push_back(&blockCenter.getDevicePointer());
+        findBlockBoundsArgs.push_back(&blockBoundingBox.getDevicePointer());
+        findBlockBoundsArgs.push_back(&rebuildNeighborList.getDevicePointer());
+        findBlockBoundsArgs.push_back(&sortedBlocks.getDevicePointer());
+        sortBoxDataArgs.push_back(&sortedBlocks.getDevicePointer());
+        sortBoxDataArgs.push_back(&blockCenter.getDevicePointer());
+        sortBoxDataArgs.push_back(&blockBoundingBox.getDevicePointer());
+        sortBoxDataArgs.push_back(&sortedBlockCenter.getDevicePointer());
+        sortBoxDataArgs.push_back(&sortedBlockBoundingBox.getDevicePointer());
        sortBoxDataArgs.push_back(&context.getPosq().getDevicePointer());
-        sortBoxDataArgs.push_back(&oldPositions->getDevicePointer());
-        sortBoxDataArgs.push_back(&interactionCount->getDevicePointer());
-        sortBoxDataArgs.push_back(&rebuildNeighborList->getDevicePointer());
+        sortBoxDataArgs.push_back(&oldPositions.getDevicePointer());
+        sortBoxDataArgs.push_back(&interactionCount.getDevicePointer());
+        sortBoxDataArgs.push_back(&rebuildNeighborList.getDevicePointer());
        sortBoxDataArgs.push_back(&forceRebuildNeighborList);
        findInteractingBlocksArgs.push_back(context.getPeriodicBoxSizePointer());
        findInteractingBlocksArgs.push_back(context.getInvPeriodicBoxSizePointer());
        findInteractingBlocksArgs.push_back(context.getPeriodicBoxVecXPointer());
        findInteractingBlocksArgs.push_back(context.getPeriodicBoxVecYPointer());
        findInteractingBlocksArgs.push_back(context.getPeriodicBoxVecZPointer());
-        findInteractingBlocksArgs.push_back(&interactionCount->getDevicePointer());
-        findInteractingBlocksArgs.push_back(&interactingTiles->getDevicePointer());
-        findInteractingBlocksArgs.push_back(&interactingAtoms->getDevicePointer());
-        findInteractingBlocksArgs.push_back(&singlePairs->getDevicePointer());
+        findInteractingBlocksArgs.push_back(&interactionCount.getDevicePointer());
+        findInteractingBlocksArgs.push_back(&interactingTiles.getDevicePointer());
+        findInteractingBlocksArgs.push_back(&interactingAtoms.getDevicePointer());
+        findInteractingBlocksArgs.push_back(&singlePairs.getDevicePointer());
        findInteractingBlocksArgs.push_back(&context.getPosq().getDevicePointer());
        findInteractingBlocksArgs.push_back(&maxTiles);
        findInteractingBlocksArgs.push_back(&maxSinglePairs);
        findInteractingBlocksArgs.push_back(&startBlockIndex);
        findInteractingBlocksArgs.push_back(&numBlocks);
-        findInteractingBlocksArgs.push_back(&sortedBlocks->getDevicePointer());
-        findInteractingBlocksArgs.push_back(&sortedBlockCenter->getDevicePointer());
-        findInteractingBlocksArgs.push_back(&sortedBlockBoundingBox->getDevicePointer());
-        findInteractingBlocksArgs.push_back(&exclusionIndices->getDevicePointer());
-        findInteractingBlocksArgs.push_back(&exclusionRowIndices->getDevicePointer());
-        findInteractingBlocksArgs.push_back(&oldPositions->getDevicePointer());
-        findInteractingBlocksArgs.push_back(&rebuildNeighborList->getDevicePointer());
+        findInteractingBlocksArgs.push_back(&sortedBlocks.getDevicePointer());
+        findInteractingBlocksArgs.push_back(&sortedBlockCenter.getDevicePointer());
+        findInteractingBlocksArgs.push_back(&sortedBlockBoundingBox.getDevicePointer());
+        findInteractingBlocksArgs.push_back(&exclusionIndices.getDevicePointer());
+        findInteractingBlocksArgs.push_back(&exclusionRowIndices.getDevicePointer());
+        findInteractingBlocksArgs.push_back(&oldPositions.getDevicePointer());
+        findInteractingBlocksArgs.push_back(&rebuildNeighborList.getDevicePointer());
    }
 }

@@ -406,12 +373,12 @@ void CudaNonbondedUtilities::prepareInteractions(int forceGroups) {
    if (lastCutoff != kernels.cutoffDistance)
        forceRebuildNeighborList = true;
    context.executeKernel(kernels.findBlockBoundsKernel, &findBlockBoundsArgs[0], context.getNumAtoms());
-    blockSorter->sort(*sortedBlocks);
+    blockSorter->sort(sortedBlocks);
    context.executeKernel(kernels.sortBoxDataKernel, &sortBoxDataArgs[0], context.getNumAtoms());
    context.executeKernel(kernels.findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtoms(), 256);
    forceRebuildNeighborList = false;
    lastCutoff = kernels.cutoffDistance;
-    interactionCount->download(pinnedCountBuffer, false);
+    interactionCount.download(pinnedCountBuffer, false);
    cuEventRecord(downloadCountEvent, context.getCurrentStream());
 }

@@ -445,27 +412,21 @@ bool CudaNonbondedUtilities::updateNeighborListSize() {
        int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
        if (maxTiles > totalTiles)
            maxTiles = totalTiles;
-        delete interactingTiles;
-        delete interactingAtoms;
-        interactingTiles = NULL; // Avoid an error in the destructor if the following allocation fails
-        interactingAtoms = NULL;
-        interactingTiles = CudaArray::create<int>(context, maxTiles, "interactingTiles");
-        interactingAtoms = CudaArray::create<int>(context, CudaContext::TileSize*maxTiles, "interactingAtoms");
+        interactingTiles.resize(maxTiles);
+        interactingAtoms.resize(CudaContext::TileSize*maxTiles);
        if (forceArgs.size() > 0)
-            forceArgs[7] = &interactingTiles->getDevicePointer();
-        findInteractingBlocksArgs[6] = &interactingTiles->getDevicePointer();
+            forceArgs[7] = &interactingTiles.getDevicePointer();
+        findInteractingBlocksArgs[6] = &interactingTiles.getDevicePointer();
        if (forceArgs.size() > 0)
-            forceArgs[17] = &interactingAtoms->getDevicePointer();
-        findInteractingBlocksArgs[7] = &interactingAtoms->getDevicePointer();
+            forceArgs[17] = &interactingAtoms.getDevicePointer();
+        findInteractingBlocksArgs[7] = &interactingAtoms.getDevicePointer();
    }
    if (pinnedCountBuffer[1] > maxSinglePairs) {
        maxSinglePairs = (int) (1.2*pinnedCountBuffer[1]);
-        delete singlePairs;
-        singlePairs = NULL; // Avoid an error in the destructor if the following allocation fails
-        singlePairs = CudaArray::create<int2>(context, maxSinglePairs, "singlePairs");
+        singlePairs.resize(maxSinglePairs);
        if (forceArgs.size() > 0)
-            forceArgs[19] = &singlePairs->getDevicePointer();
-        findInteractingBlocksArgs[8] = &singlePairs->getDevicePointer();
+            forceArgs[19] = &singlePairs.getDevicePointer();
+        findInteractingBlocksArgs[8] = &singlePairs.getDevicePointer();
    }
    forceRebuildNeighborList = true;
    context.setForcesValid(false);
@@ -510,7 +471,7 @@ void CudaNonbondedUtilities::createKernelsForGroups(int groups) {
        defines["PADDING"] = context.doubleToString(padding);
        defines["PADDED_CUTOFF"] = context.doubleToString(paddedCutoff);
        defines["PADDED_CUTOFF_SQUARED"] = context.doubleToString(paddedCutoff*paddedCutoff);
-        defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(exclusionTiles->getSize());
+        defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(exclusionTiles.getSize());
        if (usePeriodic)
            defines["USE_PERIODIC"] = "1";
        if (context.getBoxIsTriclinic())
@@ -735,7 +696,7 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
    defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
    defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
    defines["TILE_SIZE"] = context.intToString(CudaContext::TileSize);
-    int numExclusionTiles = exclusionTiles->getSize();
+    int numExclusionTiles = exclusionTiles.getSize();
    defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(numExclusionTiles);
    int numContexts = context.getPlatformData().contexts.size();
    int startExclusionIndex = context.getContextIndex()*numExclusionTiles/numContexts;

--- a/platforms/cuda/src/CudaSort.cpp
+++ b/platforms/cuda/src/CudaSort.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2010-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2010-2018 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -31,8 +31,7 @@
 using namespace OpenMM;
 using namespace std;

-CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait),
-        dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL), dataLength(length) {
+CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait), dataLength(length) {
    // Create kernels.

    map<string, string> replacements;
@@ -76,26 +75,16 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
    // Create workspace arrays.

    if (!isShortList) {
-        dataRange = new CudaArray(context, 2, trait->getKeySize(), "sortDataRange");
-        bucketOffset = CudaArray::create<uint1>(context, numBuckets, "bucketOffset");
-        bucketOfElement = CudaArray::create<uint1>(context, length, "bucketOfElement");
-        offsetInBucket = CudaArray::create<uint1>(context, length, "offsetInBucket");
-        buckets = new CudaArray(context, length, trait->getDataSize(), "buckets");
+        dataRange.initialize(context, 2, trait->getKeySize(), "sortDataRange");
+        bucketOffset.initialize<uint1>(context, numBuckets, "bucketOffset");
+        bucketOfElement.initialize<uint1>(context, length, "bucketOfElement");
+        offsetInBucket.initialize<uint1>(context, length, "offsetInBucket");
+        buckets.initialize(context, length, trait->getDataSize(), "buckets");
    }
 }

 CudaSort::~CudaSort() {
    delete trait;
-    if (dataRange != NULL)
-        delete dataRange;
-    if (bucketOfElement != NULL)
-        delete bucketOfElement;
-    if (offsetInBucket != NULL)
-        delete offsetInBucket;
-    if (bucketOffset != NULL)
-        delete bucketOffset;
-    if (buckets != NULL)
-        delete buckets;
 }

 void CudaSort::sort(CudaArray& data) {
@@ -112,30 +101,30 @@ void CudaSort::sort(CudaArray& data) {
    else {
        // Compute the range of data values.

-        unsigned int numBuckets = bucketOffset->getSize();
-        void* rangeArgs[] = {&data.getDevicePointer(), &dataLength, &dataRange->getDevicePointer(), &numBuckets, &bucketOffset->getDevicePointer()};
+        unsigned int numBuckets = bucketOffset.getSize();
+        void* rangeArgs[] = {&data.getDevicePointer(), &dataLength, &dataRange.getDevicePointer(), &numBuckets, &bucketOffset.getDevicePointer()};
        context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelSize, rangeKernelSize, 2*rangeKernelSize*trait->getKeySize());

        // Assign array elements to buckets.

-        void* elementsArgs[] = {&data.getDevicePointer(), &dataLength, &numBuckets, &dataRange->getDevicePointer(),
-                &bucketOffset->getDevicePointer(), &bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()};
+        void* elementsArgs[] = {&data.getDevicePointer(), &dataLength, &numBuckets, &dataRange.getDevicePointer(),
+                &bucketOffset.getDevicePointer(), &bucketOfElement.getDevicePointer(), &offsetInBucket.getDevicePointer()};
        context.executeKernel(assignElementsKernel, elementsArgs, data.getSize(), 128);

        // Compute the position of each bucket.

-        void* computeArgs[] = {&numBuckets, &bucketOffset->getDevicePointer()};
+        void* computeArgs[] = {&numBuckets, &bucketOffset.getDevicePointer()};
        context.executeKernel(computeBucketPositionsKernel, computeArgs, positionsKernelSize, positionsKernelSize, positionsKernelSize*sizeof(int));

        // Copy the data into the buckets.

-        void* copyArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &dataLength, &bucketOffset->getDevicePointer(),
-                &bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()};
+        void* copyArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &dataLength, &bucketOffset.getDevicePointer(),
+                &bucketOfElement.getDevicePointer(), &offsetInBucket.getDevicePointer()};
        context.executeKernel(copyToBucketsKernel, copyArgs, data.getSize());

        // Sort each bucket.

-        void* sortArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &numBuckets, &bucketOffset->getDevicePointer()};
+        void* sortArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &numBuckets, &bucketOffset.getDevicePointer()};
        context.executeKernel(sortBucketsKernel, sortArgs, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize());
    }
 }