OpenCLArray is no longer templatized and doesn't provide a host buffer. This...

OpenCLArray is no longer templatized and doesn't provide a host buffer. This is in preparation for adding mixed/double precision support.

OpenCLArray is no longer templatized and doesn't provide a host buffer. This...
OpenCLArray is no longer templatized and doesn't provide a host buffer. This is in preparation for adding mixed/double precision support.
1107aa83 · Peter Eastman · 5980100d · 1107aa83 · 1107aa83 · 1107aa83
Commit 1107aa83 authored Oct 10, 2012 by Peter Eastman
20 changed files
--- a/platforms/cuda/src/CudaNonbondedUtilities.cpp
+++ b/platforms/cuda/src/CudaNonbondedUtilities.cpp
@@ -45,7 +45,7 @@ using namespace std;

 CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), cutoff(-1.0), useCutoff(false), anyExclusions(false),
        exclusionIndices(NULL), exclusionRowIndices(NULL), exclusions(NULL), interactingTiles(NULL), interactionFlags(NULL),
-        interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), pinnedInteractionCount(NULL), nonbondedForceGroup(0) {
+        interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), nonbondedForceGroup(0) {
    // Decide how many thread blocks to use.

    string errorMessage = "Error initializing nonbonded utilities";
@@ -72,8 +72,6 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() {
        delete blockCenter;
    if (blockBoundingBox != NULL)
        delete blockBoundingBox;
-    if (pinnedInteractionCount != NULL)
-        cuMemFreeHost(pinnedInteractionCount);
 }

 void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup) {
@@ -240,9 +238,8 @@ void CudaNonbondedUtilities::initialize(const System& system) {
            blockCenter = CudaArray::create<float4>(context, numAtomBlocks, "blockCenter");
            blockBoundingBox = CudaArray::create<float4>(context, numAtomBlocks, "blockBoundingBox");
        }
-        CHECK_RESULT(cuMemHostAlloc((void**) &pinnedInteractionCount, sizeof(unsigned int), 0));
-        pinnedInteractionCount[0] = 0;
-        interactionCount->upload(pinnedInteractionCount);
+        vector<unsigned int> count(1, 0);
+        interactionCount->upload(count);
    }

    // Create kernels.
@@ -325,6 +322,7 @@ void CudaNonbondedUtilities::computeInteractions() {
 void CudaNonbondedUtilities::updateNeighborListSize() {
    if (!useCutoff)
        return;
+    unsigned int* pinnedInteractionCount = (unsigned int*) context.getPinnedBuffer();
    interactionCount->download(pinnedInteractionCount);
    if (pinnedInteractionCount[0] <= (unsigned int) maxTiles)
        return;

--- a/platforms/cuda/src/CudaNonbondedUtilities.h
+++ b/platforms/cuda/src/CudaNonbondedUtilities.h
@@ -259,7 +259,6 @@ private:
    CudaArray* interactionCount;
    CudaArray* blockCenter;
    CudaArray* blockBoundingBox;
-    unsigned int* pinnedInteractionCount;
    std::vector<void*> forceArgs, findBlockBoundsArgs, findInteractingBlocksArgs, findInteractionsWithinBlocksArgs;
    std::vector<std::vector<int> > atomExclusions;
    std::vector<ParameterInfo> parameters;

--- a/platforms/cuda/tests/TestCudaNonbondedForce.cpp
+++ b/platforms/cuda/tests/TestCudaNonbondedForce.cpp
@@ -432,6 +432,9 @@ void testLargeSystem() {
    cuState = cuContext.getState(State::Positions | State::Velocities | State::Forces | State::Energy);
    referenceState = referenceContext.getState(State::Positions | State::Velocities | State::Forces | State::Energy);
    for (int i = 0; i < numParticles; i++) {
+        double dx = cuState.getPositions()[i][0]-referenceState.getPositions()[i][0];
+        double dy = cuState.getPositions()[i][1]-referenceState.getPositions()[i][1];
+        double dz = cuState.getPositions()[i][2]-referenceState.getPositions()[i][2];
        ASSERT_EQUAL_TOL(fmod(cuState.getPositions()[i][0]-referenceState.getPositions()[i][0], boxSize), 0, tol);
        ASSERT_EQUAL_TOL(fmod(cuState.getPositions()[i][1]-referenceState.getPositions()[i][1], boxSize), 0, tol);
        ASSERT_EQUAL_TOL(fmod(cuState.getPositions()[i][2]-referenceState.getPositions()[i][2], boxSize), 0, tol);

--- a/platforms/opencl/src/OpenCLArray.cpp
+++ b/platforms/opencl/src/OpenCLArray.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2012 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "OpenCLArray.h"
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+using namespace OpenMM;
+
+OpenCLArray::OpenCLArray(OpenCLContext& context, int size, int elementSize, const std::string& name, cl_int flags) :
+        context(context), size(size), elementSize(elementSize), name(name), ownsBuffer(true) {
+    try {
+        buffer = new cl::Buffer(context.getContext(), flags, size*elementSize);
+    }
+    catch (cl::Error err) {
+        std::stringstream str;
+        str<<"Error creating array "<<name<<": "<<err.what()<<" ("<<err.err()<<")";
+        throw OpenMMException(str.str());
+    }
+}
+
+OpenCLArray::OpenCLArray(OpenCLContext& context, cl::Buffer* buffer, int size, int elementSize, const std::string& name) :
+        context(context), buffer(buffer), size(size), elementSize(elementSize), name(name), ownsBuffer(false) {
+}
+
+OpenCLArray::~OpenCLArray() {
+    if (ownsBuffer)
+        delete buffer;
+}
+
+void OpenCLArray::upload(const void* data, bool blocking) {
+    try {
+        context.getQueue().enqueueWriteBuffer(*buffer, blocking ? CL_TRUE : CL_FALSE, 0, size*elementSize, data);
+    }
+    catch (cl::Error err) {
+        std::stringstream str;
+        str<<"Error uploading array "<<name<<": "<<err.what()<<" ("<<err.err()<<")";
+        throw OpenMMException(str.str());
+    }
+}
+
+void OpenCLArray::download(void* data, bool blocking) const {
+    try {
+        context.getQueue().enqueueReadBuffer(*buffer, blocking ? CL_TRUE : CL_FALSE, 0, size*elementSize, data);
+    }
+    catch (cl::Error err) {
+        std::stringstream str;
+        str<<"Error downloading array "<<name<<": "<<err.what()<<" ("<<err.err()<<")";
+        throw OpenMMException(str.str());
+    }
+}
+
+void OpenCLArray::copyTo(OpenCLArray& dest) const {
+    if (dest.getSize() != size || dest.getElementSize() != elementSize)
+        throw OpenMMException("Error copying array "+name+" to "+dest.getName()+": The destination array does not match the size of the array");
+    try {
+        context.getQueue().enqueueCopyBuffer(*buffer, dest.getDeviceBuffer(), 0, 0, size*elementSize);
+    }
+    catch (cl::Error err) {
+        std::stringstream str;
+        str<<"Error copying array "<<name<<" to "<<dest.getName()<<": "<<err.what()<<" ("<<err.err()<<")";
+        throw OpenMMException(str.str());
+    }
+}
--- a/platforms/opencl/src/OpenCLArray.h
+++ b/platforms/opencl/src/OpenCLArray.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -37,62 +37,70 @@ namespace OpenMM {

 /**
 * This class encapsulates an OpenCL Buffer.  It provides a simplified API for working with it,
- * an optionally includes a buffer in host memory for copying data to and from the OpenCL Buffer.
+ * and for copying data to and from the OpenCL Buffer.
 */

-template <class T>
 class OpenCLArray {
 public:
    /**
-     * Create an OpenCLArray object.
+     * Create an OpenCLArray object.  The object is allocated on the heap with the "new" operator.
+     * The template argument is the data type of each array element.
     *
     * @param context           the context for which to create the array
     * @param size              the number of elements in the array
     * @param name              the name of the array
-     * @param createHostBuffer  specifies whether to create a buffer in host memory for copying data to and from
-     *                          the OpenCL Buffer
     * @param flags             the set of flags to specify when creating the OpenCL Buffer
     */
-    OpenCLArray(OpenCLContext& context, int size, const std::string& name, bool createHostBuffer = false, cl_int flags = CL_MEM_READ_WRITE) :
-            context(context), size(size), name(name), local(createHostBuffer ? size : 0), ownsBuffer(true) {
-        try {
-            buffer = new cl::Buffer(context.getContext(), flags, size*sizeof(T));
-        }
-        catch (cl::Error err) {
-            std::stringstream str;
-            str<<"Error creating array "<<name<<": "<<err.what()<<" ("<<err.err()<<")";
-            throw OpenMMException(str.str());
-        }
+    template <class T>
+    static OpenCLArray* create(OpenCLContext& context, int size, const std::string& name, cl_int flags = CL_MEM_READ_WRITE) {
+        return new OpenCLArray(context, size, sizeof(T), name, flags);
    }
    /**
-     * Create an OpenCLArray object the uses a preexisting Buffer.
+     * Create an OpenCLArray object that uses a preexisting Buffer.  The object is allocated on the heap with the "new" operator.
+     * The template argument is the data type of each array element.
     *
     * @param context           the context for which to create the array
     * @param buffer            the OpenCL Buffer this object encapsulates
     * @param size              the number of elements in the array
     * @param name              the name of the array
-     * @param createHostBuffer  specifies whether to create a buffer in host memory for copying data to and from
-     *                          the OpenCL Buffer
     */
-    OpenCLArray(OpenCLContext& context, cl::Buffer* buffer, int size, const std::string& name, bool createHostBuffer = false) :
-            context(context), buffer(buffer), size(size), name(name), local(createHostBuffer ? size : 0), ownsBuffer(false) {
-    }
-    ~OpenCLArray() {
-        if (ownsBuffer)
-            delete buffer;
-    }
-    const T& operator[](int index) const {
-        return local[index];
-    }
-    T& operator[](int index) {
-        return local[index];
+    template <class T>
+    static OpenCLArray* create(OpenCLContext& context, cl::Buffer* buffer, int size, const std::string& name) {
+        return new OpenCLArray(context, buffer, size, sizeof(T), name);
    }
+    /**
+     * Create an OpenCLArray object.
+     *
+     * @param context           the context for which to create the array
+     * @param size              the number of elements in the array
+     * @param elementSize       the size of each element in bytes
+     * @param name              the name of the array
+     * @param flags             the set of flags to specify when creating the OpenCL Buffer
+     */
+    OpenCLArray(OpenCLContext& context, int size, int elementSize, const std::string& name, cl_int flags = CL_MEM_READ_WRITE);
+    /**
+     * Create an OpenCLArray object that uses a preexisting Buffer.
+     *
+     * @param context           the context for which to create the array
+     * @param buffer            the OpenCL Buffer this object encapsulates
+     * @param size              the number of elements in the array
+     * @param elementSize       the size of each element in bytes
+     * @param name              the name of the array
+     */
+    OpenCLArray(OpenCLContext& context, cl::Buffer* buffer, int size, int elementSize, const std::string& name);
+    ~OpenCLArray();
    /**
     * Get the size of the array.
     */
    int getSize() const {
        return size;
    }
+    /**
+     * Get the size of each element in bytes.
+     */
+    int getElementSize() const {
+        return elementSize;
+    }
    /**
     * Get the name of the array.
     */
@@ -105,85 +113,50 @@ public:
    cl::Buffer& getDeviceBuffer() {
        return *buffer;
    }
-    /**
-     * Get a pointer to the host buffer.
-     */
-    T* getHostBuffer() {
-        return &local[0];
-    }
-    /**
-     * Get an element of the host buffer.
-     */
-    const T& get(int index) const {
-        return local[index];
-    }
-    /**
-     * Set an element of the host buffer.
-     */
-    void set(int index, const T& value) {
-        local[index] = value;
-    }
    /**
     * Copy the values in a vector to the Buffer.
     */
-    void upload(std::vector<T>& data, bool blocking = true) {
+    template <class T>
+    void upload(const std::vector<T>& data, bool blocking = true) {
+        if (sizeof(T) != elementSize || data.size() != size)
+            throw OpenMMException("Error uploading array "+name+": The specified vector does not match the size of the array");
        upload(&data[0], blocking);
    }
    /**
     * Copy the values in the Buffer to a vector.
     */
-    void download(std::vector<T>& data) const {
+    template <class T>
+    void download(std::vector<T>& data, bool blocking = true) const {
+        if (sizeof(T) != elementSize)
+            throw OpenMMException("Error downloading array "+name+": The specified vector has the wrong element size");
        if (data.size() != size)
            data.resize(size);
-        download(&data[0]);
+        download(&data[0], blocking);
    }
    /**
     * Copy the values in an array to the Buffer.
+     * 
+     * @param data     the data to copy
+     * @param blocking if true, this call will block until the transfer is complete.
     */
-    void upload(T* data, bool blocking = true) {
-        try {
-            context.getQueue().enqueueWriteBuffer(*buffer, blocking ? CL_TRUE : CL_FALSE, 0, size*sizeof(T), data);
-        }
-        catch (cl::Error err) {
-            std::stringstream str;
-            str<<"Error uploading array "<<name<<": "<<err.what()<<" ("<<err.err()<<")";
-            throw OpenMMException(str.str());
-        }
-    }
+    void upload(const void* data, bool blocking = true);
    /**
     * Copy the values in the Buffer to an array.
+     * 
+     * @param data     the array to copy the memory to
+     * @param blocking if true, this call will block until the transfer is complete.
     */
-    void download(T* data) const {
-        try {
-            context.getQueue().enqueueReadBuffer(*buffer, CL_TRUE, 0, size*sizeof(T), data);
-        }
-        catch (cl::Error err) {
-            std::stringstream str;
-            str<<"Error downloading array "<<name<<": "<<err.what()<<" ("<<err.err()<<")";
-            throw OpenMMException(str.str());
-        }
-    }
+    void download(void* data, bool blocking = true) const;
    /**
-     * Copy the values in the host buffer to the OpenCL Buffer.
+     * Copy the values in the Buffer to a second OpenCLArray.
+     * 
+     * @param dest     the destination array to copy to
     */
-    void upload(bool blocking = true) {
-        if (local.size() == 0)
-            throw OpenMMException(name+": Called upload() on an OpenCLArray with no host buffer");
-        upload(local, blocking);
-    }
-    /**
-     * Copy the values in the Buffer to the host buffer.
-     */
-    void download() {
-        if (local.size() == 0)
-            throw OpenMMException(name+": Called download() on an OpenCLArray with no host buffer");
-        download(local);
-    }
+    void copyTo(OpenCLArray& dest) const;
 private:
    OpenCLContext& context;
    cl::Buffer* buffer;
-    std::vector<T> local;
-    int size;
+    int size, elementSize;
    bool ownsBuffer;
    std::string name;
 };

--- a/platforms/opencl/src/OpenCLBondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLBondedUtilities.cpp
@@ -87,7 +87,7 @@ void OpenCLBondedUtilities::initialize(const System& system) {
            for (int atom = 0; atom < numAtoms; atom++)
                indexVec[bond*width+atom] = forceAtoms[i][bond][atom];
        }
-        OpenCLArray<cl_uint>* indices = new OpenCLArray<cl_uint>(context, indexVec.size(), "bondedIndices");
+        OpenCLArray* indices = OpenCLArray::create<cl_uint>(context, indexVec.size(), "bondedIndices");
        indices->upload(indexVec);
        atomIndices.push_back(indices);
        bufferVec[i].resize(width*numBonds, 0);
@@ -151,7 +151,7 @@ void OpenCLBondedUtilities::initialize(const System& system) {
                for (int bond = 0; bond < numBonds; bond++)
                    for (int atom = 0; atom < numAtoms; atom++)
                        bufferVec[force][bond*width+atom] += bufferCounter[forceSets[i][k]][forceAtoms[force][bond][atom]];
-            OpenCLArray<cl_uint>* buffers = new OpenCLArray<cl_uint>(context, bufferVec[force].size(), "bondedBufferIndices");
+            OpenCLArray* buffers = OpenCLArray::create<cl_uint>(context, bufferVec[force].size(), "bondedBufferIndices");
            buffers->upload(bufferVec[force]);
            bufferIndices[force] = buffers;
        }

--- a/platforms/opencl/src/OpenCLBondedUtilities.h
+++ b/platforms/opencl/src/OpenCLBondedUtilities.h
@@ -134,8 +134,8 @@ private:
    std::vector<std::vector<int> > forceSets;
    std::vector<cl::Memory*> arguments;
    std::vector<std::string> argTypes;
-    std::vector<OpenCLArray<cl_uint>*> atomIndices;
-    std::vector<OpenCLArray<cl_uint>*> bufferIndices;
+    std::vector<OpenCLArray*> atomIndices;
+    std::vector<OpenCLArray*> bufferIndices;
    std::vector<std::string> prefixCode;
    int numForceBuffers, maxBonds;
    bool hasInitializedKernels;

--- a/platforms/opencl/src/OpenCLCompact.cpp
+++ b/platforms/opencl/src/OpenCLCompact.cpp
@@ -30,7 +30,7 @@
 using namespace OpenMM;

 OpenCLCompact::OpenCLCompact(OpenCLContext& context) : context(context), dgBlockCounts(NULL) {
-    dgBlockCounts = new OpenCLArray<cl_uint>(context, context.getNumThreadBlocks(), "dgBlockCounts");
+    dgBlockCounts = OpenCLArray::create<cl_uint>(context, context.getNumThreadBlocks(), "dgBlockCounts");
    cl::Program program = context.createProgram(OpenCLKernelSources::compact);
    countKernel = cl::Kernel(program, "countElts");
    moveValidKernel = cl::Kernel(program, "moveValidElementsStaged");
@@ -41,7 +41,7 @@ OpenCLCompact::~OpenCLCompact() {
        delete dgBlockCounts;
 }

-void OpenCLCompact::compactStream(OpenCLArray<cl_uint>& dOut, OpenCLArray<cl_uint>& dIn, OpenCLArray<cl_uint>& dValid, OpenCLArray<cl_uint>& numValid) {
+void OpenCLCompact::compactStream(OpenCLArray& dOut, OpenCLArray& dIn, OpenCLArray& dValid, OpenCLArray& numValid) {
    // Figure out # elements per block
    unsigned int len = dIn.getSize();
    unsigned int numBlocks = context.getNumThreadBlocks();

--- a/platforms/opencl/src/OpenCLCompact.h
+++ b/platforms/opencl/src/OpenCLCompact.h
@@ -33,10 +33,10 @@ class OPENMM_EXPORT OpenCLCompact {
 public:
    OpenCLCompact(OpenCLContext& context);
    ~OpenCLCompact();
-    void compactStream(OpenCLArray<cl_uint>& dOut, OpenCLArray<cl_uint>& dIn, OpenCLArray<cl_uint>& dValid, OpenCLArray<cl_uint>& numValid);
+    void compactStream(OpenCLArray& dOut, OpenCLArray& dIn, OpenCLArray& dValid, OpenCLArray& numValid);
 private:
    OpenCLContext& context;
-    OpenCLArray<cl_uint>* dgBlockCounts;
+    OpenCLArray* dgBlockCounts;
    cl::Kernel countKernel;
    cl::Kernel moveValidKernel;
 };

--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -67,7 +67,7 @@ static void CL_CALLBACK errorCallback(const char* errinfo, const void* private_i

 OpenCLContext::OpenCLContext(const System& system, int platformIndex, int deviceIndex, OpenCLPlatform::PlatformData& platformData) :
        system(system), time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), atomsWereReordered(false), posq(NULL),
-        velm(NULL), forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndex(NULL), integration(NULL),
+        velm(NULL), forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndexDevice(NULL), integration(NULL),
        bonded(NULL), nonbonded(NULL), thread(NULL) {
    try {
        contextIndex = platformData.contexts.size();
@@ -217,8 +217,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
        numThreadBlocks = numThreadBlocksPerComputeUnit*device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
        bonded = new OpenCLBondedUtilities(*this);
        nonbonded = new OpenCLNonbondedUtilities(*this);
-        posq = new OpenCLArray<mm_float4>(*this, paddedNumAtoms, "posq", true);
-        velm = new OpenCLArray<mm_float4>(*this, paddedNumAtoms, "velm", true);
+        posq = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms, "posq");
+        velm = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms, "velm");
        posCellOffsets.resize(paddedNumAtoms, mm_int4(0, 0, 0, 0));
    }
    catch (cl::Error err) {
@@ -242,19 +242,20 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
    // Decide whether native_sqrt(), native_rsqrt(), and native_recip() are sufficiently accurate to use.

    cl::Kernel accuracyKernel(utilities, "determineNativeAccuracy");
-    OpenCLArray<mm_float8> values(*this, 20, "values", true);
+    OpenCLArray valuesArray(*this, 20, sizeof(mm_float8), "values");
+    vector<mm_float8> values(valuesArray.getSize());
    float nextValue = 1e-4f;
-    for (int i = 0; i < values.getSize(); ++i) {
+    for (int i = 0; i < (int) values.size(); ++i) {
        values[i].s0 = nextValue;
        nextValue *= (float) M_PI;
    }
-    values.upload();
-    accuracyKernel.setArg<cl::Buffer>(0, values.getDeviceBuffer());
-    accuracyKernel.setArg<cl_int>(1, values.getSize());
-    executeKernel(accuracyKernel, values.getSize());
-    values.download();
+    valuesArray.upload(values);
+    accuracyKernel.setArg<cl::Buffer>(0, valuesArray.getDeviceBuffer());
+    accuracyKernel.setArg<cl_int>(1, values.size());
+    executeKernel(accuracyKernel, values.size());
+    valuesArray.download(values);
    double maxSqrtError = 0.0, maxRsqrtError = 0.0, maxRecipError = 0.0, maxExpError = 0.0, maxLogError = 0.0;
-    for (int i = 0; i < values.getSize(); ++i) {
+    for (int i = 0; i < (int) values.size(); ++i) {
        double v = values[i].s0;
        double correctSqrt = sqrt(v);
        maxSqrtError = max(maxSqrtError, fabs(correctSqrt-values[i].s1)/correctSqrt);
@@ -283,6 +284,8 @@ OpenCLContext::~OpenCLContext() {
        delete forces[i];
    for (int i = 0; i < (int) reorderListeners.size(); i++)
        delete reorderListeners[i];
+    if (pinnedBuffer != NULL)
+        delete pinnedBuffer;
    if (posq != NULL)
        delete posq;
    if (velm != NULL)
@@ -295,8 +298,8 @@ OpenCLContext::~OpenCLContext() {
        delete longForceBuffer;
    if (energyBuffer != NULL)
        delete energyBuffer;
-    if (atomIndex != NULL)
-        delete atomIndex;
+    if (atomIndexDevice != NULL)
+        delete atomIndexDevice;
    if (integration != NULL)
        delete integration;
    if (bonded != NULL)
@@ -308,19 +311,20 @@ OpenCLContext::~OpenCLContext() {
 }

 void OpenCLContext::initialize() {
+    vector<mm_float4> v(paddedNumAtoms, mm_float4(0, 0, 0, 0));
    for (int i = 0; i < numAtoms; i++) {
        double mass = system.getParticleMass(i);
-        (*velm)[i].w = (float) (mass == 0.0 ? 0.0 : 1.0/mass);
+        v[i].w = (float) (mass == 0.0 ? 0.0 : 1.0/mass);
    }
-    velm->upload();
+    velm->upload(v);
    bonded->initialize(system);
    numForceBuffers = platformData.contexts.size();
    numForceBuffers = std::max(numForceBuffers, bonded->getNumForceBuffers());
    for (int i = 0; i < (int) forces.size(); i++)
        numForceBuffers = std::max(numForceBuffers, forces[i]->getRequiredForceBuffers());
-    forceBuffers = new OpenCLArray<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers", false);
+    forceBuffers = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers");
    if (supports64BitGlobalAtomics) {
-        longForceBuffer = new OpenCLArray<cl_long>(*this, 3*paddedNumAtoms, "longForceBuffer", false);
+        longForceBuffer = OpenCLArray::create<cl_long>(*this, 3*paddedNumAtoms, "longForceBuffer");
        reduceForcesKernel.setArg<cl::Buffer>(0, longForceBuffer->getDeviceBuffer());
        reduceForcesKernel.setArg<cl::Buffer>(1, forceBuffers->getDeviceBuffer());
        reduceForcesKernel.setArg<cl_int>(2, paddedNumAtoms);
@@ -328,13 +332,17 @@ void OpenCLContext::initialize() {
        addAutoclearBuffer(longForceBuffer->getDeviceBuffer(), longForceBuffer->getSize()*2);
    }
    addAutoclearBuffer(forceBuffers->getDeviceBuffer(), forceBuffers->getSize()*4);
-    force = new OpenCLArray<mm_float4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force", true);
-    energyBuffer = new OpenCLArray<cl_float>(*this, max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()), "energyBuffer", true);
+    force = OpenCLArray::create<mm_float4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force");
+    energyBuffer = OpenCLArray::create<cl_float>(*this, max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()), "energyBuffer");
    addAutoclearBuffer(energyBuffer->getDeviceBuffer(), energyBuffer->getSize());
-    atomIndex = new OpenCLArray<cl_int>(*this, paddedNumAtoms, "atomIndex", true);
+    int bufferBytes = max(posq->getSize()*sizeof(mm_float4), energyBuffer->getSize()*sizeof(cl_float));
+    pinnedBuffer = new cl::Buffer(context, CL_MEM_ALLOC_HOST_PTR, bufferBytes);
+    pinnedMemory = queue.enqueueMapBuffer(*pinnedBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, bufferBytes);
+    atomIndexDevice = OpenCLArray::create<cl_int>(*this, paddedNumAtoms, "atomIndexDevice");
+    atomIndex.resize(paddedNumAtoms);
    for (int i = 0; i < paddedNumAtoms; ++i)
-        (*atomIndex)[i] = i;
-    atomIndex->upload();
+        atomIndex[i] = i;
+    atomIndexDevice->upload(atomIndex);
    findMoleculeGroups();
    moleculesInvalid = false;
    nonbonded->initialize(system);
@@ -410,12 +418,8 @@ void OpenCLContext::executeKernel(cl::Kernel& kernel, int workUnits, int blockSi
    }
 }

-void OpenCLContext::clearBuffer(OpenCLArray<float>& array) {
-    clearBuffer(array.getDeviceBuffer(), array.getSize());
-}
-
-void OpenCLContext::clearBuffer(OpenCLArray<mm_float4>& array) {
-    clearBuffer(array.getDeviceBuffer(), array.getSize()*4);
+void OpenCLContext::clearBuffer(OpenCLArray& array) {
+    clearBuffer(array.getDeviceBuffer(), array.getSize()*array.getElementSize()/sizeof(cl_float));
 }

 void OpenCLContext::clearBuffer(cl::Memory& memory, int size) {
@@ -500,7 +504,7 @@ void OpenCLContext::reduceForces() {
        reduceBuffer(*forceBuffers, numForceBuffers);
 }

-void OpenCLContext::reduceBuffer(OpenCLArray<mm_float4>& array, int numBuffers) {
+void OpenCLContext::reduceBuffer(OpenCLArray& array, int numBuffers) {
    int bufferSize = array.getSize()/numBuffers;
    reduceFloat4Kernel.setArg<cl::Buffer>(0, array.getDeviceBuffer());
    reduceFloat4Kernel.setArg<cl_int>(1, bufferSize);
@@ -760,26 +764,28 @@ void OpenCLContext::validateMolecules() {
    // atoms to their original order, rebuild the list of identical molecules, and sort them
    // again.
    
-    vector<mm_float4> newPosq(numAtoms);
-    vector<mm_float4> newVelm(numAtoms);
+    vector<mm_float4> oldPosq(paddedNumAtoms);
+    vector<mm_float4> newPosq(paddedNumAtoms);
+    vector<mm_float4> oldVelm(paddedNumAtoms);
+    vector<mm_float4> newVelm(paddedNumAtoms);
    vector<mm_int4> newCellOffsets(numAtoms);
-    posq->download();
-    velm->download();
+    posq->download(oldPosq);
+    velm->download(oldVelm);
    for (int i = 0; i < numAtoms; i++) {
-        int index = atomIndex->get(i);
-        newPosq[index] = posq->get(i);
-        newVelm[index] = velm->get(i);
+        int index = atomIndex[i];
+        newPosq[index] = oldPosq[i];
+        newVelm[index] = oldVelm[i];
        newCellOffsets[index] = posCellOffsets[i];
    }
+    posq->upload(newPosq);
+    velm->upload(newVelm);
    for (int i = 0; i < numAtoms; i++) {
-        posq->set(i, newPosq[i]);
-        velm->set(i, newVelm[i]);
-        atomIndex->set(i, i);
+        atomIndex[i] = i;
        posCellOffsets[i] = newCellOffsets[i];
    }
-    posq->upload();
-    velm->upload();
-    atomIndex->upload();
+    posq->upload(newPosq);
+    velm->upload(newVelm);
+    atomIndexDevice->upload(atomIndex);
    findMoleculeGroups();
    for (int i = 0; i < (int) reorderListeners.size(); i++)
        reorderListeners[i]->execute();
@@ -794,11 +800,13 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {

    // Find the range of positions and the number of bins along each axis.

-    posq->download();
-    velm->download();
-    float minx = posq->get(0).x, maxx = posq->get(0).x;
-    float miny = posq->get(0).y, maxy = posq->get(0).y;
-    float minz = posq->get(0).z, maxz = posq->get(0).z;
+    vector<mm_float4> oldPosq(paddedNumAtoms);
+    vector<mm_float4> oldVelm(paddedNumAtoms);
+    posq->download(oldPosq);
+    velm->download(oldVelm);
+    float minx = oldPosq[0].x, maxx = oldPosq[0].x;
+    float miny = oldPosq[0].y, maxy = oldPosq[0].y;
+    float minz = oldPosq[0].z, maxz = oldPosq[0].z;
    if (nonbonded->getUsePeriodic()) {
        minx = miny = minz = 0.0;
        maxx = periodicBoxSize.x;
@@ -807,7 +815,7 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
    }
    else {
        for (int i = 1; i < numAtoms; i++) {
-            const mm_float4& pos = posq->get(i);
+            const mm_float4& pos = oldPosq[i];
            minx = min(minx, pos.x);
            maxx = max(maxx, pos.x);
            miny = min(miny, pos.y);
@@ -820,8 +828,8 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
    // Loop over each group of identical molecules and reorder them.

    vector<int> originalIndex(numAtoms);
-    vector<mm_float4> newPosq(numAtoms);
-    vector<mm_float4> newVelm(numAtoms);
+    vector<mm_float4> newPosq(paddedNumAtoms);
+    vector<mm_float4> newVelm(paddedNumAtoms);
    vector<mm_int4> newCellOffsets(numAtoms);
    for (int group = 0; group < (int) moleculeGroups.size(); group++) {
        // Find the center of each molecule.
@@ -837,7 +845,7 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
            molPos[i].z = 0.0f;
            for (int j = 0; j < (int)atoms.size(); j++) {
                int atom = atoms[j]+mol.offsets[i];
-                const mm_float4& pos = posq->get(atom);
+                const mm_float4& pos = oldPosq[atom];
                molPos[i].x += pos.x;
                molPos[i].y += pos.y;
                molPos[i].z += pos.z;
@@ -863,11 +871,11 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
                    if (enforcePeriodic) {
                        for (int j = 0; j < (int) atoms.size(); j++) {
                            int atom = atoms[j]+mol.offsets[i];
-                            mm_float4 p = posq->get(atom);
+                            mm_float4 p = oldPosq[atom];
                            p.x -= dx;
                            p.y -= dy;
                            p.z -= dz;
-                            posq->set(atom, p);
+                            oldPosq[atom] = p;
                            posCellOffsets[atom].x -= xcell;
                            posCellOffsets[atom].y -= ycell;
                            posCellOffsets[atom].z -= zcell;
@@ -918,9 +926,9 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
            for (int j = 0; j < (int)atoms.size(); j++) {
                int oldIndex = mol.offsets[molBins[i].second]+atoms[j];
                int newIndex = mol.offsets[i]+atoms[j];
-                originalIndex[newIndex] = atomIndex->get(oldIndex);
-                newPosq[newIndex] = posq->get(oldIndex);
-                newVelm[newIndex] = velm->get(oldIndex);
+                originalIndex[newIndex] = atomIndex[oldIndex];
+                newPosq[newIndex] = oldPosq[oldIndex];
+                newVelm[newIndex] = oldVelm[oldIndex];
                newCellOffsets[newIndex] = posCellOffsets[oldIndex];
            }
        }
@@ -929,14 +937,12 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
    // Update the streams.

    for (int i = 0; i < numAtoms; i++) {
-        posq->set(i, newPosq[i]);
-        velm->set(i, newVelm[i]);
-        atomIndex->set(i, originalIndex[i]);
+        atomIndex[i] = originalIndex[i];
        posCellOffsets[i] = newCellOffsets[i];
    }
-    posq->upload();
-    velm->upload();
-    atomIndex->upload();
+    posq->upload(newPosq);
+    velm->upload(newVelm);
+    atomIndexDevice->upload(atomIndex);
    for (int i = 0; i < (int) reorderListeners.size(); i++)
        reorderListeners[i]->execute();
 }

--- a/platforms/opencl/src/OpenCLContext.h
+++ b/platforms/opencl/src/OpenCLContext.h
@@ -42,7 +42,6 @@

 namespace OpenMM {

-template <class T>
 class OpenCLArray;
 class OpenCLForceInfo;
 class OpenCLIntegrationUtilities;
@@ -196,44 +195,57 @@ public:
    /**
     * Get the array which contains the position (the xyz components) and charge (the w component) of each atom.
     */
-    OpenCLArray<mm_float4>& getPosq() {
+    OpenCLArray& getPosq() {
        return *posq;
    }
    /**
     * Get the array which contains the velocity (the xyz components) and inverse mass (the w component) of each atom.
     */
-    OpenCLArray<mm_float4>& getVelm() {
+    OpenCLArray& getVelm() {
        return *velm;
    }
    /**
     * Get the array which contains the force on each atom.
     */
-    OpenCLArray<mm_float4>& getForce() {
+    OpenCLArray& getForce() {
        return *force;
    }
    /**
     * Get the array which contains the buffers in which forces are computed.
     */
-    OpenCLArray<mm_float4>& getForceBuffers() {
+    OpenCLArray& getForceBuffers() {
        return *forceBuffers;
    }
    /**
     * Get the array which contains a contribution to each force represented as 64 bit fixed point.
     */
-    OpenCLArray<cl_long>& getLongForceBuffer() {
+    OpenCLArray& getLongForceBuffer() {
        return *longForceBuffer;
    }
    /**
     * Get the array which contains the buffer in which energy is computed.
     */
-    OpenCLArray<cl_float>& getEnergyBuffer() {
+    OpenCLArray& getEnergyBuffer() {
        return *energyBuffer;
    }
+    /**
+     * Get a pointer to a block of pinned memory that can be used for efficient transfers between host and device.
+     * This is guaranteed to be at least as large as any of the arrays returned by methods of this class.
+     */
+    void* getPinnedBuffer() {
+        return pinnedMemory;
+    }
+    /**
+     * Get the host-side vector which contains the index of each atom.
+     */
+    const std::vector<int>& getAtomIndex() const {
+        return atomIndex;
+    }
    /**
     * Get the array which contains the index of each atom.
     */
-    OpenCLArray<cl_int>& getAtomIndex() {
-        return *atomIndex;
+    OpenCLArray& getAtomIndexArray() {
+        return *atomIndexDevice;
    }
    /**
     * Get the number of cells by which the positions are offset.
@@ -277,11 +289,7 @@ public:
    /**
     * Set all elements of an array to 0.
     */
-    void clearBuffer(OpenCLArray<float>& array);
-    /**
-     * Set all elements of an array to 0.
-     */
-    void clearBuffer(OpenCLArray<mm_float4>& array);
+    void clearBuffer(OpenCLArray& array);
    /**
     * Set all elements of an array to 0.
     *
@@ -307,7 +315,7 @@ public:
     * @param array       the array containing the buffers to reduce
     * @param numBuffers  the number of buffers packed into the array
     */
-    void reduceBuffer(OpenCLArray<mm_float4>& array, int numBuffers);
+    void reduceBuffer(OpenCLArray& array, int numBuffers);
    /**
     * Sum the buffesr containing forces.
     */
@@ -527,13 +535,16 @@ private:
    std::vector<Molecule> molecules;
    std::vector<MoleculeGroup> moleculeGroups;
    std::vector<mm_int4> posCellOffsets;
-    OpenCLArray<mm_float4>* posq;
-    OpenCLArray<mm_float4>* velm;
-    OpenCLArray<mm_float4>* force;
-    OpenCLArray<mm_float4>* forceBuffers;
-    OpenCLArray<cl_long>* longForceBuffer;
-    OpenCLArray<cl_float>* energyBuffer;
-    OpenCLArray<cl_int>* atomIndex;
+    cl::Buffer* pinnedBuffer;
+    void* pinnedMemory;
+    OpenCLArray* posq;
+    OpenCLArray* velm;
+    OpenCLArray* force;
+    OpenCLArray* forceBuffers;
+    OpenCLArray* longForceBuffer;
+    OpenCLArray* energyBuffer;
+    OpenCLArray* atomIndexDevice;
+    std::vector<int> atomIndex;
    std::vector<cl::Memory*> autoclearBuffers;
    std::vector<int> autoclearBufferSizes;
    std::vector<ReorderListener*> reorderListeners;

--- a/platforms/opencl/src/OpenCLFFT3D.cpp
+++ b/platforms/opencl/src/OpenCLFFT3D.cpp
@@ -41,7 +41,7 @@ OpenCLFFT3D::OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize
    ykernel = createKernel(zsize, xsize, ysize);
 }

-void OpenCLFFT3D::execFFT(OpenCLArray<mm_float2>& in, OpenCLArray<mm_float2>& out, bool forward) {
+void OpenCLFFT3D::execFFT(OpenCLArray& in, OpenCLArray& out, bool forward) {
    int maxSize = xkernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice());
    if (context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU)
        maxSize = 1;

--- a/platforms/opencl/src/OpenCLFFT3D.h
+++ b/platforms/opencl/src/OpenCLFFT3D.h
@@ -72,7 +72,7 @@ public:
     * @param out      on exit, this contains the transformed data
     * @param forward  true to perform a forward transform, false to perform an inverse transform
     */
-    void execFFT(OpenCLArray<mm_float2>& in, OpenCLArray<mm_float2>& out, bool forward = true);
+    void execFFT(OpenCLArray& in, OpenCLArray& out, bool forward = true);
    /**
     * Get the smallest legal size for a dimension of the grid (that is, a size with no prime
     * factors other than 2, 3, and 5).

--- a/platforms/opencl/src/OpenCLIntegrationUtilities.cpp
+++ b/platforms/opencl/src/OpenCLIntegrationUtilities.cpp
@@ -96,12 +96,12 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
        vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL), hasInitializedPosConstraintKernels(false), hasInitializedVelConstraintKernels(false) {
    // Create workspace arrays.

-    posDelta = new OpenCLArray<mm_float4>(context, context.getPaddedNumAtoms(), "posDelta");
+    posDelta = OpenCLArray::create<mm_float4>(context, context.getPaddedNumAtoms(), "posDelta");
    vector<mm_float4> deltas(posDelta->getSize(), mm_float4(0.0, 0.0, 0.0, 0.0));
    posDelta->upload(deltas);
-    stepSize = new OpenCLArray<mm_float2>(context, 1, "stepSize", true);
-    stepSize->set(0, mm_float2(0.0f, 0.0f));
-    stepSize->upload();
+    stepSize = OpenCLArray::create<mm_float2>(context, 1, "stepSize");
+    vector<mm_float2> step(1, mm_float2(0.0f, 0.0f));
+    stepSize->upload(step);

    // Create kernels for enforcing constraints.

@@ -192,8 +192,8 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
            isShakeAtom[atom2] = true;
            isShakeAtom[atom3] = true;
        }
-        settleAtoms = new OpenCLArray<mm_int4>(context, atoms.size(), "settleAtoms");
-        settleParams = new OpenCLArray<mm_float2>(context, params.size(), "settleParams");
+        settleAtoms = OpenCLArray::create<mm_int4>(context, atoms.size(), "settleAtoms");
+        settleParams = OpenCLArray::create<mm_float2>(context, params.size(), "settleParams");
        settleAtoms->upload(atoms);
        settleParams->upload(params);
    }
@@ -274,8 +274,8 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
                isShakeAtom[cluster.peripheralID[2]] = true;
            ++index;
        }
-        shakeAtoms = new OpenCLArray<mm_int4>(context, atoms.size(), "shakeAtoms");
-        shakeParams = new OpenCLArray<mm_float4>(context, params.size(), "shakeParams");
+        shakeAtoms = OpenCLArray::create<mm_int4>(context, atoms.size(), "shakeAtoms");
+        shakeParams = OpenCLArray::create<mm_float4>(context, params.size(), "shakeParams");
        shakeAtoms->upload(atoms);
        shakeParams->upload(params);
    }
@@ -457,18 +457,18 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c

        // Record the CCMA data structures.

-        ccmaAtoms = new OpenCLArray<mm_int2>(context, numCCMA, "CcmaAtoms");
-        ccmaDistance = new OpenCLArray<mm_float4>(context, numCCMA, "CcmaDistance");
-        ccmaAtomConstraints = new OpenCLArray<cl_int>(context, numAtoms*maxAtomConstraints, "CcmaAtomConstraints");
-        ccmaNumAtomConstraints = new OpenCLArray<cl_int>(context, numAtoms, "CcmaAtomConstraintsIndex");
-        ccmaDelta1 = new OpenCLArray<cl_float>(context, numCCMA, "CcmaDelta1");
-        ccmaDelta2 = new OpenCLArray<cl_float>(context, numCCMA, "CcmaDelta2");
-        ccmaConverged = new OpenCLArray<cl_int>(context, 2, "CcmaConverged");
+        ccmaAtoms = OpenCLArray::create<mm_int2>(context, numCCMA, "CcmaAtoms");
+        ccmaDistance = OpenCLArray::create<mm_float4>(context, numCCMA, "CcmaDistance");
+        ccmaAtomConstraints = OpenCLArray::create<cl_int>(context, numAtoms*maxAtomConstraints, "CcmaAtomConstraints");
+        ccmaNumAtomConstraints = OpenCLArray::create<cl_int>(context, numAtoms, "CcmaAtomConstraintsIndex");
+        ccmaDelta1 = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaDelta1");
+        ccmaDelta2 = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaDelta2");
+        ccmaConverged = OpenCLArray::create<cl_int>(context, 2, "CcmaConverged");
        ccmaConvergedBuffer = new cl::Buffer(context.getContext(), CL_MEM_ALLOC_HOST_PTR, 2*sizeof(cl_int));
        ccmaConvergedMemory = (cl_int*) context.getQueue().enqueueMapBuffer(*ccmaConvergedBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, 2*sizeof(cl_int));
-        ccmaReducedMass = new OpenCLArray<cl_float>(context, numCCMA, "CcmaReducedMass");
-        ccmaConstraintMatrixColumn = new OpenCLArray<cl_int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
-        ccmaConstraintMatrixValue = new OpenCLArray<cl_float>(context, numCCMA*maxRowElements, "ConstraintMatrixValue");
+        ccmaReducedMass = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaReducedMass");
+        ccmaConstraintMatrixColumn = OpenCLArray::create<cl_int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
+        ccmaConstraintMatrixValue = OpenCLArray::create<cl_float>(context, numCCMA*maxRowElements, "ConstraintMatrixValue");
        vector<mm_int2> atomsVec(ccmaAtoms->getSize());
        vector<mm_float4> distanceVec(ccmaDistance->getSize());
        vector<cl_int> atomConstraintsVec(ccmaAtomConstraints->getSize());
@@ -556,12 +556,12 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
    int num2Avg = vsite2AvgAtomVec.size();
    int num3Avg = vsite3AvgAtomVec.size();
    int numOutOfPlane = vsiteOutOfPlaneAtomVec.size();
-    vsite2AvgAtoms = new OpenCLArray<mm_int4>(context, max(1, num2Avg), "vsite2AvgAtoms");
-    vsite2AvgWeights = new OpenCLArray<mm_float2>(context, max(1, num2Avg), "vsite2AvgWeights");
-    vsite3AvgAtoms = new OpenCLArray<mm_int4>(context, max(1, num3Avg), "vsite3AvgAtoms");
-    vsite3AvgWeights = new OpenCLArray<mm_float4>(context, max(1, num3Avg), "vsite3AvgWeights");
-    vsiteOutOfPlaneAtoms = new OpenCLArray<mm_int4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneAtoms");
-    vsiteOutOfPlaneWeights = new OpenCLArray<mm_float4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights");
+    vsite2AvgAtoms = OpenCLArray::create<mm_int4>(context, max(1, num2Avg), "vsite2AvgAtoms");
+    vsite2AvgWeights = OpenCLArray::create<mm_float2>(context, max(1, num2Avg), "vsite2AvgWeights");
+    vsite3AvgAtoms = OpenCLArray::create<mm_int4>(context, max(1, num3Avg), "vsite3AvgAtoms");
+    vsite3AvgWeights = OpenCLArray::create<mm_float4>(context, max(1, num3Avg), "vsite3AvgWeights");
+    vsiteOutOfPlaneAtoms = OpenCLArray::create<mm_int4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneAtoms");
+    vsiteOutOfPlaneWeights = OpenCLArray::create<mm_float4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights");
    if (num2Avg > 0) {
        vsite2AvgAtoms->upload(vsite2AvgAtomVec);
        vsite2AvgWeights->upload(vsite2AvgWeightVec);
@@ -779,8 +779,8 @@ void OpenCLIntegrationUtilities::initRandomNumberGenerator(unsigned int randomNu
    // Create the random number arrays.

    lastSeed = randomNumberSeed;
-    random = new OpenCLArray<mm_float4>(context, 32*context.getPaddedNumAtoms(), "random");
-    randomSeed = new OpenCLArray<mm_int4>(context, context.getNumThreadBlocks()*OpenCLContext::ThreadBlockSize, "randomSeed");
+    random = OpenCLArray::create<mm_float4>(context, 32*context.getPaddedNumAtoms(), "random");
+    randomSeed = OpenCLArray::create<mm_int4>(context, context.getNumThreadBlocks()*OpenCLContext::ThreadBlockSize, "randomSeed");
    randomPos = random->getSize();

    // Use a quick and dirty RNG to pick seeds for the real random number generator.
@@ -809,7 +809,7 @@ int OpenCLIntegrationUtilities::prepareRandomNumbers(int numValues) {
    }
    if (numValues > random->getSize()) {
        delete random;
-        random = new OpenCLArray<mm_float4>(context, numValues, "random");
+        random = OpenCLArray::create<mm_float4>(context, numValues, "random");
    }
    randomKernel.setArg<cl_int>(0, random->getSize());
    randomKernel.setArg<cl::Buffer>(1, random->getDeviceBuffer());

--- a/platforms/opencl/src/OpenCLIntegrationUtilities.h
+++ b/platforms/opencl/src/OpenCLIntegrationUtilities.h
@@ -46,20 +46,20 @@ public:
    /**
     * Get the array which contains position deltas.
     */
-    OpenCLArray<mm_float4>& getPosDelta() {
+    OpenCLArray& getPosDelta() {
        return *posDelta;
    }
    /**
     * Get the array which contains random values.  Each element is a float4, whose components
     * are independent, normally distributed random numbers with mean 0 and variance 1.
     */
-    OpenCLArray<mm_float4>& getRandom() {
+    OpenCLArray& getRandom() {
        return *random;
    }
    /**
     * Get the array which contains the current step size.
     */
-    OpenCLArray<mm_float2>& getStepSize() {
+    OpenCLArray& getStepSize() {
        return *stepSize;
    }
    /**
@@ -116,32 +116,32 @@ private:
    cl::Kernel ccmaPosUpdateKernel, ccmaVelUpdateKernel;
    cl::Kernel vsitePositionKernel, vsiteForceKernel;
    cl::Kernel randomKernel;
-    OpenCLArray<mm_float4>* posDelta;
-    OpenCLArray<mm_int4>* settleAtoms;
-    OpenCLArray<mm_float2>* settleParams;
-    OpenCLArray<mm_int4>* shakeAtoms;
-    OpenCLArray<mm_float4>* shakeParams;
-    OpenCLArray<mm_float4>* random;
-    OpenCLArray<mm_int4>* randomSeed;
-    OpenCLArray<mm_float2>* stepSize;
-    OpenCLArray<mm_int2>* ccmaAtoms;
-    OpenCLArray<mm_float4>* ccmaDistance;
-    OpenCLArray<cl_float>* ccmaReducedMass;
-    OpenCLArray<cl_int>* ccmaAtomConstraints;
-    OpenCLArray<cl_int>* ccmaNumAtomConstraints;
-    OpenCLArray<cl_int>* ccmaConstraintMatrixColumn;
-    OpenCLArray<cl_float>* ccmaConstraintMatrixValue;
-    OpenCLArray<cl_float>* ccmaDelta1;
-    OpenCLArray<cl_float>* ccmaDelta2;
-    OpenCLArray<cl_int>* ccmaConverged;
+    OpenCLArray* posDelta;
+    OpenCLArray* settleAtoms;
+    OpenCLArray* settleParams;
+    OpenCLArray* shakeAtoms;
+    OpenCLArray* shakeParams;
+    OpenCLArray* random;
+    OpenCLArray* randomSeed;
+    OpenCLArray* stepSize;
+    OpenCLArray* ccmaAtoms;
+    OpenCLArray* ccmaDistance;
+    OpenCLArray* ccmaReducedMass;
+    OpenCLArray* ccmaAtomConstraints;
+    OpenCLArray* ccmaNumAtomConstraints;
+    OpenCLArray* ccmaConstraintMatrixColumn;
+    OpenCLArray* ccmaConstraintMatrixValue;
+    OpenCLArray* ccmaDelta1;
+    OpenCLArray* ccmaDelta2;
+    OpenCLArray* ccmaConverged;
    cl::Buffer* ccmaConvergedBuffer;
    cl_int* ccmaConvergedMemory;
-    OpenCLArray<mm_int4>* vsite2AvgAtoms;
-    OpenCLArray<mm_float2>* vsite2AvgWeights;
-    OpenCLArray<mm_int4>* vsite3AvgAtoms;
-    OpenCLArray<mm_float4>* vsite3AvgWeights;
-    OpenCLArray<mm_int4>* vsiteOutOfPlaneAtoms;
-    OpenCLArray<mm_float4>* vsiteOutOfPlaneWeights;
+    OpenCLArray* vsite2AvgAtoms;
+    OpenCLArray* vsite2AvgWeights;
+    OpenCLArray* vsite3AvgAtoms;
+    OpenCLArray* vsite3AvgWeights;
+    OpenCLArray* vsiteOutOfPlaneAtoms;
+    OpenCLArray* vsiteOutOfPlaneWeights;
    int randomPos;
    int lastSeed, numVsites;
    bool hasInitializedPosConstraintKernels, hasInitializedVelConstraintKernels;

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -116,9 +116,10 @@ double OpenCLCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context,
    cl.getIntegrationUtilities().distributeForcesFromVirtualSites();
    double sum = 0.0f;
    if (includeEnergy) {
-        OpenCLArray<cl_float>& energy = cl.getEnergyBuffer();
-        energy.download();
-        for (int i = 0; i < energy.getSize(); i++)
+        OpenCLArray& energyArray = cl.getEnergyBuffer();
+        cl_float* energy = (cl_float*) cl.getPinnedBuffer();
+        energyArray.download(energy);
+        for (int i = 0; i < energyArray.getSize(); i++)
            sum += energy[i];
    }
    return sum;
@@ -138,9 +139,9 @@ void OpenCLUpdateStateDataKernel::setTime(ContextImpl& context, double time) {
 }

 void OpenCLUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>& positions) {
-    OpenCLArray<mm_float4>& posq = cl.getPosq();
-    posq.download();
-    OpenCLArray<cl_int>& order = cl.getAtomIndex();
+    mm_float4* posq = (mm_float4*) cl.getPinnedBuffer();
+    cl.getPosq().download(posq);
+    const vector<cl_int>& order = cl.getAtomIndex();
    int numParticles = context.getSystem().getNumParticles();
    positions.resize(numParticles);
    mm_float4 periodicBoxSize = cl.getPeriodicBoxSize();
@@ -152,8 +153,9 @@ void OpenCLUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3
 }

 void OpenCLUpdateStateDataKernel::setPositions(ContextImpl& context, const vector<Vec3>& positions) {
-    OpenCLArray<mm_float4>& posq = cl.getPosq();
-    OpenCLArray<cl_int>& order = cl.getAtomIndex();
+    mm_float4* posq = (mm_float4*) cl.getPinnedBuffer();
+    cl.getPosq().download(posq);
+    const vector<cl_int>& order = cl.getAtomIndex();
    int numParticles = context.getSystem().getNumParticles();
    for (int i = 0; i < numParticles; ++i) {
        mm_float4& pos = posq[i];
@@ -164,15 +166,15 @@ void OpenCLUpdateStateDataKernel::setPositions(ContextImpl& context, const vecto
    }
    for (int i = numParticles; i < cl.getPaddedNumAtoms(); i++)
        posq[i] = mm_float4(0.0f, 0.0f, 0.0f, 0.0f);
-    posq.upload();
+    cl.getPosq().upload(posq);
    for (int i = 0; i < (int) cl.getPosCellOffsets().size(); i++)
        cl.getPosCellOffsets()[i] = mm_int4(0, 0, 0, 0);
 }

 void OpenCLUpdateStateDataKernel::getVelocities(ContextImpl& context, vector<Vec3>& velocities) {
-    OpenCLArray<mm_float4>& velm = cl.getVelm();
-    velm.download();
-    OpenCLArray<cl_int>& order = cl.getAtomIndex();
+    mm_float4* velm = (mm_float4*) cl.getPinnedBuffer();
+    cl.getVelm().download(velm);
+    const vector<cl_int>& order = cl.getAtomIndex();
    int numParticles = context.getSystem().getNumParticles();
    velocities.resize(numParticles);
    for (int i = 0; i < numParticles; ++i) {
@@ -182,8 +184,9 @@ void OpenCLUpdateStateDataKernel::getVelocities(ContextImpl& context, vector<Vec
 }

 void OpenCLUpdateStateDataKernel::setVelocities(ContextImpl& context, const vector<Vec3>& velocities) {
-    OpenCLArray<mm_float4>& velm = cl.getVelm();
-    OpenCLArray<cl_int>& order = cl.getAtomIndex();
+    mm_float4* velm = (mm_float4*) cl.getPinnedBuffer();
+    cl.getVelm().download(velm);
+    const vector<cl_int>& order = cl.getAtomIndex();
    int numParticles = context.getSystem().getNumParticles();
    for (int i = 0; i < numParticles; ++i) {
        mm_float4& vel = velm[i];
@@ -194,13 +197,13 @@ void OpenCLUpdateStateDataKernel::setVelocities(ContextImpl& context, const vect
    }
    for (int i = numParticles; i < cl.getPaddedNumAtoms(); i++)
        velm[i] = mm_float4(0.0f, 0.0f, 0.0f, 0.0f);
-    velm.upload();
+    cl.getVelm().upload(velm);
 }

 void OpenCLUpdateStateDataKernel::getForces(ContextImpl& context, vector<Vec3>& forces) {
-    OpenCLArray<mm_float4>& force = cl.getForce();
-    force.download();
-    OpenCLArray<cl_int>& order = cl.getAtomIndex();
+    mm_float4* force = (mm_float4*) cl.getPinnedBuffer();
+    cl.getForce().download(force);
+    const vector<cl_int>& order = cl.getAtomIndex();
    int numParticles = context.getSystem().getNumParticles();
    forces.resize(numParticles);
    for (int i = 0; i < numParticles; ++i) {
@@ -231,11 +234,12 @@ void OpenCLUpdateStateDataKernel::createCheckpoint(ContextImpl& context, ostream
    stream.write((char*) &stepCount, sizeof(int));
    int computeForceCount = cl.getComputeForceCount();
    stream.write((char*) &computeForceCount, sizeof(int));
-    cl.getPosq().download();
-    stream.write((char*) &cl.getPosq()[0], sizeof(mm_float4)*cl.getPosq().getSize());
-    cl.getVelm().download();
-    stream.write((char*) &cl.getVelm()[0], sizeof(mm_float4)*cl.getVelm().getSize());
-    stream.write((char*) &cl.getAtomIndex()[0], sizeof(cl_int)*cl.getAtomIndex().getSize());
+    char* buffer = (char*) cl.getPinnedBuffer();
+    cl.getPosq().download((mm_float4*) buffer);
+    stream.write(buffer, sizeof(mm_float4)*cl.getPosq().getSize());
+    cl.getVelm().download((mm_float4*) buffer);
+    stream.write(buffer, sizeof(mm_float4)*cl.getVelm().getSize());
+    stream.write((char*) &cl.getAtomIndex()[0], sizeof(cl_int)*cl.getAtomIndex().size());
    stream.write((char*) &cl.getPosCellOffsets()[0], sizeof(mm_int4)*cl.getPosCellOffsets().size());
    mm_float4 box = cl.getPeriodicBoxSize();
    stream.write((char*) &box, sizeof(mm_float4));
@@ -259,12 +263,13 @@ void OpenCLUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istream&
        contexts[i]->setStepCount(stepCount);
        contexts[i]->setComputeForceCount(computeForceCount);
    }
-    stream.read((char*) &cl.getPosq()[0], sizeof(mm_float4)*cl.getPosq().getSize());
-    cl.getPosq().upload();
-    stream.read((char*) &cl.getVelm()[0], sizeof(mm_float4)*cl.getVelm().getSize());
-    cl.getVelm().upload();
-    stream.read((char*) &cl.getAtomIndex()[0], sizeof(cl_int)*cl.getAtomIndex().getSize());
-    cl.getAtomIndex().upload();
+    char* buffer = (char*) cl.getPinnedBuffer();
+    stream.read(buffer, sizeof(mm_float4)*cl.getPosq().getSize());
+    cl.getPosq().upload(buffer);
+    stream.read(buffer, sizeof(mm_float4)*cl.getVelm().getSize());
+    cl.getVelm().upload(buffer);
+    stream.read((char*) &cl.getAtomIndex()[0], sizeof(cl_int)*cl.getAtomIndex().size());
+    cl.getAtomIndexArray().upload(cl.getAtomIndex());
    stream.read((char*) &cl.getPosCellOffsets()[0], sizeof(mm_int4)*cl.getPosCellOffsets().size());
    mm_float4 box;
    stream.read((char*) &box, sizeof(mm_float4));
@@ -342,7 +347,7 @@ void OpenCLCalcHarmonicBondForceKernel::initialize(const System& system, const H
    if (numBonds == 0)
        return;
    vector<vector<int> > atoms(numBonds, vector<int>(2));
-    params = new OpenCLArray<mm_float2>(cl, numBonds, "bondParams");
+    params = OpenCLArray::create<mm_float2>(cl, numBonds, "bondParams");
    vector<mm_float2> paramVector(numBonds);
    for (int i = 0; i < numBonds; i++) {
        double length, k;
@@ -463,7 +468,7 @@ void OpenCLCalcCustomBondForceKernel::initialize(const System& system, const Cus
        variables[name] = "bondParams"+params->getParameterSuffix(i);
    }
    if (force.getNumGlobalParameters() > 0) {
-        globals = new OpenCLArray<cl_float>(cl, force.getNumGlobalParameters(), "customBondGlobals", false, CL_MEM_READ_ONLY);
+        globals = OpenCLArray::create<cl_float>(cl, force.getNumGlobalParameters(), "customBondGlobals", CL_MEM_READ_ONLY);
        globals->upload(globalParamValues);
        string argName = cl.getBondedUtilities().addArgument(globals->getDeviceBuffer(), "float");
        for (int i = 0; i < force.getNumGlobalParameters(); i++) {
@@ -565,7 +570,7 @@ void OpenCLCalcHarmonicAngleForceKernel::initialize(const System& system, const
    if (numAngles == 0)
        return;
    vector<vector<int> > atoms(numAngles, vector<int>(3));
-    params = new OpenCLArray<mm_float2>(cl, numAngles, "angleParams");
+    params = OpenCLArray::create<mm_float2>(cl, numAngles, "angleParams");
    vector<mm_float2> paramVector(numAngles);
    for (int i = 0; i < numAngles; i++) {
        double angle, k;
@@ -688,7 +693,7 @@ void OpenCLCalcCustomAngleForceKernel::initialize(const System& system, const Cu
        variables[name] = "angleParams"+params->getParameterSuffix(i);
    }
    if (force.getNumGlobalParameters() > 0) {
-        globals = new OpenCLArray<cl_float>(cl, force.getNumGlobalParameters(), "customAngleGlobals", false, CL_MEM_READ_ONLY);
+        globals = OpenCLArray::create<cl_float>(cl, force.getNumGlobalParameters(), "customAngleGlobals", CL_MEM_READ_ONLY);
        globals->upload(globalParamValues);
        string argName = cl.getBondedUtilities().addArgument(globals->getDeviceBuffer(), "float");
        for (int i = 0; i < force.getNumGlobalParameters(); i++) {
@@ -791,7 +796,7 @@ void OpenCLCalcPeriodicTorsionForceKernel::initialize(const System& system, cons
    if (numTorsions == 0)
        return;
    vector<vector<int> > atoms(numTorsions, vector<int>(4));
-    params = new OpenCLArray<mm_float4>(cl, numTorsions, "periodicTorsionParams");
+    params = OpenCLArray::create<mm_float4>(cl, numTorsions, "periodicTorsionParams");
    vector<mm_float4> paramVector(numTorsions);
    for (int i = 0; i < numTorsions; i++) {
        int periodicity;
@@ -875,7 +880,7 @@ void OpenCLCalcRBTorsionForceKernel::initialize(const System& system, const RBTo
    if (numTorsions == 0)
        return;
    vector<vector<int> > atoms(numTorsions, vector<int>(4));
-    params = new OpenCLArray<mm_float8>(cl, numTorsions, "rbTorsionParams");
+    params = OpenCLArray::create<mm_float8>(cl, numTorsions, "rbTorsionParams");
    vector<mm_float8> paramVector(numTorsions);
    for (int i = 0; i < numTorsions; i++) {
        double c0, c1, c2, c3, c4, c5;
@@ -987,9 +992,9 @@ void OpenCLCalcCMAPTorsionForceKernel::initialize(const System& system, const CM
    vector<cl_int> torsionMapsVec(numTorsions);
    for (int i = 0; i < numTorsions; i++)
        force.getTorsionParameters(startIndex+i, torsionMapsVec[i], atoms[i][0], atoms[i][1], atoms[i][2], atoms[i][3], atoms[i][4], atoms[i][5], atoms[i][6], atoms[i][7]);
-    coefficients = new OpenCLArray<mm_float4>(cl, coeffVec.size(), "cmapTorsionCoefficients");
-    mapPositions = new OpenCLArray<mm_int2>(cl, numMaps, "cmapTorsionMapPositions");
-    torsionMaps = new OpenCLArray<cl_int>(cl, numTorsions, "cmapTorsionMaps");
+    coefficients = OpenCLArray::create<mm_float4>(cl, coeffVec.size(), "cmapTorsionCoefficients");
+    mapPositions = OpenCLArray::create<mm_int2>(cl, numMaps, "cmapTorsionMapPositions");
+    torsionMaps = OpenCLArray::create<cl_int>(cl, numTorsions, "cmapTorsionMaps");
    coefficients->upload(coeffVec);
    mapPositions->upload(mapPositionsVec);
    torsionMaps->upload(torsionMapsVec);
@@ -1086,7 +1091,7 @@ void OpenCLCalcCustomTorsionForceKernel::initialize(const System& system, const
        variables[name] = "torsionParams"+params->getParameterSuffix(i);
    }
    if (force.getNumGlobalParameters() > 0) {
-        globals = new OpenCLArray<cl_float>(cl, force.getNumGlobalParameters(), "customTorsionGlobals", false, CL_MEM_READ_ONLY);
+        globals = OpenCLArray::create<cl_float>(cl, force.getNumGlobalParameters(), "customTorsionGlobals", CL_MEM_READ_ONLY);
        globals->upload(globalParamValues);
        string argName = cl.getBondedUtilities().addArgument(globals->getDeviceBuffer(), "float");
        for (int i = 0; i < force.getNumGlobalParameters(); i++) {
@@ -1229,8 +1234,8 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
    // Initialize nonbonded interactions.

    int numParticles = force.getNumParticles();
-    sigmaEpsilon = new OpenCLArray<mm_float2>(cl, cl.getPaddedNumAtoms(), "sigmaEpsilon");
-    OpenCLArray<mm_float4>& posq = cl.getPosq();
+    sigmaEpsilon = OpenCLArray::create<mm_float2>(cl, cl.getPaddedNumAtoms(), "sigmaEpsilon");
+    vector<mm_float4> posq(cl.getPaddedNumAtoms(), mm_float4(0, 0, 0, 0));
    vector<mm_float2> sigmaEpsilonVector(cl.getPaddedNumAtoms());
    vector<vector<int> > exclusionList(numParticles);
    double sumSquaredCharges = 0.0;
@@ -1252,7 +1257,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
        exclusionList[exclusions[i].first].push_back(exclusions[i].second);
        exclusionList[exclusions[i].second].push_back(exclusions[i].first);
    }
-    posq.upload();
+    cl.getPosq().upload(posq);
    sigmaEpsilon->upload(sigmaEpsilonVector);
    bool useCutoff = (force.getNonbondedMethod() != NonbondedForce::NoCutoff);
    bool usePeriodic = (force.getNonbondedMethod() != NonbondedForce::NoCutoff && force.getNonbondedMethod() != NonbondedForce::CutoffNonPeriodic);
@@ -1293,7 +1298,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
        cl::Program program = cl.createProgram(OpenCLKernelSources::ewald, replacements);
        ewaldSumsKernel = cl::Kernel(program, "calculateEwaldCosSinSums");
        ewaldForcesKernel = cl::Kernel(program, "calculateEwaldForces");
-        cosSinSums = new OpenCLArray<mm_float2>(cl, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), "cosSinSums");
+        cosSinSums = OpenCLArray::create<mm_float2>(cl, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), "cosSinSums");
    }
    else if (force.getNonbondedMethod() == NonbondedForce::PME) {
        // Compute the PME parameters.
@@ -1317,18 +1322,18 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb

        // Create required data structures.

-        pmeGrid = new OpenCLArray<mm_float2>(cl, gridSizeX*gridSizeY*gridSizeZ, "pmeGrid");
+        pmeGrid = OpenCLArray::create<mm_float2>(cl, gridSizeX*gridSizeY*gridSizeZ, "pmeGrid");
        cl.addAutoclearBuffer(pmeGrid->getDeviceBuffer(), pmeGrid->getSize()*2);
-        pmeGrid2 = new OpenCLArray<mm_float2>(cl, gridSizeX*gridSizeY*gridSizeZ, "pmeGrid2");
-        pmeBsplineModuliX = new OpenCLArray<cl_float>(cl, gridSizeX, "pmeBsplineModuliX");
-        pmeBsplineModuliY = new OpenCLArray<cl_float>(cl, gridSizeY, "pmeBsplineModuliY");
-        pmeBsplineModuliZ = new OpenCLArray<cl_float>(cl, gridSizeZ, "pmeBsplineModuliZ");
-        pmeBsplineTheta = new OpenCLArray<mm_float4>(cl, PmeOrder*numParticles, "pmeBsplineTheta");
+        pmeGrid2 = OpenCLArray::create<mm_float2>(cl, gridSizeX*gridSizeY*gridSizeZ, "pmeGrid2");
+        pmeBsplineModuliX = OpenCLArray::create<cl_float>(cl, gridSizeX, "pmeBsplineModuliX");
+        pmeBsplineModuliY = OpenCLArray::create<cl_float>(cl, gridSizeY, "pmeBsplineModuliY");
+        pmeBsplineModuliZ = OpenCLArray::create<cl_float>(cl, gridSizeZ, "pmeBsplineModuliZ");
+        pmeBsplineTheta = OpenCLArray::create<mm_float4>(cl, PmeOrder*numParticles, "pmeBsplineTheta");
        bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
        if (deviceIsCpu)
-            pmeBsplineDTheta = new OpenCLArray<mm_float4>(cl, PmeOrder*numParticles, "pmeBsplineDTheta");
-        pmeAtomRange = new OpenCLArray<cl_int>(cl, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
-        pmeAtomGridIndex = new OpenCLArray<mm_int2>(cl, numParticles, "pmeAtomGridIndex");
+            pmeBsplineDTheta = OpenCLArray::create<mm_float4>(cl, PmeOrder*numParticles, "pmeBsplineDTheta");
+        pmeAtomRange = OpenCLArray::create<cl_int>(cl, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
+        pmeAtomGridIndex = OpenCLArray::create<mm_int2>(cl, numParticles, "pmeAtomGridIndex");
        sort = new OpenCLSort<SortTrait>(cl, cl.getNumAtoms());
        fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ);

@@ -1411,7 +1416,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
    if (numExceptions > 0) {
        exceptionAtoms.resize(numExceptions);
        vector<vector<int> > atoms(numExceptions, vector<int>(2));
-        exceptionParams = new OpenCLArray<mm_float4>(cl, numExceptions, "exceptionParams");
+        exceptionParams = OpenCLArray::create<mm_float4>(cl, numExceptions, "exceptionParams");
        vector<mm_float4> exceptionParamsVector(numExceptions);
        for (int i = 0; i < numExceptions; i++) {
            double chargeProd, sigma, epsilon;
@@ -1577,20 +1582,21 @@ void OpenCLCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& contex
    
    // Record the per-particle parameters.
    
-    OpenCLArray<mm_float4>& posq = cl.getPosq();
-    posq.download();
+    OpenCLArray& posq = cl.getPosq();
+    posq.download((mm_float4*) cl.getPinnedBuffer());
+    mm_float4* posqf = (mm_float4*) cl.getPinnedBuffer();
    vector<mm_float2> sigmaEpsilonVector(cl.getPaddedNumAtoms());
    double sumSquaredCharges = 0.0;
-    OpenCLArray<cl_int>& order = cl.getAtomIndex();
+    const vector<cl_int>& order = cl.getAtomIndex();
    for (int i = 0; i < force.getNumParticles(); i++) {
        int index = order[i];
        double charge, sigma, epsilon;
        force.getParticleParameters(index, charge, sigma, epsilon);
-        posq[i].w = (float) charge;
+        posqf[i].w = (float) charge;
        sigmaEpsilonVector[index] = mm_float2((float) (0.5*sigma), (float) (2.0*sqrt(epsilon)));
        sumSquaredCharges += charge*charge;
    }
-    posq.upload();
+    posq.upload(cl.getPinnedBuffer());
    sigmaEpsilon->upload(sigmaEpsilonVector);
    
    // Record the exceptions.
@@ -1669,7 +1675,7 @@ void OpenCLCalcCustomNonbondedForceKernel::initialize(const System& system, cons
    int numParticles = force.getNumParticles();
    params = new OpenCLParameterSet(cl, force.getNumPerParticleParameters(), numParticles, "customNonbondedParameters");
    if (force.getNumGlobalParameters() > 0)
-        globals = new OpenCLArray<cl_float>(cl, force.getNumGlobalParameters(), "customNonbondedGlobals", false, CL_MEM_READ_ONLY);
+        globals = OpenCLArray::create<cl_float>(cl, force.getNumGlobalParameters(), "customNonbondedGlobals", CL_MEM_READ_ONLY);
    vector<vector<cl_float> > paramVector(numParticles);
    vector<vector<int> > exclusionList(numParticles);
    for (int i = 0; i < numParticles; i++) {
@@ -1704,12 +1710,12 @@ void OpenCLCalcCustomNonbondedForceKernel::initialize(const System& system, cons
        functions[name] = &fp;
        tabulatedFunctionParamsVec[i] = mm_float4((float) min, (float) max, (float) ((values.size()-1)/(max-min)), (float) values.size()-2);
        vector<mm_float4> f = OpenCLExpressionUtilities::computeFunctionCoefficients(values, min, max);
-        tabulatedFunctions.push_back(new OpenCLArray<mm_float4>(cl, values.size()-1, "TabulatedFunction"));
+        tabulatedFunctions.push_back(OpenCLArray::create<mm_float4>(cl, values.size()-1, "TabulatedFunction"));
        tabulatedFunctions[tabulatedFunctions.size()-1]->upload(f);
        cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo(arrayName, "float", 4, sizeof(cl_float4), tabulatedFunctions[tabulatedFunctions.size()-1]->getDeviceBuffer()));
    }
    if (force.getNumFunctions() > 0) {
-        tabulatedFunctionParams = new OpenCLArray<mm_float4>(cl, tabulatedFunctionParamsVec.size(), "tabulatedFunctionParameters", false, CL_MEM_READ_ONLY);
+        tabulatedFunctionParams = OpenCLArray::create<mm_float4>(cl, tabulatedFunctionParamsVec.size(), "tabulatedFunctionParameters", CL_MEM_READ_ONLY);
        tabulatedFunctionParams->upload(tabulatedFunctionParamsVec);
        cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo(prefix+"functionParams", "float", 4, sizeof(cl_float4), tabulatedFunctionParams->getDeviceBuffer()));
    }
@@ -1838,25 +1844,25 @@ void OpenCLCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOB
    if (cl.getPlatformData().contexts.size() > 1)
        throw OpenMMException("GBSAOBCForce does not support using multiple OpenCL devices");
    OpenCLNonbondedUtilities& nb = cl.getNonbondedUtilities();
-    params = new OpenCLArray<mm_float2>(cl, cl.getPaddedNumAtoms(), "gbsaObcParams");
-    bornRadii = new OpenCLArray<cl_float>(cl, cl.getPaddedNumAtoms(), "bornRadii");
-    obcChain = new OpenCLArray<cl_float>(cl, cl.getPaddedNumAtoms(), "obcChain");
+    params = OpenCLArray::create<mm_float2>(cl, cl.getPaddedNumAtoms(), "gbsaObcParams");
+    bornRadii = OpenCLArray::create<cl_float>(cl, cl.getPaddedNumAtoms(), "bornRadii");
+    obcChain = OpenCLArray::create<cl_float>(cl, cl.getPaddedNumAtoms(), "obcChain");
    if (cl.getSupports64BitGlobalAtomics()) {
-        longBornSum = new OpenCLArray<cl_long>(cl, cl.getPaddedNumAtoms(), "longBornSum");
-        longBornForce = new OpenCLArray<cl_long>(cl, cl.getPaddedNumAtoms(), "longBornForce");
-        bornForce = new OpenCLArray<cl_float>(cl, cl.getPaddedNumAtoms(), "bornForce");
+        longBornSum = OpenCLArray::create<cl_long>(cl, cl.getPaddedNumAtoms(), "longBornSum");
+        longBornForce = OpenCLArray::create<cl_long>(cl, cl.getPaddedNumAtoms(), "longBornForce");
+        bornForce = OpenCLArray::create<cl_float>(cl, cl.getPaddedNumAtoms(), "bornForce");
        cl.addAutoclearBuffer(longBornSum->getDeviceBuffer(), 2*longBornSum->getSize());
        cl.addAutoclearBuffer(longBornForce->getDeviceBuffer(), 2*longBornForce->getSize());
    }
    else {
-        bornSum = new OpenCLArray<cl_float>(cl, cl.getPaddedNumAtoms()*nb.getNumForceBuffers(), "bornSum");
-        bornForce = new OpenCLArray<cl_float>(cl, cl.getPaddedNumAtoms()*nb.getNumForceBuffers(), "bornForce");
+        bornSum = OpenCLArray::create<cl_float>(cl, cl.getPaddedNumAtoms()*nb.getNumForceBuffers(), "bornSum");
+        bornForce = OpenCLArray::create<cl_float>(cl, cl.getPaddedNumAtoms()*nb.getNumForceBuffers(), "bornForce");
        cl.addAutoclearBuffer(bornSum->getDeviceBuffer(), bornSum->getSize());
        cl.addAutoclearBuffer(bornForce->getDeviceBuffer(), bornForce->getSize());
    }
-    OpenCLArray<mm_float4>& posq = cl.getPosq();
+    vector<mm_float4> posq(cl.getPaddedNumAtoms(), mm_float4(0, 0, 0, 0));
    int numParticles = force.getNumParticles();
-    vector<mm_float2> paramsVector(numParticles);
+    vector<mm_float2> paramsVector(cl.getPaddedNumAtoms());
    const double dielectricOffset = 0.009;
    for (int i = 0; i < numParticles; i++) {
        double charge, radius, scalingFactor;
@@ -1865,7 +1871,7 @@ void OpenCLCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOB
        paramsVector[i] = mm_float2((float) radius, (float) (scalingFactor*radius));
        posq[i].w = (float) charge;
    }
-    posq.upload();
+    cl.getPosq().upload(posq);
    params->upload(paramsVector);
    prefactor = -ONE_4PI_EPS0*((1.0/force.getSoluteDielectric())-(1.0/force.getSolventDielectric()));
    bool useCutoff = (force.getNonbondedMethod() != GBSAOBCForce::NoCutoff);
@@ -2006,18 +2012,19 @@ void OpenCLCalcGBSAOBCForceKernel::copyParametersToContext(ContextImpl& context,
    
    // Record the per-particle parameters.
    
-    OpenCLArray<mm_float4>& posq = cl.getPosq();
-    posq.download();
-    vector<mm_float2> paramsVector(numParticles);
+    OpenCLArray& posq = cl.getPosq();
+    posq.download((mm_float4*) cl.getPinnedBuffer());
+    mm_float4* posqf = (mm_float4*) cl.getPinnedBuffer();
+    vector<mm_float2> paramsVector(cl.getPaddedNumAtoms());
    const double dielectricOffset = 0.009;
    for (int i = 0; i < numParticles; i++) {
        double charge, radius, scalingFactor;
        force.getParticleParameters(i, charge, radius, scalingFactor);
        radius -= dielectricOffset;
        paramsVector[i] = mm_float2((float) radius, (float) (scalingFactor*radius));
-        posq[i].w = (float) charge;
+        posqf[i].w = (float) charge;
    }
-    posq.upload();
+    posq.upload(cl.getPinnedBuffer());
    params->upload(paramsVector);
    
    // Mark that the current reordering may be invalid.
@@ -2107,7 +2114,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
    params = new OpenCLParameterSet(cl, force.getNumPerParticleParameters(), numParticles, "customGBParameters", true);
    computedValues = new OpenCLParameterSet(cl, force.getNumComputedValues(), numParticles, "customGBComputedValues", true);
    if (force.getNumGlobalParameters() > 0)
-        globals = new OpenCLArray<cl_float>(cl, force.getNumGlobalParameters(), "customGBGlobals", false, CL_MEM_READ_ONLY);
+        globals = OpenCLArray::create<cl_float>(cl, force.getNumGlobalParameters(), "customGBGlobals", CL_MEM_READ_ONLY);
    vector<vector<cl_float> > paramVector(numParticles);
    vector<vector<int> > exclusionList(numParticles);
    for (int i = 0; i < numParticles; i++) {
@@ -2143,13 +2150,13 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
        functions[name] = &fp;
        tabulatedFunctionParamsVec[i] = mm_float4((float) min, (float) max, (float) ((values.size()-1)/(max-min)), (float) values.size()-2);
        vector<mm_float4> f = OpenCLExpressionUtilities::computeFunctionCoefficients(values, min, max);
-        tabulatedFunctions.push_back(new OpenCLArray<mm_float4>(cl, values.size()-1, "TabulatedFunction"));
+        tabulatedFunctions.push_back(OpenCLArray::create<mm_float4>(cl, values.size()-1, "TabulatedFunction"));
        tabulatedFunctions[tabulatedFunctions.size()-1]->upload(f);
        cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo(arrayName, "float", 4, sizeof(cl_float4), tabulatedFunctions[tabulatedFunctions.size()-1]->getDeviceBuffer()));
        tableArgs << ", __global const float4* restrict " << arrayName;
    }
    if (force.getNumFunctions() > 0) {
-        tabulatedFunctionParams = new OpenCLArray<mm_float4>(cl, tabulatedFunctionParamsVec.size(), "tabulatedFunctionParameters", false, CL_MEM_READ_ONLY);
+        tabulatedFunctionParams = OpenCLArray::create<mm_float4>(cl, tabulatedFunctionParamsVec.size(), "tabulatedFunctionParameters", CL_MEM_READ_ONLY);
        tabulatedFunctionParams->upload(tabulatedFunctionParamsVec);
        cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo(prefix+"functionParams", "float", 4, sizeof(cl_float4), tabulatedFunctionParams->getDeviceBuffer()));
        tableArgs << ", __global const float4* " << prefix << "functionParams";
@@ -2207,7 +2214,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
    bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
    bool useLong = (cl.getSupports64BitGlobalAtomics() && !deviceIsCpu);
    if (useLong) {
-        longEnergyDerivs = new OpenCLArray<cl_long>(cl, force.getNumComputedValues()*cl.getPaddedNumAtoms(), "customGBLongEnergyDerivatives");
+        longEnergyDerivs = OpenCLArray::create<cl_long>(cl, force.getNumComputedValues()*cl.getPaddedNumAtoms(), "customGBLongEnergyDerivatives");
        energyDerivs = new OpenCLParameterSet(cl, force.getNumComputedValues(), cl.getPaddedNumAtoms(), "customGBEnergyDerivatives", true);
    }
    else
@@ -2748,12 +2755,12 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
        maxTiles = (nb.getUseCutoff() ? nb.getInteractingTiles().getSize() : 0);
        bool useLong = (cl.getSupports64BitGlobalAtomics() && !deviceIsCpu);
        if (useLong) {
-            longValueBuffers = new OpenCLArray<cl_long>(cl, cl.getPaddedNumAtoms(), "customGBLongValueBuffers");
+            longValueBuffers = OpenCLArray::create<cl_long>(cl, cl.getPaddedNumAtoms(), "customGBLongValueBuffers");
            cl.addAutoclearBuffer(longValueBuffers->getDeviceBuffer(), 2*longValueBuffers->getSize());
            cl.clearBuffer(longValueBuffers->getDeviceBuffer(), 2*longValueBuffers->getSize());
        }
        else {
-            valueBuffers = new OpenCLArray<cl_float>(cl, cl.getPaddedNumAtoms()*nb.getNumForceBuffers(), "customGBValueBuffers");
+            valueBuffers = OpenCLArray::create<cl_float>(cl, cl.getPaddedNumAtoms()*nb.getNumForceBuffers(), "customGBValueBuffers");
            cl.addAutoclearBuffer(valueBuffers->getDeviceBuffer(), valueBuffers->getSize());
            cl.clearBuffer(*valueBuffers);
        }
@@ -3045,7 +3052,7 @@ void OpenCLCalcCustomExternalForceKernel::initialize(const System& system, const
        variables[name] = "particleParams"+params->getParameterSuffix(i);
    }
    if (force.getNumGlobalParameters() > 0) {
-        globals = new OpenCLArray<cl_float>(cl, force.getNumGlobalParameters(), "customExternalGlobals", false, CL_MEM_READ_ONLY);
+        globals = OpenCLArray::create<cl_float>(cl, force.getNumGlobalParameters(), "customExternalGlobals", CL_MEM_READ_ONLY);
        globals->upload(globalParamValues);
        string argName = cl.getBondedUtilities().addArgument(globals->getDeviceBuffer(), "float");
        for (int i = 0; i < force.getNumGlobalParameters(); i++) {
@@ -3232,12 +3239,12 @@ void OpenCLCalcCustomHbondForceKernel::initialize(const System& system, const Cu
    if (numDonors == 0 || numAcceptors == 0)
        return;
    int numParticles = system.getNumParticles();
-    donors = new OpenCLArray<mm_int4>(cl, numDonors, "customHbondDonors");
-    acceptors = new OpenCLArray<mm_int4>(cl, numAcceptors, "customHbondAcceptors");
+    donors = OpenCLArray::create<mm_int4>(cl, numDonors, "customHbondDonors");
+    acceptors = OpenCLArray::create<mm_int4>(cl, numAcceptors, "customHbondAcceptors");
    donorParams = new OpenCLParameterSet(cl, force.getNumPerDonorParameters(), numDonors, "customHbondDonorParameters");
    acceptorParams = new OpenCLParameterSet(cl, force.getNumPerAcceptorParameters(), numAcceptors, "customHbondAcceptorParameters");
    if (force.getNumGlobalParameters() > 0)
-        globals = new OpenCLArray<cl_float>(cl, force.getNumGlobalParameters(), "customHbondGlobals", false, CL_MEM_READ_ONLY);
+        globals = OpenCLArray::create<cl_float>(cl, force.getNumGlobalParameters(), "customHbondGlobals", CL_MEM_READ_ONLY);
    vector<vector<cl_float> > donorParamVector(numDonors);
    vector<mm_int4> donorVector(numDonors);
    for (int i = 0; i < numDonors; i++) {
@@ -3263,8 +3270,8 @@ void OpenCLCalcCustomHbondForceKernel::initialize(const System& system, const Cu

    // Select an output buffer index for each donor and acceptor.

-    donorBufferIndices = new OpenCLArray<mm_int4>(cl, numDonors, "customHbondDonorBuffers");
-    acceptorBufferIndices = new OpenCLArray<mm_int4>(cl, numAcceptors, "customHbondAcceptorBuffers");
+    donorBufferIndices = OpenCLArray::create<mm_int4>(cl, numDonors, "customHbondDonorBuffers");
+    acceptorBufferIndices = OpenCLArray::create<mm_int4>(cl, numAcceptors, "customHbondAcceptorBuffers");
    vector<mm_int4> donorBufferVector(numDonors);
    vector<mm_int4> acceptorBufferVector(numAcceptors);
    vector<int> donorBufferCounter(numParticles, 0);
@@ -3317,8 +3324,8 @@ void OpenCLCalcCustomHbondForceKernel::initialize(const System& system, const Cu
        else
            throw OpenMMException("CustomHbondForce: OpenCLPlatform does not support more than four exclusions per acceptor");
    }
-    donorExclusions = new OpenCLArray<mm_int4>(cl, numDonors, "customHbondDonorExclusions");
-    acceptorExclusions = new OpenCLArray<mm_int4>(cl, numAcceptors, "customHbondAcceptorExclusions");
+    donorExclusions = OpenCLArray::create<mm_int4>(cl, numDonors, "customHbondDonorExclusions");
+    acceptorExclusions = OpenCLArray::create<mm_int4>(cl, numAcceptors, "customHbondAcceptorExclusions");
    donorExclusions->upload(donorExclusionVector);
    acceptorExclusions->upload(acceptorExclusionVector);

@@ -3339,12 +3346,12 @@ void OpenCLCalcCustomHbondForceKernel::initialize(const System& system, const Cu
        functions[name] = &fp;
        tabulatedFunctionParamsVec[i] = mm_float4((float) min, (float) max, (float) ((values.size()-1)/(max-min)), (float) values.size()-2);
        vector<mm_float4> f = OpenCLExpressionUtilities::computeFunctionCoefficients(values, min, max);
-        tabulatedFunctions.push_back(new OpenCLArray<mm_float4>(cl, values.size()-1, "TabulatedFunction"));
+        tabulatedFunctions.push_back(OpenCLArray::create<mm_float4>(cl, values.size()-1, "TabulatedFunction"));
        tabulatedFunctions[tabulatedFunctions.size()-1]->upload(f);
        tableArgs << ", __global const float4* restrict " << arrayName;
    }
    if (force.getNumFunctions() > 0) {
-        tabulatedFunctionParams = new OpenCLArray<mm_float4>(cl, tabulatedFunctionParamsVec.size(), "tabulatedFunctionParameters", false, CL_MEM_READ_ONLY);
+        tabulatedFunctionParams = OpenCLArray::create<mm_float4>(cl, tabulatedFunctionParamsVec.size(), "tabulatedFunctionParameters", CL_MEM_READ_ONLY);
        tabulatedFunctionParams->upload(tabulatedFunctionParamsVec);
        tableArgs << ", __global const float4* restrict functionParams";
    }
@@ -3728,7 +3735,7 @@ void OpenCLCalcCustomCompoundBondForceKernel::initialize(const System& system, c
        functions[name] = &fp;
        tabulatedFunctionParamsVec[i] = mm_float4((float) min, (float) max, (float) ((values.size()-1)/(max-min)), (float) values.size()-2);
        vector<mm_float4> f = OpenCLExpressionUtilities::computeFunctionCoefficients(values, min, max);
-        OpenCLArray<mm_float4>* array = new OpenCLArray<mm_float4>(cl, values.size()-1, "TabulatedFunction");
+        OpenCLArray* array = OpenCLArray::create<mm_float4>(cl, values.size()-1, "TabulatedFunction");
        tabulatedFunctions.push_back(array);
        array->upload(f);
        string arrayName = cl.getBondedUtilities().addArgument(array->getDeviceBuffer(), "float4");
@@ -3736,7 +3743,7 @@ void OpenCLCalcCustomCompoundBondForceKernel::initialize(const System& system, c
    }
    string functionParamsName;
    if (force.getNumFunctions() > 0) {
-        tabulatedFunctionParams = new OpenCLArray<mm_float4>(cl, tabulatedFunctionParamsVec.size(), "tabulatedFunctionParameters", false, CL_MEM_READ_ONLY);
+        tabulatedFunctionParams = OpenCLArray::create<mm_float4>(cl, tabulatedFunctionParamsVec.size(), "tabulatedFunctionParameters", CL_MEM_READ_ONLY);
        tabulatedFunctionParams->upload(tabulatedFunctionParamsVec);
        functionParamsName = cl.getBondedUtilities().addArgument(tabulatedFunctionParams->getDeviceBuffer(), "float4");
    }
@@ -3761,7 +3768,7 @@ void OpenCLCalcCustomCompoundBondForceKernel::initialize(const System& system, c
        variables[name] = "bondParams"+params->getParameterSuffix(i);
    }
    if (force.getNumGlobalParameters() > 0) {
-        globals = new OpenCLArray<cl_float>(cl, force.getNumGlobalParameters(), "customCompoundBondGlobals", false, CL_MEM_READ_ONLY);
+        globals = OpenCLArray::create<cl_float>(cl, force.getNumGlobalParameters(), "customCompoundBondGlobals", CL_MEM_READ_ONLY);
        globals->upload(globalParamValues);
        string argName = cl.getBondedUtilities().addArgument(globals->getDeviceBuffer(), "float");
        for (int i = 0; i < force.getNumGlobalParameters(); i++) {
@@ -4042,7 +4049,7 @@ void OpenCLIntegrateLangevinStepKernel::initialize(const System& system, const L
    cl::Program program = cl.createProgram(OpenCLKernelSources::langevin, defines, "");
    kernel1 = cl::Kernel(program, "integrateLangevinPart1");
    kernel2 = cl::Kernel(program, "integrateLangevinPart2");
-    params = new OpenCLArray<cl_float>(cl, 3, "langevinParams");
+    params = OpenCLArray::create<cl_float>(cl, 3, "langevinParams");
    prevStepSize = -1.0;
 }

@@ -4078,8 +4085,8 @@ void OpenCLIntegrateLangevinStepKernel::execute(ContextImpl& context, const Lang
        p[1] = (cl_float) fscale;
        p[2] = (cl_float) noisescale;
        params->upload(p);
-        integration.getStepSize()[0].y = (cl_float) stepSize;
-        integration.getStepSize().upload();
+        mm_float2 ss = mm_float2(0, (float) stepSize);
+        integration.getStepSize().upload(&ss);
        prevTemp = temperature;
        prevFriction = friction;
        prevStepSize = stepSize;
@@ -4222,8 +4229,9 @@ double OpenCLIntegrateVariableVerletStepKernel::execute(ContextImpl& context, co

    // Update the time and step count.

-    cl.getIntegrationUtilities().getStepSize().download();
-    double dt = cl.getIntegrationUtilities().getStepSize()[0].y;
+    mm_float2 stepSize;
+    cl.getIntegrationUtilities().getStepSize().download(&stepSize);
+    double dt = stepSize.y;
    double time = cl.getTime()+dt;
    if (dt == maxStepSize)
        time = maxTime; // Avoid round-off error
@@ -4247,7 +4255,7 @@ void OpenCLIntegrateVariableLangevinStepKernel::initialize(const System& system,
    kernel1 = cl::Kernel(program, "integrateLangevinPart1");
    kernel2 = cl::Kernel(program, "integrateLangevinPart2");
    selectSizeKernel = cl::Kernel(program, "selectLangevinStepSize");
-    params = new OpenCLArray<cl_float>(cl, 3, "langevinParams");
+    params = OpenCLArray::create<cl_float>(cl, 3, "langevinParams");
    blockSize = min(256, system.getNumParticles());
    blockSize = max(blockSize, params->getSize());
    blockSize = min(blockSize, (int) cl.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
@@ -4301,8 +4309,9 @@ double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context,

    // Update the time and step count.

-    cl.getIntegrationUtilities().getStepSize().download();
-    double dt = cl.getIntegrationUtilities().getStepSize()[0].y;
+    mm_float2 stepSize;
+    cl.getIntegrationUtilities().getStepSize().download(&stepSize);
+    double dt = stepSize.y;
    double time = cl.getTime()+dt;
    if (dt == maxStepSize)
        time = maxTime; // Avoid round-off error
@@ -4334,7 +4343,7 @@ public:
            swap[3*lastAtomOrder[i]+1] = localPerDofValues[3*i+1];
            swap[3*lastAtomOrder[i]+2] = localPerDofValues[3*i+2];
        }
-        OpenCLArray<cl_int>& order = cl.getAtomIndex();
+        const vector<cl_int>& order = cl.getAtomIndex();
        for (int i = 0; i < numAtoms; i++) {
            localPerDofValues[3*i] = swap[3*order[i]];
            localPerDofValues[3*i+1] = swap[3*order[i]+1];
@@ -4374,9 +4383,9 @@ void OpenCLIntegrateCustomStepKernel::initialize(const System& system, const Cus
    cl.getPlatformData().initializeContexts(system);
    cl.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
    numGlobalVariables = integrator.getNumGlobalVariables();
-    globalValues = new OpenCLArray<cl_float>(cl, max(1, numGlobalVariables), "globalVariables", true);
-    sumBuffer = new OpenCLArray<cl_float>(cl, 3*system.getNumParticles(), "sumBuffer");
-    energy = new OpenCLArray<cl_float>(cl, 1, "energy");
+    globalValues = OpenCLArray::create<cl_float>(cl, max(1, numGlobalVariables), "globalVariables");
+    sumBuffer = OpenCLArray::create<cl_float>(cl, 3*system.getNumParticles(), "sumBuffer");
+    energy = OpenCLArray::create<cl_float>(cl, 1, "energy");
    perDofValues = new OpenCLParameterSet(cl, integrator.getNumPerDofVariables(), 3*system.getNumParticles(), "perDofVariables");
    cl.addReorderListener(new ReorderListener(cl, *perDofValues, localPerDofValues, deviceValuesAreCurrent));
    prevStepSize = -1.0;
@@ -4459,12 +4468,13 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
        // Initialize various data structures.
        
        const map<string, double>& params = context.getParameters();
-        contextParameterValues = new OpenCLArray<cl_float>(cl, max(1, (int) params.size()), "contextParameters", true);
+        contextParameterValues = OpenCLArray::create<cl_float>(cl, max(1, (int) params.size()), "contextParameters");
+        contextValues.resize(contextParameterValues->getSize());
        for (map<string, double>::const_iterator iter = params.begin(); iter != params.end(); ++iter) {
-            contextParameterValues->set(parameterNames.size(), (float) iter->second);
+            contextValues[parameterNames.size()] = (float) iter->second;
            parameterNames.push_back(iter->first);
        }
-        contextParameterValues->upload();
+        contextParameterValues->upload(contextValues);
        kernels.resize(integrator.getNumComputations());
        requiredGaussian.resize(integrator.getNumComputations(), 0);
        requiredUniform.resize(integrator.getNumComputations(), 0);
@@ -4480,8 +4490,8 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
        
        // Initialize the random number generator.
        
-        uniformRandoms = new OpenCLArray<mm_float4>(cl, cl.getNumAtoms(), "uniformRandoms");
-        randomSeed = new OpenCLArray<mm_int4>(cl, cl.getNumThreadBlocks()*OpenCLContext::ThreadBlockSize, "randomSeed");
+        uniformRandoms = OpenCLArray::create<mm_float4>(cl, cl.getNumAtoms(), "uniformRandoms");
+        randomSeed = OpenCLArray::create<mm_int4>(cl, cl.getNumThreadBlocks()*OpenCLContext::ThreadBlockSize, "randomSeed");
        vector<mm_int4> seed(randomSeed->getSize());
        unsigned int r = integrator.getRandomNumberSeed()+1;
        for (int i = 0; i < randomSeed->getSize(); i++) {
@@ -4744,20 +4754,20 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
    localValuesAreCurrent = false;
    double stepSize = integrator.getStepSize();
    if (stepSize != prevStepSize) {
-        integration.getStepSize()[0].y = (cl_float) stepSize;
-        integration.getStepSize().upload();
+        mm_float2 ss = mm_float2(0, (float) stepSize);
+        integration.getStepSize().upload(&ss);
        prevStepSize = stepSize;
    }
    bool paramsChanged = false;
    for (int i = 0; i < (int) parameterNames.size(); i++) {
        float value = (float) context.getParameter(parameterNames[i]);
-        if (value != contextParameterValues->get(i)) {
-            contextParameterValues->set(i, value);
+        if (value != contextValues[i]) {
+            contextValues[i] = value;
            paramsChanged = true;
        }
    }
    if (paramsChanged)
-        contextParameterValues->upload();
+        contextParameterValues->upload(contextValues);

    // Loop over computation steps in the integrator and execute them.

@@ -4829,25 +4839,33 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
 void OpenCLIntegrateCustomStepKernel::recordChangedParameters(ContextImpl& context) {
    if (!modifiesParameters)
        return;
-    contextParameterValues->download();
+    contextParameterValues->download(contextValues);
    for (int i = 0; i < (int) parameterNames.size(); i++) {
        float value = (float) context.getParameter(parameterNames[i]);
-        if (value != contextParameterValues->get(i))
-            context.setParameter(parameterNames[i], contextParameterValues->get(i));
+        if (value != contextValues[i])
+            context.setParameter(parameterNames[i], contextValues[i]);
    }
 }

 void OpenCLIntegrateCustomStepKernel::getGlobalVariables(ContextImpl& context, vector<double>& values) const {
-    globalValues->download();
+    if (numGlobalVariables == 0) {
+        values.resize(0);
+        return;
+    }
+    vector<cl_float> buffer;
+    globalValues->download(buffer);
    values.resize(numGlobalVariables);
    for (int i = 0; i < numGlobalVariables; i++)
-        values[i] = globalValues->get(i);
+        values[i] = buffer[i];
 }

 void OpenCLIntegrateCustomStepKernel::setGlobalVariables(ContextImpl& context, const vector<double>& values) {
+    if (numGlobalVariables == 0)
+        return;
+    vector<cl_float> valuesVec(numGlobalVariables);
    for (int i = 0; i < numGlobalVariables; i++)
-        globalValues->set(i, (float) values[i]);
-    globalValues->upload();
+        valuesVec[i] = (float) values[i];
+    globalValues->upload(valuesVec);
 }

 void OpenCLIntegrateCustomStepKernel::getPerDofVariable(ContextImpl& context, int variable, vector<Vec3>& values) const {
@@ -4856,7 +4874,7 @@ void OpenCLIntegrateCustomStepKernel::getPerDofVariable(ContextImpl& context, in
        localValuesAreCurrent = true;
    }
    values.resize(perDofValues->getNumObjects()/3);
-    OpenCLArray<cl_int>& order = cl.getAtomIndex();
+    const vector<cl_int>& order = cl.getAtomIndex();
    for (int i = 0; i < (int) values.size(); i++)
        for (int j = 0; j < 3; j++)
            values[order[i]][j] = localPerDofValues[3*i+j][variable];
@@ -4867,7 +4885,7 @@ void OpenCLIntegrateCustomStepKernel::setPerDofVariable(ContextImpl& context, in
        perDofValues->getParameterValues(localPerDofValues);
        localValuesAreCurrent = true;
    }
-    OpenCLArray<cl_int>& order = cl.getAtomIndex();
+    const vector<cl_int>& order = cl.getAtomIndex();
    for (int i = 0; i < (int) values.size(); i++)
        for (int j = 0; j < 3; j++)
            localPerDofValues[3*i+j][variable] = (float) values[order[i]][j];
@@ -4890,7 +4908,7 @@ void OpenCLApplyAndersenThermostatKernel::initialize(const System& system, const
    // Create the arrays with the group definitions.

    vector<vector<int> > groups = AndersenThermostatImpl::calcParticleGroups(system);
-    atomGroups = new OpenCLArray<int>(cl, cl.getNumAtoms(), "atomGroups");
+    atomGroups = OpenCLArray::create<int>(cl, cl.getNumAtoms(), "atomGroups");
    vector<int> atoms(atomGroups->getSize());
    for (int i = 0; i < (int) groups.size(); i++) {
        for (int j = 0; j < (int) groups[i].size(); j++)
@@ -4923,7 +4941,7 @@ OpenCLApplyMonteCarloBarostatKernel::~OpenCLApplyMonteCarloBarostatKernel() {
 }

 void OpenCLApplyMonteCarloBarostatKernel::initialize(const System& system, const MonteCarloBarostat& thermostat) {
-    savedPositions = new OpenCLArray<mm_float4>(cl, cl.getPaddedNumAtoms(), "savedPositions");
+    savedPositions = OpenCLArray::create<mm_float4>(cl, cl.getPaddedNumAtoms(), "savedPositions");
    cl::Program program = cl.createProgram(OpenCLKernelSources::monteCarloBarostat);
    kernel = cl::Kernel(program, "scalePositions");
 }
@@ -4936,8 +4954,8 @@ void OpenCLApplyMonteCarloBarostatKernel::scaleCoordinates(ContextImpl& context,

        vector<vector<int> > molecules = context.getMolecules();
        numMolecules = molecules.size();
-        moleculeAtoms = new OpenCLArray<int>(cl, cl.getNumAtoms(), "moleculeAtoms");
-        moleculeStartIndex = new OpenCLArray<int>(cl, numMolecules+1, "moleculeStartIndex");
+        moleculeAtoms = OpenCLArray::create<int>(cl, cl.getNumAtoms(), "moleculeAtoms");
+        moleculeStartIndex = OpenCLArray::create<int>(cl, numMolecules+1, "moleculeStartIndex");
        vector<int> atoms(moleculeAtoms->getSize());
        vector<int> startIndex(moleculeStartIndex->getSize());
        int index = 0;
@@ -4981,10 +4999,10 @@ double OpenCLCalcKineticEnergyKernel::execute(ContextImpl& context) {
    // We don't currently have a GPU kernel to do this, so we retrieve the velocities and calculate the energy
    // on the CPU.

-    OpenCLArray<mm_float4>& velm = cl.getVelm();
-    velm.download();
+    mm_float4* velm = (mm_float4*) cl.getPinnedBuffer();
+    cl.getVelm().download(velm);
    double energy = 0.0;
-    OpenCLArray<cl_int>& order = cl.getAtomIndex();
+    const vector<cl_int>& order = cl.getAtomIndex();
    for (size_t i = 0; i < masses.size(); ++i) {
        mm_float4 v = velm[i];
        energy += masses[order[i]]*(v.x*v.x+v.y*v.y+v.z*v.z);
@@ -5000,7 +5018,7 @@ OpenCLRemoveCMMotionKernel::~OpenCLRemoveCMMotionKernel() {
 void OpenCLRemoveCMMotionKernel::initialize(const System& system, const CMMotionRemover& force) {
    frequency = force.getFrequency();
    int numAtoms = cl.getNumAtoms();
-    cmMomentum = new OpenCLArray<mm_float4>(cl, (numAtoms+OpenCLContext::ThreadBlockSize-1)/OpenCLContext::ThreadBlockSize, "cmMomentum");
+    cmMomentum = OpenCLArray::create<mm_float4>(cl, (numAtoms+OpenCLContext::ThreadBlockSize-1)/OpenCLContext::ThreadBlockSize, "cmMomentum");
    double totalMass = 0.0;
    for (int i = 0; i < numAtoms; i++)
        totalMass += system.getParticleMass(i);

--- a/platforms/opencl/src/OpenCLKernels.h
+++ b/platforms/opencl/src/OpenCLKernels.h
@@ -255,7 +255,7 @@ private:
    bool hasInitializedKernel;
    OpenCLContext& cl;
    System& system;
-    OpenCLArray<mm_float2>* params;
+    OpenCLArray* params;
 };

 /**
@@ -296,7 +296,7 @@ private:
    OpenCLContext& cl;
    System& system;
    OpenCLParameterSet* params;
-    OpenCLArray<cl_float>* globals;
+    OpenCLArray* globals;
    std::vector<std::string> globalParamNames;
    std::vector<cl_float> globalParamValues;
 };
@@ -338,7 +338,7 @@ private:
    bool hasInitializedKernel;
    OpenCLContext& cl;
    System& system;
-    OpenCLArray<mm_float2>* params;
+    OpenCLArray* params;
 };

 /**
@@ -379,7 +379,7 @@ private:
    OpenCLContext& cl;
    System& system;
    OpenCLParameterSet* params;
-    OpenCLArray<cl_float>* globals;
+    OpenCLArray* globals;
    std::vector<std::string> globalParamNames;
    std::vector<cl_float> globalParamValues;
 };
@@ -421,7 +421,7 @@ private:
    bool hasInitializedKernel;
    OpenCLContext& cl;
    System& system;
-    OpenCLArray<mm_float4>* params;
+    OpenCLArray* params;
 };

 /**
@@ -461,7 +461,7 @@ private:
    bool hasInitializedKernel;
    OpenCLContext& cl;
    System& system;
-    OpenCLArray<mm_float8>* params;
+    OpenCLArray* params;
 };

 /**
@@ -494,9 +494,9 @@ private:
    bool hasInitializedKernel;
    OpenCLContext& cl;
    System& system;
-    OpenCLArray<mm_float4>* coefficients;
-    OpenCLArray<mm_int2>* mapPositions;
-    OpenCLArray<cl_int>* torsionMaps;
+    OpenCLArray* coefficients;
+    OpenCLArray* mapPositions;
+    OpenCLArray* torsionMaps;
 };

 /**
@@ -537,7 +537,7 @@ private:
    OpenCLContext& cl;
    System& system;
    OpenCLParameterSet* params;
-    OpenCLArray<cl_float>* globals;
+    OpenCLArray* globals;
    std::vector<std::string> globalParamNames;
    std::vector<cl_float> globalParamValues;
 };
@@ -591,18 +591,18 @@ private:
    };
    OpenCLContext& cl;
    bool hasInitializedKernel;
-    OpenCLArray<mm_float2>* sigmaEpsilon;
-    OpenCLArray<mm_float4>* exceptionParams;
-    OpenCLArray<mm_float2>* cosSinSums;
-    OpenCLArray<mm_float2>* pmeGrid;
-    OpenCLArray<mm_float2>* pmeGrid2;
-    OpenCLArray<cl_float>* pmeBsplineModuliX;
-    OpenCLArray<cl_float>* pmeBsplineModuliY;
-    OpenCLArray<cl_float>* pmeBsplineModuliZ;
-    OpenCLArray<mm_float4>* pmeBsplineTheta;
-    OpenCLArray<mm_float4>* pmeBsplineDTheta;
-    OpenCLArray<cl_int>* pmeAtomRange;
-    OpenCLArray<mm_int2>* pmeAtomGridIndex;
+    OpenCLArray* sigmaEpsilon;
+    OpenCLArray* exceptionParams;
+    OpenCLArray* cosSinSums;
+    OpenCLArray* pmeGrid;
+    OpenCLArray* pmeGrid2;
+    OpenCLArray* pmeBsplineModuliX;
+    OpenCLArray* pmeBsplineModuliY;
+    OpenCLArray* pmeBsplineModuliZ;
+    OpenCLArray* pmeBsplineTheta;
+    OpenCLArray* pmeBsplineDTheta;
+    OpenCLArray* pmeAtomRange;
+    OpenCLArray* pmeAtomGridIndex;
    OpenCLSort<SortTrait>* sort;
    OpenCLFFT3D* fft;
    cl::Kernel ewaldSumsKernel;
@@ -658,11 +658,11 @@ public:
 private:
    OpenCLContext& cl;
    OpenCLParameterSet* params;
-    OpenCLArray<cl_float>* globals;
-    OpenCLArray<mm_float4>* tabulatedFunctionParams;
+    OpenCLArray* globals;
+    OpenCLArray* tabulatedFunctionParams;
    std::vector<std::string> globalParamNames;
    std::vector<cl_float> globalParamValues;
-    std::vector<OpenCLArray<mm_float4>*> tabulatedFunctions;
+    std::vector<OpenCLArray*> tabulatedFunctions;
    System& system;
 };

@@ -704,13 +704,13 @@ private:
    bool hasCreatedKernels;
    int maxTiles;
    OpenCLContext& cl;
-    OpenCLArray<mm_float2>* params;
-    OpenCLArray<cl_float>* bornSum;
-    OpenCLArray<cl_long>* longBornSum;
-    OpenCLArray<cl_float>* bornRadii;
-    OpenCLArray<cl_float>* bornForce;
-    OpenCLArray<cl_long>* longBornForce;
-    OpenCLArray<cl_float>* obcChain;
+    OpenCLArray* params;
+    OpenCLArray* bornSum;
+    OpenCLArray* longBornSum;
+    OpenCLArray* bornRadii;
+    OpenCLArray* bornForce;
+    OpenCLArray* longBornForce;
+    OpenCLArray* obcChain;
    cl::Kernel computeBornSumKernel;
    cl::Kernel reduceBornSumKernel;
    cl::Kernel force1Kernel;
@@ -757,14 +757,14 @@ private:
    OpenCLParameterSet* params;
    OpenCLParameterSet* computedValues;
    OpenCLParameterSet* energyDerivs;
-    OpenCLArray<cl_long>* longEnergyDerivs;
-    OpenCLArray<cl_float>* globals;
-    OpenCLArray<cl_float>* valueBuffers;
-    OpenCLArray<cl_long>* longValueBuffers;
-    OpenCLArray<mm_float4>* tabulatedFunctionParams;
+    OpenCLArray* longEnergyDerivs;
+    OpenCLArray* globals;
+    OpenCLArray* valueBuffers;
+    OpenCLArray* longValueBuffers;
+    OpenCLArray* tabulatedFunctionParams;
    std::vector<std::string> globalParamNames;
    std::vector<cl_float> globalParamValues;
-    std::vector<OpenCLArray<mm_float4>*> tabulatedFunctions;
+    std::vector<OpenCLArray*> tabulatedFunctions;
    std::vector<bool> pairValueUsesParam, pairEnergyUsesParam, pairEnergyUsesValue;
    System& system;
    cl::Kernel pairValueKernel, perParticleValueKernel, pairEnergyKernel, perParticleEnergyKernel, gradientChainRuleKernel;
@@ -808,7 +808,7 @@ private:
    OpenCLContext& cl;
    System& system;
    OpenCLParameterSet* params;
-    OpenCLArray<cl_float>* globals;
+    OpenCLArray* globals;
    std::vector<std::string> globalParamNames;
    std::vector<cl_float> globalParamValues;
 };
@@ -853,17 +853,17 @@ private:
    OpenCLContext& cl;
    OpenCLParameterSet* donorParams;
    OpenCLParameterSet* acceptorParams;
-    OpenCLArray<cl_float>* globals;
-    OpenCLArray<mm_int4>* donors;
-    OpenCLArray<mm_int4>* acceptors;
-    OpenCLArray<mm_int4>* donorBufferIndices;
-    OpenCLArray<mm_int4>* acceptorBufferIndices;
-    OpenCLArray<mm_int4>* donorExclusions;
-    OpenCLArray<mm_int4>* acceptorExclusions;
-    OpenCLArray<mm_float4>* tabulatedFunctionParams;
+    OpenCLArray* globals;
+    OpenCLArray* donors;
+    OpenCLArray* acceptors;
+    OpenCLArray* donorBufferIndices;
+    OpenCLArray* acceptorBufferIndices;
+    OpenCLArray* donorExclusions;
+    OpenCLArray* acceptorExclusions;
+    OpenCLArray* tabulatedFunctionParams;
    std::vector<std::string> globalParamNames;
    std::vector<cl_float> globalParamValues;
-    std::vector<OpenCLArray<mm_float4>*> tabulatedFunctions;
+    std::vector<OpenCLArray*> tabulatedFunctions;
    System& system;
    cl::Kernel donorKernel, acceptorKernel;
 };
@@ -905,11 +905,11 @@ private:
    int numBonds;
    OpenCLContext& cl;
    OpenCLParameterSet* params;
-    OpenCLArray<cl_float>* globals;
-    OpenCLArray<mm_float4>* tabulatedFunctionParams;
+    OpenCLArray* globals;
+    OpenCLArray* tabulatedFunctionParams;
    std::vector<std::string> globalParamNames;
    std::vector<cl_float> globalParamValues;
-    std::vector<OpenCLArray<mm_float4>*> tabulatedFunctions;
+    std::vector<OpenCLArray*> tabulatedFunctions;
    System& system;
 };

@@ -970,7 +970,7 @@ private:
    OpenCLContext& cl;
    double prevTemp, prevFriction, prevStepSize;
    bool hasInitializedKernels;
-    OpenCLArray<cl_float>* params;
+    OpenCLArray* params;
    cl::Kernel kernel1, kernel2;
 };

@@ -1065,7 +1065,7 @@ private:
    OpenCLContext& cl;
    bool hasInitializedKernels;
    int blockSize;
-    OpenCLArray<cl_float>* params;
+    OpenCLArray* params;
    cl::Kernel kernel1, kernel2, selectSizeKernel;
    double prevTemp, prevFriction, prevErrorTol;
 };
@@ -1138,14 +1138,15 @@ private:
    int numGlobalVariables;
    bool hasInitializedKernels, deviceValuesAreCurrent, modifiesParameters;
    mutable bool localValuesAreCurrent;
-    OpenCLArray<cl_float>* globalValues;
-    OpenCLArray<cl_float>* contextParameterValues;
-    OpenCLArray<cl_float>* sumBuffer;
-    OpenCLArray<cl_float>* energy;
-    OpenCLArray<mm_float4>* uniformRandoms;
-    OpenCLArray<mm_int4>* randomSeed;
+    OpenCLArray* globalValues;
+    OpenCLArray* contextParameterValues;
+    OpenCLArray* sumBuffer;
+    OpenCLArray* energy;
+    OpenCLArray* uniformRandoms;
+    OpenCLArray* randomSeed;
    OpenCLParameterSet* perDofValues;
    mutable std::vector<std::vector<cl_float> > localPerDofValues;
+    std::vector<float> contextValues;
    std::vector<std::vector<cl::Kernel> > kernels;
    cl::Kernel sumEnergyKernel, randomKernel;
    std::vector<CustomIntegrator::ComputationType> stepType;
@@ -1185,7 +1186,7 @@ private:
    OpenCLContext& cl;
    bool hasInitializedKernels;
    int randomSeed;
-    OpenCLArray<cl_int>* atomGroups;
+    OpenCLArray* atomGroups;
    cl::Kernel kernel;
 };

@@ -1226,9 +1227,9 @@ private:
    OpenCLContext& cl;
    bool hasInitializedKernels;
    int numMolecules;
-    OpenCLArray<mm_float4>* savedPositions;
-    OpenCLArray<cl_int>* moleculeAtoms;
-    OpenCLArray<cl_int>* moleculeStartIndex;
+    OpenCLArray* savedPositions;
+    OpenCLArray* moleculeAtoms;
+    OpenCLArray* moleculeStartIndex;
    cl::Kernel kernel;
 };

@@ -1280,7 +1281,7 @@ public:
 private:
    OpenCLContext& cl;
    int frequency;
-    OpenCLArray<mm_float4>* cmMomentum;
+    OpenCLArray* cmMomentum;
    cl::Kernel kernel1, kernel2;
 };


--- a/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
@@ -191,14 +191,14 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
        exclusionIndicesVec.push_back(iter->second);
    }
    exclusionRowIndicesVec[++currentRow] = exclusionIndicesVec.size();
-    exclusionIndices = new OpenCLArray<cl_uint>(context, exclusionIndicesVec.size(), "exclusionIndices");
-    exclusionRowIndices = new OpenCLArray<cl_uint>(context, exclusionRowIndicesVec.size(), "exclusionRowIndices");
+    exclusionIndices = OpenCLArray::create<cl_uint>(context, exclusionIndicesVec.size(), "exclusionIndices");
+    exclusionRowIndices = OpenCLArray::create<cl_uint>(context, exclusionRowIndicesVec.size(), "exclusionRowIndices");
    exclusionIndices->upload(exclusionIndicesVec);
    exclusionRowIndices->upload(exclusionRowIndicesVec);

    // Record the exclusion data.

-    exclusions = new OpenCLArray<cl_uint>(context, tilesWithExclusions.size()*OpenCLContext::TileSize, "exclusions");
+    exclusions = OpenCLArray::create<cl_uint>(context, tilesWithExclusions.size()*OpenCLContext::TileSize, "exclusions");
    vector<cl_uint> exclusionVec(exclusions->getSize());
    for (int i = 0; i < exclusions->getSize(); ++i)
        exclusionVec[i] = 0xFFFFFFFF;
@@ -253,13 +253,13 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
            maxInteractingTiles = numTiles;
        if (maxInteractingTiles < 1)
            maxInteractingTiles = 1;
-        interactingTiles = new OpenCLArray<mm_ushort2>(context, maxInteractingTiles, "interactingTiles");
-        interactionFlags = new OpenCLArray<cl_uint>(context, context.getSIMDWidth() == 32 ? maxInteractingTiles : (deviceIsCpu ? 2*maxInteractingTiles : 1), "interactionFlags");
-        interactionCount = new OpenCLArray<cl_uint>(context, 1, "interactionCount", true);
-        blockCenter = new OpenCLArray<mm_float4>(context, numAtomBlocks, "blockCenter");
-        blockBoundingBox = new OpenCLArray<mm_float4>(context, numAtomBlocks, "blockBoundingBox");
-        interactionCount->set(0, 0);
-        interactionCount->upload();
+        interactingTiles = OpenCLArray::create<mm_ushort2>(context, maxInteractingTiles, "interactingTiles");
+        interactionFlags = OpenCLArray::create<cl_uint>(context, context.getSIMDWidth() == 32 ? maxInteractingTiles : (deviceIsCpu ? 2*maxInteractingTiles : 1), "interactionFlags");
+        interactionCount = OpenCLArray::create<cl_uint>(context, 1, "interactionCount");
+        blockCenter = OpenCLArray::create<mm_float4>(context, numAtomBlocks, "blockCenter");
+        blockBoundingBox = OpenCLArray::create<mm_float4>(context, numAtomBlocks, "blockBoundingBox");
+        vector<cl_uint> count(1, 0);
+        interactionCount->upload(count);
    }

    // Create kernels.
@@ -353,26 +353,27 @@ void OpenCLNonbondedUtilities::computeInteractions() {
 void OpenCLNonbondedUtilities::updateNeighborListSize() {
    if (!useCutoff)
        return;
-    interactionCount->download();
-    if (interactionCount->get(0) <= (unsigned int) interactingTiles->getSize())
+    unsigned int* pinnedInteractionCount = (unsigned int*) context.getPinnedBuffer();
+    interactionCount->download(pinnedInteractionCount);
+    if (pinnedInteractionCount[0] <= (unsigned int) interactingTiles->getSize())
        return;

    // The most recent timestep had too many interactions to fit in the arrays.  Make the arrays bigger to prevent
    // this from happening in the future.

-    int newSize = (int) (1.2*interactionCount->get(0));
+    int newSize = (int) (1.2*pinnedInteractionCount[0]);
    int numTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
    if (newSize > numTiles)
        newSize = numTiles;
    delete interactingTiles;
-    interactingTiles = new OpenCLArray<mm_ushort2>(context, newSize, "interactingTiles");
+    interactingTiles = OpenCLArray::create<mm_ushort2>(context, newSize, "interactingTiles");
    forceKernel.setArg<cl::Buffer>(8, interactingTiles->getDeviceBuffer());
    forceKernel.setArg<cl_uint>(12, newSize);
    findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles->getDeviceBuffer());
    findInteractingBlocksKernel.setArg<cl_uint>(9, newSize);
    if (context.getSIMDWidth() == 32 || deviceIsCpu) {
        delete interactionFlags;
-        interactionFlags = new OpenCLArray<cl_uint>(context, deviceIsCpu ? 2*newSize : newSize, "interactionFlags");
+        interactionFlags = OpenCLArray::create<cl_uint>(context, deviceIsCpu ? 2*newSize : newSize, "interactionFlags");
        forceKernel.setArg<cl::Buffer>(13, interactionFlags->getDeviceBuffer());
        findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
 		if (!deviceIsCpu) {

--- a/platforms/opencl/src/OpenCLNonbondedUtilities.h
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.h
@@ -170,49 +170,49 @@ public:
    /**
     * Get the array containing the center of each atom block.
     */
-    OpenCLArray<mm_float4>& getBlockCenters() {
+    OpenCLArray& getBlockCenters() {
        return *blockCenter;
    }
    /**
     * Get the array containing the dimensions of each atom block.
     */
-    OpenCLArray<mm_float4>& getBlockBoundingBoxes() {
+    OpenCLArray& getBlockBoundingBoxes() {
        return *blockBoundingBox;
    }
    /**
     * Get the array whose first element contains the number of tiles with interactions.
     */
-    OpenCLArray<cl_uint>& getInteractionCount() {
+    OpenCLArray& getInteractionCount() {
        return *interactionCount;
    }
    /**
     * Get the array containing tiles with interactions.
     */
-    OpenCLArray<mm_ushort2>& getInteractingTiles() {
+    OpenCLArray& getInteractingTiles() {
        return *interactingTiles;
    }
    /**
     * Get the array containing flags for tiles with interactions.
     */
-    OpenCLArray<cl_uint>& getInteractionFlags() {
+    OpenCLArray& getInteractionFlags() {
        return *interactionFlags;
    }
    /**
     * Get the array containing exclusion flags.
     */
-    OpenCLArray<cl_uint>& getExclusions() {
+    OpenCLArray& getExclusions() {
        return *exclusions;
    }
    /**
     * Get the array containing the index into the exclusion array for each tile.
     */
-    OpenCLArray<cl_uint>& getExclusionIndices() {
+    OpenCLArray& getExclusionIndices() {
        return *exclusionIndices;
    }
    /**
     * Get the array listing where the exclusion data starts for each row.
     */
-    OpenCLArray<cl_uint>& getExclusionRowIndices() {
+    OpenCLArray& getExclusionRowIndices() {
        return *exclusionRowIndices;
    }
    /**
@@ -250,14 +250,14 @@ private:
    cl::Kernel findBlockBoundsKernel;
    cl::Kernel findInteractingBlocksKernel;
    cl::Kernel findInteractionsWithinBlocksKernel;
-    OpenCLArray<cl_uint>* exclusions;
-    OpenCLArray<cl_uint>* exclusionIndices;
-    OpenCLArray<cl_uint>* exclusionRowIndices;
-    OpenCLArray<mm_ushort2>* interactingTiles;
-    OpenCLArray<cl_uint>* interactionFlags;
-    OpenCLArray<cl_uint>* interactionCount;
-    OpenCLArray<mm_float4>* blockCenter;
-    OpenCLArray<mm_float4>* blockBoundingBox;
+    OpenCLArray* exclusions;
+    OpenCLArray* exclusionIndices;
+    OpenCLArray* exclusionRowIndices;
+    OpenCLArray* interactingTiles;
+    OpenCLArray* interactionFlags;
+    OpenCLArray* interactionCount;
+    OpenCLArray* blockCenter;
+    OpenCLArray* blockBoundingBox;
    std::vector<std::vector<int> > atomExclusions;
    std::vector<ParameterInfo> parameters;
    std::vector<ParameterInfo> arguments;

--- a/platforms/opencl/src/OpenCLParallelKernels.cpp
+++ b/platforms/opencl/src/OpenCLParallelKernels.cpp
@@ -130,8 +130,8 @@ void OpenCLParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
 void OpenCLParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
    OpenCLContext& cl0 = *data.contexts[0];
    if (contextForces == NULL) {
-        contextForces = new OpenCLArray<mm_float4>(cl0, &cl0.getForceBuffers().getDeviceBuffer(),
-                data.contexts.size()*cl0.getPaddedNumAtoms(), "contextForces", true);
+        contextForces = OpenCLArray::create<mm_float4>(cl0, &cl0.getForceBuffers().getDeviceBuffer(),
+                data.contexts.size()*cl0.getPaddedNumAtoms(), "contextForces");
        int bufferBytes = (data.contexts.size()-1)*cl0.getPaddedNumAtoms()*sizeof(mm_float4);
        pinnedPositionBuffer = new cl::Buffer(cl0.getContext(), CL_MEM_ALLOC_HOST_PTR, bufferBytes);
        pinnedPositionMemory = (mm_float4*) cl0.getQueue().enqueueMapBuffer(*pinnedPositionBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, bufferBytes);