Continuing implementation of OpenCL platform

b3781c23 · Peter Eastman · 437ca02f · b3781c23 · b3781c23 · b3781c23
Commit b3781c23 authored Sep 18, 2009 by Peter Eastman
10 changed files
--- a/platforms/opencl/CMakeLists.txt
+++ b/platforms/opencl/CMakeLists.txt
@@ -82,4 +82,8 @@ ENDFOREACH(subdir)
 INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)
+# Install kernel files that will be loaded at runtime.
+FILE(GLOB OPENCL_KERNELS   ${CMAKE_CURRENT_SOURCE_DIR}/src/kernels/*.cl src/kernels/*.h)
+INSTALL_FILES(/lib/plugins/opencl FILES ${OPENCL_KERNELS})
 SUBDIRS (sharedTarget)
--- a/platforms/opencl/include/OpenCLPlatform.h
+++ b/platforms/opencl/include/OpenCLPlatform.h
@@ -49,7 +49,7 @@ public:
        return name;
    }
    double getSpeed() const {
-        return 100;
+        return 0; // TODO Increase this.  Currently set to 0 so it will never be selected automatically.
    }
    bool supportsDoublePrecision() const;
    const std::string& getPropertyValue(const Context& context, const std::string& property) const;

--- a/platforms/opencl/src/OpenCLArray.h
+++ b/platforms/opencl/src/OpenCLArray.h
@@ -51,11 +51,25 @@ public:
     *                          the OpenCL Buffer
     */
    OpenCLArray(OpenCLContext& context, int size, const std::string& name, bool createHostBuffer = false) :
-            context(context), size(size), name(name), local(createHostBuffer ? size : 0) {
+            context(context), size(size), name(name), local(createHostBuffer ? size : 0), ownsBuffer(true) {
        buffer = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, size*sizeof(T));
    }
+    /**
+     * Create an OpenCLArray object the uses a preexisting Buffer.
+     *
+     * @param context           the context for which to create the array
+     * @param buffer            the OpenCL Buffer this object encapsulates
+     * @param size              the number of elements in the array
+     * @param name              the name of the array
+     * @param createHostBuffer  specifies whether to create a buffer in host memory for copying data to and from
+     *                          the OpenCL Buffer
+     */
+    OpenCLArray(OpenCLContext& context, cl::Buffer* buffer, int size, const std::string& name, bool createHostBuffer = false) :
+            context(context), buffer(buffer), size(size), name(name), local(createHostBuffer ? size : 0), ownsBuffer(false) {
+    }
    ~OpenCLArray() {
-        delete buffer;
+        if (ownsBuffer)
+            delete buffer;
    }
    const T& operator[](int index) const {
        return local[index];
@@ -63,6 +77,18 @@ public:
    T& operator[](int index) {
        return local[index];
    }
+    /**
+     * Get the size of the array.
+     */
+    int getSize() {
+        return size;
+    }
+    /**
+     * Get the OpenCL Buffer object.
+     */
+    cl::Buffer& getDeviceBuffer() {
+        return *buffer;
+    }
    /**
     * Get a pointer to the host buffer.
     */
@@ -114,6 +140,7 @@ private:
    cl::Buffer* buffer;
    std::vector<T> local;
    int size;
+    bool ownsBuffer;
    std::string name;
 };

--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -26,25 +26,87 @@
 #include "OpenCLContext.h"
 #include "OpenCLArray.h"
+#include "openmm/Platform.h"
+#include <fstream>
+#include <iostream>
 using namespace OpenMM;
+using namespace std;
 OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceIndex) {
    // TODO Select the platform and device correctly
-    context = new cl::Context(CL_DEVICE_TYPE_CPU);
+    context = cl::Context(CL_DEVICE_TYPE_CPU);
-    queue = new cl::CommandQueue(getContext(), getContext().getInfo<CL_CONTEXT_DEVICES>()[0]);
+    device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
-    posq = new OpenCLArray<cl_float4>(*this, numParticles, "posq", true);
+    queue = cl::CommandQueue(context, device);
-    velm = new OpenCLArray<cl_float4>(*this, numParticles, "velm", true);
+    numAtoms = numParticles;
-    force = new OpenCLArray<cl_float4>(*this, numParticles, "force", true);
+    paddedNumAtoms = TileSize*((numParticles+TileSize-1)/TileSize);
-    atomIndex = new OpenCLArray<cl_int>(*this, numParticles, "atomIndex", true);
+    numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize;
+    numTiles = numAtomBlocks*(numAtomBlocks+1)/2;
+    numThreadBlocks = 8*device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
+    forceBufferPerWarp = true;
+    numForceBuffers = numThreadBlocks*ThreadBlockSize/TileSize;
+    if (numForceBuffers >= numAtomBlocks) {
+        // For small systems, it is more efficient to have one force buffer per block of 32 atoms instead of one per warp.
+        forceBufferPerWarp = false;
+        numForceBuffers = numAtomBlocks;
+    }
+    posq = new OpenCLArray<cl_float4>(*this, paddedNumAtoms, "posq", true);
+    velm = new OpenCLArray<cl_float4>(*this, paddedNumAtoms, "velm", true);
+    forceBuffers = new OpenCLArray<cl_float4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers", false);
+    force = new OpenCLArray<cl_float4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force", true);
+    atomIndex = new OpenCLArray<cl_int>(*this, paddedNumAtoms, "atomIndex", true);
+    for (int i = 0; i < paddedNumAtoms; ++i)
+        atomIndex->set(i, i);
+    atomIndex->upload();
+    // Create utility kernels that are used in multiple places.
+    utilities = createProgram(loadSourceFromFile("utilities.cl"));
+    clearBufferKernel = cl::Kernel(utilities, "clearBuffer");
 }
 OpenCLContext::~OpenCLContext() {
-    delete context;
-    delete queue;
    delete posq;
    delete velm;
    delete force;
    delete atomIndex;
 }
+string OpenCLContext::loadSourceFromFile(const string& filename) const {
+    ifstream file((Platform::getDefaultPluginsDirectory()+"/opencl/"+filename).c_str());
+    if (!file.is_open())
+        throw OpenMMException("Unable to load kernel: "+filename);
+    string kernel;
+    string line;
+    while (!file.eof()) {
+        getline(file, line);
+        kernel += line;
+        kernel += '\n';
+    }
+    file.close();
+    return kernel;
+}
+cl::Program OpenCLContext::createProgram(const std::string source) {
+    cl::Program::Sources sources(1, make_pair(source.c_str(), source.size()));
+    cl::Program program(context, sources);
+    try {
+        program.build(vector<cl::Device>(1, device));
+    } catch (cl::Error err) {
+        throw OpenMMException("Error compiling kernel: "+program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device));
+    }
+    return program;
+}
+void OpenCLContext::clearBuffer(OpenCLArray<float>& array) {
+    clearBufferKernel.setArg<cl::Buffer>(0, array.getDeviceBuffer());
+    clearBufferKernel.setArg<cl_int>(1, array.getSize());
+    queue.enqueueNDRangeKernel(clearBufferKernel, cl::NullRange, cl::NDRange(numThreadBlocks*ThreadBlockSize), cl::NDRange(ThreadBlockSize));
+}
+void OpenCLContext::clearBuffer(OpenCLArray<cl_float4>& array) {
+    clearBufferKernel.setArg<cl::Buffer>(0, array.getDeviceBuffer());
+    clearBufferKernel.setArg<cl_int>(1, array.getSize()*4);
+    queue.enqueueNDRangeKernel(clearBufferKernel, cl::NullRange, cl::NDRange(numThreadBlocks*ThreadBlockSize), cl::NDRange(ThreadBlockSize));
+}
\ No newline at end of file
--- a/platforms/opencl/src/OpenCLContext.h
+++ b/platforms/opencl/src/OpenCLContext.h
@@ -41,19 +41,21 @@ class OpenCLArray;
 class OpenCLContext {
 public:
+    static const int ThreadBlockSize = 64;
+    static const int TileSize = 32;
    OpenCLContext(int numParticles, int platformIndex, int deviceIndex);
    ~OpenCLContext();
    /**
     * Get the cl::Context associated with this object.
     */
    cl::Context& getContext() {
-        return *context;
+        return context;
    }
    /**
     * Get the cl::CommandQueue associated with this object.
     */
    cl::CommandQueue& getQueue() {
-        return *queue;
+        return queue;
    }
    /**
     * Get the array which contains the position and charge of each atom.
@@ -73,18 +75,51 @@ public:
    OpenCLArray<cl_float4>& getForce() {
        return *force;
    }
+    /**
+     * Get the array which contains the buffers in which forces are computed.
+     */
+    OpenCLArray<cl_float4>& getForceBuffers() {
+        return *forceBuffers;
+    }
    /**
     * Get the array which contains the index of each atom.
     */
    OpenCLArray<cl_int>& getAtomIndex() {
        return *atomIndex;
    }
+    /**
+     * Load OpenCL source code from a file in the kernels directory.
+     */
+    std::string loadSourceFromFile(const std::string& filename) const;
+    /**
+     * Create an OpenCL Program from source code.
+     */
+    cl::Program createProgram(const std::string source);
+    /**
+     * Set all elements of an array to 0.
+     */
+    void clearBuffer(OpenCLArray<float>& array);
+    /**
+     * Set all elements of an array to 0.
+     */
+    void clearBuffer(OpenCLArray<cl_float4>& array);
+    int numAtoms;
+    int paddedNumAtoms;
+    int numAtomBlocks;
+    int numTiles;
+    int numThreadBlocks;
+    int numForceBuffers;
+    bool forceBufferPerWarp;
 private:
-    cl::Context* context;
+    cl::Context context;
-    cl::CommandQueue* queue;
+    cl::Device device;
+    cl::CommandQueue queue;
+    cl::Program utilities;
+    cl::Kernel clearBufferKernel;
    OpenCLArray<cl_float4>* posq;
    OpenCLArray<cl_float4>* velm;
    OpenCLArray<cl_float4>* force;
+    OpenCLArray<cl_float4>* forceBuffers;
    OpenCLArray<cl_int>* atomIndex;
 };

--- a/platforms/opencl/src/OpenCLKernelFactory.cpp
+++ b/platforms/opencl/src/OpenCLKernelFactory.cpp
@@ -33,10 +33,10 @@ using namespace OpenMM;
 KernelImpl* OpenCLKernelFactory::createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const {
    OpenCLPlatform::PlatformData& data = *static_cast<OpenCLPlatform::PlatformData*>(context.getPlatformData());
-//    if (name == InitializeForcesKernel::Name())
+    if (name == InitializeForcesKernel::Name())
-//        return new OpenCLInitializeForcesKernel(name, platform);
+        return new OpenCLInitializeForcesKernel(name, platform, data);
-//    if (name == UpdateTimeKernel::Name())
+    if (name == UpdateTimeKernel::Name())
-//        return new OpenCLUpdateTimeKernel(name, platform, data);
+        return new OpenCLUpdateTimeKernel(name, platform, data);
 //    if (name == CalcHarmonicBondForceKernel::Name())
 //        return new OpenCLCalcHarmonicBondForceKernel(name, platform, data, context.getSystem());
 //    if (name == CalcHarmonicAngleForceKernel::Name())
@@ -51,8 +51,8 @@ KernelImpl* OpenCLKernelFactory::createKernelImpl(std::string name, const Platfo
 //        return new OpenCLCalcCustomNonbondedForceKernel(name, platform, data, context.getSystem());
 //    if (name == CalcGBSAOBCForceKernel::Name())
 //        return new OpenCLCalcGBSAOBCForceKernel(name, platform, data);
-//    if (name == IntegrateVerletStepKernel::Name())
+    if (name == IntegrateVerletStepKernel::Name())
-//        return new OpenCLIntegrateVerletStepKernel(name, platform, data);
+        return new OpenCLIntegrateVerletStepKernel(name, platform, data);
 //    if (name == IntegrateLangevinStepKernel::Name())
 //        return new OpenCLIntegrateLangevinStepKernel(name, platform, data);
 //    if (name == IntegrateBrownianStepKernel::Name())
@@ -63,8 +63,8 @@ KernelImpl* OpenCLKernelFactory::createKernelImpl(std::string name, const Platfo
 //        return new OpenCLIntegrateVariableLangevinStepKernel(name, platform, data);
 //    if (name == ApplyAndersenThermostatKernel::Name())
 //        return new OpenCLApplyAndersenThermostatKernel(name, platform, data);
-//    if (name == CalcKineticEnergyKernel::Name())
+    if (name == CalcKineticEnergyKernel::Name())
-//        return new OpenCLCalcKineticEnergyKernel(name, platform);
+        return new OpenCLCalcKineticEnergyKernel(name, platform);
 //    if (name == RemoveCMMotionKernel::Name())
 //        return new OpenCLRemoveCMMotionKernel(name, platform, data);
    throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2009 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "OpenCLKernels.h"
+#include "OpenCLStreamImpl.h"
+#include "openmm/LangevinIntegrator.h"
+#include "openmm/Context.h"
+#include "openmm/internal/ContextImpl.h"
+#include <cmath>
+using namespace OpenMM;
+using namespace std;
+void OpenCLInitializeForcesKernel::initialize(const System& system) {
+}
+void OpenCLInitializeForcesKernel::execute(ContextImpl& context) {
+    data.context->clearBuffer(data.context->getForceBuffers());
+}
+void OpenCLUpdateTimeKernel::initialize(const System& system) {
+}
+double OpenCLUpdateTimeKernel::getTime(const ContextImpl& context) const {
+    return data.time;
+}
+void OpenCLUpdateTimeKernel::setTime(ContextImpl& context, double time) {
+    data.time = time;
+}
+//OpenCLCalcHarmonicBondForceKernel::~OpenCLCalcHarmonicBondForceKernel() {
+//}
+//
+//void OpenCLCalcHarmonicBondForceKernel::initialize(const System& system, const HarmonicBondForce& force) {
+//    if (data.primaryKernel == NULL)
+//        data.primaryKernel = this;
+//    data.hasBonds = true;
+//    numBonds = force.getNumBonds();
+//    vector<int> particle1(numBonds);
+//    vector<int> particle2(numBonds);
+//    vector<float> length(numBonds);
+//    vector<float> k(numBonds);
+//    for (int i = 0; i < numBonds; i++) {
+//        double lengthValue, kValue;
+//        force.getBondParameters(i, particle1[i], particle2[i], lengthValue, kValue);
+//        length[i] = (float) lengthValue;
+//        k[i] = (float) kValue;
+//    }
+//    gpuSetBondParameters(data.gpu, particle1, particle2, length, k);
+//}
+//
+//void OpenCLCalcHarmonicBondForceKernel::executeForces(ContextImpl& context) {
+//    if (data.primaryKernel == this)
+//        calcForces(context, data);
+//}
+//
+//double OpenCLCalcHarmonicBondForceKernel::executeEnergy(ContextImpl& context) {
+//    if (data.primaryKernel == this)
+//        return calcEnergy(context, data, system);
+//    return 0.0;
+//}
+//
+//OpenCLCalcHarmonicAngleForceKernel::~OpenCLCalcHarmonicAngleForceKernel() {
+//}
+//
+//void OpenCLCalcHarmonicAngleForceKernel::initialize(const System& system, const HarmonicAngleForce& force) {
+//    if (data.primaryKernel == NULL)
+//        data.primaryKernel = this;
+//    data.hasAngles = true;
+//    numAngles = force.getNumAngles();
+//    const float RadiansToDegrees = (float) (180.0/3.14159265);
+//    vector<int> particle1(numAngles);
+//    vector<int> particle2(numAngles);
+//    vector<int> particle3(numAngles);
+//    vector<float> angle(numAngles);
+//    vector<float> k(numAngles);
+//    for (int i = 0; i < numAngles; i++) {
+//        double angleValue, kValue;
+//        force.getAngleParameters(i, particle1[i], particle2[i], particle3[i], angleValue, kValue);
+//        angle[i] = (float) (angleValue*RadiansToDegrees);
+//        k[i] = (float) kValue;
+//    }
+//    gpuSetBondAngleParameters(data.gpu, particle1, particle2, particle3, angle, k);
+//}
+//
+//void OpenCLCalcHarmonicAngleForceKernel::executeForces(ContextImpl& context) {
+//    if (data.primaryKernel == this)
+//        calcForces(context, data);
+//}
+//
+//double OpenCLCalcHarmonicAngleForceKernel::executeEnergy(ContextImpl& context) {
+//    if (data.primaryKernel == this)
+//        return calcEnergy(context, data, system);
+//    return 0.0;
+//}
+//
+//OpenCLCalcPeriodicTorsionForceKernel::~OpenCLCalcPeriodicTorsionForceKernel() {
+//}
+//
+//void OpenCLCalcPeriodicTorsionForceKernel::initialize(const System& system, const PeriodicTorsionForce& force) {
+//    if (data.primaryKernel == NULL)
+//        data.primaryKernel = this;
+//    data.hasPeriodicTorsions = true;
+//    numTorsions = force.getNumTorsions();
+//    const float RadiansToDegrees = (float)(180.0/3.14159265);
+//    vector<int> particle1(numTorsions);
+//    vector<int> particle2(numTorsions);
+//    vector<int> particle3(numTorsions);
+//    vector<int> particle4(numTorsions);
+//    vector<float> k(numTorsions);
+//    vector<float> phase(numTorsions);
+//    vector<int> periodicity(numTorsions);
+//    for (int i = 0; i < numTorsions; i++) {
+//        double kValue, phaseValue;
+//        force.getTorsionParameters(i, particle1[i], particle2[i], particle3[i], particle4[i], periodicity[i], phaseValue, kValue);
+//        k[i] = (float) kValue;
+//        phase[i] = (float) (phaseValue*RadiansToDegrees);
+//    }
+//    gpuSetDihedralParameters(data.gpu, particle1, particle2, particle3, particle4, k, phase, periodicity);
+//}
+//
+//void OpenCLCalcPeriodicTorsionForceKernel::executeForces(ContextImpl& context) {
+//    if (data.primaryKernel == this)
+//        calcForces(context, data);
+//}
+//
+//double OpenCLCalcPeriodicTorsionForceKernel::executeEnergy(ContextImpl& context) {
+//    if (data.primaryKernel == this)
+//        return calcEnergy(context, data, system);
+//    return 0.0;
+//}
+//
+//OpenCLCalcRBTorsionForceKernel::~OpenCLCalcRBTorsionForceKernel() {
+//}
+//
+//void OpenCLCalcRBTorsionForceKernel::initialize(const System& system, const RBTorsionForce& force) {
+//    if (data.primaryKernel == NULL)
+//        data.primaryKernel = this;
+//    data.hasRB = true;
+//    numTorsions = force.getNumTorsions();
+//    vector<int> particle1(numTorsions);
+//    vector<int> particle2(numTorsions);
+//    vector<int> particle3(numTorsions);
+//    vector<int> particle4(numTorsions);
+//    vector<float> c0(numTorsions);
+//    vector<float> c1(numTorsions);
+//    vector<float> c2(numTorsions);
+//    vector<float> c3(numTorsions);
+//    vector<float> c4(numTorsions);
+//    vector<float> c5(numTorsions);
+//    for (int i = 0; i < numTorsions; i++) {
+//        double c[6];
+//        force.getTorsionParameters(i, particle1[i], particle2[i], particle3[i], particle4[i], c[0], c[1], c[2], c[3], c[4], c[5]);
+//        c0[i] = (float) c[0];
+//        c1[i] = (float) c[1];
+//        c2[i] = (float) c[2];
+//        c3[i] = (float) c[3];
+//        c4[i] = (float) c[4];
+//        c5[i] = (float) c[5];
+//    }
+//    gpuSetRbDihedralParameters(data.gpu, particle1, particle2, particle3, particle4, c0, c1, c2, c3, c4, c5);
+//}
+//
+//void OpenCLCalcRBTorsionForceKernel::executeForces(ContextImpl& context) {
+//    if (data.primaryKernel == this)
+//        calcForces(context, data);
+//}
+//
+//double OpenCLCalcRBTorsionForceKernel::executeEnergy(ContextImpl& context) {
+//    if (data.primaryKernel == this)
+//        return calcEnergy(context, data, system);
+//    return 0.0;
+//}
+//
+//OpenCLCalcNonbondedForceKernel::~OpenCLCalcNonbondedForceKernel() {
+//}
+//
+//void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
+//    if (data.primaryKernel == NULL)
+//        data.primaryKernel = this;
+//    data.hasNonbonded = true;
+//    numParticles = force.getNumParticles();
+//    _gpuContext* gpu = data.gpu;
+//
+//    // Identify which exceptions are 1-4 interactions.
+//
+//    vector<pair<int, int> > exclusions;
+//    vector<int> exceptions;
+//    for (int i = 0; i < force.getNumExceptions(); i++) {
+//        int particle1, particle2;
+//        double chargeProd, sigma, epsilon;
+//        force.getExceptionParameters(i, particle1, particle2, chargeProd, sigma, epsilon);
+//        exclusions.push_back(pair<int, int>(particle1, particle2));
+//        if (chargeProd != 0.0 || epsilon != 0.0)
+//            exceptions.push_back(i);
+//    }
+//
+//    // Initialize nonbonded interactions.
+//
+//    {
+//        vector<int> particle(numParticles);
+//        vector<float> c6(numParticles);
+//        vector<float> c12(numParticles);
+//        vector<float> q(numParticles);
+//        vector<char> symbol;
+//        vector<vector<int> > exclusionList(numParticles);
+//        for (int i = 0; i < numParticles; i++) {
+//            double charge, radius, depth;
+//            force.getParticleParameters(i, charge, radius, depth);
+//            particle[i] = i;
+//            q[i] = (float) charge;
+//            c6[i] = (float) (4*depth*pow(radius, 6.0));
+//            c12[i] = (float) (4*depth*pow(radius, 12.0));
+//            exclusionList[i].push_back(i);
+//        }
+//        for (int i = 0; i < (int)exclusions.size(); i++) {
+//            exclusionList[exclusions[i].first].push_back(exclusions[i].second);
+//            exclusionList[exclusions[i].second].push_back(exclusions[i].first);
+//        }
+//        Vec3 boxVectors[3];
+//        system.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
+//        gpuSetPeriodicBoxSize(gpu, (float)boxVectors[0][0], (float)boxVectors[1][1], (float)boxVectors[2][2]);
+//        OpenCLNonbondedMethod method = NO_CUTOFF;
+//        if (force.getNonbondedMethod() != NonbondedForce::NoCutoff) {
+//            gpuSetNonbondedCutoff(gpu, (float)force.getCutoffDistance(), force.getReactionFieldDielectric());
+//            method = CUTOFF;
+//        }
+//        if (force.getNonbondedMethod() == NonbondedForce::CutoffPeriodic) {
+//            method = PERIODIC;
+//        }
+//        if (force.getNonbondedMethod() == NonbondedForce::Ewald || force.getNonbondedMethod() == NonbondedForce::PME) {
+//            double ewaldErrorTol = force.getEwaldErrorTolerance();
+//            double alpha = (1.0/force.getCutoffDistance())*std::sqrt(-std::log(ewaldErrorTol));
+//            double mx = boxVectors[0][0]/force.getCutoffDistance();
+//            double my = boxVectors[1][1]/force.getCutoffDistance();
+//            double mz = boxVectors[2][2]/force.getCutoffDistance();
+//            double pi = 3.1415926535897932385;
+//            int kmaxx = (int)std::ceil(-(mx/pi)*std::log(ewaldErrorTol));
+//            int kmaxy = (int)std::ceil(-(my/pi)*std::log(ewaldErrorTol));
+//            int kmaxz = (int)std::ceil(-(mz/pi)*std::log(ewaldErrorTol));
+//            if (force.getNonbondedMethod() == NonbondedForce::Ewald) {
+//                if (kmaxx%2 == 0)
+//                    kmaxx++;
+//                if (kmaxy%2 == 0)
+//                    kmaxy++;
+//                if (kmaxz%2 == 0)
+//                    kmaxz++;
+//                gpuSetEwaldParameters(gpu, (float) alpha, kmaxx, kmaxy, kmaxz);
+//                method = EWALD;
+//            }
+//            else {
+//                int gridSizeX = -0.5*kmaxx*std::log(ewaldErrorTol);
+//                int gridSizeY = -0.5*kmaxy*std::log(ewaldErrorTol);
+//                int gridSizeZ = -0.5*kmaxz*std::log(ewaldErrorTol);
+////                printf("%d %d\n", gridSizeX, (int) (kmaxx*std::sqrt(-std::log(ewaldErrorTol))));
+////                gridSizeX = 0.02*mx*std::pow(-std::log(1.5*ewaldErrorTol), 3);
+////                gridSizeY = 0.02*my*std::pow(-std::log(1.5*ewaldErrorTol), 3);
+////                gridSizeZ = 0.02*mz*std::pow(-std::log(1.5*ewaldErrorTol), 3);
+////                double scale = 0.698*std::pow(ewaldErrorTol, -0.312);
+////                double scale = 0.713*std::pow(ewaldErrorTol, -0.261);
+////                printf("%f\n", scale);
+//                gridSizeX = mx*NonbondedForce::PMEscale;
+//                gridSizeY = my*NonbondedForce::PMEscale;
+//                gridSizeZ = mz*NonbondedForce::PMEscale;
+////                printf("%d %d %d\n", gridSizeX, gridSizeY, gridSizeZ);
+////                gridSizeX = mx*scale;
+////                gridSizeY = my*scale;
+////                gridSizeZ = mz*scale;
+//                gpuSetPMEParameters(gpu, (float) alpha, gridSizeX, gridSizeY, gridSizeZ);
+//                method = PARTICLE_MESH_EWALD;
+//            }
+//        }
+//        data.nonbondedMethod = method;
+//        gpuSetCoulombParameters(gpu, 138.935485f, particle, c6, c12, q, symbol, exclusionList, method);
+//
+//        // Compute the Ewald self energy.
+//
+//        data.ewaldSelfEnergy = 0.0;
+//        if (force.getNonbondedMethod() == NonbondedForce::Ewald || force.getNonbondedMethod() == NonbondedForce::PME) {
+//            double selfEnergyScale = gpu->sim.epsfac*gpu->sim.alphaEwald/std::sqrt(PI);
+//                for (int i = 0; i < numParticles; i++)
+//                    data.ewaldSelfEnergy -= selfEnergyScale*q[i]*q[i];
+//        }
+//    }
+//
+//    // Initialize 1-4 nonbonded interactions.
+//
+//    {
+//        int numExceptions = exceptions.size();
+//        vector<int> particle1(numExceptions);
+//        vector<int> particle2(numExceptions);
+//        vector<float> c6(numExceptions);
+//        vector<float> c12(numExceptions);
+//        vector<float> q1(numExceptions);
+//        vector<float> q2(numExceptions);
+//        for (int i = 0; i < numExceptions; i++) {
+//            double charge, sig, eps;
+//            force.getExceptionParameters(exceptions[i], particle1[i], particle2[i], charge, sig, eps);
+//            c6[i] = (float) (4*eps*pow(sig, 6.0));
+//            c12[i] = (float) (4*eps*pow(sig, 12.0));
+//            q1[i] = (float) charge;
+//            q2[i] = 1.0f;
+//        }
+//        gpuSetLJ14Parameters(gpu, 138.935485f, 1.0f, particle1, particle2, c6, c12, q1, q2);
+//    }
+//}
+//
+//void OpenCLCalcNonbondedForceKernel::executeForces(ContextImpl& context) {
+//    if (data.primaryKernel == this)
+//        calcForces(context, data);
+//}
+//
+//double OpenCLCalcNonbondedForceKernel::executeEnergy(ContextImpl& context) {
+//    if (data.primaryKernel == this)
+//        return calcEnergy(context, data, system);
+//    return 0.0;
+//}
+//
+//OpenCLCalcCustomNonbondedForceKernel::~OpenCLCalcCustomNonbondedForceKernel() {
+//}
+//
+//void OpenCLCalcCustomNonbondedForceKernel::initialize(const System& system, const CustomNonbondedForce& force) {
+//    data.primaryKernel = this; // This must always be the primary kernel so it can update the global parameters
+//    data.hasCustomNonbonded = true;
+//    numParticles = force.getNumParticles();
+//    _gpuContext* gpu = data.gpu;
+//
+//    // Identify which exceptions are actual interactions.
+//
+//    vector<pair<int, int> > exclusions;
+//    vector<int> exceptions;
+//    {
+//        vector<double> parameters;
+//        for (int i = 0; i < force.getNumExceptions(); i++) {
+//            int particle1, particle2;
+//            force.getExceptionParameters(i, particle1, particle2, parameters);
+//            exclusions.push_back(pair<int, int>(particle1, particle2));
+//            if (parameters.size() > 0)
+//                exceptions.push_back(i);
+//        }
+//    }
+//
+//    // Initialize nonbonded interactions.
+//
+//    vector<int> particle(numParticles);
+//    vector<vector<double> > parameters(numParticles);
+//    vector<vector<int> > exclusionList(numParticles);
+//    for (int i = 0; i < numParticles; i++) {
+//        force.getParticleParameters(i, parameters[i]);
+//        particle[i] = i;
+//        exclusionList[i].push_back(i);
+//    }
+//    for (int i = 0; i < (int)exclusions.size(); i++) {
+//        exclusionList[exclusions[i].first].push_back(exclusions[i].second);
+//        exclusionList[exclusions[i].second].push_back(exclusions[i].first);
+//    }
+//    Vec3 boxVectors[3];
+//    system.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
+//    gpuSetPeriodicBoxSize(gpu, (float)boxVectors[0][0], (float)boxVectors[1][1], (float)boxVectors[2][2]);
+//    OpenCLNonbondedMethod method = NO_CUTOFF;
+//    if (force.getNonbondedMethod() != CustomNonbondedForce::NoCutoff)
+//        method = CUTOFF;
+//    if (force.getNonbondedMethod() == CustomNonbondedForce::CutoffPeriodic) {
+//        method = PERIODIC;
+//    }
+//    data.customNonbondedMethod = method;
+//
+//    // Initialize exceptions.
+//
+//    int numExceptions = exceptions.size();
+//    vector<int> exceptionParticle1(numExceptions);
+//    vector<int> exceptionParticle2(numExceptions);
+//    vector<vector<double> > exceptionParams(numExceptions);
+//    for (int i = 0; i < numExceptions; i++)
+//        force.getExceptionParameters(exceptions[i], exceptionParticle1[i], exceptionParticle2[i], exceptionParams[i]);
+//
+//    // Record the tabulated functions.
+//
+//    for (int i = 0; i < force.getNumFunctions(); i++) {
+//        string name;
+//        vector<double> values;
+//        double min, max;
+//        bool interpolating;
+//        force.getFunctionParameters(i, name, values, min, max, interpolating);
+//        gpuSetTabulatedFunction(gpu, i, name, values, min, max, interpolating);
+//    }
+//
+//    // Record information for the expressions.
+//
+//    vector<string> paramNames;
+//    vector<string> combiningRules;
+//    for (int i = 0; i < force.getNumParameters(); i++) {
+//        paramNames.push_back(force.getParameterName(i));
+//        combiningRules.push_back(force.getParameterCombiningRule(i));
+//    }
+//    globalParamNames.resize(force.getNumGlobalParameters());
+//    globalParamValues.resize(force.getNumGlobalParameters());
+//    for (int i = 0; i < force.getNumGlobalParameters(); i++) {
+//        globalParamNames[i] = force.getGlobalParameterName(i);
+//        globalParamValues[i] = force.getGlobalParameterDefaultValue(i);
+//    }
+//    gpuSetCustomNonbondedParameters(gpu, parameters, exclusionList, exceptionParticle1, exceptionParticle2, exceptionParams, method,
+//            (float)force.getCutoffDistance(), force.getEnergyFunction(), combiningRules, paramNames, globalParamNames);
+//    if (globalParamValues.size() > 0)
+//        SetCustomNonbondedGlobalParams(&globalParamValues[0]);
+//}
+//
+//void OpenCLCalcCustomNonbondedForceKernel::executeForces(ContextImpl& context) {
+//    if (data.primaryKernel == this) {
+//        updateGlobalParams(context);
+//        calcForces(context, data);
+//    }
+//}
+//
+//double OpenCLCalcCustomNonbondedForceKernel::executeEnergy(ContextImpl& context) {
+//    if (data.primaryKernel == this) {
+//        updateGlobalParams(context);
+//        return calcEnergy(context, data, system);
+//    }
+//    return 0.0;
+//}
+//
+//void OpenCLCalcCustomNonbondedForceKernel::updateGlobalParams(ContextImpl& context) {
+//    bool changed = false;
+//    for (int i = 0; i < globalParamNames.size(); i++) {
+//        float value = (float) context.getParameter(globalParamNames[i]);
+//        if (value != globalParamValues[i])
+//            changed = true;
+//        globalParamValues[i] = value;
+//    }
+//    if (changed)
+//        SetCustomNonbondedGlobalParams(&globalParamValues[0]);
+//}
+//
+//OpenCLCalcGBSAOBCForceKernel::~OpenCLCalcGBSAOBCForceKernel() {
+//}
+//
+//void OpenCLCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOBCForce& force) {
+//
+//    int numParticles = system.getNumParticles();
+//    _gpuContext* gpu = data.gpu;
+//    vector<float> radius(numParticles);
+//    vector<float> scale(numParticles);
+//    vector<float> charge(numParticles);
+//    for (int i = 0; i < numParticles; i++) {
+//        double particleCharge, particleRadius, scalingFactor;
+//        force.getParticleParameters(i, particleCharge, particleRadius, scalingFactor);
+//        radius[i] = (float) particleRadius;
+//        scale[i] = (float) scalingFactor;
+//        charge[i] = (float) particleCharge;
+//    }
+//    gpuSetObcParameters(gpu, (float) force.getSoluteDielectric(), (float) force.getSolventDielectric(), radius, scale, charge);
+//}
+//
+//void OpenCLCalcGBSAOBCForceKernel::executeForces(ContextImpl& context) {
+//}
+//
+//static void initializeIntegration(const System& system, OpenCLPlatform::PlatformData& data, const Integrator& integrator) {
+//
+//    // Initialize any terms that haven't already been handled by a Force.
+//
+//    _gpuContext* gpu = data.gpu;
+//    if (!data.hasBonds)
+//        gpuSetBondParameters(gpu, vector<int>(), vector<int>(), vector<float>(), vector<float>());
+//    if (!data.hasAngles)
+//        gpuSetBondAngleParameters(gpu, vector<int>(), vector<int>(), vector<int>(), vector<float>(), vector<float>());
+//    if (!data.hasPeriodicTorsions)
+//        gpuSetDihedralParameters(gpu, vector<int>(), vector<int>(), vector<int>(), vector<int>(), vector<float>(), vector<float>(), vector<int>());
+//    if (!data.hasRB)
+//        gpuSetRbDihedralParameters(gpu, vector<int>(), vector<int>(), vector<int>(), vector<int>(), vector<float>(), vector<float>(),
+//                vector<float>(), vector<float>(), vector<float>(), vector<float>());
+//    if (!data.hasNonbonded) {
+//        gpuSetCoulombParameters(gpu, 138.935485f, vector<int>(), vector<float>(), vector<float>(), vector<float>(), vector<char>(), vector<vector<int> >(), NO_CUTOFF);
+//        gpuSetLJ14Parameters(gpu, 138.935485f, 1.0f, vector<int>(), vector<int>(), vector<float>(), vector<float>(), vector<float>(), vector<float>());
+//    }
+//
+//    // Set masses.
+//
+//    int numParticles = system.getNumParticles();
+//    vector<float> mass(numParticles);
+//    for (int i = 0; i < numParticles; i++)
+//        mass[i] = (float) system.getParticleMass(i);
+//    gpuSetMass(gpu, mass);
+//
+//    // Set constraints.
+//
+//    int numConstraints = system.getNumConstraints();
+//    vector<int> particle1(numConstraints);
+//    vector<int> particle2(numConstraints);
+//    vector<float> distance(numConstraints);
+//    vector<float> invMass1(numConstraints);
+//    vector<float> invMass2(numConstraints);
+//    for (int i = 0; i < numConstraints; i++) {
+//        int particle1Index, particle2Index;
+//        double constraintDistance;
+//        system.getConstraintParameters(i, particle1Index, particle2Index, constraintDistance);
+//        particle1[i] = particle1Index;
+//        particle2[i] = particle2Index;
+//        distance[i] = (float) constraintDistance;
+//        invMass1[i] = 1.0f/mass[particle1Index];
+//        invMass2[i] = 1.0f/mass[particle2Index];
+//    }
+//    gpuSetConstraintParameters(gpu, particle1, particle2, distance, invMass1, invMass2, (float)integrator.getConstraintTolerance());
+//
+//    // Finish initialization.
+//
+//    gpuBuildThreadBlockWorkList(gpu);
+//    gpuBuildExclusionList(gpu);
+//    gpuBuildOutputBuffers(gpu);
+//    gpuSetConstants(gpu);
+//    kClearBornForces(gpu);
+//    kClearForces(gpu);
+//    cudaThreadSynchronize();
+//}
+//
+//double OpenCLCalcGBSAOBCForceKernel::executeEnergy(ContextImpl& context) {
+//	return 0.0;
+//}
+OpenCLIntegrateVerletStepKernel::~OpenCLIntegrateVerletStepKernel() {
+}
+void OpenCLIntegrateVerletStepKernel::initialize(const System& system, const VerletIntegrator& integrator) {
+//    initializeIntegration(system, data, integrator);
+    prevStepSize = -1.0;
+}
+void OpenCLIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIntegrator& integrator) {
+//    _gpuContext* gpu = data.gpu;
+//    double stepSize = integrator.getStepSize();
+//    if (stepSize != prevStepSize) {
+//        // Initialize the GPU parameters.
+//
+//        gpuSetVerletIntegrationParameters(gpu, (float) stepSize, 0.0f);
+//        gpuSetConstants(gpu);
+//        prevStepSize = stepSize;
+//    }
+//    kVerletUpdatePart1(gpu);
+//    kApplyFirstShake(gpu);
+//    kApplyFirstSettle(gpu);
+//    kApplyFirstCCMA(gpu);
+//    if (data.removeCM)
+//        if (data.stepCount%data.cmMotionFrequency == 0)
+//            gpu->bCalculateCM = true;
+//    kVerletUpdatePart2(gpu);
+//    data.time += stepSize;
+//    data.stepCount++;
+}
+//OpenCLIntegrateLangevinStepKernel::~OpenCLIntegrateLangevinStepKernel() {
+//}
+//
+//void OpenCLIntegrateLangevinStepKernel::initialize(const System& system, const LangevinIntegrator& integrator) {
+//    initializeIntegration(system, data, integrator);
+//    _gpuContext* gpu = data.gpu;
+//    gpu->seed = (unsigned long) integrator.getRandomNumberSeed();
+//    gpuInitializeRandoms(gpu);
+//    prevStepSize = -1.0;
+//}
+//
+//void OpenCLIntegrateLangevinStepKernel::execute(ContextImpl& context, const LangevinIntegrator& integrator) {
+//    _gpuContext* gpu = data.gpu;
+//    double temperature = integrator.getTemperature();
+//    double friction = integrator.getFriction();
+//    double stepSize = integrator.getStepSize();
+//    if (temperature != prevTemp || friction != prevFriction || stepSize != prevStepSize) {
+//        // Initialize the GPU parameters.
+//
+//        double tau = (friction == 0.0 ? 0.0 : 1.0/friction);
+//        gpuSetLangevinIntegrationParameters(gpu, (float) tau, (float) stepSize, (float) temperature, 0.0f);
+//        gpuSetConstants(gpu);
+//        kGenerateRandoms(gpu);
+//        prevTemp = temperature;
+//        prevFriction = friction;
+//        prevStepSize = stepSize;
+//    }
+//    kLangevinUpdatePart1(gpu);
+//    kApplyFirstShake(gpu);
+//    kApplyFirstSettle(gpu);
+//    kApplyFirstCCMA(gpu);
+//    if (data.removeCM)
+//        if (data.stepCount%data.cmMotionFrequency == 0)
+//            gpu->bCalculateCM = true;
+//    kLangevinUpdatePart2(gpu);
+//    kApplySecondShake(gpu);
+//    kApplySecondSettle(gpu);
+//    kApplySecondCCMA(gpu);
+//    data.time += stepSize;
+//    data.stepCount++;
+//}
+//
+//OpenCLIntegrateBrownianStepKernel::~OpenCLIntegrateBrownianStepKernel() {
+//}
+//
+//void OpenCLIntegrateBrownianStepKernel::initialize(const System& system, const BrownianIntegrator& integrator) {
+//    initializeIntegration(system, data, integrator);
+//    _gpuContext* gpu = data.gpu;
+//    gpu->seed = (unsigned long) integrator.getRandomNumberSeed();
+//    gpuInitializeRandoms(gpu);
+//    prevStepSize = -1.0;
+//}
+//
+//void OpenCLIntegrateBrownianStepKernel::execute(ContextImpl& context, const BrownianIntegrator& integrator) {
+//    _gpuContext* gpu = data.gpu;
+//    double temperature = integrator.getTemperature();
+//    double friction = integrator.getFriction();
+//    double stepSize = integrator.getStepSize();
+//    if (temperature != prevTemp || friction != prevFriction || stepSize != prevStepSize) {
+//        // Initialize the GPU parameters.
+//
+//        double tau = (friction == 0.0 ? 0.0 : 1.0/friction);
+//        gpuSetBrownianIntegrationParameters(gpu, (float) tau, (float) stepSize, (float) temperature);
+//        gpuSetConstants(gpu);
+//        kGenerateRandoms(gpu);
+//        prevTemp = temperature;
+//        prevFriction = friction;
+//        prevStepSize = stepSize;
+//    }
+//    kBrownianUpdatePart1(gpu);
+//    kApplyFirstShake(gpu);
+//    kApplyFirstSettle(gpu);
+//    kApplyFirstCCMA(gpu);
+//    if (data.removeCM)
+//        if (data.stepCount%data.cmMotionFrequency == 0)
+//            gpu->bCalculateCM = true;
+//    kBrownianUpdatePart2(gpu);
+//    data.time += stepSize;
+//    data.stepCount++;
+//}
+//
+//OpenCLIntegrateVariableVerletStepKernel::~OpenCLIntegrateVariableVerletStepKernel() {
+//}
+//
+//void OpenCLIntegrateVariableVerletStepKernel::initialize(const System& system, const VariableVerletIntegrator& integrator) {
+//    initializeIntegration(system, data, integrator);
+//    prevErrorTol = -1.0;
+//}
+//
+//void OpenCLIntegrateVariableVerletStepKernel::execute(ContextImpl& context, const VariableVerletIntegrator& integrator, double maxTime) {
+//    _gpuContext* gpu = data.gpu;
+//    double errorTol = integrator.getErrorTolerance();
+//    if (errorTol != prevErrorTol) {
+//        // Initialize the GPU parameters.
+//
+//        gpuSetVerletIntegrationParameters(gpu, 0.0f, (float) errorTol);
+//        gpuSetConstants(gpu);
+//        prevErrorTol = errorTol;
+//    }
+//    float maxStepSize = (float)(maxTime-data.time);
+//    kSelectVerletStepSize(gpu, maxStepSize);
+//    kVerletUpdatePart1(gpu);
+//    kApplyFirstShake(gpu);
+//    kApplyFirstSettle(gpu);
+//    kApplyFirstCCMA(gpu);
+//    if (data.removeCM)
+//        if (data.stepCount%data.cmMotionFrequency == 0)
+//            gpu->bCalculateCM = true;
+//    kVerletUpdatePart2(gpu);
+//    gpu->psStepSize->Download();
+//    data.time += (*gpu->psStepSize)[0].y;
+//    if ((*gpu->psStepSize)[0].y == maxStepSize)
+//        data.time = maxTime; // Avoid round-off error
+//    data.stepCount++;
+//}
+//
+//OpenCLIntegrateVariableLangevinStepKernel::~OpenCLIntegrateVariableLangevinStepKernel() {
+//}
+//
+//void OpenCLIntegrateVariableLangevinStepKernel::initialize(const System& system, const VariableLangevinIntegrator& integrator) {
+//    initializeIntegration(system, data, integrator);
+//    _gpuContext* gpu = data.gpu;
+//    gpu->seed = (unsigned long) integrator.getRandomNumberSeed();
+//    gpuInitializeRandoms(gpu);
+//    prevErrorTol = -1.0;
+//}
+//
+//void OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, const VariableLangevinIntegrator& integrator, double maxTime) {
+//    _gpuContext* gpu = data.gpu;
+//    double temperature = integrator.getTemperature();
+//    double friction = integrator.getFriction();
+//    double errorTol = integrator.getErrorTolerance();
+//    if (temperature != prevTemp || friction != prevFriction || errorTol != prevErrorTol) {
+//        // Initialize the GPU parameters.
+//
+//        double tau = (friction == 0.0 ? 0.0 : 1.0/friction);
+//        gpuSetLangevinIntegrationParameters(gpu, (float) tau, 0.0f, (float) temperature, errorTol);
+//        gpuSetConstants(gpu);
+//        kGenerateRandoms(gpu);
+//        prevTemp = temperature;
+//        prevFriction = friction;
+//        prevErrorTol = errorTol;
+//    }
+//    float maxStepSize = (float)(maxTime-data.time);
+//    kSelectLangevinStepSize(gpu, maxStepSize);
+//    kLangevinUpdatePart1(gpu);
+//    kApplyFirstShake(gpu);
+//    kApplyFirstSettle(gpu);
+//    kApplyFirstCCMA(gpu);
+//    if (data.removeCM)
+//        if (data.stepCount%data.cmMotionFrequency == 0)
+//            gpu->bCalculateCM = true;
+//    kLangevinUpdatePart2(gpu);
+//    kApplySecondShake(gpu);
+//    kApplySecondSettle(gpu);
+//    kApplySecondCCMA(gpu);
+//    gpu->psStepSize->Download();
+//    data.time += (*gpu->psStepSize)[0].y;
+//    if ((*gpu->psStepSize)[0].y == maxStepSize)
+//        data.time = maxTime; // Avoid round-off error
+//    data.stepCount++;
+//}
+//
+//OpenCLApplyAndersenThermostatKernel::~OpenCLApplyAndersenThermostatKernel() {
+//}
+//
+//void OpenCLApplyAndersenThermostatKernel::initialize(const System& system, const AndersenThermostat& thermostat) {
+//    _gpuContext* gpu = data.gpu;
+//    gpu->seed = (unsigned long) thermostat.getRandomNumberSeed();
+//    gpuInitializeRandoms(gpu);
+//    prevStepSize = -1.0;
+//}
+//
+//void OpenCLApplyAndersenThermostatKernel::execute(ContextImpl& context) {
+//    _gpuContext* gpu = data.gpu;
+//    double temperature = context.getParameter(AndersenThermostat::Temperature());
+//    double frequency = context.getParameter(AndersenThermostat::CollisionFrequency());
+//    double stepSize = context.getIntegrator().getStepSize();
+//    if (temperature != prevTemp || frequency != prevFrequency || stepSize != prevStepSize) {
+//        // Initialize the GPU parameters.
+//
+//        gpuSetAndersenThermostatParameters(gpu, (float) temperature, frequency);
+//        gpuSetConstants(gpu);
+//        kGenerateRandoms(gpu);
+//        prevTemp = temperature;
+//        prevFrequency = frequency;
+//        prevStepSize = stepSize;
+//    }
+//    kCalculateAndersenThermostat(gpu);
+//}
+//
+void OpenCLCalcKineticEnergyKernel::initialize(const System& system) {
+    int numParticles = system.getNumParticles();
+    masses.resize(numParticles);
+    for (int i = 0; i < numParticles; ++i)
+        masses[i] = system.getParticleMass(i);
+}
+double OpenCLCalcKineticEnergyKernel::execute(ContextImpl& context) {
+    // We don't currently have a GPU kernel to do this, so we retrieve the velocities and calculate the energy
+    // on the CPU.
+    const Stream& velocities = context.getVelocities();
+    double* v = new double[velocities.getSize()*3];
+    velocities.saveToArray(v);
+    double energy = 0.0;
+    for (size_t i = 0; i < masses.size(); ++i)
+        energy += masses[i]*(v[i*3]*v[i*3]+v[i*3+1]*v[i*3+1]+v[i*3+2]*v[i*3+2]);
+    delete v;
+    return 0.5*energy;
+}
+//
+//void OpenCLRemoveCMMotionKernel::initialize(const System& system, const CMMotionRemover& force) {
+//    data.removeCM = true;
+//    data.cmMotionFrequency = force.getFrequency();
+//}
+//
+//void OpenCLRemoveCMMotionKernel::execute(ContextImpl& context) {
+//}
--- a/platforms/opencl/src/OpenCLKernels.h
+++ b/platforms/opencl/src/OpenCLKernels.h
@@ -33,55 +33,57 @@
 namespace OpenMM {
-///**
+/**
-// * This kernel is invoked at the start of each force evaluation to clear the forces.
+ * This kernel is invoked at the start of each force evaluation to clear the forces.
-// */
+ */
-//class OpenCLInitializeForcesKernel : public InitializeForcesKernel {
+class OpenCLInitializeForcesKernel : public InitializeForcesKernel {
-//public:
+public:
-//    OpenCLInitializeForcesKernel(std::string name, const Platform& platform) : InitializeForcesKernel(name, platform) {
+    OpenCLInitializeForcesKernel(std::string name, const Platform& platform, OpenCLPlatform::PlatformData& data) : InitializeForcesKernel(name, platform), data(data) {
-//    }
+    }
-//    /**
+    /**
-//     * Initialize the kernel.
+     * Initialize the kernel.
-//     *
+     *
-//     * @param system     the System this kernel will be applied to
+     * @param system     the System this kernel will be applied to
-//     */
+     */
-//    void initialize(const System& system);
+    void initialize(const System& system);
-//    /**
+    /**
-//     * Execute the kernel.
+     * Execute the kernel.
-//     *
+     *
-//     * @param context    the context in which to execute this kernel
+     * @param context    the context in which to execute this kernel
-//     */
+     */
-//    void execute(ContextImpl& context);
+    void execute(ContextImpl& context);
-//};
+private:
-//
+   OpenCLPlatform::PlatformData& data;
-///**
+};
-// * This kernel is invoked to get or set the current time.
-// */
+/**
-//class OpenCLUpdateTimeKernel : public UpdateTimeKernel {
+ * This kernel is invoked to get or set the current time.
-//public:
+ */
-//    OpenCLUpdateTimeKernel(std::string name, const Platform& platform, OpenCLPlatform::PlatformData& data) : UpdateTimeKernel(name, platform), data(data) {
+class OpenCLUpdateTimeKernel : public UpdateTimeKernel {
-//    }
+public:
-//    /**
+    OpenCLUpdateTimeKernel(std::string name, const Platform& platform, OpenCLPlatform::PlatformData& data) : UpdateTimeKernel(name, platform), data(data) {
-//     * Initialize the kernel.
+    }
-//     *
+    /**
-//     * @param system     the System this kernel will be applied to
+     * Initialize the kernel.
-//     */
+     *
-//    void initialize(const System& system);
+     * @param system     the System this kernel will be applied to
-//    /**
+     */
-//     * Get the current time (in picoseconds).
+    void initialize(const System& system);
-//     *
+    /**
-//     * @param context    the context in which to execute this kernel
+     * Get the current time (in picoseconds).
-//     */
+     *
-//    double getTime(const ContextImpl& context) const;
+     * @param context    the context in which to execute this kernel
-//    /**
+     */
-//     * Set the current time (in picoseconds).
+    double getTime(const ContextImpl& context) const;
-//     *
+    /**
-//     * @param context    the context in which to execute this kernel
+     * Set the current time (in picoseconds).
-//     */
+     *
-//    void setTime(ContextImpl& context, double time);
+     * @param context    the context in which to execute this kernel
-//private:
+     */
-//    OpenCLPlatform::PlatformData& data;
+    void setTime(ContextImpl& context, double time);
-//};
+private:
+    OpenCLPlatform::PlatformData& data;
+};
 //
 ///**
 // * This kernel is invoked by HarmonicBondForce to calculate the forces acting on the system and the energy of the system.
@@ -321,34 +323,34 @@ namespace OpenMM {
 //private:
 //    OpenCLPlatform::PlatformData& data;
 //};
-//
-///**
+/**
-// * This kernel is invoked by VerletIntegrator to take one time step.
+ * This kernel is invoked by VerletIntegrator to take one time step.
-// */
+ */
-//class OpenCLIntegrateVerletStepKernel : public IntegrateVerletStepKernel {
+class OpenCLIntegrateVerletStepKernel : public IntegrateVerletStepKernel {
-//public:
+public:
-//    OpenCLIntegrateVerletStepKernel(std::string name, const Platform& platform, OpenCLPlatform::PlatformData& data) : IntegrateVerletStepKernel(name, platform), data(data) {
+    OpenCLIntegrateVerletStepKernel(std::string name, const Platform& platform, OpenCLPlatform::PlatformData& data) : IntegrateVerletStepKernel(name, platform), data(data) {
-//    }
+    }
-//    ~OpenCLIntegrateVerletStepKernel();
+    ~OpenCLIntegrateVerletStepKernel();
-//    /**
+    /**
-//     * Initialize the kernel.
+     * Initialize the kernel.
-//     *
+     *
-//     * @param system     the System this kernel will be applied to
+     * @param system     the System this kernel will be applied to
-//     * @param integrator the VerletIntegrator this kernel will be used for
+     * @param integrator the VerletIntegrator this kernel will be used for
-//     */
+     */
-//    void initialize(const System& system, const VerletIntegrator& integrator);
+    void initialize(const System& system, const VerletIntegrator& integrator);
-//    /**
+    /**
-//     * Execute the kernel.
+     * Execute the kernel.
-//     *
+     *
-//     * @param context    the context in which to execute this kernel
+     * @param context    the context in which to execute this kernel
-//     * @param integrator the VerletIntegrator this kernel is being used for
+     * @param integrator the VerletIntegrator this kernel is being used for
-//     */
+     */
-//    void execute(ContextImpl& context, const VerletIntegrator& integrator);
+    void execute(ContextImpl& context, const VerletIntegrator& integrator);
-//private:
+private:
-//    OpenCLPlatform::PlatformData& data;
+    OpenCLPlatform::PlatformData& data;
-//    double prevStepSize;
+    double prevStepSize;
-//};
+};
-//
 ///**
 // * This kernel is invoked by LangevinIntegrator to take one time step.
 // */
@@ -484,30 +486,30 @@ namespace OpenMM {
 //    OpenCLPlatform::PlatformData& data;
 //    double prevTemp, prevFrequency, prevStepSize;
 //};
-//
-///**
+/**
-// * This kernel is invoked to calculate the kinetic energy of the system.
+ * This kernel is invoked to calculate the kinetic energy of the system.
-// */
+ */
-//class OpenCLCalcKineticEnergyKernel : public CalcKineticEnergyKernel {
+class OpenCLCalcKineticEnergyKernel : public CalcKineticEnergyKernel {
-//public:
+public:
-//    OpenCLCalcKineticEnergyKernel(std::string name, const Platform& platform) : CalcKineticEnergyKernel(name, platform) {
+    OpenCLCalcKineticEnergyKernel(std::string name, const Platform& platform) : CalcKineticEnergyKernel(name, platform) {
-//    }
+    }
-//    /**
+    /**
-//     * Initialize the kernel.
+     * Initialize the kernel.
-//     *
+     *
-//     * @param system     the System this kernel will be applied to
+     * @param system     the System this kernel will be applied to
-//     */
+     */
-//    void initialize(const System& system);
+    void initialize(const System& system);
-//    /**
+    /**
-//     * Execute the kernel.
+     * Execute the kernel.
-//     *
+     *
-//     * @param context    the context in which to execute this kernel
+     * @param context    the context in which to execute this kernel
-//     */
+     */
-//    double execute(ContextImpl& context);
+    double execute(ContextImpl& context);
-//private:
+private:
-//    std::vector<double> masses;
+    std::vector<double> masses;
-//};
+};
-//
 ///**
 // * This kernel is invoked to remove center of mass motion from the system.
 // */

--- a/platforms/opencl/src/OpenCLPlatform.cpp
+++ b/platforms/opencl/src/OpenCLPlatform.cpp
@@ -45,8 +45,8 @@ extern "C" void initOpenMMPlugin() {
 OpenCLPlatform::OpenCLPlatform() {
    OpenCLKernelFactory* factory = new OpenCLKernelFactory();
-//    registerKernelFactory(InitializeForcesKernel::Name(), factory);
+    registerKernelFactory(InitializeForcesKernel::Name(), factory);
-//    registerKernelFactory(UpdateTimeKernel::Name(), factory);
+    registerKernelFactory(UpdateTimeKernel::Name(), factory);
 //    registerKernelFactory(CalcHarmonicBondForceKernel::Name(), factory);
 //    registerKernelFactory(CalcHarmonicAngleForceKernel::Name(), factory);
 //    registerKernelFactory(CalcPeriodicTorsionForceKernel::Name(), factory);
@@ -54,13 +54,13 @@ OpenCLPlatform::OpenCLPlatform() {
 //    registerKernelFactory(CalcNonbondedForceKernel::Name(), factory);
 //    registerKernelFactory(CalcCustomNonbondedForceKernel::Name(), factory);
 //    registerKernelFactory(CalcGBSAOBCForceKernel::Name(), factory);
-//    registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
 //    registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory);
 //    registerKernelFactory(IntegrateBrownianStepKernel::Name(), factory);
 //    registerKernelFactory(IntegrateVariableVerletStepKernel::Name(), factory);
 //    registerKernelFactory(IntegrateVariableLangevinStepKernel::Name(), factory);
 //    registerKernelFactory(ApplyAndersenThermostatKernel::Name(), factory);
-//    registerKernelFactory(CalcKineticEnergyKernel::Name(), factory);
+    registerKernelFactory(CalcKineticEnergyKernel::Name(), factory);
 //    registerKernelFactory(RemoveCMMotionKernel::Name(), factory);
    platformProperties.push_back(OpenCLPlatformIndex());
    platformProperties.push_back(OpenCLDeviceIndex());

--- a/platforms/opencl/src/kernels/utilities.cl
+++ b/platforms/opencl/src/kernels/utilities.cl
+__kernel void clearBuffer(__global float* buffer, int size) {
+    int index = get_global_id(0);
+    int step = get_global_size(0);
+    __global float4* buffer4 = (__global float4*) buffer;
+    int sizeDiv4 = size/4;
+    while (index < sizeDiv4) {
+        buffer4[index] = (float4) (0.0f);
+        index += step;
+    }
+    if (get_global_id(0) == 0)
+        for (int i = sizeDiv4*4; i < size; i++)
+            buffer[i] = 0.0f;
+}