Allow multiple buffers to be cleared by a single kernel, improving performance on small systems.

ef8a3447 · Peter Eastman · 94a151b1 · ef8a3447 · ef8a3447 · ef8a3447
Commit ef8a3447 authored Jun 01, 2010 by Peter Eastman
6 changed files
--- a/platforms/cuda/tests/TestCudaPerformance.cpp
+++ b/platforms/cuda/tests/TestCudaPerformance.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2009 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "../../../tests/AssertionUtilities.h"
+#include "openmm/Context.h"
+#include "CudaPlatform.h"
+#include "openmm/NonbondedForce.h"
+#include "openmm/GBSAOBCForce.h"
+#include "openmm/System.h"
+#include "openmm/LangevinIntegrator.h"
+#include <sys/time.h>
+#include <iostream>
+#include <stdlib.h>
+
+using namespace OpenMM;
+using namespace std;
+
+void testPerformance() {
+    const int xsize = 20;
+    const int ysize = 21;
+    const int zsize = 21;
+    const int numParticles = xsize*ysize*zsize;
+    const double spacing = 0.3;
+    CudaPlatform platform;
+    System system;
+    system.setDefaultPeriodicBoxVectors(Vec3(xsize*spacing, 0, 0), Vec3(0, ysize*spacing, 0), Vec3(0, 0, zsize*spacing));
+    for (int i = 0; i < numParticles; i++)
+        system.addParticle(1.0);
+    LangevinIntegrator integrator(1.0, 0.1, 0.001);
+    NonbondedForce* nonbonded = new NonbondedForce();
+    vector<Vec3> positions;
+    vector<Vec3> velocities;
+    double charge = 0.1;
+    for (int i = 0; i < xsize; ++i)
+        for (int j = 0; j < ysize; ++j)
+            for (int k = 0; k < zsize; ++k) {
+                nonbonded->addParticle(charge, 0.2, 0.1);
+                charge = -charge;
+                positions.push_back(Vec3(i*spacing, j*spacing, k*spacing));
+                velocities.push_back(Vec3(0, 0, 0));
+            }
+    nonbonded->setNonbondedMethod(NonbondedForce::CutoffPeriodic);
+    nonbonded->setCutoffDistance(3*spacing);
+    system.addForce(nonbonded);
+    Context context(system, integrator, platform);
+    context.setPositions(positions);
+    context.setVelocities(velocities);
+    timeval startTime;
+    gettimeofday(&startTime, NULL);
+    integrator.step(5000);
+    State state = context.getState(State::Positions | State::Velocities | State::Forces | State::Energy);
+    timeval endTime;
+    gettimeofday(&endTime, NULL);
+    double dt = endTime.tv_sec-startTime.tv_sec+1e-6*(endTime.tv_usec-startTime.tv_usec);
+    std::cout << "Elapsed time: " << dt << std::endl;
+    std::cout << "Final energy: " << state.getPotentialEnergy()+state.getKineticEnergy() << std::endl;
+}
+
+int main() {
+    try {
+        testPerformance();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}
+
+
--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -45,7 +45,8 @@
 using namespace OpenMM;
 using namespace std;

-OpenCLContext::OpenCLContext(int numParticles, int deviceIndex) : time(0.0), stepCount(0), computeForceCount(0) {
+OpenCLContext::OpenCLContext(int numParticles, int deviceIndex) : time(0.0), stepCount(0), computeForceCount(0), posq(NULL), velm(NULL),
+        forceBuffers(NULL), energyBuffer(NULL), atomIndex(NULL), integration(NULL), nonbonded(NULL) {
    try {
        std::vector<cl::Platform> platforms;
        cl::Platform::get(&platforms);
@@ -98,6 +99,9 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex) : time(0.0), ste

    utilities = createProgram(OpenCLKernelSources::utilities);
    clearBufferKernel = cl::Kernel(utilities, "clearBuffer");
+    clearTwoBuffersKernel = cl::Kernel(utilities, "clearTwoBuffers");
+    clearThreeBuffersKernel = cl::Kernel(utilities, "clearThreeBuffers");
+    clearFourBuffersKernel = cl::Kernel(utilities, "clearFourBuffers");
    reduceFloat4Kernel = cl::Kernel(utilities, "reduceFloat4Buffer");

    // Decide whether native_sqrt(), native_rsqrt(), and native_recip() are sufficiently accurate to use.
@@ -164,8 +168,10 @@ void OpenCLContext::initialize(const System& system) {
    for (int i = 0; i < (int) forces.size(); i++)
        numForceBuffers = std::max(numForceBuffers, forces[i]->getRequiredForceBuffers());
    forceBuffers = new OpenCLArray<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers", false);
+    addAutoclearBuffer(forceBuffers->getDeviceBuffer(), forceBuffers->getSize()*4);
    force = new OpenCLArray<mm_float4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force", true);
    energyBuffer = new OpenCLArray<cl_float>(*this, numThreadBlocks*ThreadBlockSize, "energyBuffer", true);
+    addAutoclearBuffer(energyBuffer->getDeviceBuffer(), energyBuffer->getSize());
    atomIndex = new OpenCLArray<cl_int>(*this, paddedNumAtoms, "atomIndex", true);
    for (int i = 0; i < paddedNumAtoms; ++i)
        (*atomIndex)[i] = i;
@@ -257,6 +263,47 @@ void OpenCLContext::clearBuffer(cl::Memory& memory, int size) {
    executeKernel(clearBufferKernel, size);
 }

+void OpenCLContext::addAutoclearBuffer(cl::Memory& memory, int size) {
+    autoclearBuffers.push_back(&memory);
+    autoclearBufferSizes.push_back(size);
+}
+
+void OpenCLContext::clearAutoclearBuffers() {
+    int base = 0;
+    int total = autoclearBufferSizes.size();
+    while (total-base >= 4) {
+        clearFourBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
+        clearFourBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
+        clearFourBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
+        clearFourBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
+        clearFourBuffersKernel.setArg<cl::Memory>(4, *autoclearBuffers[base+2]);
+        clearFourBuffersKernel.setArg<cl_int>(5, autoclearBufferSizes[base+2]);
+        clearFourBuffersKernel.setArg<cl::Memory>(6, *autoclearBuffers[base+3]);
+        clearFourBuffersKernel.setArg<cl_int>(7, autoclearBufferSizes[base+3]);
+        executeKernel(clearFourBuffersKernel, max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base]+3));
+        base += 4;
+    }
+    if (total-base == 3) {
+        clearThreeBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
+        clearThreeBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
+        clearThreeBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
+        clearThreeBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
+        clearThreeBuffersKernel.setArg<cl::Memory>(4, *autoclearBuffers[base+2]);
+        clearThreeBuffersKernel.setArg<cl_int>(5, autoclearBufferSizes[base+2]);
+        executeKernel(clearThreeBuffersKernel, max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]));
+    }
+    else if (total-base == 2) {
+        clearTwoBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
+        clearTwoBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
+        clearTwoBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
+        clearTwoBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
+        executeKernel(clearTwoBuffersKernel, max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]));
+    }
+    else if (total-base == 1) {
+        clearBuffer(*autoclearBuffers[base], autoclearBufferSizes[base]);
+    }
+}
+
 void OpenCLContext::reduceBuffer(OpenCLArray<mm_float4>& array, int numBuffers) {
    int bufferSize = array.getSize()/numBuffers;
    reduceFloat4Kernel.setArg<cl::Buffer>(0, array.getDeviceBuffer());

--- a/platforms/opencl/src/OpenCLContext.h
+++ b/platforms/opencl/src/OpenCLContext.h
@@ -234,6 +234,17 @@ public:
     * @param size       the number of float elements in the buffer
     */
    void clearBuffer(cl::Memory& memory, int size);
+    /**
+     * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
+     *
+     * @param memory     the Memory to clear
+     * @param size       the number of float elements in the buffer
+     */
+    void addAutoclearBuffer(cl::Memory& memory, int size);
+    /**
+     * Clear all buffers that have been registered with addAutoclearBuffer().
+     */
+    void clearAutoclearBuffers();
    /**
     * Given a collection of buffers packed into an array, sum them and store
     * the sum in the first buffer.
@@ -374,6 +385,9 @@ private:
    cl::CommandQueue queue;
    cl::Program utilities;
    cl::Kernel clearBufferKernel;
+    cl::Kernel clearTwoBuffersKernel;
+    cl::Kernel clearThreeBuffersKernel;
+    cl::Kernel clearFourBuffersKernel;
    cl::Kernel reduceFloat4Kernel;
    std::vector<OpenCLForceInfo*> forces;
    std::vector<MoleculeGroup> moleculeGroups;
@@ -384,6 +398,8 @@ private:
    OpenCLArray<mm_float4>* forceBuffers;
    OpenCLArray<cl_float>* energyBuffer;
    OpenCLArray<cl_int>* atomIndex;
+    std::vector<cl::Memory*> autoclearBuffers;
+    std::vector<int> autoclearBufferSizes;
    OpenCLIntegrationUtilities* integration;
    OpenCLNonbondedUtilities* nonbonded;
 };

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -73,7 +73,7 @@ void OpenCLCalcForcesAndEnergyKernel::beginForceComputation(ContextImpl& context
    if (cl.getNonbondedUtilities().getUseCutoff() && cl.getComputeForceCount()%100 == 0)
        cl.reorderAtoms();
    cl.setComputeForceCount(cl.getComputeForceCount()+1);
-    cl.clearBuffer(cl.getForceBuffers());
+    cl.clearAutoclearBuffers();
    cl.getNonbondedUtilities().prepareInteractions();
 }

@@ -86,7 +86,7 @@ void OpenCLCalcForcesAndEnergyKernel::beginEnergyComputation(ContextImpl& contex
    if (cl.getNonbondedUtilities().getUseCutoff() && cl.getComputeForceCount()%100 == 0)
        cl.reorderAtoms();
    cl.setComputeForceCount(cl.getComputeForceCount()+1);
-    cl.clearBuffer(cl.getEnergyBuffer());
+    cl.clearAutoclearBuffers();
    cl.getNonbondedUtilities().prepareInteractions();
 }

@@ -1580,6 +1580,8 @@ void OpenCLCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOB
    nb.addParameter(OpenCLNonbondedUtilities::ParameterInfo("obcParams", "float", 2, sizeof(cl_float2), params->getDeviceBuffer()));;
    nb.addParameter(OpenCLNonbondedUtilities::ParameterInfo("bornForce", "float", 1, sizeof(cl_float), bornForce->getDeviceBuffer()));;
    cl.addForce(new OpenCLGBSAOBCForceInfo(nb.getNumForceBuffers(), force));
+    cl.addAutoclearBuffer(bornSum->getDeviceBuffer(), bornSum->getSize());
+    cl.addAutoclearBuffer(bornForce->getDeviceBuffer(), bornForce->getSize());
 }

 void OpenCLCalcGBSAOBCForceKernel::executeForces(ContextImpl& context) {
@@ -1655,8 +1657,6 @@ void OpenCLCalcGBSAOBCForceKernel::executeForces(ContextImpl& context) {
        reduceBornForceKernel.setArg<cl::Buffer>(5, bornRadii->getDeviceBuffer());
        reduceBornForceKernel.setArg<cl::Buffer>(6, obcChain->getDeviceBuffer());
    }
-    cl.clearBuffer(*bornSum);
-    cl.clearBuffer(*bornForce);
    if (nb.getUseCutoff()) {
        computeBornSumKernel.setArg<mm_float4>(8, cl.getPeriodicBoxSize());
        computeBornSumKernel.setArg<mm_float4>(9, cl.getInvPeriodicBoxSize());
@@ -2278,6 +2278,10 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
        }
    }
    cl.addForce(new OpenCLCustomGBForceInfo(cl.getNonbondedUtilities().getNumForceBuffers(), force));
+    for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++) {
+        const OpenCLNonbondedUtilities::ParameterInfo& buffer = energyDerivs->getBuffers()[i];
+        cl.addAutoclearBuffer(buffer.getMemory(), buffer.getSize()*energyDerivs->getNumObjects()/sizeof(cl_float));
+    }
 }

 void OpenCLCalcCustomGBForceKernel::executeForces(ContextImpl& context) {
@@ -2285,6 +2289,8 @@ void OpenCLCalcCustomGBForceKernel::executeForces(ContextImpl& context) {
    if (!hasInitializedKernels) {
        hasInitializedKernels = true;
        valueBuffers = new OpenCLArray<cl_float>(cl, cl.getPaddedNumAtoms()*cl.getNumForceBuffers(), "customGBValueBuffers");
+        cl.addAutoclearBuffer(valueBuffers->getDeviceBuffer(), valueBuffers->getSize());
+        cl.clearBuffer(*valueBuffers);
        int index = 0;
        pairValueKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
        pairValueKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
@@ -2404,11 +2410,6 @@ void OpenCLCalcCustomGBForceKernel::executeForces(ContextImpl& context) {
        if (changed)
            globals->upload(globalParamValues);
    }
-    cl.clearBuffer(*valueBuffers);
-    for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++) {
-        const OpenCLNonbondedUtilities::ParameterInfo& buffer = energyDerivs->getBuffers()[i];
-        cl.clearBuffer(buffer.getMemory(), buffer.getSize()*energyDerivs->getNumObjects()/sizeof(cl_float));
-    }
    if (nb.getUseCutoff()) {
        pairValueKernel.setArg<mm_float4>(10, cl.getPeriodicBoxSize());
        pairValueKernel.setArg<mm_float4>(11, cl.getInvPeriodicBoxSize());

--- a/platforms/opencl/src/kernels/utilities.cl
+++ b/platforms/opencl/src/kernels/utilities.cl
@@ -15,6 +15,33 @@ __kernel void clearBuffer(__global float* buffer, int size) {
            buffer[i] = 0.0f;
 }

+/**
+ * Fill two buffers with 0.
+ */
+__kernel void clearTwoBuffers(__global float* buffer1, int size1, __global float* buffer2, int size2) {
+    clearBuffer(buffer1, size1);
+    clearBuffer(buffer2, size2);
+}
+
+/**
+ * Fill three buffers with 0.
+ */
+__kernel void clearThreeBuffers(__global float* buffer1, int size1, __global float* buffer2, int size2, __global float* buffer3, int size3) {
+    clearBuffer(buffer1, size1);
+    clearBuffer(buffer2, size2);
+    clearBuffer(buffer3, size3);
+}
+
+/**
+ * Fill four buffers with 0.
+ */
+__kernel void clearFourBuffers(__global float* buffer1, int size1, __global float* buffer2, int size2, __global float* buffer3, int size3, __global float* buffer4, int size4) {
+    clearBuffer(buffer1, size1);
+    clearBuffer(buffer2, size2);
+    clearBuffer(buffer3, size3);
+    clearBuffer(buffer4, size4);
+}
+
 /**
 * Sum a collection of buffers into the first one.
 */

--- a/platforms/opencl/tests/TestOpenCLPerformance.cpp
+++ b/platforms/opencl/tests/TestOpenCLPerformance.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2009 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "../../../tests/AssertionUtilities.h"
+#include "openmm/Context.h"
+#include "OpenCLPlatform.h"
+#include "openmm/NonbondedForce.h"
+#include "openmm/System.h"
+#include "openmm/LangevinIntegrator.h"
+#include <sys/time.h>
+#include <iostream>
+#include <stdlib.h>
+
+using namespace OpenMM;
+using namespace std;
+
+void testPerformance() {
+    const int xsize = 20;
+    const int ysize = 21;
+    const int zsize = 21;
+    const int numParticles = xsize*ysize*zsize;
+    const double spacing = 0.3;
+    OpenCLPlatform platform;
+    System system;
+    system.setDefaultPeriodicBoxVectors(Vec3(xsize*spacing, 0, 0), Vec3(0, ysize*spacing, 0), Vec3(0, 0, zsize*spacing));
+    for (int i = 0; i < numParticles; i++)
+        system.addParticle(1.0);
+    LangevinIntegrator integrator(1.0, 0.1, 0.001);
+    NonbondedForce* nonbonded = new NonbondedForce();
+    vector<Vec3> positions;
+    vector<Vec3> velocities;
+    double charge = 0.1;
+    for (int i = 0; i < xsize; ++i)
+        for (int j = 0; j < ysize; ++j)
+            for (int k = 0; k < zsize; ++k) {
+                nonbonded->addParticle(charge, 0.2, 0.1);
+                charge = -charge;
+                positions.push_back(Vec3(i*spacing, j*spacing, k*spacing));
+                velocities.push_back(Vec3(0, 0, 0));
+            }
+    nonbonded->setNonbondedMethod(NonbondedForce::CutoffPeriodic);
+    nonbonded->setCutoffDistance(3*spacing);
+    system.addForce(nonbonded);
+    Context context(system, integrator, platform);
+    context.setPositions(positions);
+    context.setVelocities(velocities);
+    timeval startTime;
+    gettimeofday(&startTime, NULL);
+    integrator.step(50);
+    State state = context.getState(State::Positions | State::Velocities | State::Forces | State::Energy);
+    timeval endTime;
+    gettimeofday(&endTime, NULL);
+    double dt = endTime.tv_sec-startTime.tv_sec+1e-6*(endTime.tv_usec-startTime.tv_usec);
+    std::cout << "Elapsed time: " << dt << std::endl;
+    std::cout << "Final energy: " << state.getPotentialEnergy()+state.getKineticEnergy() << std::endl;
+}
+
+int main() {
+    try {
+        testPerformance();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}
+