Commit ef8a3447 authored by Peter Eastman's avatar Peter Eastman
Browse files

Allow multiple buffers to be cleared by a single kernel, improving performance on small systems.

parent 94a151b1
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2009 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "../../../tests/AssertionUtilities.h"
#include "openmm/Context.h"
#include "CudaPlatform.h"
#include "openmm/NonbondedForce.h"
#include "openmm/GBSAOBCForce.h"
#include "openmm/System.h"
#include "openmm/LangevinIntegrator.h"
#include <sys/time.h>
#include <iostream>
#include <stdlib.h>
using namespace OpenMM;
using namespace std;
void testPerformance() {
const int xsize = 20;
const int ysize = 21;
const int zsize = 21;
const int numParticles = xsize*ysize*zsize;
const double spacing = 0.3;
CudaPlatform platform;
System system;
system.setDefaultPeriodicBoxVectors(Vec3(xsize*spacing, 0, 0), Vec3(0, ysize*spacing, 0), Vec3(0, 0, zsize*spacing));
for (int i = 0; i < numParticles; i++)
system.addParticle(1.0);
LangevinIntegrator integrator(1.0, 0.1, 0.001);
NonbondedForce* nonbonded = new NonbondedForce();
vector<Vec3> positions;
vector<Vec3> velocities;
double charge = 0.1;
for (int i = 0; i < xsize; ++i)
for (int j = 0; j < ysize; ++j)
for (int k = 0; k < zsize; ++k) {
nonbonded->addParticle(charge, 0.2, 0.1);
charge = -charge;
positions.push_back(Vec3(i*spacing, j*spacing, k*spacing));
velocities.push_back(Vec3(0, 0, 0));
}
nonbonded->setNonbondedMethod(NonbondedForce::CutoffPeriodic);
nonbonded->setCutoffDistance(3*spacing);
system.addForce(nonbonded);
Context context(system, integrator, platform);
context.setPositions(positions);
context.setVelocities(velocities);
timeval startTime;
gettimeofday(&startTime, NULL);
integrator.step(5000);
State state = context.getState(State::Positions | State::Velocities | State::Forces | State::Energy);
timeval endTime;
gettimeofday(&endTime, NULL);
double dt = endTime.tv_sec-startTime.tv_sec+1e-6*(endTime.tv_usec-startTime.tv_usec);
std::cout << "Elapsed time: " << dt << std::endl;
std::cout << "Final energy: " << state.getPotentialEnergy()+state.getKineticEnergy() << std::endl;
}
int main() {
try {
testPerformance();
}
catch(const exception& e) {
cout << "exception: " << e.what() << endl;
return 1;
}
cout << "Done" << endl;
return 0;
}
......@@ -45,7 +45,8 @@
using namespace OpenMM;
using namespace std;
OpenCLContext::OpenCLContext(int numParticles, int deviceIndex) : time(0.0), stepCount(0), computeForceCount(0) {
OpenCLContext::OpenCLContext(int numParticles, int deviceIndex) : time(0.0), stepCount(0), computeForceCount(0), posq(NULL), velm(NULL),
forceBuffers(NULL), energyBuffer(NULL), atomIndex(NULL), integration(NULL), nonbonded(NULL) {
try {
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
......@@ -98,6 +99,9 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex) : time(0.0), ste
utilities = createProgram(OpenCLKernelSources::utilities);
clearBufferKernel = cl::Kernel(utilities, "clearBuffer");
clearTwoBuffersKernel = cl::Kernel(utilities, "clearTwoBuffers");
clearThreeBuffersKernel = cl::Kernel(utilities, "clearThreeBuffers");
clearFourBuffersKernel = cl::Kernel(utilities, "clearFourBuffers");
reduceFloat4Kernel = cl::Kernel(utilities, "reduceFloat4Buffer");
// Decide whether native_sqrt(), native_rsqrt(), and native_recip() are sufficiently accurate to use.
......@@ -164,8 +168,10 @@ void OpenCLContext::initialize(const System& system) {
for (int i = 0; i < (int) forces.size(); i++)
numForceBuffers = std::max(numForceBuffers, forces[i]->getRequiredForceBuffers());
forceBuffers = new OpenCLArray<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers", false);
addAutoclearBuffer(forceBuffers->getDeviceBuffer(), forceBuffers->getSize()*4);
force = new OpenCLArray<mm_float4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force", true);
energyBuffer = new OpenCLArray<cl_float>(*this, numThreadBlocks*ThreadBlockSize, "energyBuffer", true);
addAutoclearBuffer(energyBuffer->getDeviceBuffer(), energyBuffer->getSize());
atomIndex = new OpenCLArray<cl_int>(*this, paddedNumAtoms, "atomIndex", true);
for (int i = 0; i < paddedNumAtoms; ++i)
(*atomIndex)[i] = i;
......@@ -257,6 +263,47 @@ void OpenCLContext::clearBuffer(cl::Memory& memory, int size) {
executeKernel(clearBufferKernel, size);
}
void OpenCLContext::addAutoclearBuffer(cl::Memory& memory, int size) {
autoclearBuffers.push_back(&memory);
autoclearBufferSizes.push_back(size);
}
void OpenCLContext::clearAutoclearBuffers() {
int base = 0;
int total = autoclearBufferSizes.size();
while (total-base >= 4) {
clearFourBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
clearFourBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
clearFourBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
clearFourBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
clearFourBuffersKernel.setArg<cl::Memory>(4, *autoclearBuffers[base+2]);
clearFourBuffersKernel.setArg<cl_int>(5, autoclearBufferSizes[base+2]);
clearFourBuffersKernel.setArg<cl::Memory>(6, *autoclearBuffers[base+3]);
clearFourBuffersKernel.setArg<cl_int>(7, autoclearBufferSizes[base+3]);
executeKernel(clearFourBuffersKernel, max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base]+3));
base += 4;
}
if (total-base == 3) {
clearThreeBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
clearThreeBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
clearThreeBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
clearThreeBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
clearThreeBuffersKernel.setArg<cl::Memory>(4, *autoclearBuffers[base+2]);
clearThreeBuffersKernel.setArg<cl_int>(5, autoclearBufferSizes[base+2]);
executeKernel(clearThreeBuffersKernel, max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]));
}
else if (total-base == 2) {
clearTwoBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
clearTwoBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
clearTwoBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
clearTwoBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
executeKernel(clearTwoBuffersKernel, max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]));
}
else if (total-base == 1) {
clearBuffer(*autoclearBuffers[base], autoclearBufferSizes[base]);
}
}
void OpenCLContext::reduceBuffer(OpenCLArray<mm_float4>& array, int numBuffers) {
int bufferSize = array.getSize()/numBuffers;
reduceFloat4Kernel.setArg<cl::Buffer>(0, array.getDeviceBuffer());
......
......@@ -234,6 +234,17 @@ public:
* @param size the number of float elements in the buffer
*/
void clearBuffer(cl::Memory& memory, int size);
/**
* Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
*
* @param memory the Memory to clear
* @param size the number of float elements in the buffer
*/
void addAutoclearBuffer(cl::Memory& memory, int size);
/**
* Clear all buffers that have been registered with addAutoclearBuffer().
*/
void clearAutoclearBuffers();
/**
* Given a collection of buffers packed into an array, sum them and store
* the sum in the first buffer.
......@@ -374,6 +385,9 @@ private:
cl::CommandQueue queue;
cl::Program utilities;
cl::Kernel clearBufferKernel;
cl::Kernel clearTwoBuffersKernel;
cl::Kernel clearThreeBuffersKernel;
cl::Kernel clearFourBuffersKernel;
cl::Kernel reduceFloat4Kernel;
std::vector<OpenCLForceInfo*> forces;
std::vector<MoleculeGroup> moleculeGroups;
......@@ -384,6 +398,8 @@ private:
OpenCLArray<mm_float4>* forceBuffers;
OpenCLArray<cl_float>* energyBuffer;
OpenCLArray<cl_int>* atomIndex;
std::vector<cl::Memory*> autoclearBuffers;
std::vector<int> autoclearBufferSizes;
OpenCLIntegrationUtilities* integration;
OpenCLNonbondedUtilities* nonbonded;
};
......
......@@ -73,7 +73,7 @@ void OpenCLCalcForcesAndEnergyKernel::beginForceComputation(ContextImpl& context
if (cl.getNonbondedUtilities().getUseCutoff() && cl.getComputeForceCount()%100 == 0)
cl.reorderAtoms();
cl.setComputeForceCount(cl.getComputeForceCount()+1);
cl.clearBuffer(cl.getForceBuffers());
cl.clearAutoclearBuffers();
cl.getNonbondedUtilities().prepareInteractions();
}
......@@ -86,7 +86,7 @@ void OpenCLCalcForcesAndEnergyKernel::beginEnergyComputation(ContextImpl& contex
if (cl.getNonbondedUtilities().getUseCutoff() && cl.getComputeForceCount()%100 == 0)
cl.reorderAtoms();
cl.setComputeForceCount(cl.getComputeForceCount()+1);
cl.clearBuffer(cl.getEnergyBuffer());
cl.clearAutoclearBuffers();
cl.getNonbondedUtilities().prepareInteractions();
}
......@@ -1580,6 +1580,8 @@ void OpenCLCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOB
nb.addParameter(OpenCLNonbondedUtilities::ParameterInfo("obcParams", "float", 2, sizeof(cl_float2), params->getDeviceBuffer()));;
nb.addParameter(OpenCLNonbondedUtilities::ParameterInfo("bornForce", "float", 1, sizeof(cl_float), bornForce->getDeviceBuffer()));;
cl.addForce(new OpenCLGBSAOBCForceInfo(nb.getNumForceBuffers(), force));
cl.addAutoclearBuffer(bornSum->getDeviceBuffer(), bornSum->getSize());
cl.addAutoclearBuffer(bornForce->getDeviceBuffer(), bornForce->getSize());
}
void OpenCLCalcGBSAOBCForceKernel::executeForces(ContextImpl& context) {
......@@ -1655,8 +1657,6 @@ void OpenCLCalcGBSAOBCForceKernel::executeForces(ContextImpl& context) {
reduceBornForceKernel.setArg<cl::Buffer>(5, bornRadii->getDeviceBuffer());
reduceBornForceKernel.setArg<cl::Buffer>(6, obcChain->getDeviceBuffer());
}
cl.clearBuffer(*bornSum);
cl.clearBuffer(*bornForce);
if (nb.getUseCutoff()) {
computeBornSumKernel.setArg<mm_float4>(8, cl.getPeriodicBoxSize());
computeBornSumKernel.setArg<mm_float4>(9, cl.getInvPeriodicBoxSize());
......@@ -2278,6 +2278,10 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
}
}
cl.addForce(new OpenCLCustomGBForceInfo(cl.getNonbondedUtilities().getNumForceBuffers(), force));
for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++) {
const OpenCLNonbondedUtilities::ParameterInfo& buffer = energyDerivs->getBuffers()[i];
cl.addAutoclearBuffer(buffer.getMemory(), buffer.getSize()*energyDerivs->getNumObjects()/sizeof(cl_float));
}
}
void OpenCLCalcCustomGBForceKernel::executeForces(ContextImpl& context) {
......@@ -2285,6 +2289,8 @@ void OpenCLCalcCustomGBForceKernel::executeForces(ContextImpl& context) {
if (!hasInitializedKernels) {
hasInitializedKernels = true;
valueBuffers = new OpenCLArray<cl_float>(cl, cl.getPaddedNumAtoms()*cl.getNumForceBuffers(), "customGBValueBuffers");
cl.addAutoclearBuffer(valueBuffers->getDeviceBuffer(), valueBuffers->getSize());
cl.clearBuffer(*valueBuffers);
int index = 0;
pairValueKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
pairValueKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
......@@ -2404,11 +2410,6 @@ void OpenCLCalcCustomGBForceKernel::executeForces(ContextImpl& context) {
if (changed)
globals->upload(globalParamValues);
}
cl.clearBuffer(*valueBuffers);
for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++) {
const OpenCLNonbondedUtilities::ParameterInfo& buffer = energyDerivs->getBuffers()[i];
cl.clearBuffer(buffer.getMemory(), buffer.getSize()*energyDerivs->getNumObjects()/sizeof(cl_float));
}
if (nb.getUseCutoff()) {
pairValueKernel.setArg<mm_float4>(10, cl.getPeriodicBoxSize());
pairValueKernel.setArg<mm_float4>(11, cl.getInvPeriodicBoxSize());
......
......@@ -15,6 +15,33 @@ __kernel void clearBuffer(__global float* buffer, int size) {
buffer[i] = 0.0f;
}
/**
* Fill two buffers with 0.
*/
__kernel void clearTwoBuffers(__global float* buffer1, int size1, __global float* buffer2, int size2) {
clearBuffer(buffer1, size1);
clearBuffer(buffer2, size2);
}
/**
* Fill three buffers with 0.
*/
__kernel void clearThreeBuffers(__global float* buffer1, int size1, __global float* buffer2, int size2, __global float* buffer3, int size3) {
clearBuffer(buffer1, size1);
clearBuffer(buffer2, size2);
clearBuffer(buffer3, size3);
}
/**
* Fill four buffers with 0.
*/
__kernel void clearFourBuffers(__global float* buffer1, int size1, __global float* buffer2, int size2, __global float* buffer3, int size3, __global float* buffer4, int size4) {
clearBuffer(buffer1, size1);
clearBuffer(buffer2, size2);
clearBuffer(buffer3, size3);
clearBuffer(buffer4, size4);
}
/**
* Sum a collection of buffers into the first one.
*/
......
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2009 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "../../../tests/AssertionUtilities.h"
#include "openmm/Context.h"
#include "OpenCLPlatform.h"
#include "openmm/NonbondedForce.h"
#include "openmm/System.h"
#include "openmm/LangevinIntegrator.h"
#include <sys/time.h>
#include <iostream>
#include <stdlib.h>
using namespace OpenMM;
using namespace std;
void testPerformance() {
const int xsize = 20;
const int ysize = 21;
const int zsize = 21;
const int numParticles = xsize*ysize*zsize;
const double spacing = 0.3;
OpenCLPlatform platform;
System system;
system.setDefaultPeriodicBoxVectors(Vec3(xsize*spacing, 0, 0), Vec3(0, ysize*spacing, 0), Vec3(0, 0, zsize*spacing));
for (int i = 0; i < numParticles; i++)
system.addParticle(1.0);
LangevinIntegrator integrator(1.0, 0.1, 0.001);
NonbondedForce* nonbonded = new NonbondedForce();
vector<Vec3> positions;
vector<Vec3> velocities;
double charge = 0.1;
for (int i = 0; i < xsize; ++i)
for (int j = 0; j < ysize; ++j)
for (int k = 0; k < zsize; ++k) {
nonbonded->addParticle(charge, 0.2, 0.1);
charge = -charge;
positions.push_back(Vec3(i*spacing, j*spacing, k*spacing));
velocities.push_back(Vec3(0, 0, 0));
}
nonbonded->setNonbondedMethod(NonbondedForce::CutoffPeriodic);
nonbonded->setCutoffDistance(3*spacing);
system.addForce(nonbonded);
Context context(system, integrator, platform);
context.setPositions(positions);
context.setVelocities(velocities);
timeval startTime;
gettimeofday(&startTime, NULL);
integrator.step(50);
State state = context.getState(State::Positions | State::Velocities | State::Forces | State::Energy);
timeval endTime;
gettimeofday(&endTime, NULL);
double dt = endTime.tv_sec-startTime.tv_sec+1e-6*(endTime.tv_usec-startTime.tv_usec);
std::cout << "Elapsed time: " << dt << std::endl;
std::cout << "Final energy: " << state.getPotentialEnergy()+state.getKineticEnergy() << std::endl;
}
int main() {
try {
testPerformance();
}
catch(const exception& e) {
cout << "exception: " << e.what() << endl;
return 1;
}
cout << "Done" << endl;
return 0;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment