Commit 5a9897bd authored by Peter Eastman's avatar Peter Eastman
Browse files

Minor optimizations to data transfer between GPUs

parent f631ecaf
......@@ -126,8 +126,8 @@ public:
/**
* Copy the values in a vector to the Buffer.
*/
void upload(std::vector<T>& data) {
upload(&data[0]);
void upload(std::vector<T>& data, bool blocking = true) {
upload(&data[0], blocking);
}
/**
* Copy the values in the Buffer to a vector.
......@@ -140,9 +140,9 @@ public:
/**
* Copy the values in an array to the Buffer.
*/
void upload(T* data) {
void upload(T* data, bool blocking = true) {
try {
context.getQueue().enqueueWriteBuffer(*buffer, CL_TRUE, 0, size*sizeof(T), data);
context.getQueue().enqueueWriteBuffer(*buffer, blocking ? CL_TRUE : CL_FALSE, 0, size*sizeof(T), data);
}
catch (cl::Error err) {
std::stringstream str;
......@@ -166,10 +166,10 @@ public:
/**
* Copy the values in the host buffer to the OpenCL Buffer.
*/
void upload() {
void upload(bool blocking = true) {
if (local.size() == 0)
throw OpenMMException(name+": Called upload() on an OpenCLArray with no host buffer");
upload(local);
upload(local, blocking);
}
/**
* Copy the values in the Buffer to the host buffer.
......
......@@ -199,7 +199,7 @@ void OpenCLContext::initialize(const System& system) {
for (int i = 0; i < numAtoms; i++)
(*velm)[i].w = (float) (1.0/system.getParticleMass(i));
velm->upload();
numForceBuffers = 1;
numForceBuffers = platformData.contexts.size();
for (int i = 0; i < (int) forces.size(); i++)
numForceBuffers = std::max(numForceBuffers, forces[i]->getRequiredForceBuffers());
forceBuffers = new OpenCLArray<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers", false);
......
......@@ -61,7 +61,7 @@ public:
// Copy coordinates over to this device and execute the kernel.
if (cl.getContextIndex() > 0)
cl.getPosq().upload(cl.getPlatformData().contexts[0]->getPosq().getHostBuffer());
cl.getPosq().upload(cl.getPlatformData().contexts[0]->getPosq().getHostBuffer(), false);
kernel.beginComputation(context, includeForce, includeEnergy);
}
private:
......@@ -82,7 +82,7 @@ public:
// Execute the kernel, then download forces.
energy += kernel.finishComputation(context, includeForce, includeEnergy);
if (includeForce)
if (includeForce && cl.getContextIndex() > 0)
cl.getForce().download(&contextForces[cl.getContextIndex()*cl.getPaddedNumAtoms()]);
completionTime = getTime();
}
......@@ -142,8 +142,11 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c
if (includeForce) {
// Sum the forces from all devices.
contextForces->upload();
data.contexts[0]->reduceBuffer(*contextForces, data.contexts.size());
OpenCLContext& cl = *data.contexts[0];
int numAtoms = cl.getPaddedNumAtoms();
cl.getQueue().enqueueWriteBuffer(contextForces->getDeviceBuffer(), CL_FALSE, numAtoms*sizeof(mm_float4),
numAtoms*(data.contexts.size()-1)*sizeof(mm_float4), &(*contextForces)[numAtoms]);
cl.reduceBuffer(*contextForces, data.contexts.size());
// Balance work between the contexts by transferring a few nonbonded tiles from the context that
// finished last to the one that finished first.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment