Minor optimizations to data transfer between GPUs

5a9897bd · Peter Eastman · f631ecaf · 5a9897bd · 5a9897bd · 5a9897bd
Commit 5a9897bd authored May 04, 2011 by Peter Eastman
3 changed files
--- a/platforms/opencl/src/OpenCLArray.h
+++ b/platforms/opencl/src/OpenCLArray.h
@@ -126,8 +126,8 @@ public:
    /**
     * Copy the values in a vector to the Buffer.
     */
-    void upload(std::vector<T>& data) {
-        upload(&data[0]);
+    void upload(std::vector<T>& data, bool blocking = true) {
+        upload(&data[0], blocking);
    }
    /**
     * Copy the values in the Buffer to a vector.
@@ -140,9 +140,9 @@ public:
    /**
     * Copy the values in an array to the Buffer.
     */
-    void upload(T* data) {
+    void upload(T* data, bool blocking = true) {
        try {
-            context.getQueue().enqueueWriteBuffer(*buffer, CL_TRUE, 0, size*sizeof(T), data);
+            context.getQueue().enqueueWriteBuffer(*buffer, blocking ? CL_TRUE : CL_FALSE, 0, size*sizeof(T), data);
        }
        catch (cl::Error err) {
            std::stringstream str;
@@ -166,10 +166,10 @@ public:
    /**
     * Copy the values in the host buffer to the OpenCL Buffer.
     */
-    void upload() {
+    void upload(bool blocking = true) {
        if (local.size() == 0)
            throw OpenMMException(name+": Called upload() on an OpenCLArray with no host buffer");
-        upload(local);
+        upload(local, blocking);
    }
    /**
     * Copy the values in the Buffer to the host buffer.

--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -199,7 +199,7 @@ void OpenCLContext::initialize(const System& system) {
    for (int i = 0; i < numAtoms; i++)
        (*velm)[i].w = (float) (1.0/system.getParticleMass(i));
    velm->upload();
-    numForceBuffers = 1;
+    numForceBuffers = platformData.contexts.size();
    for (int i = 0; i < (int) forces.size(); i++)
        numForceBuffers = std::max(numForceBuffers, forces[i]->getRequiredForceBuffers());
    forceBuffers = new OpenCLArray<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers", false);

--- a/platforms/opencl/src/OpenCLParallelKernels.cpp
+++ b/platforms/opencl/src/OpenCLParallelKernels.cpp
@@ -61,7 +61,7 @@ public:
        // Copy coordinates over to this device and execute the kernel.

        if (cl.getContextIndex() > 0)
-            cl.getPosq().upload(cl.getPlatformData().contexts[0]->getPosq().getHostBuffer());
+            cl.getPosq().upload(cl.getPlatformData().contexts[0]->getPosq().getHostBuffer(), false);
        kernel.beginComputation(context, includeForce, includeEnergy);
    }
 private:
@@ -82,7 +82,7 @@ public:
        // Execute the kernel, then download forces.
        
        energy += kernel.finishComputation(context, includeForce, includeEnergy);
-        if (includeForce)
+        if (includeForce && cl.getContextIndex() > 0)
            cl.getForce().download(&contextForces[cl.getContextIndex()*cl.getPaddedNumAtoms()]);
        completionTime = getTime();
    }
@@ -142,8 +142,11 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c
    if (includeForce) {
        // Sum the forces from all devices.
        
-        contextForces->upload();
-        data.contexts[0]->reduceBuffer(*contextForces, data.contexts.size());
+        OpenCLContext& cl = *data.contexts[0];
+        int numAtoms = cl.getPaddedNumAtoms();
+        cl.getQueue().enqueueWriteBuffer(contextForces->getDeviceBuffer(), CL_FALSE, numAtoms*sizeof(mm_float4),
+                numAtoms*(data.contexts.size()-1)*sizeof(mm_float4), &(*contextForces)[numAtoms]);
+        cl.reduceBuffer(*contextForces, data.contexts.size());
        
        // Balance work between the contexts by transferring a few nonbonded tiles from the context that
        // finished last to the one that finished first.