Improvements to multi-GPU performance

f32c804b · peastman · 5b591ab0 · f32c804b · f32c804b · f32c804b
Commit f32c804b authored Nov 04, 2014 by peastman
3 changed files
--- a/platforms/cuda/src/CudaNonbondedUtilities.cpp
+++ b/platforms/cuda/src/CudaNonbondedUtilities.cpp
@@ -342,6 +342,8 @@ void CudaNonbondedUtilities::initialize(const System& system) {
 void CudaNonbondedUtilities::prepareInteractions() {
    if (!useCutoff)
        return;
+    if (numTiles == 0)
+        return;
    if (usePeriodic) {
        double4 box = context.getPeriodicBoxSize();
        double minAllowedSize = 1.999999*cutoff;

--- a/platforms/cuda/src/CudaParallelKernels.cpp
+++ b/platforms/cuda/src/CudaParallelKernels.cpp
@@ -71,12 +71,7 @@ public:

        cu.setAsCurrent();
        if (cu.getContextIndex() > 0) {
-            if (cu.getPlatformData().peerAccessSupported && false) { // Why is the peer-to-peer copy slower???
-                CudaContext& context0 = *cu.getPlatformData().contexts[0];
-                int numBytes = cu.getPosq().getSize()*cu.getPosq().getElementSize();
-                CHECK_RESULT(cuMemcpyAsync(cu.getPosq().getDevicePointer(), context0.getPosq().getDevicePointer(), numBytes, 0), "Error copying positions");
-            }
-            else {
+            if (!cu.getPlatformData().peerAccessSupported) {
                cuStreamWaitEvent(cu.getCurrentStream(), event, 0);
                cu.getPosq().upload(pinnedMemory, false);
            }
@@ -117,7 +112,8 @@ public:
                    cu.getForce().download(&pinnedMemory[(cu.getContextIndex()-1)*numAtoms*3]);
            }
            else {
-                CHECK_RESULT(cuCtxSynchronize(), "Error synchronizing CUDA context");
+                // In principle this should make the load balancing more accurate, but in practice it just seems to make things slower.
+                //CHECK_RESULT(cuCtxSynchronize(), "Error synchronizing CUDA context");
            }
        }
        completionTime = getTime();
@@ -175,10 +171,18 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex

    // Copy coordinates over to each device and execute the kernel.
    
-    if (!(cu.getPlatformData().peerAccessSupported && false)) { // Why is this faster than a peer-to-peer copy???
+    if (!cu.getPlatformData().peerAccessSupported) {
        cu.getPosq().download(pinnedPositionBuffer, false);
        cuEventRecord(event, cu.getCurrentStream());
    }
+    else {
+        int numBytes = cu.getPosq().getSize()*cu.getPosq().getElementSize();
+        for (int i = 1; i < (int) data.contexts.size(); i++) {
+            data.contexts[i]->setAsCurrent();
+            CHECK_RESULT(cuMemcpyAsync(data.contexts[i]->getPosq().getDevicePointer(), cu.getPosq().getDevicePointer(), numBytes, 0), "Error copying positions");
+        }
+        cu.setAsCurrent();
+    }
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        data.contextEnergy[i] = 0.0;
        CudaContext& cu = *data.contexts[i];

--- a/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
@@ -386,6 +386,8 @@ static void setInvPeriodicBoxSizeArg(OpenCLContext& cl, cl::Kernel& kernel, int
 void OpenCLNonbondedUtilities::prepareInteractions() {
    if (!useCutoff)
        return;
+    if (numTiles == 0)
+        return;
    if (usePeriodic) {
        mm_float4 box = context.getPeriodicBoxSize();
        double minAllowedSize = 1.999999*cutoff;