Further improvements to multi-GPU scaling

8f12d8fc · peastman · 67a3b4c0 · 8f12d8fc
Commit 8f12d8fc authored Nov 06, 2014 by peastman
Show whitespace changes
Inline Side-by-side

Showing with 27 additions and 23 deletions

platforms/cuda/src/CudaParallelKernels.cpp platforms/cuda/src/CudaParallelKernels.cpp +27 -23

No files found.
--- a/platforms/cuda/src/CudaParallelKernels.cpp
+++ b/platforms/cuda/src/CudaParallelKernels.cpp
@@ -98,6 +98,12 @@ public:
        // Execute the kernel, then download forces.
        
        energy += kernel.finishComputation(context, includeForce, includeEnergy, groups);
+        if (cu.getComputeForceCount() < 200) {
+            // Record timing information for load balancing.  Since this takes time, only do it at the start of the simulation.
+
+            CHECK_RESULT(cuCtxSynchronize(), "Error synchronizing CUDA context");
+            completionTime = getTime();
+        }
        if (includeForce) {
            if (cu.getContextIndex() > 0) {
                int numAtoms = cu.getPaddedNumAtoms();
@@ -110,13 +116,8 @@ public:
                else
                    cu.getForce().download(&pinnedMemory[(cu.getContextIndex()-1)*numAtoms*3]);
            }
-            else {
-                // In principle this should make the load balancing more accurate, but in practice it just seems to make things slower.
-                //CHECK_RESULT(cuCtxSynchronize(), "Error synchronizing CUDA context");
        }
    }
-        completionTime = getTime();
-    }
 private:
    ContextImpl& context;
    CudaContext& cu;
@@ -192,6 +193,7 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex
    }
 }

+#include <cstdio>
 double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        CudaContext& cu = *data.contexts[i];
@@ -216,6 +218,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
        // Balance work between the contexts by transferring a little nonbonded work from the context that
        // finished last to the one that finished first.
        
+        if (cu.getComputeForceCount() < 200) {
            int firstIndex = 0, lastIndex = 0;
            for (int i = 0; i < (int) completionTimes.size(); i++) {
                if (completionTimes[i] < completionTimes[firstIndex])
@@ -223,7 +226,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
                if (completionTimes[i] > completionTimes[lastIndex])
                    lastIndex = i;
            }
-        double fractionToTransfer = min(0.001, contextNonbondedFractions[lastIndex]);
+            double fractionToTransfer = min(0.01, contextNonbondedFractions[lastIndex]);
            contextNonbondedFractions[firstIndex] += fractionToTransfer;
            contextNonbondedFractions[lastIndex] -= fractionToTransfer;
            double startFraction = 0.0;
@@ -235,6 +238,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
                startFraction = endFraction;
            }
 	}
+    }
    return energy;
 }