Fixed a performance regression in multi-GPU on CUDA

9d3a655b · peastman · 5069c668 · 9d3a655b · 9d3a655b · 9d3a655b
Commit 9d3a655b authored Mar 10, 2015 by peastman
3 changed files
--- a/examples/benchmark.py
+++ b/examples/benchmark.py
@@ -6,9 +6,9 @@ import sys
 from datetime import datetime
 from optparse import OptionParser

-def timeIntegration(context, steps):
+def timeIntegration(context, steps, initialSteps):
    """Integrate a Context for a specified number of steps, then return how many seconds it took."""
-    context.getIntegrator().step(5) # Make sure everything is fully initialized
+    context.getIntegrator().step(initialSteps) # Make sure everything is fully initialized
    context.getState(getEnergy=True)
    start = datetime.now()
    context.getIntegrator().step(steps)
@@ -79,11 +79,14 @@ def runOneTest(testName, options):
        system = ff.createSystem(pdb.topology, nonbondedMethod=method, nonbondedCutoff=cutoff, constraints=constraints, hydrogenMass=hydrogenMass)
    print('Step Size: %g fs' % dt.value_in_unit(unit.femtoseconds))
    properties = {}
+    initialSteps = 5
    if options.device is not None:
        if platform.getName() == 'CUDA':
            properties['CudaDeviceIndex'] = options.device
        elif platform.getName() == 'OpenCL':
            properties['OpenCLDeviceIndex'] = options.device
+        if ',' in options.device or ' ' in options.device:
+            initialSteps = 250
    if options.precision is not None:
        if platform.getName() == 'CUDA':
            properties['CudaPrecision'] = options.precision
@@ -102,7 +105,7 @@ def runOneTest(testName, options):
    context.setVelocitiesToTemperature(300*unit.kelvin)
    steps = 20
    while True:
-        time = timeIntegration(context, steps)
+        time = timeIntegration(context, steps, initialSteps)
        if time >= 0.5*options.seconds:
            break
        if time < 0.5:

--- a/platforms/cuda/include/CudaParallelKernels.h
+++ b/platforms/cuda/include/CudaParallelKernels.h
@@ -83,7 +83,7 @@ private:
    std::vector<Kernel> kernels;
    std::vector<long long> completionTimes;
    std::vector<double> contextNonbondedFractions;
-    std::vector<int> tileCounts;
+    int* tileCounts;
    CudaArray* contextForces;
    void* pinnedPositionBuffer;
    long long* pinnedForceBuffer;

--- a/platforms/cuda/src/CudaParallelKernels.cpp
+++ b/platforms/cuda/src/CudaParallelKernels.cpp
@@ -99,7 +99,7 @@ public:
    }
    void execute() {
        // Execute the kernel, then download forces.
-
+        
        energy += kernel.finishComputation(context, includeForce, includeEnergy, groups, valid);
        if (cu.getComputeForceCount() < 200) {
            // Record timing information for load balancing.  Since this takes time, only do it at the start of the simulation.
@@ -141,7 +141,7 @@ private:

 CudaParallelCalcForcesAndEnergyKernel::CudaParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, CudaPlatform::PlatformData& data) :
        CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextNonbondedFractions(data.contexts.size()),
-        tileCounts(data.contexts.size()), contextForces(NULL), pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) {
+        tileCounts(NULL), contextForces(NULL), pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
        kernels.push_back(Kernel(new CudaCalcForcesAndEnergyKernel(name, platform, *data.contexts[i])));
 }
@@ -156,6 +156,8 @@ CudaParallelCalcForcesAndEnergyKernel::~CudaParallelCalcForcesAndEnergyKernel()
        cuMemFreeHost(pinnedForceBuffer);
    cuEventDestroy(event);
    cuStreamDestroy(peerCopyStream);
+    if (tileCounts != NULL)
+        cuMemFreeHost(tileCounts);
 }

 void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
@@ -163,12 +165,14 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
    cu.setAsCurrent();
    CUmodule module = cu.createModule(CudaKernelSources::parallel);
    sumKernel = cu.getKernel(module, "sumForces");
-    for (int i = 0; i < (int) kernels.size(); i++)
+    int numContexts = data.contexts.size();
+    for (int i = 0; i < numContexts; i++)
        getKernel(i).initialize(system);
-    for (int i = 0; i < (int) contextNonbondedFractions.size(); i++)
-        contextNonbondedFractions[i] = 1/(double) contextNonbondedFractions.size();
+    for (int i = 0; i < numContexts; i++)
+        contextNonbondedFractions[i] = 1/(double) numContexts;
    CHECK_RESULT(cuEventCreate(&event, 0), "Error creating event");
    CHECK_RESULT(cuStreamCreate(&peerCopyStream, CU_STREAM_NON_BLOCKING), "Error creating stream");
+    CHECK_RESULT(cuMemHostAlloc((void**) &tileCounts, numContexts*sizeof(int), 0), "Error creating tile count buffer");
 }

 void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {