Commit 7f8c5089 authored by peastman's avatar peastman
Browse files

Merge pull request #848 from peastman/pinned

Fixed a performance regression in multi-GPU on CUDA
parents 5069c668 9d3a655b
...@@ -6,9 +6,9 @@ import sys ...@@ -6,9 +6,9 @@ import sys
from datetime import datetime from datetime import datetime
from optparse import OptionParser from optparse import OptionParser
def timeIntegration(context, steps): def timeIntegration(context, steps, initialSteps):
"""Integrate a Context for a specified number of steps, then return how many seconds it took.""" """Integrate a Context for a specified number of steps, then return how many seconds it took."""
context.getIntegrator().step(5) # Make sure everything is fully initialized context.getIntegrator().step(initialSteps) # Make sure everything is fully initialized
context.getState(getEnergy=True) context.getState(getEnergy=True)
start = datetime.now() start = datetime.now()
context.getIntegrator().step(steps) context.getIntegrator().step(steps)
...@@ -79,11 +79,14 @@ def runOneTest(testName, options): ...@@ -79,11 +79,14 @@ def runOneTest(testName, options):
system = ff.createSystem(pdb.topology, nonbondedMethod=method, nonbondedCutoff=cutoff, constraints=constraints, hydrogenMass=hydrogenMass) system = ff.createSystem(pdb.topology, nonbondedMethod=method, nonbondedCutoff=cutoff, constraints=constraints, hydrogenMass=hydrogenMass)
print('Step Size: %g fs' % dt.value_in_unit(unit.femtoseconds)) print('Step Size: %g fs' % dt.value_in_unit(unit.femtoseconds))
properties = {} properties = {}
initialSteps = 5
if options.device is not None: if options.device is not None:
if platform.getName() == 'CUDA': if platform.getName() == 'CUDA':
properties['CudaDeviceIndex'] = options.device properties['CudaDeviceIndex'] = options.device
elif platform.getName() == 'OpenCL': elif platform.getName() == 'OpenCL':
properties['OpenCLDeviceIndex'] = options.device properties['OpenCLDeviceIndex'] = options.device
if ',' in options.device or ' ' in options.device:
initialSteps = 250
if options.precision is not None: if options.precision is not None:
if platform.getName() == 'CUDA': if platform.getName() == 'CUDA':
properties['CudaPrecision'] = options.precision properties['CudaPrecision'] = options.precision
...@@ -102,7 +105,7 @@ def runOneTest(testName, options): ...@@ -102,7 +105,7 @@ def runOneTest(testName, options):
context.setVelocitiesToTemperature(300*unit.kelvin) context.setVelocitiesToTemperature(300*unit.kelvin)
steps = 20 steps = 20
while True: while True:
time = timeIntegration(context, steps) time = timeIntegration(context, steps, initialSteps)
if time >= 0.5*options.seconds: if time >= 0.5*options.seconds:
break break
if time < 0.5: if time < 0.5:
......
...@@ -83,7 +83,7 @@ private: ...@@ -83,7 +83,7 @@ private:
std::vector<Kernel> kernels; std::vector<Kernel> kernels;
std::vector<long long> completionTimes; std::vector<long long> completionTimes;
std::vector<double> contextNonbondedFractions; std::vector<double> contextNonbondedFractions;
std::vector<int> tileCounts; int* tileCounts;
CudaArray* contextForces; CudaArray* contextForces;
void* pinnedPositionBuffer; void* pinnedPositionBuffer;
long long* pinnedForceBuffer; long long* pinnedForceBuffer;
......
...@@ -141,7 +141,7 @@ private: ...@@ -141,7 +141,7 @@ private:
CudaParallelCalcForcesAndEnergyKernel::CudaParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, CudaPlatform::PlatformData& data) : CudaParallelCalcForcesAndEnergyKernel::CudaParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, CudaPlatform::PlatformData& data) :
CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextNonbondedFractions(data.contexts.size()), CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextNonbondedFractions(data.contexts.size()),
tileCounts(data.contexts.size()), contextForces(NULL), pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) { tileCounts(NULL), contextForces(NULL), pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) {
for (int i = 0; i < (int) data.contexts.size(); i++) for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CudaCalcForcesAndEnergyKernel(name, platform, *data.contexts[i]))); kernels.push_back(Kernel(new CudaCalcForcesAndEnergyKernel(name, platform, *data.contexts[i])));
} }
...@@ -156,6 +156,8 @@ CudaParallelCalcForcesAndEnergyKernel::~CudaParallelCalcForcesAndEnergyKernel() ...@@ -156,6 +156,8 @@ CudaParallelCalcForcesAndEnergyKernel::~CudaParallelCalcForcesAndEnergyKernel()
cuMemFreeHost(pinnedForceBuffer); cuMemFreeHost(pinnedForceBuffer);
cuEventDestroy(event); cuEventDestroy(event);
cuStreamDestroy(peerCopyStream); cuStreamDestroy(peerCopyStream);
if (tileCounts != NULL)
cuMemFreeHost(tileCounts);
} }
void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) { void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
...@@ -163,12 +165,14 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) { ...@@ -163,12 +165,14 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
cu.setAsCurrent(); cu.setAsCurrent();
CUmodule module = cu.createModule(CudaKernelSources::parallel); CUmodule module = cu.createModule(CudaKernelSources::parallel);
sumKernel = cu.getKernel(module, "sumForces"); sumKernel = cu.getKernel(module, "sumForces");
for (int i = 0; i < (int) kernels.size(); i++) int numContexts = data.contexts.size();
for (int i = 0; i < numContexts; i++)
getKernel(i).initialize(system); getKernel(i).initialize(system);
for (int i = 0; i < (int) contextNonbondedFractions.size(); i++) for (int i = 0; i < numContexts; i++)
contextNonbondedFractions[i] = 1/(double) contextNonbondedFractions.size(); contextNonbondedFractions[i] = 1/(double) numContexts;
CHECK_RESULT(cuEventCreate(&event, 0), "Error creating event"); CHECK_RESULT(cuEventCreate(&event, 0), "Error creating event");
CHECK_RESULT(cuStreamCreate(&peerCopyStream, CU_STREAM_NON_BLOCKING), "Error creating stream"); CHECK_RESULT(cuStreamCreate(&peerCopyStream, CU_STREAM_NON_BLOCKING), "Error creating stream");
CHECK_RESULT(cuMemHostAlloc((void**) &tileCounts, numContexts*sizeof(int), 0), "Error creating tile count buffer");
} }
void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) { void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment