Unverified Commit 0c82c264 authored by Peter Eastman's avatar Peter Eastman Committed by GitHub
Browse files

Improve performance running on multiple GPUs (#3347)



* Use multiple streams to broadcast positions

* Use multiple streams to reduce forces

* Adds sync between default stream and peer-copy

* Minor cleanup
Co-authored-by: default avatarDavid Clark <daclark@nvidia.com>
parent c28ae903
......@@ -90,7 +90,9 @@ private:
long long* pinnedForceBuffer;
CUfunction sumKernel;
CUevent event;
CUstream peerCopyStream;
std::vector<CUevent> peerCopyEvent;
std::vector<CUevent> peerCopyEventLocal;
std::vector<CUstream> peerCopyStream;
};
/**
......
......@@ -94,9 +94,11 @@ private:
class CudaParallelCalcForcesAndEnergyKernel::FinishComputationTask : public CudaContext::WorkTask {
public:
FinishComputationTask(ContextImpl& context, CudaContext& cu, CudaCalcForcesAndEnergyKernel& kernel,
bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, long long* pinnedMemory, CudaArray& contextForces, bool& valid, int2& interactionCount) :
bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, long long* pinnedMemory, CudaArray& contextForces,
bool& valid, int2& interactionCount, CUstream stream, CUevent event, CUevent localEvent) :
context(context), cu(cu), kernel(kernel), includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), energy(energy),
completionTime(completionTime), pinnedMemory(pinnedMemory), contextForces(contextForces), valid(valid), interactionCount(interactionCount) {
completionTime(completionTime), pinnedMemory(pinnedMemory), contextForces(contextForces), valid(valid), interactionCount(interactionCount),
stream(stream), event(event), localEvent(localEvent) {
}
void execute() {
// Execute the kernel, then download forces.
......@@ -111,12 +113,15 @@ public:
}
if (includeForce) {
if (cu.getContextIndex() > 0) {
cuEventRecord(localEvent, cu.getCurrentStream());
cuStreamWaitEvent(stream, localEvent, 0);
int numAtoms = cu.getPaddedNumAtoms();
if (cu.getPlatformData().peerAccessSupported) {
int numBytes = numAtoms*3*sizeof(long long);
int offset = (cu.getContextIndex()-1)*numBytes;
CudaContext& context0 = *cu.getPlatformData().contexts[0];
CHECK_RESULT(cuMemcpy(contextForces.getDevicePointer()+offset, cu.getForce().getDevicePointer(), numBytes), "Error copying forces");
CHECK_RESULT(cuMemcpyAsync(contextForces.getDevicePointer()+offset, cu.getForce().getDevicePointer(), numBytes, stream), "Error copying forces");
cuEventRecord(event, stream);
}
else
cu.getForce().download(&pinnedMemory[(cu.getContextIndex()-1)*numAtoms*3]);
......@@ -140,6 +145,9 @@ private:
CudaArray& contextForces;
bool& valid;
int2& interactionCount;
CUstream stream;
CUevent event;
CUevent localEvent;
};
CudaParallelCalcForcesAndEnergyKernel::CudaParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, CudaPlatform::PlatformData& data) :
......@@ -156,7 +164,12 @@ CudaParallelCalcForcesAndEnergyKernel::~CudaParallelCalcForcesAndEnergyKernel()
if (pinnedForceBuffer != NULL)
cuMemFreeHost(pinnedForceBuffer);
cuEventDestroy(event);
cuStreamDestroy(peerCopyStream);
for (int i = 0; i < peerCopyEvent.size(); i++)
cuEventDestroy(peerCopyEvent[i]);
for (int i = 0; i < peerCopyEventLocal.size(); i++)
cuEventDestroy(peerCopyEventLocal[i]);
for (int i = 0; i < peerCopyStream.size(); i++)
cuStreamDestroy(peerCopyStream[i]);
if (interactionCounts != NULL)
cuMemFreeHost(interactionCounts);
}
......@@ -172,7 +185,18 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
for (int i = 0; i < numContexts; i++)
contextNonbondedFractions[i] = 1/(double) numContexts;
CHECK_RESULT(cuEventCreate(&event, 0), "Error creating event");
CHECK_RESULT(cuStreamCreate(&peerCopyStream, CU_STREAM_NON_BLOCKING), "Error creating stream");
peerCopyEvent.resize(numContexts);
peerCopyEventLocal.resize(numContexts);
peerCopyStream.resize(numContexts);
for (int i = 0; i < numContexts; i++) {
CHECK_RESULT(cuEventCreate(&peerCopyEvent[i], 0), "Error creating event");
CHECK_RESULT(cuStreamCreate(&peerCopyStream[i], CU_STREAM_NON_BLOCKING), "Error creating stream");
}
for (int i = 0; i < numContexts; i++) {
CudaContext& cuLocal = *data.contexts[i];
ContextSelector selectorLocal(cuLocal);
CHECK_RESULT(cuEventCreate(&peerCopyEventLocal[i], 0), "Error creating event");
}
CHECK_RESULT(cuMemHostAlloc((void**) &interactionCounts, numContexts*sizeof(int2), 0), "Error creating interaction counts buffer");
}
......@@ -194,16 +218,18 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex
else {
int numBytes = cu.getPosq().getSize()*cu.getPosq().getElementSize();
cuEventRecord(event, cu.getCurrentStream());
cuStreamWaitEvent(peerCopyStream, event, 0);
for (int i = 1; i < (int) data.contexts.size(); i++)
CHECK_RESULT(cuMemcpyAsync(data.contexts[i]->getPosq().getDevicePointer(), cu.getPosq().getDevicePointer(), numBytes, peerCopyStream), "Error copying positions");
cuEventRecord(event, peerCopyStream);
for (int i = 1; i < (int) data.contexts.size(); i++) {
cuStreamWaitEvent(peerCopyStream[i], event, 0);
CHECK_RESULT(cuMemcpyAsync(data.contexts[i]->getPosq().getDevicePointer(), cu.getPosq().getDevicePointer(), numBytes, peerCopyStream[i]), "Error copying positions");
cuEventRecord(peerCopyEvent[i], peerCopyStream[i]);
}
}
for (int i = 0; i < (int) data.contexts.size(); i++) {
data.contextEnergy[i] = 0.0;
CudaContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new BeginComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, pinnedPositionBuffer, event, interactionCounts[i]));
CUevent waitEvent = (cu.getPlatformData().peerAccessSupported ? peerCopyEvent[i] : event);
thread.addTask(new BeginComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, pinnedPositionBuffer, waitEvent, interactionCounts[i]));
}
}
......@@ -211,17 +237,21 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
for (int i = 0; i < (int) data.contexts.size(); i++) {
CudaContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new FinishComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, data.contextEnergy[i], completionTimes[i], pinnedForceBuffer, contextForces, valid, interactionCounts[i]));
thread.addTask(new FinishComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, data.contextEnergy[i], completionTimes[i],
pinnedForceBuffer, contextForces, valid, interactionCounts[i], peerCopyStream[i], peerCopyEvent[i], peerCopyEventLocal[i]));
}
data.syncContexts();
CudaContext& cu = *data.contexts[0];
ContextSelector selector(cu);
if (cu.getPlatformData().peerAccessSupported)
for (int i = 1; i < data.contexts.size(); i++)
cuStreamWaitEvent(cu.getCurrentStream(), peerCopyEvent[i], 0);
double energy = 0.0;
for (int i = 0; i < (int) data.contextEnergy.size(); i++)
energy += data.contextEnergy[i];
if (includeForce && valid) {
// Sum the forces from all devices.
CudaContext& cu = *data.contexts[0];
ContextSelector selector(cu);
if (!cu.getPlatformData().peerAccessSupported)
contextForces.upload(pinnedForceBuffer, false);
int bufferSize = 3*cu.getPaddedNumAtoms();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment