Commit 4ab3b428 authored by peastman's avatar peastman
Browse files

Merge pull request #703 from peastman/multigpu

Further improvements to multi-GPU performance
parents 924af739 67a3b4c0
...@@ -86,6 +86,7 @@ private: ...@@ -86,6 +86,7 @@ private:
long long* pinnedForceBuffer; long long* pinnedForceBuffer;
CUfunction sumKernel; CUfunction sumKernel;
CUevent event; CUevent event;
CUstream peerCopyStream;
}; };
/** /**
......
...@@ -71,10 +71,9 @@ public: ...@@ -71,10 +71,9 @@ public:
cu.setAsCurrent(); cu.setAsCurrent();
if (cu.getContextIndex() > 0) { if (cu.getContextIndex() > 0) {
if (!cu.getPlatformData().peerAccessSupported) { cuStreamWaitEvent(cu.getCurrentStream(), event, 0);
cuStreamWaitEvent(cu.getCurrentStream(), event, 0); if (!cu.getPlatformData().peerAccessSupported)
cu.getPosq().upload(pinnedMemory, false); cu.getPosq().upload(pinnedMemory, false);
}
} }
kernel.beginComputation(context, includeForce, includeEnergy, groups); kernel.beginComputation(context, includeForce, includeEnergy, groups);
} }
...@@ -146,6 +145,7 @@ CudaParallelCalcForcesAndEnergyKernel::~CudaParallelCalcForcesAndEnergyKernel() ...@@ -146,6 +145,7 @@ CudaParallelCalcForcesAndEnergyKernel::~CudaParallelCalcForcesAndEnergyKernel()
if (pinnedForceBuffer != NULL) if (pinnedForceBuffer != NULL)
cuMemFreeHost(pinnedForceBuffer); cuMemFreeHost(pinnedForceBuffer);
cuEventDestroy(event); cuEventDestroy(event);
cuStreamDestroy(peerCopyStream);
} }
void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) { void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
...@@ -158,6 +158,7 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) { ...@@ -158,6 +158,7 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
for (int i = 0; i < (int) contextNonbondedFractions.size(); i++) for (int i = 0; i < (int) contextNonbondedFractions.size(); i++)
contextNonbondedFractions[i] = 1/(double) contextNonbondedFractions.size(); contextNonbondedFractions[i] = 1/(double) contextNonbondedFractions.size();
CHECK_RESULT(cuEventCreate(&event, 0), "Error creating event"); CHECK_RESULT(cuEventCreate(&event, 0), "Error creating event");
CHECK_RESULT(cuStreamCreate(&peerCopyStream, CU_STREAM_NON_BLOCKING), "Error creating stream");
} }
void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) { void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
...@@ -177,11 +178,11 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex ...@@ -177,11 +178,11 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex
} }
else { else {
int numBytes = cu.getPosq().getSize()*cu.getPosq().getElementSize(); int numBytes = cu.getPosq().getSize()*cu.getPosq().getElementSize();
for (int i = 1; i < (int) data.contexts.size(); i++) { cuEventRecord(event, cu.getCurrentStream());
data.contexts[i]->setAsCurrent(); cuStreamWaitEvent(peerCopyStream, event, 0);
CHECK_RESULT(cuMemcpyAsync(data.contexts[i]->getPosq().getDevicePointer(), cu.getPosq().getDevicePointer(), numBytes, 0), "Error copying positions"); for (int i = 1; i < (int) data.contexts.size(); i++)
} CHECK_RESULT(cuMemcpyAsync(data.contexts[i]->getPosq().getDevicePointer(), cu.getPosq().getDevicePointer(), numBytes, peerCopyStream), "Error copying positions");
cu.setAsCurrent(); cuEventRecord(event, peerCopyStream);
} }
for (int i = 0; i < (int) data.contexts.size(); i++) { for (int i = 0; i < (int) data.contexts.size(); i++) {
data.contextEnergy[i] = 0.0; data.contextEnergy[i] = 0.0;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment