Merge pull request #668 from peastman/multi

Optimizations to multi-GPU calculations

Merge pull request #668 from peastman/multi
Optimizations to multi-GPU calculations
55bae85f · peastman · e19cefde · 390e0a6b · 55bae85f · 55bae85f
Commit 55bae85f authored Oct 23, 2014 by peastman
4 changed files
--- a/platforms/cuda/include/CudaParallelKernels.h
+++ b/platforms/cuda/include/CudaParallelKernels.h
@@ -85,6 +85,7 @@ private:
    void* pinnedPositionBuffer;
    long long* pinnedForceBuffer;
    CUfunction sumKernel;
+    CUevent event;
 };

 /**

--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -154,6 +154,16 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    CHECK_RESULT(cuCtxCreate(&context, flags, device));
    contextIsValid = true;
    CHECK_RESULT(cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_SHARED));
+    if (contextIndex > 0) {
+        int canAccess;
+        cuDeviceCanAccessPeer(&canAccess, getDevice(), platformData.contexts[0]->getDevice());
+        if (canAccess) {
+            platformData.contexts[0]->setAsCurrent();
+            CHECK_RESULT(cuCtxEnablePeerAccess(getContext(), 0));
+            setAsCurrent();
+            CHECK_RESULT(cuCtxEnablePeerAccess(platformData.contexts[0]->getContext(), 0));
+        }
+    }
    numAtoms = system.getNumParticles();
    paddedNumAtoms = TileSize*((numAtoms+TileSize-1)/TileSize);
    numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize;

--- a/platforms/cuda/src/CudaParallelKernels.cpp
+++ b/platforms/cuda/src/CudaParallelKernels.cpp
@@ -63,22 +63,24 @@ if (result != CUDA_SUCCESS) { \
 class CudaParallelCalcForcesAndEnergyKernel::BeginComputationTask : public CudaContext::WorkTask {
 public:
    BeginComputationTask(ContextImpl& context, CudaContext& cu, CudaCalcForcesAndEnergyKernel& kernel,
-            bool includeForce, bool includeEnergy, int groups, void* pinnedMemory) : context(context), cu(cu), kernel(kernel),
-            includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), pinnedMemory(pinnedMemory) {
+            bool includeForce, bool includeEnergy, int groups, void* pinnedMemory, CUevent event) : context(context), cu(cu), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), pinnedMemory(pinnedMemory), event(event) {
    }
    void execute() {
        // Copy coordinates over to this device and execute the kernel.

        cu.setAsCurrent();
        if (cu.getContextIndex() > 0) {
-            if (cu.getPlatformData().peerAccessSupported && cu.getPlatformData().contexts.size() < 3) {
+            if (cu.getPlatformData().peerAccessSupported && false) { // Why is the peer-to-peer copy slower???
                CudaContext& context0 = *cu.getPlatformData().contexts[0];
                int numBytes = cu.getPosq().getSize()*cu.getPosq().getElementSize();
-                CHECK_RESULT(cuMemcpyPeerAsync(cu.getPosq().getDevicePointer(), cu.getContext(), context0.getPosq().getDevicePointer(), context0.getContext(), numBytes, 0), "Error copying positions");
+                CHECK_RESULT(cuMemcpyAsync(cu.getPosq().getDevicePointer(), context0.getPosq().getDevicePointer(), numBytes, 0), "Error copying positions");
            }
-            else
+            else {
+                cuStreamWaitEvent(cu.getCurrentStream(), event, 0);
                cu.getPosq().upload(pinnedMemory, false);
            }
+        }
        kernel.beginComputation(context, includeForce, includeEnergy, groups);
    }
 private:
@@ -88,6 +90,7 @@ private:
    bool includeForce, includeEnergy;
    int groups;
    void* pinnedMemory;
+    CUevent event;
 };

 class CudaParallelCalcForcesAndEnergyKernel::FinishComputationTask : public CudaContext::WorkTask {
@@ -108,7 +111,7 @@ public:
                    int numBytes = numAtoms*3*sizeof(long long);
                    int offset = (cu.getContextIndex()-1)*numBytes;
                    CudaContext& context0 = *cu.getPlatformData().contexts[0];
-                    CHECK_RESULT(cuMemcpyPeer(contextForces.getDevicePointer()+offset, context0.getContext(), cu.getForce().getDevicePointer(), cu.getContext(), numBytes), "Error copying forces");
+                    CHECK_RESULT(cuMemcpy(contextForces.getDevicePointer()+offset, cu.getForce().getDevicePointer(), numBytes), "Error copying forces");
                }
                else
                    cu.getForce().download(&pinnedMemory[(cu.getContextIndex()-1)*numAtoms*3]);
@@ -146,6 +149,7 @@ CudaParallelCalcForcesAndEnergyKernel::~CudaParallelCalcForcesAndEnergyKernel()
        cuMemFreeHost(pinnedPositionBuffer);
    if (pinnedForceBuffer != NULL)
        cuMemFreeHost(pinnedForceBuffer);
+    cuEventDestroy(event);
 }

 void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
@@ -157,6 +161,7 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
        getKernel(i).initialize(system);
    for (int i = 0; i < (int) contextNonbondedFractions.size(); i++)
        contextNonbondedFractions[i] = 1/(double) contextNonbondedFractions.size();
+    CHECK_RESULT(cuEventCreate(&event, 0), "Error creating event");
 }

 void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
@@ -170,13 +175,15 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex

    // Copy coordinates over to each device and execute the kernel.
    
-    if (!(cu.getPlatformData().peerAccessSupported && cu.getPlatformData().contexts.size() < 3))
-        cu.getPosq().download(pinnedPositionBuffer);
+    if (!(cu.getPlatformData().peerAccessSupported && false)) { // Why is this faster than a peer-to-peer copy???
+        cu.getPosq().download(pinnedPositionBuffer, false);
+        cuEventRecord(event, cu.getCurrentStream());
+    }
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        data.contextEnergy[i] = 0.0;
        CudaContext& cu = *data.contexts[i];
        CudaContext::WorkThread& thread = cu.getWorkThread();
-        thread.addTask(new BeginComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, pinnedPositionBuffer));
+        thread.addTask(new BeginComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, pinnedPositionBuffer, event));
    }
 }


--- a/platforms/cuda/src/CudaPlatform.cpp
+++ b/platforms/cuda/src/CudaPlatform.cpp
@@ -229,22 +229,13 @@ CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& sys
    
    // Determine whether peer-to-peer copying is supported, and enable it if so.
    
-    peerAccessSupported = false; // Disable until I figure out why it usually makes things slower
-//    peerAccessSupported = true;
-//    for (int i = 1; i < contexts.size(); i++) {
-//        int canAccess;
-//        cuDeviceCanAccessPeer(&canAccess, contexts[i]->getDevice(), contexts[0]->getDevice());
-//        if (!canAccess) {
-//            peerAccessSupported = false;
-//            break;
-//        }
-//    }
-    if (peerAccessSupported) {
+    peerAccessSupported = true;
    for (int i = 1; i < contexts.size(); i++) {
-            contexts[0]->setAsCurrent();
-            CHECK_RESULT(cuCtxEnablePeerAccess(contexts[i]->getContext(), 0), "Error enabling peer access");
-            contexts[i]->setAsCurrent();
-            CHECK_RESULT(cuCtxEnablePeerAccess(contexts[0]->getContext(), 0), "Error enabling peer access");
+        int canAccess;
+        cuDeviceCanAccessPeer(&canAccess, contexts[i]->getDevice(), contexts[0]->getDevice());
+        if (!canAccess) {
+            peerAccessSupported = false;
+            break;
        }
    }
 }