Use blocking sync when creating events (#3561)

fe21d5ee · Peter Eastman · GitHub · c36c76ca · fe21d5ee · fe21d5ee
Unverified Commit fe21d5ee authored Apr 13, 2022 by Peter Eastman Committed by GitHub Apr 13, 2022
7 changed files
--- a/platforms/cuda/include/CudaContext.h
+++ b/platforms/cuda/include/CudaContext.h
@@ -541,6 +541,10 @@ public:
     * expense of reduced simulation performance.
     */
    void flushQueue();
+    /**
+     * Get the flags that should be used when creating CUevent objects.
+     */
+    unsigned int getEventFlags();
 private:
    /**
     * Compute a sorted list of device indices in decreasing order of desirability

--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -109,7 +109,8 @@ static int executeInWindows(const string &command) {
 CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
        const string& tempDir, const std::string& hostCompiler, bool allowRuntimeCompiler, CudaPlatform::PlatformData& platformData,
        CudaContext* originalContext) : ComputeContext(system), currentStream(0), platformData(platformData), contextIsValid(false), hasAssignedPosqCharges(false),
-        hasCompilerKernel(false), isNvccAvailable(false), pinnedBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL) {
+        hasCompilerKernel(false), isNvccAvailable(false), pinnedBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL),
+        useBlockingSync(useBlockingSync) {
    // Determine what compiler to use.
    
    this->compiler = "\""+compiler+"\"";
@@ -894,3 +895,10 @@ vector<int> CudaContext::getDevicePrecedence() {

    return precedence;
 }
+
+unsigned int CudaContext::getEventFlags() {
+    unsigned int flags = CU_EVENT_DISABLE_TIMING;
+    if (useBlockingSync)
+        flags += CU_EVENT_BLOCKING_SYNC;
+    return flags;
+}
--- a/platforms/cuda/src/CudaEvent.cpp
+++ b/platforms/cuda/src/CudaEvent.cpp
@@ -30,7 +30,7 @@
 using namespace OpenMM;

 CudaEvent::CudaEvent(CudaContext& context) : context(context), eventCreated(false) {
-    CUresult result = cuEventCreate(&event, CU_EVENT_DISABLE_TIMING);
+    CUresult result = cuEventCreate(&event, context.getEventFlags());
    if (result != CUDA_SUCCESS)
        throw OpenMMException("Error creating CUDA event:"+CudaContext::getErrorString(result));
    eventCreated = true;

--- a/platforms/cuda/src/CudaIntegrationUtilities.cpp
+++ b/platforms/cuda/src/CudaIntegrationUtilities.cpp
@@ -41,7 +41,7 @@ using namespace std;

 CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const System& system) : IntegrationUtilities(context, system),
        ccmaConvergedMemory(NULL) {
-        CHECK_RESULT2(cuEventCreate(&ccmaEvent, CU_EVENT_DISABLE_TIMING), "Error creating event for CCMA");
+        CHECK_RESULT2(cuEventCreate(&ccmaEvent, context.getEventFlags()), "Error creating event for CCMA");
        CHECK_RESULT2(cuMemHostAlloc((void**) &ccmaConvergedMemory, sizeof(int), CU_MEMHOSTALLOC_DEVICEMAP), "Error allocating pinned memory");
        CHECK_RESULT2(cuMemHostGetDevicePointer(&ccmaConvergedDeviceMemory, ccmaConvergedMemory, 0), "Error getting device address for pinned memory");
 }

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -882,8 +882,8 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
                            cufftSetStream(dispersionFftBackward, pmeStream);
                        }
                    }
-                    CHECK_RESULT(cuEventCreate(&pmeSyncEvent, CU_EVENT_DISABLE_TIMING), "Error creating event for NonbondedForce");
-                    CHECK_RESULT(cuEventCreate(&paramsSyncEvent, CU_EVENT_DISABLE_TIMING), "Error creating event for NonbondedForce");
+                    CHECK_RESULT(cuEventCreate(&pmeSyncEvent, cu.getEventFlags()), "Error creating event for NonbondedForce");
+                    CHECK_RESULT(cuEventCreate(&paramsSyncEvent, cu.getEventFlags()), "Error creating event for NonbondedForce");
                    int recipForceGroup = force.getReciprocalSpaceForceGroup();
                    if (recipForceGroup < 0)
                        recipForceGroup = force.getForceGroup();

--- a/platforms/cuda/src/CudaNonbondedUtilities.cpp
+++ b/platforms/cuda/src/CudaNonbondedUtilities.cpp
@@ -70,7 +70,7 @@ CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(c
    string errorMessage = "Error initializing nonbonded utilities";
    int multiprocessors;
    CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, context.getDevice()));
-    CHECK_RESULT(cuEventCreate(&downloadCountEvent, 0));
+    CHECK_RESULT(cuEventCreate(&downloadCountEvent, context.getEventFlags()));
    CHECK_RESULT(cuMemHostAlloc((void**) &pinnedCountBuffer, 2*sizeof(unsigned int), CU_MEMHOSTALLOC_PORTABLE));
    numForceThreadBlocks = 4*multiprocessors;
    forceThreadBlockSize = (context.getComputeCapability() < 2.0 ? 128 : 256);

--- a/platforms/cuda/src/CudaParallelKernels.cpp
+++ b/platforms/cuda/src/CudaParallelKernels.cpp
@@ -184,18 +184,18 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
        getKernel(i).initialize(system);
    for (int i = 0; i < numContexts; i++)
        contextNonbondedFractions[i] = 1/(double) numContexts;
-    CHECK_RESULT(cuEventCreate(&event, 0), "Error creating event");
+    CHECK_RESULT(cuEventCreate(&event, cu.getEventFlags()), "Error creating event");
    peerCopyEvent.resize(numContexts);
    peerCopyEventLocal.resize(numContexts);
    peerCopyStream.resize(numContexts);
    for (int i = 0; i < numContexts; i++) {
-        CHECK_RESULT(cuEventCreate(&peerCopyEvent[i], 0), "Error creating event");
+        CHECK_RESULT(cuEventCreate(&peerCopyEvent[i], cu.getEventFlags()), "Error creating event");
        CHECK_RESULT(cuStreamCreate(&peerCopyStream[i], CU_STREAM_NON_BLOCKING), "Error creating stream");
    }
    for (int i = 0; i < numContexts; i++) {
        CudaContext& cuLocal = *data.contexts[i];
        ContextSelector selectorLocal(cuLocal);
-        CHECK_RESULT(cuEventCreate(&peerCopyEventLocal[i], 0), "Error creating event");
+        CHECK_RESULT(cuEventCreate(&peerCopyEventLocal[i], cu.getEventFlags()), "Error creating event");
    }
    CHECK_RESULT(cuMemHostAlloc((void**) &interactionCounts, numContexts*sizeof(int2), 0), "Error creating interaction counts buffer");
 }