Enable flush after enqueueReadBuffer on Windows (#3973)

* Enable flush on Windows - Implements perm 5 from https://github.com/openmm/openmm/issues/3937#issuecomment-1413872621 Co-Authored-By: Philip Turner <philipturner.AR@gmail.com> * Add brackets for clarification Co-authored-by: Philip Turner <philipturner.AR@gmail.com> * Make this optimization only apply to AMD GPUs * Switch to perm 1 - Flush before call to computeNonbonded since it works well on Windows and Linux * Update OpenCLNonbondedUtilities.cpp * Perm 4 is now significantly faster on Windows * Use isAMD * Fix indentation * Fix missed variable * Remove Mac check * Remove isAMD out of Mac code * Consistent (lack of) brackets style --------- Co-authored-by: Philip Turner <philipturner.AR@gmail.com>

Enable flush after enqueueReadBuffer on Windows (#3973)
* Enable flush on Windows - Implements perm 5 from https://github.com/openmm/openmm/issues/3937#issuecomment-1413872621 Co-Authored-By: Philip Turner <philipturner.AR@gmail.com> * Add brackets for clarification Co-authored-by: Philip Turner <philipturner.AR@gmail.com> * Make this optimization only apply to AMD GPUs * Switch to perm 1 - Flush before call to computeNonbonded since it works well on Windows and Linux * Update OpenCLNonbondedUtilities.cpp * Perm 4 is now significantly faster on Windows * Use isAMD * Fix indentation * Fix missed variable * Remove Mac check * Remove isAMD out of Mac code * Consistent (lack of) brackets style --------- Co-authored-by: Philip Turner <philipturner.AR@gmail.com>
25f502d7 · bdenhollander · GitHub · 92199268 · 25f502d7 · 25f502d7
Unverified Commit 25f502d7 authored Oct 24, 2023 by bdenhollander Committed by GitHub Oct 24, 2023
Showing with 9 additions and 1 deletion

platforms/opencl/include/OpenCLNonbondedUtilities.h platforms/opencl/include/OpenCLNonbondedUtilities.h +1 -1

platforms/opencl/src/OpenCLNonbondedUtilities.cpp platforms/opencl/src/OpenCLNonbondedUtilities.cpp +8 -0

No files found.
--- a/platforms/opencl/include/OpenCLNonbondedUtilities.h
+++ b/platforms/opencl/include/OpenCLNonbondedUtilities.h
@@ -334,7 +334,7 @@ private:
    std::map<int, double> groupCutoff;
    std::map<int, std::string> groupKernelSource;
    double lastCutoff;
-    bool useCutoff, usePeriodic, deviceIsCpu, anyExclusions, usePadding, useNeighborList, forceRebuildNeighborList, useLargeBlocks;
+    bool useCutoff, usePeriodic, deviceIsCpu, anyExclusions, usePadding, useNeighborList, forceRebuildNeighborList, useLargeBlocks, isAMD;
    int startTileIndex, startBlockIndex, numBlocks, maxExclusions, numForceThreadBlocks;
    int forceThreadBlockSize, interactingBlocksThreadBlockSize, groupFlags;
    unsigned int tilesAfterReorder;

--- a/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
@@ -87,6 +87,10 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
    // list.  We guess based on system size which will be faster.

    useLargeBlocks = (context.getNumAtoms() > 100000);
+
+    std::string vendor = context.getDevice().getInfo<CL_DEVICE_VENDOR>();
+    isAMD = !deviceIsCpu && ((vendor.size() >= 3 && vendor.substr(0, 3) == "AMD") || (vendor.size() >= 28 && vendor.substr(0, 28) == "Advanced Micro Devices, Inc."));
+
    setKernelSource(deviceIsCpu ? OpenCLKernelSources::nonbonded_cpu : OpenCLKernelSources::nonbonded);
 }

@@ -374,6 +378,8 @@ void OpenCLNonbondedUtilities::prepareInteractions(int forceGroups) {
    forceRebuildNeighborList = false;
    lastCutoff = kernels.cutoffDistance;
    context.getQueue().enqueueReadBuffer(interactionCount.getDeviceBuffer(), CL_FALSE, 0, sizeof(int), pinnedCountMemory, NULL, &downloadCountEvent);
+    if (isAMD)
+        context.getQueue().flush();

    #if __APPLE__ && defined(__aarch64__)
    // Segment the command stream to avoid stalls later.
@@ -387,6 +393,8 @@ void OpenCLNonbondedUtilities::computeInteractions(int forceGroups, bool include
        return;
    KernelSet& kernels = groupKernels[forceGroups];
    if (kernels.hasForces) {
+        if (isAMD)
+            context.getQueue().flush();
        cl::Kernel& kernel = (includeForces ? (includeEnergy ? kernels.forceEnergyKernel : kernels.forceKernel) : kernels.energyKernel);
        if (*reinterpret_cast<cl_kernel*>(&kernel) == NULL)
            kernel = createInteractionKernel(kernels.source, parameters, arguments, true, true, forceGroups, includeForces, includeEnergy);