Unverified Commit 25f502d7 authored by bdenhollander's avatar bdenhollander Committed by GitHub
Browse files

Enable flush after enqueueReadBuffer on Windows (#3973)

* Enable flush on Windows

- Implements perm 5 from https://github.com/openmm/openmm/issues/3937#issuecomment-1413872621

Co-Authored-By: default avatarPhilip Turner <philipturner.AR@gmail.com>

* Add brackets for clarification
Co-authored-by: default avatarPhilip Turner <philipturner.AR@gmail.com>

* Make this optimization only apply to AMD GPUs

* Switch to perm 1

- Flush before call to computeNonbonded since it works well on Windows and Linux

* Update OpenCLNonbondedUtilities.cpp

* Perm 4 is now significantly faster on Windows

* Use isAMD

* Fix indentation

* Fix missed variable

* Remove Mac check

* Remove isAMD out of Mac code

* Consistent (lack of) brackets style

---------
Co-authored-by: default avatarPhilip Turner <philipturner.AR@gmail.com>
parent 92199268
...@@ -334,7 +334,7 @@ private: ...@@ -334,7 +334,7 @@ private:
std::map<int, double> groupCutoff; std::map<int, double> groupCutoff;
std::map<int, std::string> groupKernelSource; std::map<int, std::string> groupKernelSource;
double lastCutoff; double lastCutoff;
bool useCutoff, usePeriodic, deviceIsCpu, anyExclusions, usePadding, useNeighborList, forceRebuildNeighborList, useLargeBlocks; bool useCutoff, usePeriodic, deviceIsCpu, anyExclusions, usePadding, useNeighborList, forceRebuildNeighborList, useLargeBlocks, isAMD;
int startTileIndex, startBlockIndex, numBlocks, maxExclusions, numForceThreadBlocks; int startTileIndex, startBlockIndex, numBlocks, maxExclusions, numForceThreadBlocks;
int forceThreadBlockSize, interactingBlocksThreadBlockSize, groupFlags; int forceThreadBlockSize, interactingBlocksThreadBlockSize, groupFlags;
unsigned int tilesAfterReorder; unsigned int tilesAfterReorder;
......
...@@ -87,6 +87,10 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con ...@@ -87,6 +87,10 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
// list. We guess based on system size which will be faster. // list. We guess based on system size which will be faster.
useLargeBlocks = (context.getNumAtoms() > 100000); useLargeBlocks = (context.getNumAtoms() > 100000);
std::string vendor = context.getDevice().getInfo<CL_DEVICE_VENDOR>();
isAMD = !deviceIsCpu && ((vendor.size() >= 3 && vendor.substr(0, 3) == "AMD") || (vendor.size() >= 28 && vendor.substr(0, 28) == "Advanced Micro Devices, Inc."));
setKernelSource(deviceIsCpu ? OpenCLKernelSources::nonbonded_cpu : OpenCLKernelSources::nonbonded); setKernelSource(deviceIsCpu ? OpenCLKernelSources::nonbonded_cpu : OpenCLKernelSources::nonbonded);
} }
...@@ -374,6 +378,8 @@ void OpenCLNonbondedUtilities::prepareInteractions(int forceGroups) { ...@@ -374,6 +378,8 @@ void OpenCLNonbondedUtilities::prepareInteractions(int forceGroups) {
forceRebuildNeighborList = false; forceRebuildNeighborList = false;
lastCutoff = kernels.cutoffDistance; lastCutoff = kernels.cutoffDistance;
context.getQueue().enqueueReadBuffer(interactionCount.getDeviceBuffer(), CL_FALSE, 0, sizeof(int), pinnedCountMemory, NULL, &downloadCountEvent); context.getQueue().enqueueReadBuffer(interactionCount.getDeviceBuffer(), CL_FALSE, 0, sizeof(int), pinnedCountMemory, NULL, &downloadCountEvent);
if (isAMD)
context.getQueue().flush();
#if __APPLE__ && defined(__aarch64__) #if __APPLE__ && defined(__aarch64__)
// Segment the command stream to avoid stalls later. // Segment the command stream to avoid stalls later.
...@@ -387,6 +393,8 @@ void OpenCLNonbondedUtilities::computeInteractions(int forceGroups, bool include ...@@ -387,6 +393,8 @@ void OpenCLNonbondedUtilities::computeInteractions(int forceGroups, bool include
return; return;
KernelSet& kernels = groupKernels[forceGroups]; KernelSet& kernels = groupKernels[forceGroups];
if (kernels.hasForces) { if (kernels.hasForces) {
if (isAMD)
context.getQueue().flush();
cl::Kernel& kernel = (includeForces ? (includeEnergy ? kernels.forceEnergyKernel : kernels.forceKernel) : kernels.energyKernel); cl::Kernel& kernel = (includeForces ? (includeEnergy ? kernels.forceEnergyKernel : kernels.forceKernel) : kernels.energyKernel);
if (*reinterpret_cast<cl_kernel*>(&kernel) == NULL) if (*reinterpret_cast<cl_kernel*>(&kernel) == NULL)
kernel = createInteractionKernel(kernels.source, parameters, arguments, true, true, forceGroups, includeForces, includeEnergy); kernel = createInteractionKernel(kernels.source, parameters, arguments, true, true, forceGroups, includeForces, includeEnergy);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment