Unverified Commit 25f502d7 authored by bdenhollander's avatar bdenhollander Committed by GitHub
Browse files

Enable flush after enqueueReadBuffer on Windows (#3973)

* Enable flush on Windows

- Implements perm 5 from https://github.com/openmm/openmm/issues/3937#issuecomment-1413872621

Co-Authored-By: default avatarPhilip Turner <philipturner.AR@gmail.com>

* Add brackets for clarification
Co-authored-by: default avatarPhilip Turner <philipturner.AR@gmail.com>

* Make this optimization only apply to AMD GPUs

* Switch to perm 1

- Flush before call to computeNonbonded since it works well on Windows and Linux

* Update OpenCLNonbondedUtilities.cpp

* Perm 4 is now significantly faster on Windows

* Use isAMD

* Fix indentation

* Fix missed variable

* Remove Mac check

* Remove isAMD out of Mac code

* Consistent (lack of) brackets style

---------
Co-authored-by: default avatarPhilip Turner <philipturner.AR@gmail.com>
parent 92199268
......@@ -334,7 +334,7 @@ private:
std::map<int, double> groupCutoff;
std::map<int, std::string> groupKernelSource;
double lastCutoff;
bool useCutoff, usePeriodic, deviceIsCpu, anyExclusions, usePadding, useNeighborList, forceRebuildNeighborList, useLargeBlocks;
bool useCutoff, usePeriodic, deviceIsCpu, anyExclusions, usePadding, useNeighborList, forceRebuildNeighborList, useLargeBlocks, isAMD;
int startTileIndex, startBlockIndex, numBlocks, maxExclusions, numForceThreadBlocks;
int forceThreadBlockSize, interactingBlocksThreadBlockSize, groupFlags;
unsigned int tilesAfterReorder;
......
......@@ -87,6 +87,10 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
// list. We guess based on system size which will be faster.
useLargeBlocks = (context.getNumAtoms() > 100000);
std::string vendor = context.getDevice().getInfo<CL_DEVICE_VENDOR>();
isAMD = !deviceIsCpu && ((vendor.size() >= 3 && vendor.substr(0, 3) == "AMD") || (vendor.size() >= 28 && vendor.substr(0, 28) == "Advanced Micro Devices, Inc."));
setKernelSource(deviceIsCpu ? OpenCLKernelSources::nonbonded_cpu : OpenCLKernelSources::nonbonded);
}
......@@ -374,6 +378,8 @@ void OpenCLNonbondedUtilities::prepareInteractions(int forceGroups) {
forceRebuildNeighborList = false;
lastCutoff = kernels.cutoffDistance;
context.getQueue().enqueueReadBuffer(interactionCount.getDeviceBuffer(), CL_FALSE, 0, sizeof(int), pinnedCountMemory, NULL, &downloadCountEvent);
if (isAMD)
context.getQueue().flush();
#if __APPLE__ && defined(__aarch64__)
// Segment the command stream to avoid stalls later.
......@@ -387,6 +393,8 @@ void OpenCLNonbondedUtilities::computeInteractions(int forceGroups, bool include
return;
KernelSet& kernels = groupKernels[forceGroups];
if (kernels.hasForces) {
if (isAMD)
context.getQueue().flush();
cl::Kernel& kernel = (includeForces ? (includeEnergy ? kernels.forceEnergyKernel : kernels.forceKernel) : kernels.energyKernel);
if (*reinterpret_cast<cl_kernel*>(&kernel) == NULL)
kernel = createInteractionKernel(kernels.source, parameters, arguments, true, true, forceGroups, includeForces, includeEnergy);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment