Unverified Commit 70771a51 authored by Anton Gorenko's avatar Anton Gorenko
Browse files

Improve latencies, handling of streams and events, multi-GPU support

Use a small kernel for copying interactionCounts to host memory

    hipMemcpy's CopyDeviceToHost operation has higher latency.

Do not set stream and event blocking/spin related flags

    Let the runtime choose the best option because overriding does not
    improve performance in most cases.

Remove NULL streams and use nonblocking streams explicitly

Make HipContext::pushAsCurrent/popAsCurrent thread-safe as they can be
called simultaneously from different threads via ContextSelector.

Allow peer access to be enabled more than once (if there are multiple
simulations one after another, like in benchmark.py).

Create peerCopyStream on a corresponding device

Use two-speed load balancing for multi GPU runs

    First 100 steps do coarse balancing, next 100 - fine tuning.
    Also ignore the slowest device (usually 0) if its fraction has
    reached 0, (i.e. no work can be transfered to other devices) and
    balance other devices.

Do not download inteactionCounts in parallel nonbonded tasks

    This is not required because updateNeighborListSize has been called
    and valid flag changed.

Initialize tilesAfterReorder properly

    It may contain a garbage value, and if it is large then
    updateNeighborListSize does not force reorder atoms after 25 steps
    in extremal cases.
parent ecc2d258
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2019 Stanford University and the Authors. * * Portions copyright (c) 2009-2019 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. * * Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis * * Authors: Peter Eastman, Nicholas Curtis *
* Contributors: * * Contributors: *
* * * *
...@@ -39,7 +39,6 @@ ...@@ -39,7 +39,6 @@
#include <map> #include <map>
#include <stack>
#include <string> #include <string>
#include <utility> #include <utility>
#define __CL_ENABLE_EXCEPTIONS #define __CL_ENABLE_EXCEPTIONS
...@@ -625,8 +624,8 @@ private: ...@@ -625,8 +624,8 @@ private:
std::map<std::string, std::string> compilationDefines; std::map<std::string, std::string> compilationDefines;
std::vector<hipModule_t> loadedModules; std::vector<hipModule_t> loadedModules;
hipDevice_t device; hipDevice_t device;
std::stack<hipDevice_t> outerScopeDevices;
hipStream_t currentStream; hipStream_t currentStream;
hipStream_t defaultStream;
hipFunction_t clearBufferKernel; hipFunction_t clearBufferKernel;
hipFunction_t clearTwoBuffersKernel; hipFunction_t clearTwoBuffersKernel;
hipFunction_t clearThreeBuffersKernel; hipFunction_t clearThreeBuffersKernel;
......
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2022 Stanford University and the Authors. * * Portions copyright (c) 2009-2022 Stanford University and the Authors. *
* Portions copyright (c) 2020-2022 Advanced Micro Devices, Inc. * * Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis * * Authors: Peter Eastman, Nicholas Curtis *
* Contributors: * * Contributors: *
* * * *
...@@ -340,7 +340,7 @@ private: ...@@ -340,7 +340,7 @@ private:
HipSort* blockSorter; HipSort* blockSorter;
hipEvent_t downloadCountEvent; hipEvent_t downloadCountEvent;
unsigned int* pinnedCountBuffer; unsigned int* pinnedCountBuffer;
std::vector<void*> forceArgs, findBlockBoundsArgs, sortBoxDataArgs, findInteractingBlocksArgs; std::vector<void*> forceArgs, findBlockBoundsArgs, sortBoxDataArgs, findInteractingBlocksArgs, copyInteractionCountsArgs;
std::vector<std::vector<int> > atomExclusions; std::vector<std::vector<int> > atomExclusions;
std::vector<ParameterInfo> parameters; std::vector<ParameterInfo> parameters;
std::vector<ParameterInfo> arguments; std::vector<ParameterInfo> arguments;
...@@ -369,7 +369,7 @@ public: ...@@ -369,7 +369,7 @@ public:
hipFunction_t findBlockBoundsKernel; hipFunction_t findBlockBoundsKernel;
hipFunction_t sortBoxDataKernel; hipFunction_t sortBoxDataKernel;
hipFunction_t findInteractingBlocksKernel; hipFunction_t findInteractingBlocksKernel;
hipFunction_t findInteractionsWithinBlocksKernel; hipFunction_t copyInteractionCountsKernel;
}; };
/** /**
......
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2011-2019 Stanford University and the Authors. * * Portions copyright (c) 2011-2019 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. * * Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis * * Authors: Peter Eastman, Nicholas Curtis *
* Contributors: * * Contributors: *
* * * *
...@@ -85,7 +85,6 @@ private: ...@@ -85,7 +85,6 @@ private:
std::vector<Kernel> kernels; std::vector<Kernel> kernels;
std::vector<long long> completionTimes; std::vector<long long> completionTimes;
std::vector<double> contextNonbondedFractions; std::vector<double> contextNonbondedFractions;
int2* interactionCounts;
HipArray contextForces; HipArray contextForces;
void* pinnedPositionBuffer; void* pinnedPositionBuffer;
long long* pinnedForceBuffer; long long* pinnedForceBuffer;
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2012-2022 Stanford University and the Authors. * * Portions copyright (c) 2012-2022 Stanford University and the Authors. *
* Portions copyright (c) 2020-2022 Advanced Micro Devices, Inc. * * Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis * * Authors: Peter Eastman, Nicholas Curtis *
* Contributors: * * Contributors: *
* * * *
...@@ -96,10 +96,9 @@ void HipArray::uploadSubArray(const void* data, int offset, int elements, bool b ...@@ -96,10 +96,9 @@ void HipArray::uploadSubArray(const void* data, int offset, int elements, bool b
if (offset < 0 || offset+elements > getSize()) if (offset < 0 || offset+elements > getSize())
throw OpenMMException("uploadSubArray: data exceeds range of array"); throw OpenMMException("uploadSubArray: data exceeds range of array");
hipError_t result; hipError_t result;
if (blocking) result = hipMemcpyAsync(reinterpret_cast<char*>(pointer)+offset*elementSize, const_cast<void*>(data), elements*elementSize, hipMemcpyHostToDevice, context->getCurrentStream());
result = hipMemcpyHtoD(reinterpret_cast<char*>(pointer)+offset*elementSize, const_cast<void*>(data), elements*elementSize); if (blocking && result == hipSuccess)
else result = hipStreamSynchronize(context->getCurrentStream());
result = hipMemcpyHtoDAsync(reinterpret_cast<char*>(pointer)+offset*elementSize, const_cast<void*>(data), elements*elementSize, context->getCurrentStream());
if (result != hipSuccess) { if (result != hipSuccess) {
std::stringstream str; std::stringstream str;
str<<"Error uploading array "<<name<<": "<<HipContext::getErrorString(result)<<" ("<<result<<")"; str<<"Error uploading array "<<name<<": "<<HipContext::getErrorString(result)<<" ("<<result<<")";
...@@ -111,10 +110,9 @@ void HipArray::download(void* data, bool blocking) const { ...@@ -111,10 +110,9 @@ void HipArray::download(void* data, bool blocking) const {
if (pointer == 0) if (pointer == 0)
throw OpenMMException("HipArray has not been initialized"); throw OpenMMException("HipArray has not been initialized");
hipError_t result; hipError_t result;
if (blocking) result = hipMemcpyAsync(data, pointer, size*elementSize, hipMemcpyDeviceToHost, context->getCurrentStream());
result = hipMemcpyDtoH(data, pointer, size*elementSize); if (blocking && result == hipSuccess)
else result = hipStreamSynchronize(context->getCurrentStream());
result = hipMemcpyDtoHAsync(data, pointer, size*elementSize, context->getCurrentStream());
if (result != hipSuccess) { if (result != hipSuccess) {
std::stringstream str; std::stringstream str;
str<<"Error downloading array "<<name<<": "<<HipContext::getErrorString(result)<<" ("<<result<<")"; str<<"Error downloading array "<<name<<": "<<HipContext::getErrorString(result)<<" ("<<result<<")";
...@@ -128,7 +126,7 @@ void HipArray::copyTo(ArrayInterface& dest) const { ...@@ -128,7 +126,7 @@ void HipArray::copyTo(ArrayInterface& dest) const {
if (dest.getSize() != size || dest.getElementSize() != elementSize) if (dest.getSize() != size || dest.getElementSize() != elementSize)
throw OpenMMException("Error copying array "+name+" to "+dest.getName()+": The destination array does not match the size of the array"); throw OpenMMException("Error copying array "+name+" to "+dest.getName()+": The destination array does not match the size of the array");
HipArray& cuDest = context->unwrap(dest); HipArray& cuDest = context->unwrap(dest);
hipError_t result = hipMemcpyDtoDAsync(cuDest.getDevicePointer(), pointer, size*elementSize, context->getCurrentStream()); hipError_t result = hipMemcpyAsync(cuDest.getDevicePointer(), pointer, size*elementSize, hipMemcpyDeviceToDevice, context->getCurrentStream());
if (result != hipSuccess) { if (result != hipSuccess) {
std::stringstream str; std::stringstream str;
str<<"Error copying array "<<name<<" to "<<dest.getName()<<": "<<HipContext::getErrorString(result)<<" ("<<result<<")"; str<<"Error copying array "<<name<<" to "<<dest.getName()<<": "<<HipContext::getErrorString(result)<<" ("<<result<<")";
......
...@@ -85,7 +85,7 @@ bool HipContext::hasInitializedHip = false; ...@@ -85,7 +85,7 @@ bool HipContext::hasInitializedHip = false;
HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& tempDir, HipPlatform::PlatformData& platformData, HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& tempDir, HipPlatform::PlatformData& platformData,
HipContext* originalContext) : ComputeContext(system), currentStream(0), platformData(platformData), contextIsValid(false), hasAssignedPosqCharges(false), HipContext* originalContext) : ComputeContext(system), currentStream(0), defaultStream(0), platformData(platformData), contextIsValid(false), hasAssignedPosqCharges(false),
pinnedBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), pinnedBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL),
useBlockingSync(useBlockingSync), supportsHardwareFloatGlobalAtomicAdd(false) { useBlockingSync(useBlockingSync), supportsHardwareFloatGlobalAtomicAdd(false) {
if (!hasInitializedHip) { if (!hasInitializedHip) {
...@@ -137,17 +137,8 @@ HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSy ...@@ -137,17 +137,8 @@ HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSy
CHECK_RESULT(hipDeviceGet(&device, trialDeviceIndex)); CHECK_RESULT(hipDeviceGet(&device, trialDeviceIndex));
// try setting device // try setting device
if (hipSetDevice(device) == hipSuccess) { if (hipSetDevice(device) == hipSuccess) {
// and set flags this->deviceIndex = trialDeviceIndex;
unsigned int flags = hipDeviceMapHost; break;
if (useBlockingSync)
flags += hipDeviceScheduleBlockingSync;
else
flags += hipDeviceScheduleSpin;
if (hipSetDeviceFlags(flags) == hipSuccess) {
this->deviceIndex = trialDeviceIndex;
break;
}
} }
} }
...@@ -157,12 +148,15 @@ HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSy ...@@ -157,12 +148,15 @@ HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSy
else else
throw OpenMMException("No compatible HIP device is available"); throw OpenMMException("No compatible HIP device is available");
} }
CHECK_RESULT(hipStreamCreateWithFlags(&defaultStream, hipStreamNonBlocking));
} }
else { else {
isLinkedContext = true; isLinkedContext = true;
this->deviceIndex = originalContext->deviceIndex; this->deviceIndex = originalContext->deviceIndex;
this->device = originalContext->device; this->device = originalContext->device;
defaultStream = originalContext->defaultStream;
} }
currentStream = defaultStream;
hipDeviceProp_t props; hipDeviceProp_t props;
CHECK_RESULT(hipGetDeviceProperties(&props, device)); CHECK_RESULT(hipGetDeviceProperties(&props, device));
...@@ -192,9 +186,15 @@ HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSy ...@@ -192,9 +186,15 @@ HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSy
if (canAccess) { if (canAccess) {
{ {
ContextSelector selector2(*platformData.contexts[0]); ContextSelector selector2(*platformData.contexts[0]);
CHECK_RESULT(hipDeviceEnablePeerAccess(getDevice(), 0)); hipError_t result = hipDeviceEnablePeerAccess(getDevice(), 0);
if (result != hipErrorPeerAccessAlreadyEnabled) {
CHECK_RESULT(result);
}
}
hipError_t result = hipDeviceEnablePeerAccess(platformData.contexts[0]->getDevice(), 0);
if (result != hipErrorPeerAccessAlreadyEnabled) {
CHECK_RESULT(result);
} }
CHECK_RESULT(hipDeviceEnablePeerAccess(platformData.contexts[0]->getDevice(), 0));
} }
} }
numAtoms = system.getNumParticles(); numAtoms = system.getNumParticles();
...@@ -369,6 +369,8 @@ HipContext::~HipContext() { ...@@ -369,6 +369,8 @@ HipContext::~HipContext() {
delete nonbonded; delete nonbonded;
for (auto module : loadedModules) for (auto module : loadedModules)
hipModuleUnload(module); hipModuleUnload(module);
if (!isLinkedContext)
hipStreamDestroy(defaultStream);
popAsCurrent(); popAsCurrent();
contextIsValid = false; contextIsValid = false;
} }
...@@ -427,9 +429,11 @@ void HipContext::setAsCurrent() { ...@@ -427,9 +429,11 @@ void HipContext::setAsCurrent() {
hipSetDevice(device); hipSetDevice(device);
} }
thread_local std::stack<hipDevice_t> outerScopeDevices;
void HipContext::pushAsCurrent() { void HipContext::pushAsCurrent() {
if (contextIsValid) { if (contextIsValid) {
// Emulate cuCtxPushCurrent's behavior // Emulate cuCtxPushCurrent's behavior because hipCtxPushCurrent is deprecated
hipDevice_t outerScopeDevice; hipDevice_t outerScopeDevice;
hipGetDevice(&outerScopeDevice); hipGetDevice(&outerScopeDevice);
outerScopeDevices.push(outerScopeDevice); outerScopeDevices.push(outerScopeDevice);
...@@ -441,7 +445,7 @@ void HipContext::pushAsCurrent() { ...@@ -441,7 +445,7 @@ void HipContext::pushAsCurrent() {
void HipContext::popAsCurrent() { void HipContext::popAsCurrent() {
if (contextIsValid) { if (contextIsValid) {
// Emulate cuCtxPopCurrent's behavior // Emulate cuCtxPopCurrent's behavior because hipCtxPopCurrent is deprecated
hipDevice_t outerScopeDevice = outerScopeDevices.top(); hipDevice_t outerScopeDevice = outerScopeDevices.top();
outerScopeDevices.pop(); outerScopeDevices.pop();
if (outerScopeDevice != device) { if (outerScopeDevice != device) {
...@@ -666,7 +670,7 @@ void HipContext::setCurrentStream(hipStream_t stream) { ...@@ -666,7 +670,7 @@ void HipContext::setCurrentStream(hipStream_t stream) {
} }
void HipContext::restoreDefaultStream() { void HipContext::restoreDefaultStream() {
setCurrentStream(0); currentStream = defaultStream;
} }
HipArray* HipContext::createArray() { HipArray* HipContext::createArray() {
...@@ -885,8 +889,6 @@ vector<int> HipContext::getDevicePrecedence() { ...@@ -885,8 +889,6 @@ vector<int> HipContext::getDevicePrecedence() {
unsigned int HipContext::getEventFlags() { unsigned int HipContext::getEventFlags() {
unsigned int flags = hipEventDisableTiming; unsigned int flags = hipEventDisableTiming;
if (useBlockingSync)
flags += hipEventBlockingSync;
return flags; return flags;
} }
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2019 Stanford University and the Authors. * * Portions copyright (c) 2019 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. * * Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis * * Authors: Peter Eastman, Nicholas Curtis *
* Contributors: * * Contributors: *
* * * *
...@@ -43,7 +43,7 @@ HipEvent::~HipEvent() { ...@@ -43,7 +43,7 @@ HipEvent::~HipEvent() {
} }
void HipEvent::enqueue() { void HipEvent::enqueue() {
hipEventRecord(event, 0); hipEventRecord(event, context.getCurrentStream());
} }
void HipEvent::wait() { void HipEvent::wait() {
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2021 Stanford University and the Authors. * * Portions copyright (c) 2009-2021 Stanford University and the Authors. *
* Portions copyright (c) 2020-2021 Advanced Micro Devices, Inc. * * Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis * * Authors: Peter Eastman, Nicholas Curtis *
* Contributors: * * Contributors: *
* * * *
...@@ -118,7 +118,7 @@ void HipIntegrationUtilities::applyConstraintsImpl(bool constrainVelocities, dou ...@@ -118,7 +118,7 @@ void HipIntegrationUtilities::applyConstraintsImpl(bool constrainVelocities, dou
ccmaForceKernel->setArg(8, i); ccmaForceKernel->setArg(8, i);
ccmaForceKernel->execute(ccmaConstraintAtoms.getSize()); ccmaForceKernel->execute(ccmaConstraintAtoms.getSize());
if ((i+1)%checkInterval == 0) if ((i+1)%checkInterval == 0)
CHECK_RESULT2(hipEventRecord(ccmaEvent, 0), "Error recording event for CCMA"); CHECK_RESULT2(hipEventRecord(ccmaEvent, dynamic_cast<HipContext&>(context).getCurrentStream()), "Error recording event for CCMA");
ccmaMultiplyKernel->setArg(5, i); ccmaMultiplyKernel->setArg(5, i);
ccmaMultiplyKernel->execute(ccmaConstraintAtoms.getSize()); ccmaMultiplyKernel->execute(ccmaConstraintAtoms.getSize());
ccmaUpdateKernel->setArg(9, i); ccmaUpdateKernel->setArg(9, i);
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2022 Stanford University and the Authors. * * Portions copyright (c) 2009-2022 Stanford University and the Authors. *
* Portions copyright (c) 2020-2022 Advanced Micro Devices, Inc. * * Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis * * Authors: Peter Eastman, Nicholas Curtis *
* Contributors: * * Contributors: *
* * * *
...@@ -65,7 +65,7 @@ private: ...@@ -65,7 +65,7 @@ private:
}; };
HipNonbondedUtilities::HipNonbondedUtilities(HipContext& context) : context(context), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true), HipNonbondedUtilities::HipNonbondedUtilities(HipContext& context) : context(context), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true),
blockSorter(NULL), pinnedCountBuffer(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0), canUsePairList(true) { blockSorter(NULL), pinnedCountBuffer(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0), canUsePairList(true), tilesAfterReorder(0) {
// Decide how many thread blocks to use. // Decide how many thread blocks to use.
string errorMessage = "Error initializing nonbonded utilities"; string errorMessage = "Error initializing nonbonded utilities";
...@@ -374,6 +374,8 @@ void HipNonbondedUtilities::initialize(const System& system) { ...@@ -374,6 +374,8 @@ void HipNonbondedUtilities::initialize(const System& system) {
findInteractingBlocksArgs.push_back(&exclusionRowIndices.getDevicePointer()); findInteractingBlocksArgs.push_back(&exclusionRowIndices.getDevicePointer());
findInteractingBlocksArgs.push_back(&oldPositions.getDevicePointer()); findInteractingBlocksArgs.push_back(&oldPositions.getDevicePointer());
findInteractingBlocksArgs.push_back(&rebuildNeighborList.getDevicePointer()); findInteractingBlocksArgs.push_back(&rebuildNeighborList.getDevicePointer());
copyInteractionCountsArgs.push_back(&interactionCount.getDevicePointer());
copyInteractionCountsArgs.push_back(&pinnedCountBuffer);
} }
} }
...@@ -416,7 +418,7 @@ void HipNonbondedUtilities::prepareInteractions(int forceGroups) { ...@@ -416,7 +418,7 @@ void HipNonbondedUtilities::prepareInteractions(int forceGroups) {
context.executeKernelFlat(kernels.findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtomBlocks() * context.getSIMDWidth() * numTilesInBatch, findInteractingBlocksThreadBlockSize); context.executeKernelFlat(kernels.findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtomBlocks() * context.getSIMDWidth() * numTilesInBatch, findInteractingBlocksThreadBlockSize);
forceRebuildNeighborList = false; forceRebuildNeighborList = false;
lastCutoff = kernels.cutoffDistance; lastCutoff = kernels.cutoffDistance;
interactionCount.download(pinnedCountBuffer, false); context.executeKernelFlat(kernels.copyInteractionCountsKernel, &copyInteractionCountsArgs[0], 1, 1);
hipEventRecord(downloadCountEvent, context.getCurrentStream()); hipEventRecord(downloadCountEvent, context.getCurrentStream());
} }
...@@ -439,7 +441,7 @@ void HipNonbondedUtilities::computeInteractions(int forceGroups, bool includeFor ...@@ -439,7 +441,7 @@ void HipNonbondedUtilities::computeInteractions(int forceGroups, bool includeFor
bool HipNonbondedUtilities::updateNeighborListSize() { bool HipNonbondedUtilities::updateNeighborListSize() {
if (!useCutoff) if (!useCutoff)
return false; return false;
if (context.getStepsSinceReorder() == 0) if (context.getStepsSinceReorder() == 0 || tilesAfterReorder == 0)
tilesAfterReorder = pinnedCountBuffer[0]; tilesAfterReorder = pinnedCountBuffer[0];
else if (context.getStepsSinceReorder() > 25 && pinnedCountBuffer[0] > 1.1*tilesAfterReorder) else if (context.getStepsSinceReorder() > 25 && pinnedCountBuffer[0] > 1.1*tilesAfterReorder)
context.forceReorder(); context.forceReorder();
...@@ -551,6 +553,7 @@ void HipNonbondedUtilities::createKernelsForGroups(int groups) { ...@@ -551,6 +553,7 @@ void HipNonbondedUtilities::createKernelsForGroups(int groups) {
kernels.findBlockBoundsKernel = context.getKernel(interactingBlocksProgram, "findBlockBounds"); kernels.findBlockBoundsKernel = context.getKernel(interactingBlocksProgram, "findBlockBounds");
kernels.sortBoxDataKernel = context.getKernel(interactingBlocksProgram, "sortBoxData"); kernels.sortBoxDataKernel = context.getKernel(interactingBlocksProgram, "sortBoxData");
kernels.findInteractingBlocksKernel = context.getKernel(interactingBlocksProgram, "findBlocksWithInteractions"); kernels.findInteractingBlocksKernel = context.getKernel(interactingBlocksProgram, "findBlocksWithInteractions");
kernels.copyInteractionCountsKernel = context.getKernel(interactingBlocksProgram, "copyInteractionCounts");
} }
groupKernels[groups] = kernels; groupKernels[groups] = kernels;
} }
......
...@@ -65,8 +65,8 @@ if (result != hipSuccess) { \ ...@@ -65,8 +65,8 @@ if (result != hipSuccess) { \
class HipParallelCalcForcesAndEnergyKernel::BeginComputationTask : public HipContext::WorkTask { class HipParallelCalcForcesAndEnergyKernel::BeginComputationTask : public HipContext::WorkTask {
public: public:
BeginComputationTask(ContextImpl& context, HipContext& cu, HipCalcForcesAndEnergyKernel& kernel, BeginComputationTask(ContextImpl& context, HipContext& cu, HipCalcForcesAndEnergyKernel& kernel,
bool includeForce, bool includeEnergy, int groups, void* pinnedMemory, hipEvent_t event, int2& interactionCount) : context(context), cu(cu), kernel(kernel), bool includeForce, bool includeEnergy, int groups, void* pinnedMemory, hipEvent_t event) : context(context), cu(cu), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), pinnedMemory(pinnedMemory), event(event), interactionCount(interactionCount) { includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), pinnedMemory(pinnedMemory), event(event) {
} }
void execute() { void execute() {
// Copy coordinates over to this device and execute the kernel. // Copy coordinates over to this device and execute the kernel.
...@@ -78,8 +78,6 @@ public: ...@@ -78,8 +78,6 @@ public:
cu.getPosq().upload(pinnedMemory, false); cu.getPosq().upload(pinnedMemory, false);
} }
kernel.beginComputation(context, includeForce, includeEnergy, groups); kernel.beginComputation(context, includeForce, includeEnergy, groups);
if (cu.getNonbondedUtilities().getUsePeriodic())
cu.getNonbondedUtilities().getInteractionCount().download(&interactionCount, false);
} }
private: private:
ContextImpl& context; ContextImpl& context;
...@@ -89,16 +87,15 @@ private: ...@@ -89,16 +87,15 @@ private:
int groups; int groups;
void* pinnedMemory; void* pinnedMemory;
hipEvent_t event; hipEvent_t event;
int2& interactionCount;
}; };
class HipParallelCalcForcesAndEnergyKernel::FinishComputationTask : public HipContext::WorkTask { class HipParallelCalcForcesAndEnergyKernel::FinishComputationTask : public HipContext::WorkTask {
public: public:
FinishComputationTask(ContextImpl& context, HipContext& cu, HipCalcForcesAndEnergyKernel& kernel, FinishComputationTask(ContextImpl& context, HipContext& cu, HipCalcForcesAndEnergyKernel& kernel,
bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, long long* pinnedMemory, HipArray& contextForces, bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, long long* pinnedMemory, HipArray& contextForces,
bool& valid, int2& interactionCount, hipStream_t stream, hipEvent_t event, hipEvent_t localEvent) : bool& valid, hipStream_t stream, hipEvent_t event, hipEvent_t localEvent) :
context(context), cu(cu), kernel(kernel), includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), energy(energy), context(context), cu(cu), kernel(kernel), includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), energy(energy),
completionTime(completionTime), pinnedMemory(pinnedMemory), contextForces(contextForces), valid(valid), interactionCount(interactionCount), completionTime(completionTime), pinnedMemory(pinnedMemory), contextForces(contextForces), valid(valid),
stream(stream), event(event), localEvent(localEvent) { stream(stream), event(event), localEvent(localEvent) {
} }
void execute() { void execute() {
...@@ -120,7 +117,6 @@ public: ...@@ -120,7 +117,6 @@ public:
if (cu.getPlatformData().peerAccessSupported) { if (cu.getPlatformData().peerAccessSupported) {
int numBytes = numAtoms*3*sizeof(long long); int numBytes = numAtoms*3*sizeof(long long);
int offset = (cu.getContextIndex()-1)*numBytes; int offset = (cu.getContextIndex()-1)*numBytes;
HipContext& context0 = *cu.getPlatformData().contexts[0];
CHECK_RESULT(hipMemcpyAsync(static_cast<char*>(contextForces.getDevicePointer())+offset, CHECK_RESULT(hipMemcpyAsync(static_cast<char*>(contextForces.getDevicePointer())+offset,
cu.getForce().getDevicePointer(), numBytes, hipMemcpyDeviceToDevice, stream), "Error copying forces"); cu.getForce().getDevicePointer(), numBytes, hipMemcpyDeviceToDevice, stream), "Error copying forces");
hipEventRecord(event, stream); hipEventRecord(event, stream);
...@@ -129,11 +125,6 @@ public: ...@@ -129,11 +125,6 @@ public:
cu.getForce().download(&pinnedMemory[(cu.getContextIndex()-1)*numAtoms*3]); cu.getForce().download(&pinnedMemory[(cu.getContextIndex()-1)*numAtoms*3]);
} }
} }
if (cu.getNonbondedUtilities().getUsePeriodic() && (interactionCount.x > cu.getNonbondedUtilities().getInteractingTiles().getSize() ||
interactionCount.y > cu.getNonbondedUtilities().getSinglePairs().getSize())) {
valid = false;
cu.getNonbondedUtilities().updateNeighborListSize();
}
} }
private: private:
ContextImpl& context; ContextImpl& context;
...@@ -146,7 +137,6 @@ private: ...@@ -146,7 +137,6 @@ private:
long long* pinnedMemory; long long* pinnedMemory;
HipArray& contextForces; HipArray& contextForces;
bool& valid; bool& valid;
int2& interactionCount;
hipStream_t stream; hipStream_t stream;
hipEvent_t event; hipEvent_t event;
hipEvent_t localEvent; hipEvent_t localEvent;
...@@ -154,7 +144,7 @@ private: ...@@ -154,7 +144,7 @@ private:
HipParallelCalcForcesAndEnergyKernel::HipParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, HipPlatform::PlatformData& data) : HipParallelCalcForcesAndEnergyKernel::HipParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, HipPlatform::PlatformData& data) :
CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextNonbondedFractions(data.contexts.size()), CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextNonbondedFractions(data.contexts.size()),
interactionCounts(NULL), pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) { pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) {
for (int i = 0; i < (int) data.contexts.size(); i++) for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new HipCalcForcesAndEnergyKernel(name, platform, *data.contexts[i]))); kernels.push_back(Kernel(new HipCalcForcesAndEnergyKernel(name, platform, *data.contexts[i])));
} }
...@@ -172,8 +162,6 @@ HipParallelCalcForcesAndEnergyKernel::~HipParallelCalcForcesAndEnergyKernel() { ...@@ -172,8 +162,6 @@ HipParallelCalcForcesAndEnergyKernel::~HipParallelCalcForcesAndEnergyKernel() {
hipEventDestroy(peerCopyEventLocal[i]); hipEventDestroy(peerCopyEventLocal[i]);
for (int i = 0; i < peerCopyStream.size(); i++) for (int i = 0; i < peerCopyStream.size(); i++)
hipStreamDestroy(peerCopyStream[i]); hipStreamDestroy(peerCopyStream[i]);
if (interactionCounts != NULL)
hipHostFree(interactionCounts);
} }
void HipParallelCalcForcesAndEnergyKernel::initialize(const System& system) { void HipParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
...@@ -190,16 +178,13 @@ void HipParallelCalcForcesAndEnergyKernel::initialize(const System& system) { ...@@ -190,16 +178,13 @@ void HipParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
peerCopyEvent.resize(numContexts); peerCopyEvent.resize(numContexts);
peerCopyEventLocal.resize(numContexts); peerCopyEventLocal.resize(numContexts);
peerCopyStream.resize(numContexts); peerCopyStream.resize(numContexts);
for (int i = 0; i < numContexts; i++) {
CHECK_RESULT(hipEventCreateWithFlags(&peerCopyEvent[i], cu.getEventFlags()), "Error creating event");
CHECK_RESULT(hipStreamCreateWithFlags(&peerCopyStream[i], hipStreamNonBlocking), "Error creating stream");
}
for (int i = 0; i < numContexts; i++) { for (int i = 0; i < numContexts; i++) {
HipContext& cuLocal = *data.contexts[i]; HipContext& cuLocal = *data.contexts[i];
ContextSelector selectorLocal(cuLocal); ContextSelector selectorLocal(cuLocal);
CHECK_RESULT(hipEventCreateWithFlags(&peerCopyEvent[i], cu.getEventFlags()), "Error creating event");
CHECK_RESULT(hipStreamCreateWithFlags(&peerCopyStream[i], hipStreamNonBlocking), "Error creating stream");
CHECK_RESULT(hipEventCreateWithFlags(&peerCopyEventLocal[i], cu.getEventFlags()), "Error creating event"); CHECK_RESULT(hipEventCreateWithFlags(&peerCopyEventLocal[i], cu.getEventFlags()), "Error creating event");
} }
CHECK_RESULT(hipHostMalloc((void**) &interactionCounts, numContexts*sizeof(int2), 0), "Error creating interaction counts buffer");
} }
void HipParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) { void HipParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
...@@ -207,8 +192,10 @@ void HipParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context ...@@ -207,8 +192,10 @@ void HipParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context
ContextSelector selector(cu); ContextSelector selector(cu);
if (!contextForces.isInitialized()) { if (!contextForces.isInitialized()) {
contextForces.initialize<long long>(cu, 3*(data.contexts.size()-1)*cu.getPaddedNumAtoms(), "contextForces"); contextForces.initialize<long long>(cu, 3*(data.contexts.size()-1)*cu.getPaddedNumAtoms(), "contextForces");
CHECK_RESULT(hipHostMalloc((void**) &pinnedForceBuffer, 3*(data.contexts.size()-1)*cu.getPaddedNumAtoms()*sizeof(long long), hipHostMallocPortable), "Error allocating pinned memory"); if (!cu.getPlatformData().peerAccessSupported) {
CHECK_RESULT(hipHostMalloc(&pinnedPositionBuffer, cu.getPaddedNumAtoms()*(cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4)), hipHostMallocPortable), "Error allocating pinned memory"); CHECK_RESULT(hipHostMalloc((void**) &pinnedForceBuffer, 3*(data.contexts.size()-1)*cu.getPaddedNumAtoms()*sizeof(long long), hipHostMallocPortable), "Error allocating pinned memory");
CHECK_RESULT(hipHostMalloc(&pinnedPositionBuffer, cu.getPaddedNumAtoms()*(cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4)), hipHostMallocPortable), "Error allocating pinned memory");
}
} }
// Copy coordinates over to each device and execute the kernel. // Copy coordinates over to each device and execute the kernel.
...@@ -234,7 +221,7 @@ void HipParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context ...@@ -234,7 +221,7 @@ void HipParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context
HipContext& cu = *data.contexts[i]; HipContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread(); ComputeContext::WorkThread& thread = cu.getWorkThread();
hipEvent_t waitEvent = (cu.getPlatformData().peerAccessSupported ? peerCopyEvent[i] : event); hipEvent_t waitEvent = (cu.getPlatformData().peerAccessSupported ? peerCopyEvent[i] : event);
thread.addTask(new BeginComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, pinnedPositionBuffer, waitEvent, interactionCounts[i])); thread.addTask(new BeginComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, pinnedPositionBuffer, waitEvent));
} }
data.syncContexts(); data.syncContexts();
} }
...@@ -244,7 +231,7 @@ double HipParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& cont ...@@ -244,7 +231,7 @@ double HipParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& cont
HipContext& cu = *data.contexts[i]; HipContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread(); ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new FinishComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, data.contextEnergy[i], completionTimes[i], thread.addTask(new FinishComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, data.contextEnergy[i], completionTimes[i],
pinnedForceBuffer, contextForces, valid, interactionCounts[i], peerCopyStream[i], peerCopyEvent[i], peerCopyEventLocal[i])); pinnedForceBuffer, contextForces, valid, peerCopyStream[i], peerCopyEvent[i], peerCopyEventLocal[i]));
} }
data.syncContexts(); data.syncContexts();
HipContext& cu = *data.contexts[0]; HipContext& cu = *data.contexts[0];
...@@ -270,13 +257,14 @@ double HipParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& cont ...@@ -270,13 +257,14 @@ double HipParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& cont
if (cu.getComputeForceCount() < 200) { if (cu.getComputeForceCount() < 200) {
int firstIndex = 0, lastIndex = 0; int firstIndex = 0, lastIndex = 0;
const double eps = 0.001;
for (int i = 0; i < (int) completionTimes.size(); i++) { for (int i = 0; i < (int) completionTimes.size(); i++) {
if (completionTimes[i] < completionTimes[firstIndex]) if (completionTimes[i] < completionTimes[firstIndex])
firstIndex = i; firstIndex = i;
if (completionTimes[i] > completionTimes[lastIndex]) if (contextNonbondedFractions[lastIndex] < eps || completionTimes[i] > completionTimes[lastIndex])
lastIndex = i; lastIndex = i;
} }
double fractionToTransfer = min(0.01, contextNonbondedFractions[lastIndex]); double fractionToTransfer = min(cu.getComputeForceCount() < 100 ? 0.01 : 0.001, contextNonbondedFractions[lastIndex]);
contextNonbondedFractions[firstIndex] += fractionToTransfer; contextNonbondedFractions[firstIndex] += fractionToTransfer;
contextNonbondedFractions[lastIndex] -= fractionToTransfer; contextNonbondedFractions[lastIndex] -= fractionToTransfer;
double startFraction = 0.0; double startFraction = 0.0;
...@@ -287,7 +275,7 @@ double HipParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& cont ...@@ -287,7 +275,7 @@ double HipParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& cont
data.contexts[i]->getNonbondedUtilities().setAtomBlockRange(startFraction, endFraction); data.contexts[i]->getNonbondedUtilities().setAtomBlockRange(startFraction, endFraction);
startFraction = endFraction; startFraction = endFraction;
} }
} }
} }
return energy; return energy;
} }
......
...@@ -518,3 +518,9 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteracti ...@@ -518,3 +518,9 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteracti
oldPositions[i] = posq[i]; oldPositions[i] = posq[i];
} }
} }
extern "C" __global__ void copyInteractionCounts(const unsigned int* __restrict__ interactionCount,
unsigned int* __restrict__ pinnedInteractionCount) {
pinnedInteractionCount[0] = interactionCount[0];
pinnedInteractionCount[1] = interactionCount[1];
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment