Unverified Commit 70771a51 authored by Anton Gorenko's avatar Anton Gorenko
Browse files

Improve latencies, handling of streams and events, multi-GPU support

Use a small kernel for copying interactionCounts to host memory

    hipMemcpy's CopyDeviceToHost operation has higher latency.

Do not set stream and event blocking/spin related flags

    Let the runtime choose the best option because overriding does not
    improve performance in most cases.

Remove NULL streams and use nonblocking streams explicitly

Make HipContext::pushAsCurrent/popAsCurrent thread-safe as they can be
called simultaneously from different threads via ContextSelector.

Allow peer access to be enabled more than once (if there are multiple
simulations one after another, like in benchmark.py).

Create peerCopyStream on a corresponding device

Use two-speed load balancing for multi GPU runs

    First 100 steps do coarse balancing, next 100 - fine tuning.
    Also ignore the slowest device (usually 0) if its fraction has
    reached 0, (i.e. no work can be transfered to other devices) and
    balance other devices.

Do not download inteactionCounts in parallel nonbonded tasks

    This is not required because updateNeighborListSize has been called
    and valid flag changed.

Initialize tilesAfterReorder properly

    It may contain a garbage value, and if it is large then
    updateNeighborListSize does not force reorder atoms after 25 steps
    in extremal cases.
parent ecc2d258
......@@ -10,7 +10,7 @@
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2019 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
......@@ -39,7 +39,6 @@
#include <map>
#include <stack>
#include <string>
#include <utility>
#define __CL_ENABLE_EXCEPTIONS
......@@ -625,8 +624,8 @@ private:
std::map<std::string, std::string> compilationDefines;
std::vector<hipModule_t> loadedModules;
hipDevice_t device;
std::stack<hipDevice_t> outerScopeDevices;
hipStream_t currentStream;
hipStream_t defaultStream;
hipFunction_t clearBufferKernel;
hipFunction_t clearTwoBuffersKernel;
hipFunction_t clearThreeBuffersKernel;
......
......@@ -10,7 +10,7 @@
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2022 Stanford University and the Authors. *
* Portions copyright (c) 2020-2022 Advanced Micro Devices, Inc. *
* Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
......@@ -340,7 +340,7 @@ private:
HipSort* blockSorter;
hipEvent_t downloadCountEvent;
unsigned int* pinnedCountBuffer;
std::vector<void*> forceArgs, findBlockBoundsArgs, sortBoxDataArgs, findInteractingBlocksArgs;
std::vector<void*> forceArgs, findBlockBoundsArgs, sortBoxDataArgs, findInteractingBlocksArgs, copyInteractionCountsArgs;
std::vector<std::vector<int> > atomExclusions;
std::vector<ParameterInfo> parameters;
std::vector<ParameterInfo> arguments;
......@@ -369,7 +369,7 @@ public:
hipFunction_t findBlockBoundsKernel;
hipFunction_t sortBoxDataKernel;
hipFunction_t findInteractingBlocksKernel;
hipFunction_t findInteractionsWithinBlocksKernel;
hipFunction_t copyInteractionCountsKernel;
};
/**
......
......@@ -10,7 +10,7 @@
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011-2019 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
......@@ -85,7 +85,6 @@ private:
std::vector<Kernel> kernels;
std::vector<long long> completionTimes;
std::vector<double> contextNonbondedFractions;
int2* interactionCounts;
HipArray contextForces;
void* pinnedPositionBuffer;
long long* pinnedForceBuffer;
......
......@@ -7,7 +7,7 @@
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2012-2022 Stanford University and the Authors. *
* Portions copyright (c) 2020-2022 Advanced Micro Devices, Inc. *
* Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
......@@ -96,10 +96,9 @@ void HipArray::uploadSubArray(const void* data, int offset, int elements, bool b
if (offset < 0 || offset+elements > getSize())
throw OpenMMException("uploadSubArray: data exceeds range of array");
hipError_t result;
if (blocking)
result = hipMemcpyHtoD(reinterpret_cast<char*>(pointer)+offset*elementSize, const_cast<void*>(data), elements*elementSize);
else
result = hipMemcpyHtoDAsync(reinterpret_cast<char*>(pointer)+offset*elementSize, const_cast<void*>(data), elements*elementSize, context->getCurrentStream());
result = hipMemcpyAsync(reinterpret_cast<char*>(pointer)+offset*elementSize, const_cast<void*>(data), elements*elementSize, hipMemcpyHostToDevice, context->getCurrentStream());
if (blocking && result == hipSuccess)
result = hipStreamSynchronize(context->getCurrentStream());
if (result != hipSuccess) {
std::stringstream str;
str<<"Error uploading array "<<name<<": "<<HipContext::getErrorString(result)<<" ("<<result<<")";
......@@ -111,10 +110,9 @@ void HipArray::download(void* data, bool blocking) const {
if (pointer == 0)
throw OpenMMException("HipArray has not been initialized");
hipError_t result;
if (blocking)
result = hipMemcpyDtoH(data, pointer, size*elementSize);
else
result = hipMemcpyDtoHAsync(data, pointer, size*elementSize, context->getCurrentStream());
result = hipMemcpyAsync(data, pointer, size*elementSize, hipMemcpyDeviceToHost, context->getCurrentStream());
if (blocking && result == hipSuccess)
result = hipStreamSynchronize(context->getCurrentStream());
if (result != hipSuccess) {
std::stringstream str;
str<<"Error downloading array "<<name<<": "<<HipContext::getErrorString(result)<<" ("<<result<<")";
......@@ -128,7 +126,7 @@ void HipArray::copyTo(ArrayInterface& dest) const {
if (dest.getSize() != size || dest.getElementSize() != elementSize)
throw OpenMMException("Error copying array "+name+" to "+dest.getName()+": The destination array does not match the size of the array");
HipArray& cuDest = context->unwrap(dest);
hipError_t result = hipMemcpyDtoDAsync(cuDest.getDevicePointer(), pointer, size*elementSize, context->getCurrentStream());
hipError_t result = hipMemcpyAsync(cuDest.getDevicePointer(), pointer, size*elementSize, hipMemcpyDeviceToDevice, context->getCurrentStream());
if (result != hipSuccess) {
std::stringstream str;
str<<"Error copying array "<<name<<" to "<<dest.getName()<<": "<<HipContext::getErrorString(result)<<" ("<<result<<")";
......
......@@ -85,7 +85,7 @@ bool HipContext::hasInitializedHip = false;
HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& tempDir, HipPlatform::PlatformData& platformData,
HipContext* originalContext) : ComputeContext(system), currentStream(0), platformData(platformData), contextIsValid(false), hasAssignedPosqCharges(false),
HipContext* originalContext) : ComputeContext(system), currentStream(0), defaultStream(0), platformData(platformData), contextIsValid(false), hasAssignedPosqCharges(false),
pinnedBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL),
useBlockingSync(useBlockingSync), supportsHardwareFloatGlobalAtomicAdd(false) {
if (!hasInitializedHip) {
......@@ -137,17 +137,8 @@ HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSy
CHECK_RESULT(hipDeviceGet(&device, trialDeviceIndex));
// try setting device
if (hipSetDevice(device) == hipSuccess) {
// and set flags
unsigned int flags = hipDeviceMapHost;
if (useBlockingSync)
flags += hipDeviceScheduleBlockingSync;
else
flags += hipDeviceScheduleSpin;
if (hipSetDeviceFlags(flags) == hipSuccess) {
this->deviceIndex = trialDeviceIndex;
break;
}
this->deviceIndex = trialDeviceIndex;
break;
}
}
......@@ -157,12 +148,15 @@ HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSy
else
throw OpenMMException("No compatible HIP device is available");
}
CHECK_RESULT(hipStreamCreateWithFlags(&defaultStream, hipStreamNonBlocking));
}
else {
isLinkedContext = true;
this->deviceIndex = originalContext->deviceIndex;
this->device = originalContext->device;
defaultStream = originalContext->defaultStream;
}
currentStream = defaultStream;
hipDeviceProp_t props;
CHECK_RESULT(hipGetDeviceProperties(&props, device));
......@@ -192,9 +186,15 @@ HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSy
if (canAccess) {
{
ContextSelector selector2(*platformData.contexts[0]);
CHECK_RESULT(hipDeviceEnablePeerAccess(getDevice(), 0));
hipError_t result = hipDeviceEnablePeerAccess(getDevice(), 0);
if (result != hipErrorPeerAccessAlreadyEnabled) {
CHECK_RESULT(result);
}
}
hipError_t result = hipDeviceEnablePeerAccess(platformData.contexts[0]->getDevice(), 0);
if (result != hipErrorPeerAccessAlreadyEnabled) {
CHECK_RESULT(result);
}
CHECK_RESULT(hipDeviceEnablePeerAccess(platformData.contexts[0]->getDevice(), 0));
}
}
numAtoms = system.getNumParticles();
......@@ -369,6 +369,8 @@ HipContext::~HipContext() {
delete nonbonded;
for (auto module : loadedModules)
hipModuleUnload(module);
if (!isLinkedContext)
hipStreamDestroy(defaultStream);
popAsCurrent();
contextIsValid = false;
}
......@@ -427,9 +429,11 @@ void HipContext::setAsCurrent() {
hipSetDevice(device);
}
thread_local std::stack<hipDevice_t> outerScopeDevices;
void HipContext::pushAsCurrent() {
if (contextIsValid) {
// Emulate cuCtxPushCurrent's behavior
// Emulate cuCtxPushCurrent's behavior because hipCtxPushCurrent is deprecated
hipDevice_t outerScopeDevice;
hipGetDevice(&outerScopeDevice);
outerScopeDevices.push(outerScopeDevice);
......@@ -441,7 +445,7 @@ void HipContext::pushAsCurrent() {
void HipContext::popAsCurrent() {
if (contextIsValid) {
// Emulate cuCtxPopCurrent's behavior
// Emulate cuCtxPopCurrent's behavior because hipCtxPopCurrent is deprecated
hipDevice_t outerScopeDevice = outerScopeDevices.top();
outerScopeDevices.pop();
if (outerScopeDevice != device) {
......@@ -666,7 +670,7 @@ void HipContext::setCurrentStream(hipStream_t stream) {
}
void HipContext::restoreDefaultStream() {
setCurrentStream(0);
currentStream = defaultStream;
}
HipArray* HipContext::createArray() {
......@@ -885,8 +889,6 @@ vector<int> HipContext::getDevicePrecedence() {
unsigned int HipContext::getEventFlags() {
unsigned int flags = hipEventDisableTiming;
if (useBlockingSync)
flags += hipEventBlockingSync;
return flags;
}
......
......@@ -7,7 +7,7 @@
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2019 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
......@@ -43,7 +43,7 @@ HipEvent::~HipEvent() {
}
void HipEvent::enqueue() {
hipEventRecord(event, 0);
hipEventRecord(event, context.getCurrentStream());
}
void HipEvent::wait() {
......
......@@ -7,7 +7,7 @@
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2021 Stanford University and the Authors. *
* Portions copyright (c) 2020-2021 Advanced Micro Devices, Inc. *
* Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
......@@ -118,7 +118,7 @@ void HipIntegrationUtilities::applyConstraintsImpl(bool constrainVelocities, dou
ccmaForceKernel->setArg(8, i);
ccmaForceKernel->execute(ccmaConstraintAtoms.getSize());
if ((i+1)%checkInterval == 0)
CHECK_RESULT2(hipEventRecord(ccmaEvent, 0), "Error recording event for CCMA");
CHECK_RESULT2(hipEventRecord(ccmaEvent, dynamic_cast<HipContext&>(context).getCurrentStream()), "Error recording event for CCMA");
ccmaMultiplyKernel->setArg(5, i);
ccmaMultiplyKernel->execute(ccmaConstraintAtoms.getSize());
ccmaUpdateKernel->setArg(9, i);
......
......@@ -7,7 +7,7 @@
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2022 Stanford University and the Authors. *
* Portions copyright (c) 2020-2022 Advanced Micro Devices, Inc. *
* Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
......@@ -65,7 +65,7 @@ private:
};
HipNonbondedUtilities::HipNonbondedUtilities(HipContext& context) : context(context), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true),
blockSorter(NULL), pinnedCountBuffer(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0), canUsePairList(true) {
blockSorter(NULL), pinnedCountBuffer(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0), canUsePairList(true), tilesAfterReorder(0) {
// Decide how many thread blocks to use.
string errorMessage = "Error initializing nonbonded utilities";
......@@ -374,6 +374,8 @@ void HipNonbondedUtilities::initialize(const System& system) {
findInteractingBlocksArgs.push_back(&exclusionRowIndices.getDevicePointer());
findInteractingBlocksArgs.push_back(&oldPositions.getDevicePointer());
findInteractingBlocksArgs.push_back(&rebuildNeighborList.getDevicePointer());
copyInteractionCountsArgs.push_back(&interactionCount.getDevicePointer());
copyInteractionCountsArgs.push_back(&pinnedCountBuffer);
}
}
......@@ -416,7 +418,7 @@ void HipNonbondedUtilities::prepareInteractions(int forceGroups) {
context.executeKernelFlat(kernels.findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtomBlocks() * context.getSIMDWidth() * numTilesInBatch, findInteractingBlocksThreadBlockSize);
forceRebuildNeighborList = false;
lastCutoff = kernels.cutoffDistance;
interactionCount.download(pinnedCountBuffer, false);
context.executeKernelFlat(kernels.copyInteractionCountsKernel, &copyInteractionCountsArgs[0], 1, 1);
hipEventRecord(downloadCountEvent, context.getCurrentStream());
}
......@@ -439,7 +441,7 @@ void HipNonbondedUtilities::computeInteractions(int forceGroups, bool includeFor
bool HipNonbondedUtilities::updateNeighborListSize() {
if (!useCutoff)
return false;
if (context.getStepsSinceReorder() == 0)
if (context.getStepsSinceReorder() == 0 || tilesAfterReorder == 0)
tilesAfterReorder = pinnedCountBuffer[0];
else if (context.getStepsSinceReorder() > 25 && pinnedCountBuffer[0] > 1.1*tilesAfterReorder)
context.forceReorder();
......@@ -551,6 +553,7 @@ void HipNonbondedUtilities::createKernelsForGroups(int groups) {
kernels.findBlockBoundsKernel = context.getKernel(interactingBlocksProgram, "findBlockBounds");
kernels.sortBoxDataKernel = context.getKernel(interactingBlocksProgram, "sortBoxData");
kernels.findInteractingBlocksKernel = context.getKernel(interactingBlocksProgram, "findBlocksWithInteractions");
kernels.copyInteractionCountsKernel = context.getKernel(interactingBlocksProgram, "copyInteractionCounts");
}
groupKernels[groups] = kernels;
}
......
......@@ -65,8 +65,8 @@ if (result != hipSuccess) { \
class HipParallelCalcForcesAndEnergyKernel::BeginComputationTask : public HipContext::WorkTask {
public:
BeginComputationTask(ContextImpl& context, HipContext& cu, HipCalcForcesAndEnergyKernel& kernel,
bool includeForce, bool includeEnergy, int groups, void* pinnedMemory, hipEvent_t event, int2& interactionCount) : context(context), cu(cu), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), pinnedMemory(pinnedMemory), event(event), interactionCount(interactionCount) {
bool includeForce, bool includeEnergy, int groups, void* pinnedMemory, hipEvent_t event) : context(context), cu(cu), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), pinnedMemory(pinnedMemory), event(event) {
}
void execute() {
// Copy coordinates over to this device and execute the kernel.
......@@ -78,8 +78,6 @@ public:
cu.getPosq().upload(pinnedMemory, false);
}
kernel.beginComputation(context, includeForce, includeEnergy, groups);
if (cu.getNonbondedUtilities().getUsePeriodic())
cu.getNonbondedUtilities().getInteractionCount().download(&interactionCount, false);
}
private:
ContextImpl& context;
......@@ -89,16 +87,15 @@ private:
int groups;
void* pinnedMemory;
hipEvent_t event;
int2& interactionCount;
};
class HipParallelCalcForcesAndEnergyKernel::FinishComputationTask : public HipContext::WorkTask {
public:
FinishComputationTask(ContextImpl& context, HipContext& cu, HipCalcForcesAndEnergyKernel& kernel,
bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, long long* pinnedMemory, HipArray& contextForces,
bool& valid, int2& interactionCount, hipStream_t stream, hipEvent_t event, hipEvent_t localEvent) :
bool& valid, hipStream_t stream, hipEvent_t event, hipEvent_t localEvent) :
context(context), cu(cu), kernel(kernel), includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), energy(energy),
completionTime(completionTime), pinnedMemory(pinnedMemory), contextForces(contextForces), valid(valid), interactionCount(interactionCount),
completionTime(completionTime), pinnedMemory(pinnedMemory), contextForces(contextForces), valid(valid),
stream(stream), event(event), localEvent(localEvent) {
}
void execute() {
......@@ -120,7 +117,6 @@ public:
if (cu.getPlatformData().peerAccessSupported) {
int numBytes = numAtoms*3*sizeof(long long);
int offset = (cu.getContextIndex()-1)*numBytes;
HipContext& context0 = *cu.getPlatformData().contexts[0];
CHECK_RESULT(hipMemcpyAsync(static_cast<char*>(contextForces.getDevicePointer())+offset,
cu.getForce().getDevicePointer(), numBytes, hipMemcpyDeviceToDevice, stream), "Error copying forces");
hipEventRecord(event, stream);
......@@ -129,11 +125,6 @@ public:
cu.getForce().download(&pinnedMemory[(cu.getContextIndex()-1)*numAtoms*3]);
}
}
if (cu.getNonbondedUtilities().getUsePeriodic() && (interactionCount.x > cu.getNonbondedUtilities().getInteractingTiles().getSize() ||
interactionCount.y > cu.getNonbondedUtilities().getSinglePairs().getSize())) {
valid = false;
cu.getNonbondedUtilities().updateNeighborListSize();
}
}
private:
ContextImpl& context;
......@@ -146,7 +137,6 @@ private:
long long* pinnedMemory;
HipArray& contextForces;
bool& valid;
int2& interactionCount;
hipStream_t stream;
hipEvent_t event;
hipEvent_t localEvent;
......@@ -154,7 +144,7 @@ private:
HipParallelCalcForcesAndEnergyKernel::HipParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, HipPlatform::PlatformData& data) :
CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextNonbondedFractions(data.contexts.size()),
interactionCounts(NULL), pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) {
pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new HipCalcForcesAndEnergyKernel(name, platform, *data.contexts[i])));
}
......@@ -172,8 +162,6 @@ HipParallelCalcForcesAndEnergyKernel::~HipParallelCalcForcesAndEnergyKernel() {
hipEventDestroy(peerCopyEventLocal[i]);
for (int i = 0; i < peerCopyStream.size(); i++)
hipStreamDestroy(peerCopyStream[i]);
if (interactionCounts != NULL)
hipHostFree(interactionCounts);
}
void HipParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
......@@ -190,16 +178,13 @@ void HipParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
peerCopyEvent.resize(numContexts);
peerCopyEventLocal.resize(numContexts);
peerCopyStream.resize(numContexts);
for (int i = 0; i < numContexts; i++) {
CHECK_RESULT(hipEventCreateWithFlags(&peerCopyEvent[i], cu.getEventFlags()), "Error creating event");
CHECK_RESULT(hipStreamCreateWithFlags(&peerCopyStream[i], hipStreamNonBlocking), "Error creating stream");
}
for (int i = 0; i < numContexts; i++) {
HipContext& cuLocal = *data.contexts[i];
ContextSelector selectorLocal(cuLocal);
CHECK_RESULT(hipEventCreateWithFlags(&peerCopyEvent[i], cu.getEventFlags()), "Error creating event");
CHECK_RESULT(hipStreamCreateWithFlags(&peerCopyStream[i], hipStreamNonBlocking), "Error creating stream");
CHECK_RESULT(hipEventCreateWithFlags(&peerCopyEventLocal[i], cu.getEventFlags()), "Error creating event");
}
CHECK_RESULT(hipHostMalloc((void**) &interactionCounts, numContexts*sizeof(int2), 0), "Error creating interaction counts buffer");
}
void HipParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
......@@ -207,8 +192,10 @@ void HipParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context
ContextSelector selector(cu);
if (!contextForces.isInitialized()) {
contextForces.initialize<long long>(cu, 3*(data.contexts.size()-1)*cu.getPaddedNumAtoms(), "contextForces");
CHECK_RESULT(hipHostMalloc((void**) &pinnedForceBuffer, 3*(data.contexts.size()-1)*cu.getPaddedNumAtoms()*sizeof(long long), hipHostMallocPortable), "Error allocating pinned memory");
CHECK_RESULT(hipHostMalloc(&pinnedPositionBuffer, cu.getPaddedNumAtoms()*(cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4)), hipHostMallocPortable), "Error allocating pinned memory");
if (!cu.getPlatformData().peerAccessSupported) {
CHECK_RESULT(hipHostMalloc((void**) &pinnedForceBuffer, 3*(data.contexts.size()-1)*cu.getPaddedNumAtoms()*sizeof(long long), hipHostMallocPortable), "Error allocating pinned memory");
CHECK_RESULT(hipHostMalloc(&pinnedPositionBuffer, cu.getPaddedNumAtoms()*(cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4)), hipHostMallocPortable), "Error allocating pinned memory");
}
}
// Copy coordinates over to each device and execute the kernel.
......@@ -234,7 +221,7 @@ void HipParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context
HipContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread();
hipEvent_t waitEvent = (cu.getPlatformData().peerAccessSupported ? peerCopyEvent[i] : event);
thread.addTask(new BeginComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, pinnedPositionBuffer, waitEvent, interactionCounts[i]));
thread.addTask(new BeginComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, pinnedPositionBuffer, waitEvent));
}
data.syncContexts();
}
......@@ -244,7 +231,7 @@ double HipParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& cont
HipContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new FinishComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, data.contextEnergy[i], completionTimes[i],
pinnedForceBuffer, contextForces, valid, interactionCounts[i], peerCopyStream[i], peerCopyEvent[i], peerCopyEventLocal[i]));
pinnedForceBuffer, contextForces, valid, peerCopyStream[i], peerCopyEvent[i], peerCopyEventLocal[i]));
}
data.syncContexts();
HipContext& cu = *data.contexts[0];
......@@ -270,13 +257,14 @@ double HipParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& cont
if (cu.getComputeForceCount() < 200) {
int firstIndex = 0, lastIndex = 0;
const double eps = 0.001;
for (int i = 0; i < (int) completionTimes.size(); i++) {
if (completionTimes[i] < completionTimes[firstIndex])
firstIndex = i;
if (completionTimes[i] > completionTimes[lastIndex])
if (contextNonbondedFractions[lastIndex] < eps || completionTimes[i] > completionTimes[lastIndex])
lastIndex = i;
}
double fractionToTransfer = min(0.01, contextNonbondedFractions[lastIndex]);
double fractionToTransfer = min(cu.getComputeForceCount() < 100 ? 0.01 : 0.001, contextNonbondedFractions[lastIndex]);
contextNonbondedFractions[firstIndex] += fractionToTransfer;
contextNonbondedFractions[lastIndex] -= fractionToTransfer;
double startFraction = 0.0;
......@@ -287,7 +275,7 @@ double HipParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& cont
data.contexts[i]->getNonbondedUtilities().setAtomBlockRange(startFraction, endFraction);
startFraction = endFraction;
}
}
}
}
return energy;
}
......
......@@ -518,3 +518,9 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteracti
oldPositions[i] = posq[i];
}
}
extern "C" __global__ void copyInteractionCounts(const unsigned int* __restrict__ interactionCount,
unsigned int* __restrict__ pinnedInteractionCount) {
pinnedInteractionCount[0] = interactionCount[0];
pinnedInteractionCount[1] = interactionCount[1];
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment