Unverified Commit 3b8df952 authored by Peter Eastman's avatar Peter Eastman Committed by GitHub
Browse files

Merge pull request #4632 from ex-rzr/make-hip-standard-platform

HIP platform
parents 5ce6a85d 28fb2918
#ifndef OPENMM_HIPPARAMETERSET_H_
#define OPENMM_HIPPARAMETERSET_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2019 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "HipContext.h"
#include "HipNonbondedUtilities.h"
#include "openmm/common/ComputeParameterSet.h"
namespace OpenMM {
class HipNonbondedUtilities;
/**
* This class exists for backward compatibility. For most purposes you can use
* ComputeParameterSet directly instead.
*/
class OPENMM_EXPORT_COMMON HipParameterSet : public ComputeParameterSet {
public:
/**
* Create an HipParameterSet.
*
* @param context the context for which to create the parameter set
* @param numParameters the number of parameters for each object
* @param numObjects the number of objects to store parameter values for
* @param name the name of the parameter set
* @param bufferPerParameter if true, a separate buffer is created for each parameter. If false,
* multiple parameters may be combined into a single buffer.
* @param useDoublePrecision whether values should be stored as single or double precision
*/
HipParameterSet(HipContext& context, int numParameters, int numObjects, const std::string& name, bool bufferPerParameter=false, bool useDoublePrecision=false);
/**
* Get a set of HipNonbondedUtilities::ParameterInfo objects which describe the Buffers
* containing the data.
*/
std::vector<HipNonbondedUtilities::ParameterInfo>& getBuffers() {
return buffers;
}
private:
std::vector<HipNonbondedUtilities::ParameterInfo> buffers;
};
} // namespace OpenMM
#endif /*OPENMM_HIPPARAMETERSET_H_*/
#ifndef OPENMM_HIPPLATFORM_H_
#define OPENMM_HIPPLATFORM_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2016 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "openmm/Platform.h"
#include "openmm/System.h"
#include "openmm/internal/ThreadPool.h"
#include "openmm/common/windowsExportCommon.h"
namespace OpenMM {
class HipContext;
/**
* This Platform subclass uses HIP implementations of the OpenMM kernels.
*/
class OPENMM_EXPORT_COMMON HipPlatform : public Platform {
public:
class PlatformData;
HipPlatform();
const std::string& getName() const {
static const std::string name = "HIP";
return name;
}
double getSpeed() const;
bool supportsDoublePrecision() const;
const std::string& getPropertyValue(const Context& context, const std::string& property) const;
void setPropertyValue(Context& context, const std::string& property, const std::string& value) const;
void contextCreated(ContextImpl& context, const std::map<std::string, std::string>& properties) const;
void linkedContextCreated(ContextImpl& context, ContextImpl& originalContext) const;
void contextDestroyed(ContextImpl& context) const;
/**
* This is the name of the parameter for selecting which HIP device or devices to use.
*/
static const std::string& HipDeviceIndex() {
static const std::string key = "DeviceIndex";
return key;
}
/**
* This is the name of the parameter that reports the HIP device or devices being used.
*/
static const std::string& HipDeviceName() {
static const std::string key = "DeviceName";
return key;
}
/**
* This is the name of the parameter for selecting whether HIP should sync or spin loop while waiting for results.
*/
static const std::string& HipUseBlockingSync() {
static const std::string key = "UseBlockingSync";
return key;
}
/**
* This is the name of the parameter for selecting what numerical precision to use.
*/
static const std::string& HipPrecision() {
static const std::string key = "Precision";
return key;
}
/**
* This is the name of the parameter for selecting whether to use the CPU based PME calculation.
*/
static const std::string& HipUseCpuPme() {
static const std::string key = "UseCpuPme";
return key;
}
/**
* This is the name of the parameter for specifying the path to the directory for creating temporary files.
*/
static const std::string& HipTempDirectory() {
static const std::string key = "TempDirectory";
return key;
}
/**
* This is the name of the parameter for selecting whether to disable use of a separate stream for PME.
*/
static const std::string& HipDisablePmeStream() {
static const std::string key = "DisablePmeStream";
return key;
}
/**
* This is the name of the parameter for requesting that force computations be fully deterministic.
*/
static const std::string& HipDeterministicForces() {
static const std::string key = "DeterministicForces";
return key;
}
};
class OPENMM_EXPORT_COMMON HipPlatform::PlatformData {
public:
PlatformData(ContextImpl* context, const System& system, const std::string& deviceIndexProperty, const std::string& blockingProperty, const std::string& precisionProperty,
const std::string& cpuPmeProperty, const std::string& tempProperty,
const std::string& pmeStreamProperty, const std::string& deterministicForcesProperty, int numThreads, ContextImpl* originalContext);
~PlatformData();
void initializeContexts(const System& system);
void syncContexts();
ContextImpl* context;
std::vector<HipContext*> contexts;
std::vector<double> contextEnergy;
bool hasInitializedContexts, removeCM, peerAccessSupported, useCpuPme, disablePmeStream, deterministicForces;
int cmMotionFrequency, computeForceCount;
long long stepCount;
double time;
std::map<std::string, std::string> propertyValues;
ThreadPool threads;
};
} // namespace OpenMM
#endif /*OPENMM_HIPPLATFORM_H_*/
#ifndef OPENMM_HIPPROGRAM_H_
#define OPENMM_HIPPROGRAM_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2019 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "openmm/common/ComputeProgram.h"
#include "HipContext.h"
namespace OpenMM {
/**
* This is the HIP implementation of the ComputeProgramImpl interface.
*/
class HipProgram : public ComputeProgramImpl {
public:
/**
* Create a new HipProgram.
*
* @param context the context this kernel belongs to
* @param module the compiled module
*/
HipProgram(HipContext& context, hipModule_t module);
/**
* Create a ComputeKernel for one of the kernels in this program.
*
* @param name the name of the kernel to get
*/
ComputeKernel createKernel(const std::string& name);
private:
HipContext& context;
hipModule_t module;
};
} // namespace OpenMM
#endif /*OPENMM_HIPPROGRAM_H_*/
#ifndef __OPENMM_HIPSORT_H__
#define __OPENMM_HIPSORT_H__
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2018 Stanford University and the Authors. *
* Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "HipArray.h"
#include "openmm/common/windowsExportCommon.h"
#include "HipContext.h"
namespace OpenMM {
/**
* This class sorts arrays of values. It supports any type of values, not just scalars,
* so long as an appropriate sorting key can be defined by which to sort them.
*
* The sorting behavior is specified by a "trait" class that defines the type of data to
* sort and the key for sorting it. Here is an example of a trait class for
* sorting floats:
*
* class FloatTrait : public HipSort::SortTrait {
* int getDataSize() const {return 4;}
* int getKeySize() const {return 4;}
* const char* getDataType() const {return "float";}
* const char* getKeyType() const {return "float";}
* const char* getMinKey() const {return "-3.40282e+38f";}
* const char* getMaxKey() const {return "3.40282e+38f";}
* const char* getMaxValue() const {return "3.40282e+38f";}
* const char* getSortKey() const {return "value";}
* };
*
* The algorithm used is a bucket sort, followed by a bitonic sort within each bucket
* (in local memory when possible, in global memory otherwise). This is similar to
* the algorithm described in
*
* Shifu Chen, Jing Qin, Yongming Xie, Junping Zhao, and Pheng-Ann Heng. "An Efficient
* Sorting Algorithm with CUDA" Journal of the Chinese Institute of Engineers, 32(7),
* pp. 915-921 (2009)
*
* but with many modifications and simplifications. In particular, this algorithm
* involves much less communication between host and device, which is critical to get
* good performance with the array sizes we typically work with (10,000 to 100,000
* elements).
*/
class OPENMM_EXPORT_COMMON HipSort {
public:
class SortTrait;
/**
* Create a HipSort object for sorting data of a particular type.
*
* @param context the context in which to perform calculations
* @param trait a SortTrait defining the type of data to sort. It should have been allocated
* on the heap with the "new" operator. This object takes over ownership of it,
* and deletes it when the HipSort is deleted.
* @param length the length of the arrays this object will be used to sort.
* @param uniform whether the input data is expected to follow a uniform or nonuniform
* distribution. This argument is used only as a hint.
*/
HipSort(HipContext& context, SortTrait* trait, unsigned int length, bool uniform=true);
~HipSort();
/**
* Sort an array.
*/
void sort(HipArray& data);
private:
HipContext& context;
SortTrait* trait;
HipArray counters;
HipArray dataRange;
HipArray bucketOfElement;
HipArray offsetInBucket;
HipArray bucketOffset;
HipArray buckets;
hipFunction_t shortListKernel, shortList2Kernel, computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
unsigned int dataLength, rangeKernelBlocks, rangeKernelSize, positionsKernelSize, sortKernelSize;
bool isShortList, uniform;
};
/**
* A subclass of SortTrait defines the type of value to sort, and the key for sorting them.
*/
class HipSort::SortTrait {
public:
virtual ~SortTrait() {
}
/**
* Get the size of each data value in bytes.
*/
virtual int getDataSize() const = 0;
/**
* Get the size of each key value in bytes.
*/
virtual int getKeySize() const = 0;
/**
* Get the data type of the values to sort.
*/
virtual const char* getDataType() const = 0;
/**
* Get the data type of the sorting key.
*/
virtual const char* getKeyType() const = 0;
/**
* Get the minimum value a key can take.
*/
virtual const char* getMinKey() const = 0;
/**
* Get the maximum value a key can take.
*/
virtual const char* getMaxKey() const = 0;
/**
* Get a value whose key is guaranteed to equal getMaxKey().
*/
virtual const char* getMaxValue() const = 0;
/**
* Get the HIP code to select the key from the data value.
*/
virtual const char* getSortKey() const = 0;
};
} // namespace OpenMM
#endif // __OPENMM_HIPSORT_H__
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2012-2022 Stanford University and the Authors. *
* Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "HipArray.h"
#include "HipContext.h"
#include "openmm/common/ContextSelector.h"
#include <iostream>
#include <sstream>
#include <vector>
using namespace OpenMM;
HipArray::HipArray() : pointer(0), ownsMemory(false) {
}
HipArray::HipArray(HipContext& context, size_t size, int elementSize, const std::string& name) : pointer(0) {
initialize(context, size, elementSize, name);
}
HipArray::~HipArray() {
if (pointer != 0 && ownsMemory && context->getContextIsValid()) {
ContextSelector selector(*context);
hipError_t result = hipFree(pointer);
if (result != hipSuccess) {
std::stringstream str;
str<<"Error deleting array "<<name<<": "<<HipContext::getErrorString(result)<<" ("<<result<<")";
throw OpenMMException(str.str());
}
}
}
void HipArray::initialize(ComputeContext& context, size_t size, int elementSize, const std::string& name) {
if (this->pointer != 0)
throw OpenMMException("HipArray has already been initialized");
this->context = &dynamic_cast<HipContext&>(context);
this->size = size;
this->elementSize = elementSize;
this->name = name;
ownsMemory = true;
ContextSelector selector(*this->context);
hipError_t result = hipMalloc(&pointer, size*elementSize);
if (result != hipSuccess) {
std::stringstream str;
str<<"Error creating array "<<name<<": "<<HipContext::getErrorString(result)<<" ("<<result<<")";
throw OpenMMException(str.str());
}
}
void HipArray::resize(size_t size) {
if (pointer == 0)
throw OpenMMException("HipArray has not been initialized");
if (!ownsMemory)
throw OpenMMException("Cannot resize an array that does not own its storage");
ContextSelector selector(*context);
hipError_t result = hipFree(pointer);
if (result != hipSuccess) {
std::stringstream str;
str<<"Error deleting array "<<name<<": "<<HipContext::getErrorString(result)<<" ("<<result<<")";
throw OpenMMException(str.str());
}
pointer = 0;
initialize(*context, size, elementSize, name);
}
ComputeContext& HipArray::getContext() {
return *context;
}
void HipArray::uploadSubArray(const void* data, int offset, int elements, bool blocking) {
if (pointer == 0)
throw OpenMMException("HipArray has not been initialized");
if (offset < 0 || offset+elements > getSize())
throw OpenMMException("uploadSubArray: data exceeds range of array");
hipError_t result;
result = hipMemcpyAsync(reinterpret_cast<char*>(pointer)+offset*elementSize, const_cast<void*>(data), elements*elementSize, hipMemcpyHostToDevice, context->getCurrentStream());
if (blocking && result == hipSuccess)
result = hipStreamSynchronize(context->getCurrentStream());
if (result != hipSuccess) {
std::stringstream str;
str<<"Error uploading array "<<name<<": "<<HipContext::getErrorString(result)<<" ("<<result<<")";
throw OpenMMException(str.str());
}
}
void HipArray::download(void* data, bool blocking) const {
if (pointer == 0)
throw OpenMMException("HipArray has not been initialized");
hipError_t result;
result = hipMemcpyAsync(data, pointer, size*elementSize, hipMemcpyDeviceToHost, context->getCurrentStream());
if (blocking && result == hipSuccess)
result = hipStreamSynchronize(context->getCurrentStream());
if (result != hipSuccess) {
std::stringstream str;
str<<"Error downloading array "<<name<<": "<<HipContext::getErrorString(result)<<" ("<<result<<")";
throw OpenMMException(str.str());
}
}
void HipArray::copyTo(ArrayInterface& dest) const {
if (pointer == 0)
throw OpenMMException("HipArray has not been initialized");
if (dest.getSize() != size || dest.getElementSize() != elementSize)
throw OpenMMException("Error copying array "+name+" to "+dest.getName()+": The destination array does not match the size of the array");
HipArray& cuDest = context->unwrap(dest);
hipError_t result = hipMemcpyAsync(cuDest.getDevicePointer(), pointer, size*elementSize, hipMemcpyDeviceToDevice, context->getCurrentStream());
if (result != hipSuccess) {
std::stringstream str;
str<<"Error copying array "<<name<<" to "<<dest.getName()<<": "<<HipContext::getErrorString(result)<<" ("<<result<<")";
throw OpenMMException(str.str());
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2024 Stanford University and the Authors. *
* Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#ifdef WIN32
#define _USE_MATH_DEFINES // Needed to get M_PI
#endif
#include "HipContext.h"
#include "HipArray.h"
#include "HipBondedUtilities.h"
#include "HipEvent.h"
#include "HipIntegrationUtilities.h"
#include "HipKernels.h"
#include "HipKernelSources.h"
#include "HipNonbondedUtilities.h"
#include "HipProgram.h"
#include "HipFFT3D.h"
#include "openmm/common/ComputeArray.h"
#include "openmm/common/ContextSelector.h"
#include "SHA1.h"
#include "openmm/MonteCarloFlexibleBarostat.h"
#include "openmm/Platform.h"
#include "openmm/System.h"
#include "openmm/VirtualSite.h"
#include "HipExpressionUtilities.h"
#include "openmm/internal/ContextImpl.h"
#include <algorithm>
#include <cmath>
#include <cstdlib>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <set>
#include <sstream>
#include <stack>
#include <thread>
#include <typeinfo>
#include <sys/stat.h>
#include <hip/hiprtc.h>
#define CHECK_RESULT(result) CHECK_RESULT2(result, errorMessage);
#define CHECK_RESULT2(result, prefix) \
if (result != hipSuccess) { \
std::stringstream m; \
m<<prefix<<": "<<getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
throw OpenMMException(m.str());\
}
#define HIPRTC_CHECK_RESULT(result, prefix) \
if (result != HIPRTC_SUCCESS) { \
stringstream m; \
m<<prefix<<": "<<hiprtcGetErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
throw OpenMMException(m.str());\
}
using namespace OpenMM;
using namespace std;
const int HipContext::ThreadBlockSize = 64;
const int HipContext::TileSize = 32;
bool HipContext::hasInitializedHip = false;
HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& tempDir, HipPlatform::PlatformData& platformData,
HipContext* originalContext) : ComputeContext(system), currentStream(0), defaultStream(0), platformData(platformData), contextIsValid(false), hasAssignedPosqCharges(false),
pinnedBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL),
useBlockingSync(useBlockingSync), supportsHardwareFloatGlobalAtomicAdd(false) {
if (!hasInitializedHip) {
CHECK_RESULT2(hipInit(0), "Error initializing HIP");
hasInitializedHip = true;
}
if (precision == "single") {
useDoublePrecision = false;
useMixedPrecision = false;
}
else if (precision == "mixed") {
useDoublePrecision = false;
useMixedPrecision = true;
}
else if (precision == "double") {
useDoublePrecision = true;
useMixedPrecision = false;
}
else
throw OpenMMException("Illegal value for Precision: "+precision);
char* cacheVariable = getenv("OPENMM_CACHE_DIR");
cacheDir = (cacheVariable == NULL ? tempDir : string(cacheVariable));
#ifdef WIN32
this->tempDir = tempDir+"\\";
cacheDir = cacheDir+"\\";
#else
this->tempDir = tempDir+"/";
cacheDir = cacheDir+"/";
#endif
contextIndex = platformData.contexts.size();
string errorMessage = "Error initializing Context";
if (originalContext == NULL) {
isLinkedContext = false;
int numDevices;
CHECK_RESULT(hipGetDeviceCount(&numDevices));
if (deviceIndex < -1 || deviceIndex >= numDevices)
throw OpenMMException("Illegal value for DeviceIndex: "+intToString(deviceIndex));
vector<int> devicePrecedence;
if (deviceIndex == -1) {
devicePrecedence = getDevicePrecedence();
} else {
devicePrecedence.push_back(deviceIndex);
}
this->deviceIndex = -1;
for (int i = 0; i < static_cast<int>(devicePrecedence.size()); i++) {
int trialDeviceIndex = devicePrecedence[i];
CHECK_RESULT(hipDeviceGet(&device, trialDeviceIndex));
// try setting device
if (hipSetDevice(device) == hipSuccess) {
this->deviceIndex = trialDeviceIndex;
break;
}
}
if (this->deviceIndex == -1) {
if (deviceIndex != -1)
throw OpenMMException("The requested HIP device could not be loaded");
else
throw OpenMMException("No compatible HIP device is available");
}
CHECK_RESULT(hipStreamCreateWithFlags(&defaultStream, hipStreamNonBlocking));
}
else {
isLinkedContext = true;
this->deviceIndex = originalContext->deviceIndex;
this->device = originalContext->device;
defaultStream = originalContext->defaultStream;
}
currentStream = defaultStream;
hipDeviceProp_t props;
CHECK_RESULT(hipGetDeviceProperties(&props, device));
// set device properties
this->simdWidth = props.warpSize;
this->sharedMemPerBlock = props.sharedMemPerBlock;
gpuArchitecture = props.gcnArchName;
// HIP-TODO: find a good value here
int numThreadBlocksPerComputeUnit = 6;
// GPUs starting from CDNA1 and RDNA3 support atomic add for floats (global_atomic_add_f32),
// which can be used in PME. Older GPUs use fixed point charge spreading instead.
this->supportsHardwareFloatGlobalAtomicAdd = true;
if (gpuArchitecture.find("gfx900") == 0 ||
gpuArchitecture.find("gfx906") == 0 ||
gpuArchitecture.find("gfx10") == 0) {
this->supportsHardwareFloatGlobalAtomicAdd = false;
}
contextIsValid = true;
ContextSelector selector(*this);
if (contextIndex > 0) {
int canAccess;
CHECK_RESULT(hipDeviceCanAccessPeer(&canAccess, getDevice(), platformData.contexts[0]->getDevice()));
if (canAccess) {
{
ContextSelector selector2(*platformData.contexts[0]);
hipError_t result = hipDeviceEnablePeerAccess(getDevice(), 0);
if (result != hipErrorPeerAccessAlreadyEnabled) {
CHECK_RESULT(result);
}
}
hipError_t result = hipDeviceEnablePeerAccess(platformData.contexts[0]->getDevice(), 0);
if (result != hipErrorPeerAccessAlreadyEnabled) {
CHECK_RESULT(result);
}
}
}
numAtoms = system.getNumParticles();
paddedNumAtoms = TileSize*((numAtoms+TileSize-1)/TileSize);
numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize;
CHECK_RESULT(hipDeviceGetAttribute(&multiprocessors, hipDeviceAttributeMultiprocessorCount, device));
// For RDNA GPUs hipDeviceAttributeMultiprocessorCount means WGP (work-group processors, two compute units), not CUs.
if (simdWidth == 32)
multiprocessors *= 2;
numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors;
compilationDefines["USE_HIP"] = "1";
if (simdWidth == 32)
compilationDefines["AMD_RDNA"] = "1";
if (useDoublePrecision) {
posq.initialize<double4>(*this, paddedNumAtoms, "posq");
velm.initialize<double4>(*this, paddedNumAtoms, "velm");
compilationDefines["USE_DOUBLE_PRECISION"] = "1";
compilationDefines["make_real2"] = "make_double2";
compilationDefines["make_real3"] = "make_double3";
compilationDefines["make_real4"] = "make_double4";
compilationDefines["make_mixed2"] = "make_double2";
compilationDefines["make_mixed3"] = "make_double3";
compilationDefines["make_mixed4"] = "make_double4";
}
else if (useMixedPrecision) {
posq.initialize<float4>(*this, paddedNumAtoms, "posq");
posqCorrection.initialize<float4>(*this, paddedNumAtoms, "posqCorrection");
velm.initialize<double4>(*this, paddedNumAtoms, "velm");
compilationDefines["USE_MIXED_PRECISION"] = "1";
compilationDefines["make_real2"] = "make_float2";
compilationDefines["make_real3"] = "make_float3";
compilationDefines["make_real4"] = "make_float4";
compilationDefines["make_mixed2"] = "make_double2";
compilationDefines["make_mixed3"] = "make_double3";
compilationDefines["make_mixed4"] = "make_double4";
}
else {
posq.initialize<float4>(*this, paddedNumAtoms, "posq");
velm.initialize<float4>(*this, paddedNumAtoms, "velm");
compilationDefines["make_real2"] = "make_float2";
compilationDefines["make_real3"] = "make_float3";
compilationDefines["make_real4"] = "make_float4";
compilationDefines["make_mixed2"] = "make_float2";
compilationDefines["make_mixed3"] = "make_float3";
compilationDefines["make_mixed4"] = "make_float4";
}
force.initialize<long long>(*this, paddedNumAtoms*3, "force");
posCellOffsets.resize(paddedNumAtoms, mm_int4(0, 0, 0, 0));
atomIndexDevice.initialize<int>(*this, paddedNumAtoms, "atomIndex");
atomIndex.resize(paddedNumAtoms);
for (int i = 0; i < paddedNumAtoms; ++i)
atomIndex[i] = i;
atomIndexDevice.upload(atomIndex);
// Create utility kernels that are used in multiple places.
hipModule_t utilities = createModule(HipKernelSources::vectorOps+HipKernelSources::utilities);
clearBufferKernel = getKernel(utilities, "clearBuffer");
clearTwoBuffersKernel = getKernel(utilities, "clearTwoBuffers");
clearThreeBuffersKernel = getKernel(utilities, "clearThreeBuffers");
clearFourBuffersKernel = getKernel(utilities, "clearFourBuffers");
clearFiveBuffersKernel = getKernel(utilities, "clearFiveBuffers");
clearSixBuffersKernel = getKernel(utilities, "clearSixBuffers");
reduceEnergyKernel = getKernel(utilities, "reduceEnergy");
setChargesKernel = getKernel(utilities, "setCharges");
// Set defines based on the requested precision.
compilationDefines["SQRT"] = useDoublePrecision ? "sqrt" : "__fsqrt_rn";
compilationDefines["RSQRT"] = useDoublePrecision ? "rsqrt" : "__frsqrt_rn";
compilationDefines["RECIP(x)"] = useDoublePrecision ? "(1.0/(x))" : "(1.0f/(x))";
compilationDefines["EXP"] = useDoublePrecision ? "exp" : "__expf";
compilationDefines["LOG"] = useDoublePrecision ? "log" : "__logf";
compilationDefines["POW"] = useDoublePrecision ? "pow" : "powf";
compilationDefines["COS"] = useDoublePrecision ? "cos" : "cosf";
compilationDefines["SIN"] = useDoublePrecision ? "sin" : "sinf";
compilationDefines["TAN"] = useDoublePrecision ? "tan" : "tanf";
compilationDefines["ACOS"] = useDoublePrecision ? "acos" : "acosf";
compilationDefines["ASIN"] = useDoublePrecision ? "asin" : "asinf";
compilationDefines["ATAN"] = useDoublePrecision ? "atan" : "atanf";
compilationDefines["ERF"] = useDoublePrecision ? "erf" : "erff";
compilationDefines["ERFC"] = useDoublePrecision ? "erfc" : "erfcf";
// Set defines for applying periodic boundary conditions.
Vec3 boxVectors[3];
system.getDefaultPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
boxIsTriclinic = (boxVectors[0][1] != 0.0 || boxVectors[0][2] != 0.0 ||
boxVectors[1][0] != 0.0 || boxVectors[1][2] != 0.0 ||
boxVectors[2][0] != 0.0 || boxVectors[2][1] != 0.0);
for (int i = 0; i < system.getNumForces(); i++)
if (dynamic_cast<const MonteCarloFlexibleBarostat*>(&system.getForce(i)) != NULL)
boxIsTriclinic = true;
if (boxIsTriclinic) {
compilationDefines["APPLY_PERIODIC_TO_DELTA(delta)"] =
"{"
"real scale3 = floor(delta.z*invPeriodicBoxSize.z+0.5f); \\\n"
"delta.x -= scale3*periodicBoxVecZ.x; \\\n"
"delta.y -= scale3*periodicBoxVecZ.y; \\\n"
"delta.z -= scale3*periodicBoxVecZ.z; \\\n"
"real scale2 = floor(delta.y*invPeriodicBoxSize.y+0.5f); \\\n"
"delta.x -= scale2*periodicBoxVecY.x; \\\n"
"delta.y -= scale2*periodicBoxVecY.y; \\\n"
"real scale1 = floor(delta.x*invPeriodicBoxSize.x+0.5f); \\\n"
"delta.x -= scale1*periodicBoxVecX.x;}";
compilationDefines["APPLY_PERIODIC_TO_POS(pos)"] =
"{"
"real scale3 = floor(pos.z*invPeriodicBoxSize.z); \\\n"
"pos.x -= scale3*periodicBoxVecZ.x; \\\n"
"pos.y -= scale3*periodicBoxVecZ.y; \\\n"
"pos.z -= scale3*periodicBoxVecZ.z; \\\n"
"real scale2 = floor(pos.y*invPeriodicBoxSize.y); \\\n"
"pos.x -= scale2*periodicBoxVecY.x; \\\n"
"pos.y -= scale2*periodicBoxVecY.y; \\\n"
"real scale1 = floor(pos.x*invPeriodicBoxSize.x); \\\n"
"pos.x -= scale1*periodicBoxVecX.x;}";
compilationDefines["APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)"] =
"{"
"real scale3 = floor((pos.z-center.z)*invPeriodicBoxSize.z+0.5f); \\\n"
"pos.x -= scale3*periodicBoxVecZ.x; \\\n"
"pos.y -= scale3*periodicBoxVecZ.y; \\\n"
"pos.z -= scale3*periodicBoxVecZ.z; \\\n"
"real scale2 = floor((pos.y-center.y)*invPeriodicBoxSize.y+0.5f); \\\n"
"pos.x -= scale2*periodicBoxVecY.x; \\\n"
"pos.y -= scale2*periodicBoxVecY.y; \\\n"
"real scale1 = floor((pos.x-center.x)*invPeriodicBoxSize.x+0.5f); \\\n"
"pos.x -= scale1*periodicBoxVecX.x;}";
}
else {
compilationDefines["APPLY_PERIODIC_TO_DELTA(delta)"] =
"{"
"delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; \\\n"
"delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; \\\n"
"delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;}";
compilationDefines["APPLY_PERIODIC_TO_POS(pos)"] =
"{"
"pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x; \\\n"
"pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y; \\\n"
"pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;}";
compilationDefines["APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)"] =
"{"
"pos.x -= floor((pos.x-center.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; \\\n"
"pos.y -= floor((pos.y-center.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; \\\n"
"pos.z -= floor((pos.z-center.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;}";
}
// Create utilities objects.
bonded = new HipBondedUtilities(*this);
nonbonded = new HipNonbondedUtilities(*this);
integration = new HipIntegrationUtilities(*this, system);
expression = new HipExpressionUtilities(*this);
}
HipContext::~HipContext() {
pushAsCurrent();
for (auto force : forces)
delete force;
for (auto listener : reorderListeners)
delete listener;
for (auto computation : preComputations)
delete computation;
for (auto computation : postComputations)
delete computation;
if (pinnedBuffer != NULL)
hipHostFree(pinnedBuffer);
if (integration != NULL)
delete integration;
if (expression != NULL)
delete expression;
if (bonded != NULL)
delete bonded;
if (nonbonded != NULL)
delete nonbonded;
for (auto module : loadedModules)
hipModuleUnload(module);
if (!isLinkedContext)
hipStreamDestroy(defaultStream);
popAsCurrent();
contextIsValid = false;
}
void HipContext::initialize() {
ContextSelector selector(*this);
string errorMessage = "Error initializing Context";
int numEnergyBuffers = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers());
if (useDoublePrecision) {
energyBuffer.initialize<double>(*this, numEnergyBuffers, "energyBuffer");
energySum.initialize<double>(*this, multiprocessors, "energySum");
int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
CHECK_RESULT(hipHostMalloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), getHostMallocFlags()));
}
else if (useMixedPrecision) {
energyBuffer.initialize<double>(*this, numEnergyBuffers, "energyBuffer");
energySum.initialize<double>(*this, multiprocessors, "energySum");
int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
CHECK_RESULT(hipHostMalloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), getHostMallocFlags()));
}
else {
energyBuffer.initialize<float>(*this, numEnergyBuffers, "energyBuffer");
energySum.initialize<float>(*this, multiprocessors, "energySum");
int pinnedBufferSize = max(paddedNumAtoms*6, numEnergyBuffers);
CHECK_RESULT(hipHostMalloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), getHostMallocFlags()));
}
for (int i = 0; i < numAtoms; i++) {
double mass = system.getParticleMass(i);
if (useDoublePrecision || useMixedPrecision)
((double4*) pinnedBuffer)[i] = make_double4(0.0, 0.0, 0.0, mass == 0.0 ? 0.0 : 1.0/mass);
else
((float4*) pinnedBuffer)[i] = make_float4(0.0f, 0.0f, 0.0f, mass == 0.0 ? 0.0f : (float) (1.0/mass));
}
velm.upload(pinnedBuffer);
bonded->initialize(system);
addAutoclearBuffer(force.getDevicePointer(), force.getSize()*force.getElementSize());
addAutoclearBuffer(energyBuffer.getDevicePointer(), energyBuffer.getSize()*energyBuffer.getElementSize());
int numEnergyParamDerivs = energyParamDerivNames.size();
if (numEnergyParamDerivs > 0) {
if (useDoublePrecision || useMixedPrecision)
energyParamDerivBuffer.initialize<double>(*this, numEnergyParamDerivs*numEnergyBuffers, "energyParamDerivBuffer");
else
energyParamDerivBuffer.initialize<float>(*this, numEnergyParamDerivs*numEnergyBuffers, "energyParamDerivBuffer");
addAutoclearBuffer(energyParamDerivBuffer);
}
findMoleculeGroups();
nonbonded->initialize(system);
}
void HipContext::initializeContexts() {
getPlatformData().initializeContexts(system);
}
void HipContext::setAsCurrent() {
if (contextIsValid)
hipSetDevice(device);
}
thread_local std::stack<hipDevice_t> outerScopeDevices;
void HipContext::pushAsCurrent() {
if (contextIsValid) {
// Emulate cuCtxPushCurrent's behavior because hipCtxPushCurrent is deprecated
hipDevice_t outerScopeDevice;
hipGetDevice(&outerScopeDevice);
outerScopeDevices.push(outerScopeDevice);
if (device != outerScopeDevice) {
hipSetDevice(device);
}
}
}
void HipContext::popAsCurrent() {
if (contextIsValid) {
// Emulate cuCtxPopCurrent's behavior because hipCtxPopCurrent is deprecated
hipDevice_t outerScopeDevice = outerScopeDevices.top();
outerScopeDevices.pop();
if (outerScopeDevice != device) {
hipSetDevice(outerScopeDevice);
}
}
}
string HipContext::getTempFileName() const {
stringstream tempFileName;
tempFileName << tempDir;
tempFileName << "openmmTempKernel" << this; // Include a pointer to this context as part of the filename to avoid collisions.
tempFileName << "_" << std::this_thread::get_id();
return tempFileName.str();
}
string HipContext::getHash(const string& src) const {
CSHA1 sha1;
sha1.Update((const UINT_8*) src.c_str(), src.size());
sha1.Final();
UINT_8 hash[20];
sha1.GetHash(hash);
stringstream cacheHash;
cacheHash.flags(ios::hex);
for (int i = 0; i < 20; i++)
cacheHash << setw(2) << setfill('0') << (int) hash[i];
return cacheHash.str();
}
string HipContext::getCacheFileName(const string& src) const {
stringstream cacheFile;
cacheFile << cacheDir << "openmm-hip-" << getHash(src + gpuArchitecture);
return cacheFile.str();
}
hipModule_t HipContext::createModule(const string source) {
return createModule(source, map<string, string>());
}
hipModule_t HipContext::createModule(const string source, const map<string, string>& defines) {
const char* saveTempsEnv = getenv("OPENMM_SAVE_TEMPS");
const bool saveTemps = saveTempsEnv != nullptr && string(saveTempsEnv) == "1";
int runtimeVersion;
CHECK_RESULT2(hipRuntimeGetVersion(&runtimeVersion), "Error getting HIP runtime version");
string options = "-O3 -ffast-math -munsafe-fp-atomics -Wall -Wno-hip-only";
options += " --offload-arch=" + gpuArchitecture;
if (gpuArchitecture.find("gfx90a") == 0 ||
gpuArchitecture.find("gfx94") == 0) {
// HIP-TODO: Remove it when the compiler does a better job
// Disable SLP vectorization as it may generate unoptimal packed math instructions on
// >=MI200 (gfx90a, gfx942): more v_mov, higher register usage etc.
options += " -fno-slp-vectorize";
}
if (getMaxThreadBlockSize() < 1024) {
options += " --gpu-max-threads-per-block=" + std::to_string(getMaxThreadBlockSize());
}
if (runtimeVersion < 60140092) {
// Workaround for operator* defined for complex types (typedefs for float2, double2) in
// ROCm 6.0 headers. This issue has been fixed in 6.1. hipRTC includes amd_hip_complex.h
// by default, we fool the include guard into thinking the header is already included.
options += " -D HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMPLEX_H";
}
stringstream src;
src << "// Compilation Options: " << options << endl << endl;
src << "// HIP Runtime Version: " << runtimeVersion << endl << endl;
for (auto& pair : compilationDefines) {
// Query defines to avoid duplicate variables
if (defines.find(pair.first) == defines.end()) {
src << "#define " << pair.first;
if (!pair.second.empty())
src << " " << pair.second;
src << endl;
}
}
if (!compilationDefines.empty())
src << endl;
if (useDoublePrecision) {
src << "typedef double real;\n";
src << "typedef double2 real2;\n";
src << "typedef double3 real3;\n";
src << "typedef double4 real4;\n";
}
else {
src << "typedef float real;\n";
src << "typedef float2 real2;\n";
src << "typedef float3 real3;\n";
src << "typedef float4 real4;\n";
}
if (useDoublePrecision || useMixedPrecision) {
src << "typedef double mixed;\n";
src << "typedef double2 mixed2;\n";
src << "typedef double3 mixed3;\n";
src << "typedef double4 mixed4;\n";
}
else {
src << "typedef float mixed;\n";
src << "typedef float2 mixed2;\n";
src << "typedef float3 mixed3;\n";
src << "typedef float4 mixed4;\n";
}
src << "typedef unsigned int tileflags;\n";
src << HipKernelSources::common << endl;
for (auto& pair : defines) {
src << "#define " << pair.first;
if (!pair.second.empty())
src << " " << pair.second;
src << endl;
}
if (!defines.empty())
src << endl;
src << HipKernelSources::intrinsics << endl;
src << source << endl;
// See whether we already have PTX for this kernel cached.
string cacheFile = getCacheFileName(src.str());
hipModule_t module;
if (hipModuleLoad(&module, cacheFile.c_str()) == hipSuccess) {
loadedModules.push_back(module);
return module;
}
// Select names for the various temporary files.
if (saveTemps) {
stringstream tempFileName;
const char* saveTempsPrefixEnv = getenv("OPENMM_SAVE_TEMPS_PREFIX");
if (saveTempsPrefixEnv) {
tempFileName << saveTempsPrefixEnv;
}
tempFileName << getHash(src.str());
options += " --save-temps";
string inputFile = (tempFileName.str()+".hip");
std::cout << "Source code: " << inputFile << std::endl;
std::cout << "Compile options: " << options << std::endl;
ofstream out(inputFile.c_str());
out << src.str();
out.close();
}
// Split the command line options into an array of options.
stringstream flagsStream(options);
string flag;
vector<string> splitFlags;
while (flagsStream >> flag)
splitFlags.push_back(flag);
int numOptions = splitFlags.size();
vector<const char*> optionsVec(numOptions);
for (int i = 0; i < numOptions; i++)
optionsVec[i] = &splitFlags[i][0];
// Compile the program to CO.
hiprtcProgram program;
HIPRTC_CHECK_RESULT(hiprtcCreateProgram(&program, src.str().c_str(), NULL, 0, NULL, NULL), "Error creating program");
try {
hiprtcResult result = hiprtcCompileProgram(program, optionsVec.size(), &optionsVec[0]);
if (result != HIPRTC_SUCCESS || saveTemps) {
size_t logSize;
hiprtcGetProgramLogSize(program, &logSize);
std::string log(logSize, '\0');
if (logSize > 0) {
hiprtcGetProgramLog(program, &log[0]);
if (saveTemps) {
std::cout << "Log: " << log << std::endl;
}
}
if (result != HIPRTC_SUCCESS) {
throw OpenMMException("Error compiling program: "+log);
}
}
size_t codeSize;
hiprtcGetCodeSize(program, &codeSize);
vector<char> code(codeSize);
hiprtcGetCode(program, &code[0]);
hiprtcDestroyProgram(&program);
// If possible, write the CO out to a cache file for later use.
try {
ofstream out(cacheFile.c_str(), ios::out | ios::binary);
out.write(&code[0], code.size());
out.close();
}
catch (...) {
// An error occurred. Possibly we don't have permission to write to the temp directory.
// Ignore.
}
CHECK_RESULT2(hipModuleLoadDataEx(&module, &code[0], 0, NULL, NULL), "Error loading HIP module");
loadedModules.push_back(module);
return module;
}
catch (...) {
hiprtcDestroyProgram(&program);
throw;
}
}
hipFunction_t HipContext::getKernel(hipModule_t& module, const string& name) {
hipFunction_t function;
hipError_t result = hipModuleGetFunction(&function, module, name.c_str());
if (result != hipSuccess) {
std::stringstream m;
m<<"Error creating kernel "<<name<<": "<<getErrorString(result)<<" ("<<result<<")";
throw OpenMMException(m.str());
}
return function;
}
vector<ComputeContext*> HipContext::getAllContexts() {
vector<ComputeContext*> result;
for (HipContext* c : platformData.contexts)
result.push_back(c);
return result;
}
hipStream_t HipContext::getCurrentStream() {
return currentStream;
}
void HipContext::setCurrentStream(hipStream_t stream) {
currentStream = stream;
}
void HipContext::restoreDefaultStream() {
currentStream = defaultStream;
}
HipArray* HipContext::createArray() {
return new HipArray();
}
ComputeEvent HipContext::createEvent() {
return shared_ptr<ComputeEventImpl>(new HipEvent(*this));
}
HipFFT3D* HipContext::createFFT(int xsize, int ysize, int zsize, bool realToComplex, hipStream_t stream, HipArray& in, HipArray& out) {
return new HipFFT3D(*this, xsize, ysize, zsize, realToComplex, stream, in, out);
}
int HipContext::findLegalFFTDimension(int minimum) {
return HipFFT3D::findLegalDimension(minimum);
}
ComputeProgram HipContext::compileProgram(const std::string source, const std::map<std::string, std::string>& defines) {
hipModule_t module = createModule(HipKernelSources::vectorOps+source, defines);
return shared_ptr<ComputeProgramImpl>(new HipProgram(*this, module));
}
HipArray& HipContext::unwrap(ArrayInterface& array) const {
HipArray* cuarray;
ComputeArray* wrapper = dynamic_cast<ComputeArray*>(&array);
if (wrapper != NULL)
cuarray = dynamic_cast<HipArray*>(&wrapper->getArray());
else
cuarray = dynamic_cast<HipArray*>(&array);
if (cuarray == NULL)
throw OpenMMException("Array argument is not an HipArray");
return *cuarray;
}
std::string HipContext::getErrorString(hipError_t result) {
return string(hipGetErrorName(result));
}
void HipContext::executeKernel(hipFunction_t kernel, void** arguments, int threads, int blockSize, unsigned int sharedSize) {
if (blockSize == -1)
blockSize = ThreadBlockSize;
int gridSize = std::min((threads+blockSize-1)/blockSize, numThreadBlocks);
hipError_t result = hipModuleLaunchKernel(kernel, gridSize, 1, 1, blockSize, 1, 1, sharedSize, currentStream, arguments, NULL);
if (result != hipSuccess) {
stringstream str;
str<<"Error invoking kernel: "<<getErrorString(result)<<" ("<<result<<")";
throw OpenMMException(str.str());
}
}
void HipContext::executeKernelFlat(hipFunction_t kernel, void** arguments, int threads, int blockSize, unsigned int sharedSize) {
if (blockSize == -1)
blockSize = ThreadBlockSize;
int gridSize = (threads+blockSize-1)/blockSize;
hipError_t result = hipModuleLaunchKernel(kernel, gridSize, 1, 1, blockSize, 1, 1, sharedSize, currentStream, arguments, NULL);
if (result != hipSuccess) {
stringstream str;
str<<"Error invoking kernel: "<<getErrorString(result)<<" ("<<result<<")";
throw OpenMMException(str.str());
}
}
int HipContext::computeThreadBlockSize(double memory) const {
int maxShared = this->sharedMemPerBlock;
int max = (int) (maxShared/memory);
if (max < HipContext::ThreadBlockSize) {
throw OpenMMException("Too much shared memory requested!");
}
int threads = this->simdWidth;
while (threads+this->simdWidth < max)
threads += this->simdWidth;
return threads;
}
void HipContext::clearBuffer(ArrayInterface& array) {
clearBuffer(unwrap(array).getDevicePointer(), array.getSize()*array.getElementSize());
}
void HipContext::clearBuffer(hipDeviceptr_t memory, int size) {
int words = size/4;
void* args[] = {&memory, &words};
executeKernel(clearBufferKernel, args, words, 4 * this->simdWidth);
}
void HipContext::addAutoclearBuffer(ArrayInterface& array) {
addAutoclearBuffer(unwrap(array).getDevicePointer(), array.getSize()*array.getElementSize());
}
void HipContext::addAutoclearBuffer(hipDeviceptr_t memory, int size) {
autoclearBuffers.push_back(memory);
autoclearBufferSizes.push_back(size/4);
}
void HipContext::clearAutoclearBuffers() {
int preferredTBSize = this->simdWidth * 4;
int base = 0;
int total = autoclearBufferSizes.size();
while (total-base >= 6) {
void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
&autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
&autoclearBuffers[base+2], &autoclearBufferSizes[base+2],
&autoclearBuffers[base+3], &autoclearBufferSizes[base+3],
&autoclearBuffers[base+4], &autoclearBufferSizes[base+4],
&autoclearBuffers[base+5], &autoclearBufferSizes[base+5]};
executeKernel(clearSixBuffersKernel, args, max(max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), autoclearBufferSizes[base+5]), preferredTBSize);
base += 6;
}
if (total-base == 5) {
void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
&autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
&autoclearBuffers[base+2], &autoclearBufferSizes[base+2],
&autoclearBuffers[base+3], &autoclearBufferSizes[base+3],
&autoclearBuffers[base+4], &autoclearBufferSizes[base+4]};
executeKernel(clearFiveBuffersKernel, args, max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), preferredTBSize);
}
else if (total-base == 4) {
void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
&autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
&autoclearBuffers[base+2], &autoclearBufferSizes[base+2],
&autoclearBuffers[base+3], &autoclearBufferSizes[base+3]};
executeKernel(clearFourBuffersKernel, args, max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), preferredTBSize);
}
else if (total-base == 3) {
void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
&autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
&autoclearBuffers[base+2], &autoclearBufferSizes[base+2]};
executeKernel(clearThreeBuffersKernel, args, max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), preferredTBSize);
}
else if (total-base == 2) {
void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
&autoclearBuffers[base+1], &autoclearBufferSizes[base+1]};
executeKernel(clearTwoBuffersKernel, args, max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), preferredTBSize);
}
else if (total-base == 1) {
clearBuffer(autoclearBuffers[base], autoclearBufferSizes[base]*4);
}
}
double HipContext::reduceEnergy() {
int bufferSize = energyBuffer.getSize();
int workGroupSize = getMaxThreadBlockSize();
void* args[] = {&energyBuffer.getDevicePointer(), &energySum.getDevicePointer(), &bufferSize, &workGroupSize};
executeKernel(reduceEnergyKernel, args, workGroupSize*energySum.getSize(), workGroupSize, workGroupSize*energyBuffer.getElementSize());
energySum.download(pinnedBuffer);
double result = 0;
if (getUseDoublePrecision() || getUseMixedPrecision()) {
for (int i = 0; i < energySum.getSize(); i++)
result += ((double*) pinnedBuffer)[i];
}
else {
for (int i = 0; i < energySum.getSize(); i++)
result += ((float*) pinnedBuffer)[i];
}
return result;
}
void HipContext::setCharges(const vector<double>& charges) {
if (!chargeBuffer.isInitialized())
chargeBuffer.initialize(*this, numAtoms, useDoublePrecision ? sizeof(double) : sizeof(float), "chargeBuffer");
vector<double> c(numAtoms);
for (int i = 0; i < numAtoms; i++)
c[i] = charges[i];
chargeBuffer.upload(c, true);
void* args[] = {&chargeBuffer.getDevicePointer(), &posq.getDevicePointer(), &atomIndexDevice.getDevicePointer(), &numAtoms};
executeKernel(setChargesKernel, args, numAtoms);
}
bool HipContext::requestPosqCharges() {
bool allow = !hasAssignedPosqCharges;
hasAssignedPosqCharges = true;
return allow;
}
void HipContext::addEnergyParameterDerivative(const string& param) {
// See if this parameter has already been registered.
for (int i = 0; i < energyParamDerivNames.size(); i++)
if (param == energyParamDerivNames[i])
return;
energyParamDerivNames.push_back(param);
}
void HipContext::flushQueue() {
hipStreamSynchronize(getCurrentStream());
}
vector<int> HipContext::getDevicePrecedence() {
int numDevices;
hipDeviceProp_t thisDevice;
string errorMessage = "Error initializing Context";
vector<pair<int, int> > devices;
CHECK_RESULT(hipGetDeviceCount(&numDevices));
for (int i = 0; i < numDevices; i++) {
CHECK_RESULT(hipGetDeviceProperties(&thisDevice, i));
int clock, multiprocessors, speed;
clock = thisDevice.clockRate;
multiprocessors = thisDevice.multiProcessorCount;
speed = clock*multiprocessors;
devices.push_back(std::make_pair(speed, -i));
}
// sort first by speed (higher is better), and finally device index (lower is better)
std::sort(devices.begin(), devices.end());
std::reverse(devices.begin(), devices.end());
vector<int> precedence;
for (int i = 0; i < static_cast<int>(devices.size()); i++) {
precedence.push_back(-devices[i].second);
}
return precedence;
}
unsigned int HipContext::getEventFlags() {
unsigned int flags = hipEventDisableTiming;
return flags;
}
unsigned int HipContext::getHostMallocFlags() {
#ifdef WIN32
return hipHostMallocDefault;
#else
return hipHostMallocNumaUser;
#endif
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2019 Stanford University and the Authors. *
* Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "HipEvent.h"
#include "openmm/OpenMMException.h"
using namespace OpenMM;
HipEvent::HipEvent(HipContext& context) : context(context), eventCreated(false) {
hipError_t result = hipEventCreateWithFlags(&event, context.getEventFlags());
if (result != hipSuccess)
throw OpenMMException("Error creating HIP event:"+HipContext::getErrorString(result));
eventCreated = true;
}
HipEvent::~HipEvent() {
if (eventCreated)
hipEventDestroy(event);
}
void HipEvent::enqueue() {
hipEventRecord(event, context.getCurrentStream());
}
void HipEvent::wait() {
hipEventSynchronize(event);
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2015 Stanford University and the Authors. *
* Portions copyright (c) 2021 Advanced Micro Devices, Inc. *
* Authors: *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "HipFFT3D.h"
#include "HipContext.h"
#include <fstream>
#include <iostream>
#include <sstream>
#include <iterator>
using namespace OpenMM;
using namespace std;
HipFFT3D::HipFFT3D(HipContext& context, int xsize, int ysize, int zsize, bool realToComplex, hipStream_t stream, HipArray& in, HipArray& out) :
context(context), stream(stream) {
deviceIndex = context.getDeviceIndex();
inputBuffer = in.getDevicePointer();
outputBuffer = out.getDevicePointer();
size_t valueSize = context.getUseDoublePrecision() ? sizeof(double) : sizeof(float);
inputBufferSize = zsize * ysize * xsize * valueSize;
if (realToComplex) {
outputBufferSize = (zsize/2 + 1) * ysize * xsize * valueSize * 2;
}
else {
outputBufferSize = zsize * ysize * xsize * valueSize;
}
VkFFTConfiguration configuration = {};
configuration.performR2C = realToComplex;
configuration.device = &deviceIndex;
configuration.num_streams = 1;
configuration.stream = &this->stream;
configuration.doublePrecision = context.getUseDoublePrecision();
configuration.FFTdim = 3;
configuration.size[0] = zsize;
configuration.size[1] = ysize;
configuration.size[2] = xsize;
configuration.inverseReturnToInputBuffer = true;
configuration.isInputFormatted = true;
configuration.inputBufferSize = &inputBufferSize;
configuration.inputBuffer = &inputBuffer;
configuration.inputBufferStride[0] = zsize;
configuration.inputBufferStride[1] = configuration.inputBufferStride[0] * ysize;
configuration.inputBufferStride[2] = configuration.inputBufferStride[1] * xsize;
configuration.bufferSize = &outputBufferSize;
configuration.buffer = &outputBuffer;
configuration.bufferStride[0] = realToComplex ? (zsize/2 + 1) : zsize;
configuration.bufferStride[1] = configuration.bufferStride[0] * ysize;
configuration.bufferStride[2] = configuration.bufferStride[1] * xsize;
// Combine all parameters into a unique key
stringstream info;
int runtimeVersion;
(void)hipRuntimeGetVersion(&runtimeVersion);
info << runtimeVersion;
info << " " << VkFFTGetVersion();
info << " " << xsize << " " << ysize << " " << zsize;
info << " " << realToComplex << " " << context.getUseDoublePrecision();
string cacheFile = context.getCacheFileName(info.str());
bool hasCache = false;
vector<char> cacheContent;
ifstream cache(cacheFile.c_str(), ios::in | ios::binary);
if (cache.is_open()) {
cacheContent.insert(cacheContent.begin(), istreambuf_iterator<char>(cache), istreambuf_iterator<char>());
cache.close();
hasCache = true;
// There is an existing cache, load VkFFT kernels from it
configuration.loadApplicationFromString = 1;
configuration.loadApplicationString = cacheContent.data();
}
else {
// There is no existing cache, request saving
configuration.saveApplicationToString = 1;
}
app = new VkFFTApplication();
VkFFTResult fftResult = initializeVkFFT(app, configuration);
if (fftResult != VKFFT_SUCCESS) {
throw OpenMMException("Error executing initializeVkFFT: "+context.intToString(fftResult));
}
if (!hasCache) {
// There is no existing cache, create it
string outputFile = context.getTempFileName() + ".vkfftcache";
try {
ofstream out(outputFile.c_str(), ios::out | ios::binary);
out.write(reinterpret_cast<char*>(app->saveApplicationString), size_t(app->applicationStringSize));
out.close();
if (!out.fail()) {
if (rename(outputFile.c_str(), cacheFile.c_str()) != 0)
remove(outputFile.c_str());
}
}
catch (...) {
// An error occurred. Possibly we don't have permission to write to the temp directory.
}
}
}
HipFFT3D::~HipFFT3D() {
deleteVkFFT(app);
delete app;
}
void HipFFT3D::execFFT(bool forward) {
VkFFTResult fftResult = VkFFTAppend(app, forward ? -1 : 1, NULL);
if (fftResult != VKFFT_SUCCESS) {
throw OpenMMException("Error executing VkFFTAppend: "+context.intToString(fftResult));
}
}
int HipFFT3D::findLegalDimension(int minimum) {
if (minimum < 1)
return 1;
while (true) {
// Attempt to factor the current value.
int unfactored = minimum;
// VkFFT supports prime factors up to 13
for (int factor = 2; factor <= 13; factor++) {
while (unfactored > 1 && unfactored%factor == 0)
unfactored /= factor;
}
if (unfactored == 1)
return minimum;
minimum++;
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2021 Stanford University and the Authors. *
* Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "HipIntegrationUtilities.h"
#include "HipContext.h"
#include "openmm/common/ContextSelector.h"
using namespace OpenMM;
using namespace std;
#define CHECK_RESULT(result) CHECK_RESULT2(result, errorMessage);
#define CHECK_RESULT2(result, prefix) \
if (result != hipSuccess) { \
std::stringstream m; \
m<<prefix<<": "<<dynamic_cast<HipContext&>(context).getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
throw OpenMMException(m.str());\
}
HipIntegrationUtilities::HipIntegrationUtilities(HipContext& context, const System& system) : IntegrationUtilities(context, system),
ccmaConvergedMemory(NULL) {
CHECK_RESULT2(hipEventCreateWithFlags(&ccmaEvent, context.getEventFlags()), "Error creating event for CCMA");
CHECK_RESULT2(hipHostMalloc((void**) &ccmaConvergedMemory, sizeof(int), context.getHostMallocFlags()), "Error allocating pinned memory");
CHECK_RESULT2(hipHostGetDevicePointer(&ccmaConvergedDeviceMemory, ccmaConvergedMemory, 0), "Error getting device address for pinned memory");
}
HipIntegrationUtilities::~HipIntegrationUtilities() {
ContextSelector selector(context);
if (ccmaConvergedMemory != NULL) {
hipHostFree(ccmaConvergedMemory);
hipEventDestroy(ccmaEvent);
}
}
HipArray& HipIntegrationUtilities::getPosDelta() {
return dynamic_cast<HipContext&>(context).unwrap(posDelta);
}
HipArray& HipIntegrationUtilities::getRandom() {
return dynamic_cast<HipContext&>(context).unwrap(random);
}
HipArray& HipIntegrationUtilities::getStepSize() {
return dynamic_cast<HipContext&>(context).unwrap(stepSize);
}
void HipIntegrationUtilities::applyConstraintsImpl(bool constrainVelocities, double tol) {
ContextSelector selector(context);
ComputeKernel settleKernel, shakeKernel, ccmaForceKernel;
if (constrainVelocities) {
settleKernel = settleVelKernel;
shakeKernel = shakeVelKernel;
ccmaForceKernel = ccmaVelForceKernel;
}
else {
settleKernel = settlePosKernel;
shakeKernel = shakePosKernel;
ccmaForceKernel = ccmaPosForceKernel;
}
if (settleAtoms.isInitialized()) {
if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
settleKernel->setArg(1, tol);
else
settleKernel->setArg(1, (float) tol);
settleKernel->execute(settleAtoms.getSize());
}
if (shakeAtoms.isInitialized()) {
if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
shakeKernel->setArg(1, tol);
else
shakeKernel->setArg(1, (float) tol);
shakeKernel->execute(shakeAtoms.getSize());
}
if (ccmaConstraintAtoms.isInitialized()) {
if (ccmaConstraintAtoms.getSize() <= 1024) {
// Use the version of CCMA that runs in a single kernel with one workgroup.
ccmaFullKernel->setArg(0, (int) constrainVelocities);
if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
ccmaFullKernel->setArg(14, tol);
else
ccmaFullKernel->setArg(14, (float) tol);
ccmaFullKernel->execute(128, 128);
}
else {
ccmaForceKernel->setArg(6, ccmaConvergedDeviceMemory);
if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
ccmaForceKernel->setArg(7, tol);
else
ccmaForceKernel->setArg(7, (float) tol);
ccmaDirectionsKernel->execute(ccmaConstraintAtoms.getSize());
const int checkInterval = 4;
ccmaConvergedMemory[0] = 0;
ccmaUpdateKernel->setArg(4, constrainVelocities ? context.getVelm() : posDelta);
for (int i = 0; i < 150; i++) {
ccmaForceKernel->setArg(8, i);
ccmaForceKernel->execute(ccmaConstraintAtoms.getSize());
if ((i+1)%checkInterval == 0)
CHECK_RESULT2(hipEventRecord(ccmaEvent, dynamic_cast<HipContext&>(context).getCurrentStream()), "Error recording event for CCMA");
ccmaMultiplyKernel->setArg(5, i);
ccmaMultiplyKernel->execute(ccmaConstraintAtoms.getSize());
ccmaUpdateKernel->setArg(9, i);
ccmaUpdateKernel->execute(context.getNumAtoms());
if ((i+1)%checkInterval == 0) {
CHECK_RESULT2(hipEventSynchronize(ccmaEvent), "Error synchronizing on event for CCMA");
if (ccmaConvergedMemory[0])
break;
}
}
}
}
}
void HipIntegrationUtilities::distributeForcesFromVirtualSites() {
ContextSelector selector(context);
for (int i = numVsiteStages-1; i >= 0; i--) {
vsiteForceKernel->setArg(2, context.getLongForceBuffer());
vsiteForceKernel->setArg(15, i);
vsiteForceKernel->execute(numVsites);
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2019 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "HipKernel.h"
#include "openmm/common/ComputeArray.h"
#include "openmm/internal/AssertionUtilities.h"
#include <cstring>
#include <vector>
using namespace OpenMM;
using namespace std;
HipKernel::HipKernel(HipContext& context, hipFunction_t kernel, const string& name) : context(context), kernel(kernel), name(name) {
}
string HipKernel::getName() const {
return name;
}
int HipKernel::getMaxBlockSize() const {
int size;
hipError_t result = hipFuncGetAttribute(&size, HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel);
if (result != hipSuccess)
throw OpenMMException("Error querying max thread block size: "+context.getErrorString(result));
return size;
}
void HipKernel::execute(int threads, int blockSize) {
int numArgs = arrayArgs.size();
argPointers.resize(numArgs);
for (int i = 0; i < numArgs; i++) {
if (arrayArgs[i] != NULL)
argPointers[i] = &arrayArgs[i]->getDevicePointer();
else
argPointers[i] = &primitiveArgs[i];
}
context.executeKernel(kernel, argPointers.data(), threads, blockSize);
}
void HipKernel::addArrayArg(ArrayInterface& value) {
int index = arrayArgs.size();
addEmptyArg();
setArrayArg(index, value);
}
void HipKernel::addPrimitiveArg(const void* value, int size) {
int index = arrayArgs.size();
addEmptyArg();
setPrimitiveArg(index, value, size);
}
void HipKernel::addEmptyArg() {
primitiveArgs.push_back(make_double4(0, 0, 0, 0));
arrayArgs.push_back(NULL);
}
void HipKernel::setArrayArg(int index, ArrayInterface& value) {
ASSERT_VALID_INDEX(index, arrayArgs);
arrayArgs[index] = &context.unwrap(value);
}
void HipKernel::setPrimitiveArg(int index, const void* value, int size) {
ASSERT_VALID_INDEX(index, primitiveArgs);
if (size > sizeof(double4))
throw OpenMMException("Unsupported value type for kernel argument");
memcpy(&primitiveArgs[index], value, size);
arrayArgs[index] = NULL;
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2024 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "HipKernelFactory.h"
#include "HipKernels.h"
#include "HipParallelKernels.h"
#include "HipPlatform.h"
#include "openmm/common/CommonKernels.h"
#include "openmm/internal/ContextImpl.h"
#include "openmm/OpenMMException.h"
using namespace OpenMM;
KernelImpl* HipKernelFactory::createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const {
HipPlatform::PlatformData& data = *static_cast<HipPlatform::PlatformData*>(context.getPlatformData());
if (data.contexts.size() > 1) {
// We are running in parallel on multiple devices, so we may want to create a parallel kernel.
if (name == CalcForcesAndEnergyKernel::Name())
return new HipParallelCalcForcesAndEnergyKernel(name, platform, data);
if (name == CalcHarmonicBondForceKernel::Name())
return new HipParallelCalcHarmonicBondForceKernel(name, platform, data, context.getSystem());
if (name == CalcCustomBondForceKernel::Name())
return new HipParallelCalcCustomBondForceKernel(name, platform, data, context.getSystem());
if (name == CalcHarmonicAngleForceKernel::Name())
return new HipParallelCalcHarmonicAngleForceKernel(name, platform, data, context.getSystem());
if (name == CalcCustomAngleForceKernel::Name())
return new HipParallelCalcCustomAngleForceKernel(name, platform, data, context.getSystem());
if (name == CalcPeriodicTorsionForceKernel::Name())
return new HipParallelCalcPeriodicTorsionForceKernel(name, platform, data, context.getSystem());
if (name == CalcRBTorsionForceKernel::Name())
return new HipParallelCalcRBTorsionForceKernel(name, platform, data, context.getSystem());
if (name == CalcCMAPTorsionForceKernel::Name())
return new HipParallelCalcCMAPTorsionForceKernel(name, platform, data, context.getSystem());
if (name == CalcCustomTorsionForceKernel::Name())
return new HipParallelCalcCustomTorsionForceKernel(name, platform, data, context.getSystem());
if (name == CalcNonbondedForceKernel::Name())
return new HipParallelCalcNonbondedForceKernel(name, platform, data, context.getSystem());
if (name == CalcCustomNonbondedForceKernel::Name())
return new HipParallelCalcCustomNonbondedForceKernel(name, platform, data, context.getSystem());
if (name == CalcCustomExternalForceKernel::Name())
return new HipParallelCalcCustomExternalForceKernel(name, platform, data, context.getSystem());
if (name == CalcCustomHbondForceKernel::Name())
return new HipParallelCalcCustomHbondForceKernel(name, platform, data, context.getSystem());
if (name == CalcCustomCompoundBondForceKernel::Name())
return new HipParallelCalcCustomCompoundBondForceKernel(name, platform, data, context.getSystem());
}
HipContext& cu = *data.contexts[0];
if (name == CalcForcesAndEnergyKernel::Name())
return new HipCalcForcesAndEnergyKernel(name, platform, cu);
if (name == UpdateStateDataKernel::Name())
return new CommonUpdateStateDataKernel(name, platform, cu);
if (name == ApplyConstraintsKernel::Name())
return new CommonApplyConstraintsKernel(name, platform, cu);
if (name == VirtualSitesKernel::Name())
return new CommonVirtualSitesKernel(name, platform, cu);
if (name == CalcHarmonicBondForceKernel::Name())
return new CommonCalcHarmonicBondForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomBondForceKernel::Name())
return new CommonCalcCustomBondForceKernel(name, platform, cu, context.getSystem());
if (name == CalcHarmonicAngleForceKernel::Name())
return new CommonCalcHarmonicAngleForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomAngleForceKernel::Name())
return new CommonCalcCustomAngleForceKernel(name, platform, cu, context.getSystem());
if (name == CalcPeriodicTorsionForceKernel::Name())
return new CommonCalcPeriodicTorsionForceKernel(name, platform, cu, context.getSystem());
if (name == CalcRBTorsionForceKernel::Name())
return new CommonCalcRBTorsionForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCMAPTorsionForceKernel::Name())
return new CommonCalcCMAPTorsionForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomTorsionForceKernel::Name())
return new CommonCalcCustomTorsionForceKernel(name, platform, cu, context.getSystem());
if (name == CalcNonbondedForceKernel::Name())
return new HipCalcNonbondedForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomNonbondedForceKernel::Name())
return new CommonCalcCustomNonbondedForceKernel(name, platform, cu, context.getSystem());
if (name == CalcGBSAOBCForceKernel::Name())
return new CommonCalcGBSAOBCForceKernel(name, platform, cu);
if (name == CalcCustomGBForceKernel::Name())
return new CommonCalcCustomGBForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomExternalForceKernel::Name())
return new CommonCalcCustomExternalForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomHbondForceKernel::Name())
return new CommonCalcCustomHbondForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomCentroidBondForceKernel::Name())
return new CommonCalcCustomCentroidBondForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomCompoundBondForceKernel::Name())
return new CommonCalcCustomCompoundBondForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomCVForceKernel::Name())
return new HipCalcCustomCVForceKernel(name, platform, cu);
if (name == CalcCustomCPPForceKernel::Name())
return new CommonCalcCustomCPPForceKernel(name, platform, context, cu);
if (name == CalcRMSDForceKernel::Name())
return new CommonCalcRMSDForceKernel(name, platform, cu);
if (name == CalcCustomManyParticleForceKernel::Name())
return new CommonCalcCustomManyParticleForceKernel(name, platform, cu, context.getSystem());
if (name == CalcGayBerneForceKernel::Name())
return new CommonCalcGayBerneForceKernel(name, platform, cu);
if (name == IntegrateVerletStepKernel::Name())
return new CommonIntegrateVerletStepKernel(name, platform, cu);
if (name == IntegrateLangevinMiddleStepKernel::Name())
return new CommonIntegrateLangevinMiddleStepKernel(name, platform, cu);
if (name == IntegrateBrownianStepKernel::Name())
return new CommonIntegrateBrownianStepKernel(name, platform, cu);
if (name == IntegrateVariableVerletStepKernel::Name())
return new CommonIntegrateVariableVerletStepKernel(name, platform, cu);
if (name == IntegrateVariableLangevinStepKernel::Name())
return new CommonIntegrateVariableLangevinStepKernel(name, platform, cu);
if (name == IntegrateCustomStepKernel::Name())
return new CommonIntegrateCustomStepKernel(name, platform, cu);
if (name == ApplyAndersenThermostatKernel::Name())
return new CommonApplyAndersenThermostatKernel(name, platform, cu);
if (name == IntegrateNoseHooverStepKernel::Name())
return new CommonIntegrateNoseHooverStepKernel(name, platform, cu);
if (name == ApplyMonteCarloBarostatKernel::Name())
return new CommonApplyMonteCarloBarostatKernel(name, platform, cu);
if (name == RemoveCMMotionKernel::Name())
return new CommonRemoveCMMotionKernel(name, platform, cu);
if (name == CalcATMForceKernel::Name() )
return new HipCalcATMForceKernel(name, platform, cu);
throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2012 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "HipKernelSources.h"
using namespace OpenMM;
using namespace std;
#ifndef OPENMM_HIPKERNELSOURCES_H_
#define OPENMM_HIPKERNELSOURCES_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2012 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "openmm/common/windowsExportCommon.h"
#include <string>
namespace OpenMM {
/**
* This class is a central holding place for the source code of HIP kernels.
* The CMake build script inserts declarations into it based on the .hip files in the
* kernels subfolder.
*/
class OPENMM_EXPORT_COMMON HipKernelSources {
public:
@KERNEL_FILE_DECLARATIONS@
};
} // namespace OpenMM
#endif /*OPENMM_HIPKERNELSOURCES_H_*/
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2022 Stanford University and the Authors. *
* Portions copyright (c) 2020-2022 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "HipKernels.h"
#include "HipForceInfo.h"
#include "openmm/Context.h"
#include "openmm/internal/ContextImpl.h"
#include "openmm/internal/NonbondedForceImpl.h"
#include "openmm/common/ContextSelector.h"
#include "CommonKernelSources.h"
#include "HipBondedUtilities.h"
#include "HipExpressionUtilities.h"
#include "HipIntegrationUtilities.h"
#include "HipNonbondedUtilities.h"
#include "HipKernelSources.h"
#include "SimTKOpenMMRealType.h"
#include "SimTKOpenMMUtilities.h"
#include <algorithm>
#include <cmath>
#include <iterator>
#include <set>
#include <assert.h>
using namespace OpenMM;
using namespace std;
#define CHECK_RESULT(result, prefix) \
if (result != hipSuccess) { \
std::stringstream m; \
m<<prefix<<": "<<HipContext::getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
throw OpenMMException(m.str());\
}
void HipCalcForcesAndEnergyKernel::initialize(const System& system) {
}
void HipCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups) {
cu.setForcesValid(true);
ContextSelector selector(cu);
cu.clearAutoclearBuffers();
for (auto computation : cu.getPreComputations())
computation->computeForceAndEnergy(includeForces, includeEnergy, groups);
HipNonbondedUtilities& nb = cu.getNonbondedUtilities();
cu.setComputeForceCount(cu.getComputeForceCount()+1);
nb.prepareInteractions(groups);
map<string, double>& derivs = cu.getEnergyParamDerivWorkspace();
for (auto& param : context.getParameters())
derivs[param.first] = 0;
}
double HipCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups, bool& valid) {
ContextSelector selector(cu);
cu.getBondedUtilities().computeInteractions(groups);
cu.getNonbondedUtilities().computeInteractions(groups, includeForces, includeEnergy);
double sum = 0.0;
for (auto computation : cu.getPostComputations())
sum += computation->computeForceAndEnergy(includeForces, includeEnergy, groups);
cu.getIntegrationUtilities().distributeForcesFromVirtualSites();
if (includeEnergy)
sum += cu.reduceEnergy();
if (!cu.getForcesValid())
valid = false;
return sum;
}
class HipCalcNonbondedForceKernel::ForceInfo : public HipForceInfo {
public:
ForceInfo(const NonbondedForce& force) : force(force) {
}
bool areParticlesIdentical(int particle1, int particle2) {
double charge1, charge2, sigma1, sigma2, epsilon1, epsilon2;
force.getParticleParameters(particle1, charge1, sigma1, epsilon1);
force.getParticleParameters(particle2, charge2, sigma2, epsilon2);
return (charge1 == charge2 && sigma1 == sigma2 && epsilon1 == epsilon2);
}
int getNumParticleGroups() {
return force.getNumExceptions();
}
void getParticlesInGroup(int index, vector<int>& particles) {
int particle1, particle2;
double chargeProd, sigma, epsilon;
force.getExceptionParameters(index, particle1, particle2, chargeProd, sigma, epsilon);
particles.resize(2);
particles[0] = particle1;
particles[1] = particle2;
}
bool areGroupsIdentical(int group1, int group2) {
int particle1, particle2;
double chargeProd1, chargeProd2, sigma1, sigma2, epsilon1, epsilon2;
force.getExceptionParameters(group1, particle1, particle2, chargeProd1, sigma1, epsilon1);
force.getExceptionParameters(group2, particle1, particle2, chargeProd2, sigma2, epsilon2);
return (chargeProd1 == chargeProd2 && sigma1 == sigma2 && epsilon1 == epsilon2);
}
private:
const NonbondedForce& force;
};
class HipCalcNonbondedForceKernel::PmeIO : public CalcPmeReciprocalForceKernel::IO {
public:
PmeIO(HipContext& cu, hipFunction_t addForcesKernel) : cu(cu), addForcesKernel(addForcesKernel) {
forceTemp.initialize<float4>(cu, cu.getNumAtoms(), "PmeForce");
}
float* getPosq() {
ContextSelector selector(cu);
cu.getPosq().download(posq);
return (float*) &posq[0];
}
void setForce(float* force) {
forceTemp.upload(force);
void* args[] = {&forceTemp.getDevicePointer(), &cu.getForce().getDevicePointer()};
cu.executeKernel(addForcesKernel, args, cu.getNumAtoms());
}
private:
HipContext& cu;
vector<float4> posq;
HipArray forceTemp;
hipFunction_t addForcesKernel;
};
class HipCalcNonbondedForceKernel::PmePreComputation : public HipContext::ForcePreComputation {
public:
PmePreComputation(HipContext& cu, Kernel& pme, CalcPmeReciprocalForceKernel::IO& io) : cu(cu), pme(pme), io(io) {
}
void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
Vec3 boxVectors[3] = {Vec3(cu.getPeriodicBoxSize().x, 0, 0), Vec3(0, cu.getPeriodicBoxSize().y, 0), Vec3(0, 0, cu.getPeriodicBoxSize().z)};
pme.getAs<CalcPmeReciprocalForceKernel>().beginComputation(io, boxVectors, includeEnergy);
}
private:
HipContext& cu;
Kernel pme;
CalcPmeReciprocalForceKernel::IO& io;
};
class HipCalcNonbondedForceKernel::PmePostComputation : public HipContext::ForcePostComputation {
public:
PmePostComputation(Kernel& pme, CalcPmeReciprocalForceKernel::IO& io) : pme(pme), io(io) {
}
double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
return pme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
}
private:
Kernel pme;
CalcPmeReciprocalForceKernel::IO& io;
};
class HipCalcNonbondedForceKernel::SyncStreamPreComputation : public HipContext::ForcePreComputation {
public:
SyncStreamPreComputation(HipContext& cu, hipStream_t stream, hipEvent_t event, int forceGroup) : cu(cu), stream(stream), event(event), forceGroup(forceGroup) {
}
void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
if ((groups&(1<<forceGroup)) != 0) {
hipEventRecord(event, cu.getCurrentStream());
hipStreamWaitEvent(stream, event, 0);
}
}
private:
HipContext& cu;
hipStream_t stream;
hipEvent_t event;
int forceGroup;
};
class HipCalcNonbondedForceKernel::SyncStreamPostComputation : public HipContext::ForcePostComputation {
public:
SyncStreamPostComputation(HipContext& cu, hipEvent_t event, hipFunction_t addEnergyKernel, HipArray& pmeEnergyBuffer, int forceGroup) : cu(cu), event(event),
addEnergyKernel(addEnergyKernel), pmeEnergyBuffer(pmeEnergyBuffer), forceGroup(forceGroup) {
}
double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
if ((groups&(1<<forceGroup)) != 0) {
hipStreamWaitEvent(cu.getCurrentStream(), event, 0);
if (includeEnergy) {
int bufferSize = pmeEnergyBuffer.getSize();
void* args[] = {&pmeEnergyBuffer.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), &bufferSize};
cu.executeKernel(addEnergyKernel, args, bufferSize);
}
}
return 0.0;
}
private:
HipContext& cu;
hipEvent_t event;
hipFunction_t addEnergyKernel;
HipArray& pmeEnergyBuffer;
int forceGroup;
};
HipCalcNonbondedForceKernel::~HipCalcNonbondedForceKernel() {
ContextSelector selector(cu);
if (sort != NULL)
delete sort;
if (fft != NULL)
delete fft;
if (dispersionFft != NULL)
delete dispersionFft;
if (pmeio != NULL)
delete pmeio;
if (hasInitializedFFT) {
if (usePmeStream) {
hipStreamDestroy(pmeStream);
hipEventDestroy(pmeSyncEvent);
hipEventDestroy(paramsSyncEvent);
}
}
}
void HipCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
ContextSelector selector(cu);
int forceIndex;
for (forceIndex = 0; forceIndex < system.getNumForces() && &system.getForce(forceIndex) != &force; ++forceIndex)
;
string prefix = "nonbonded"+cu.intToString(forceIndex)+"_";
// Identify which exceptions are 1-4 interactions.
set<int> exceptionsWithOffsets;
for (int i = 0; i < force.getNumExceptionParameterOffsets(); i++) {
string param;
int exception;
double charge, sigma, epsilon;
force.getExceptionParameterOffset(i, param, exception, charge, sigma, epsilon);
exceptionsWithOffsets.insert(exception);
}
vector<pair<int, int> > exclusions;
vector<int> exceptions;
map<int, int> exceptionIndex;
for (int i = 0; i < force.getNumExceptions(); i++) {
int particle1, particle2;
double chargeProd, sigma, epsilon;
force.getExceptionParameters(i, particle1, particle2, chargeProd, sigma, epsilon);
exclusions.push_back(pair<int, int>(particle1, particle2));
if (chargeProd != 0.0 || epsilon != 0.0 || exceptionsWithOffsets.find(i) != exceptionsWithOffsets.end()) {
exceptionIndex[i] = exceptions.size();
exceptions.push_back(i);
}
}
// Initialize nonbonded interactions.
int numParticles = force.getNumParticles();
vector<float4> baseParticleParamVec(cu.getPaddedNumAtoms(), make_float4(0, 0, 0, 0));
vector<vector<int> > exclusionList(numParticles);
hasCoulomb = false;
hasLJ = false;
for (int i = 0; i < numParticles; i++) {
double charge, sigma, epsilon;
force.getParticleParameters(i, charge, sigma, epsilon);
baseParticleParamVec[i] = make_float4(charge, sigma, epsilon, 0);
exclusionList[i].push_back(i);
if (charge != 0.0)
hasCoulomb = true;
if (epsilon != 0.0)
hasLJ = true;
}
for (int i = 0; i < force.getNumParticleParameterOffsets(); i++) {
string param;
int particle;
double charge, sigma, epsilon;
force.getParticleParameterOffset(i, param, particle, charge, sigma, epsilon);
if (charge != 0.0)
hasCoulomb = true;
if (epsilon != 0.0)
hasLJ = true;
}
for (auto exclusion : exclusions) {
exclusionList[exclusion.first].push_back(exclusion.second);
exclusionList[exclusion.second].push_back(exclusion.first);
}
nonbondedMethod = CalcNonbondedForceKernel::NonbondedMethod(force.getNonbondedMethod());
bool useCutoff = (nonbondedMethod != NoCutoff);
bool usePeriodic = (nonbondedMethod != NoCutoff && nonbondedMethod != CutoffNonPeriodic);
doLJPME = (nonbondedMethod == LJPME && hasLJ);
usePosqCharges = hasCoulomb ? cu.requestPosqCharges() : false;
map<string, string> defines;
defines["HAS_COULOMB"] = (hasCoulomb ? "1" : "0");
defines["HAS_LENNARD_JONES"] = (hasLJ ? "1" : "0");
defines["USE_LJ_SWITCH"] = (useCutoff && force.getUseSwitchingFunction() ? "1" : "0");
if (useCutoff) {
// Compute the reaction field constants.
double reactionFieldK = pow(force.getCutoffDistance(), -3.0)*(force.getReactionFieldDielectric()-1.0)/(2.0*force.getReactionFieldDielectric()+1.0);
double reactionFieldC = (1.0 / force.getCutoffDistance())*(3.0*force.getReactionFieldDielectric())/(2.0*force.getReactionFieldDielectric()+1.0);
defines["REACTION_FIELD_K"] = cu.doubleToString(reactionFieldK);
defines["REACTION_FIELD_C"] = cu.doubleToString(reactionFieldC);
// Compute the switching coefficients.
if (force.getUseSwitchingFunction()) {
defines["LJ_SWITCH_CUTOFF"] = cu.doubleToString(force.getSwitchingDistance());
defines["LJ_SWITCH_C3"] = cu.doubleToString(10/pow(force.getSwitchingDistance()-force.getCutoffDistance(), 3.0));
defines["LJ_SWITCH_C4"] = cu.doubleToString(15/pow(force.getSwitchingDistance()-force.getCutoffDistance(), 4.0));
defines["LJ_SWITCH_C5"] = cu.doubleToString(6/pow(force.getSwitchingDistance()-force.getCutoffDistance(), 5.0));
}
}
if (force.getUseDispersionCorrection() && cu.getContextIndex() == 0 && !doLJPME)
dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(system, force);
else
dispersionCoefficient = 0.0;
alpha = 0;
ewaldSelfEnergy = 0.0;
map<string, string> paramsDefines;
paramsDefines["ONE_4PI_EPS0"] = cu.doubleToString(ONE_4PI_EPS0);
hasOffsets = (force.getNumParticleParameterOffsets() > 0 || force.getNumExceptionParameterOffsets() > 0);
if (hasOffsets)
paramsDefines["HAS_OFFSETS"] = "1";
if (force.getNumParticleParameterOffsets() > 0)
paramsDefines["HAS_PARTICLE_OFFSETS"] = "1";
if (force.getNumExceptionParameterOffsets() > 0)
paramsDefines["HAS_EXCEPTION_OFFSETS"] = "1";
if (usePosqCharges)
paramsDefines["USE_POSQ_CHARGES"] = "1";
if (doLJPME)
paramsDefines["INCLUDE_LJPME_EXCEPTIONS"] = "1";
if (nonbondedMethod == Ewald) {
// Compute the Ewald parameters.
int kmaxx, kmaxy, kmaxz;
NonbondedForceImpl::calcEwaldParameters(system, force, alpha, kmaxx, kmaxy, kmaxz);
defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
defines["USE_EWALD"] = "1";
if (cu.getContextIndex() == 0) {
paramsDefines["INCLUDE_EWALD"] = "1";
paramsDefines["EWALD_SELF_ENERGY_SCALE"] = cu.doubleToString(ONE_4PI_EPS0*alpha/sqrt(M_PI));
for (int i = 0; i < numParticles; i++)
ewaldSelfEnergy -= baseParticleParamVec[i].x*baseParticleParamVec[i].x*ONE_4PI_EPS0*alpha/sqrt(M_PI);
// Create the reciprocal space kernels.
map<string, string> replacements;
replacements["NUM_ATOMS"] = cu.intToString(numParticles);
replacements["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
replacements["KMAX_X"] = cu.intToString(kmaxx);
replacements["KMAX_Y"] = cu.intToString(kmaxy);
replacements["KMAX_Z"] = cu.intToString(kmaxz);
replacements["EXP_COEFFICIENT"] = cu.doubleToString(-1.0/(4.0*alpha*alpha));
replacements["ONE_4PI_EPS0"] = cu.doubleToString(ONE_4PI_EPS0);
replacements["M_PI"] = cu.doubleToString(M_PI);
hipModule_t module = cu.createModule(HipKernelSources::vectorOps+CommonKernelSources::ewald, replacements);
ewaldSumsKernel = cu.getKernel(module, "calculateEwaldCosSinSums");
ewaldForcesKernel = cu.getKernel(module, "calculateEwaldForces");
int elementSize = (cu.getUseDoublePrecision() ? sizeof(double2) : sizeof(float2));
cosSinSums.initialize(cu, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), elementSize, "cosSinSums");
}
}
else if (((nonbondedMethod == PME || nonbondedMethod == LJPME) && hasCoulomb) || doLJPME) {
// Compute the PME parameters.
NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSizeX, gridSizeY, gridSizeZ, false);
gridSizeX = cu.findLegalFFTDimension(gridSizeX);
gridSizeY = cu.findLegalFFTDimension(gridSizeY);
gridSizeZ = cu.findLegalFFTDimension(gridSizeZ);
if (doLJPME) {
NonbondedForceImpl::calcPMEParameters(system, force, dispersionAlpha, dispersionGridSizeX,
dispersionGridSizeY, dispersionGridSizeZ, true);
dispersionGridSizeX = cu.findLegalFFTDimension(dispersionGridSizeX);
dispersionGridSizeY = cu.findLegalFFTDimension(dispersionGridSizeY);
dispersionGridSizeZ = cu.findLegalFFTDimension(dispersionGridSizeZ);
}
defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
defines["USE_EWALD"] = "1";
defines["DO_LJPME"] = doLJPME ? "1" : "0";
if (doLJPME) {
defines["EWALD_DISPERSION_ALPHA"] = cu.doubleToString(dispersionAlpha);
double invRCut6 = pow(force.getCutoffDistance(), -6);
double dalphaR = dispersionAlpha * force.getCutoffDistance();
double dar2 = dalphaR*dalphaR;
double dar4 = dar2*dar2;
double multShift6 = -invRCut6*(1.0 - exp(-dar2) * (1.0 + dar2 + 0.5*dar4));
defines["INVCUT6"] = cu.doubleToString(invRCut6);
defines["MULTSHIFT6"] = cu.doubleToString(multShift6);
}
if (cu.getContextIndex() == 0) {
paramsDefines["INCLUDE_EWALD"] = "1";
paramsDefines["EWALD_SELF_ENERGY_SCALE"] = cu.doubleToString(ONE_4PI_EPS0*alpha/sqrt(M_PI));
for (int i = 0; i < numParticles; i++)
ewaldSelfEnergy -= baseParticleParamVec[i].x*baseParticleParamVec[i].x*ONE_4PI_EPS0*alpha/sqrt(M_PI);
if (doLJPME) {
paramsDefines["INCLUDE_LJPME"] = "1";
paramsDefines["LJPME_SELF_ENERGY_SCALE"] = cu.doubleToString(pow(dispersionAlpha, 6)/3.0);
for (int i = 0; i < numParticles; i++)
ewaldSelfEnergy += baseParticleParamVec[i].z*pow(baseParticleParamVec[i].y*dispersionAlpha, 6)/3.0;
}
usePmeStream = (!cu.getPlatformData().disablePmeStream && !cu.getPlatformData().useCpuPme);
map<string, string> pmeDefines;
pmeDefines["PME_ORDER"] = cu.intToString(PmeOrder);
pmeDefines["NUM_ATOMS"] = cu.intToString(numParticles);
pmeDefines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
pmeDefines["RECIP_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(alpha*alpha));
pmeDefines["GRID_SIZE_X"] = cu.intToString(gridSizeX);
pmeDefines["GRID_SIZE_Y"] = cu.intToString(gridSizeY);
pmeDefines["GRID_SIZE_Z"] = cu.intToString(gridSizeZ);
pmeDefines["EPSILON_FACTOR"] = cu.doubleToString(sqrt(ONE_4PI_EPS0));
pmeDefines["M_PI"] = cu.doubleToString(M_PI);
if (cu.getUseDoublePrecision() || !cu.getSupportsHardwareFloatGlobalAtomicAdd() || cu.getPlatformData().deterministicForces)
pmeDefines["USE_FIXED_POINT_CHARGE_SPREADING"] = "1";
if (usePmeStream)
pmeDefines["USE_PME_STREAM"] = "1";
map<string, string> replacements;
replacements["CHARGE"] = (usePosqCharges ? "pos.w" : "charges[atom]");
hipModule_t module = cu.createModule(HipKernelSources::vectorOps+cu.replaceStrings(CommonKernelSources::pme, replacements), pmeDefines);
if (cu.getPlatformData().useCpuPme && !doLJPME && usePosqCharges) {
// Create the CPU PME kernel.
try {
cpuPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), *cu.getPlatformData().context);
cpuPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSizeX, gridSizeY, gridSizeZ, numParticles, alpha, cu.getPlatformData().deterministicForces);
hipFunction_t addForcesKernel = cu.getKernel(module, "addForces");
pmeio = new PmeIO(cu, addForcesKernel);
cu.addPreComputation(new PmePreComputation(cu, cpuPme, *pmeio));
cu.addPostComputation(new PmePostComputation(cpuPme, *pmeio));
}
catch (OpenMMException& ex) {
// The CPU PME plugin isn't available.
}
}
if (pmeio == NULL) {
pmeGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
pmeSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge");
pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
pmeInterpolateForceKernel = cu.getKernel(module, "gridInterpolateForce");
pmeEvalEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy");
pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
hipFuncSetCacheConfig(pmeSpreadChargeKernel, hipFuncCachePreferShared);
hipFuncSetCacheConfig(pmeInterpolateForceKernel, hipFuncCachePreferL1);
if (doLJPME) {
pmeDefines["EWALD_ALPHA"] = cu.doubleToString(dispersionAlpha);
pmeDefines["GRID_SIZE_X"] = cu.intToString(dispersionGridSizeX);
pmeDefines["GRID_SIZE_Y"] = cu.intToString(dispersionGridSizeY);
pmeDefines["GRID_SIZE_Z"] = cu.intToString(dispersionGridSizeZ);
pmeDefines["RECIP_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(dispersionAlpha*dispersionAlpha));
pmeDefines["USE_LJPME"] = "1";
pmeDefines["CHARGE_FROM_SIGEPS"] = "1";
module = cu.createModule(HipKernelSources::vectorOps+CommonKernelSources::pme, pmeDefines);
pmeDispersionFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
pmeDispersionGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
pmeDispersionSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge");
pmeDispersionConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
pmeEvalDispersionEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy");
pmeInterpolateDispersionForceKernel = cu.getKernel(module, "gridInterpolateForce");
hipFuncSetCacheConfig(pmeDispersionSpreadChargeKernel, hipFuncCachePreferL1);
}
// Create required data structures.
int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
int roundedZSize = PmeOrder*(int) ceil(gridSizeZ/(double) PmeOrder);
int gridElements = gridSizeX*gridSizeY*roundedZSize;
if (doLJPME) {
roundedZSize = PmeOrder*(int) ceil(dispersionGridSizeZ/(double) PmeOrder);
gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*roundedZSize);
}
pmeGrid1.initialize(cu, gridElements, 2*elementSize, "pmeGrid1");
pmeGrid2.initialize(cu, gridElements, 2*elementSize, "pmeGrid2");
cu.addAutoclearBuffer(pmeGrid2);
pmeBsplineModuliX.initialize(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliY.initialize(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliZ.initialize(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
if (doLJPME) {
pmeDispersionBsplineModuliX.initialize(cu, dispersionGridSizeX, elementSize, "pmeDispersionBsplineModuliX");
pmeDispersionBsplineModuliY.initialize(cu, dispersionGridSizeY, elementSize, "pmeDispersionBsplineModuliY");
pmeDispersionBsplineModuliZ.initialize(cu, dispersionGridSizeZ, elementSize, "pmeDispersionBsplineModuliZ");
}
pmeAtomGridIndex.initialize<int2>(cu, numParticles, "pmeAtomGridIndex");
int energyElementSize = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
pmeEnergyBuffer.initialize(cu, cu.getNumThreadBlocks()*HipContext::ThreadBlockSize, energyElementSize, "pmeEnergyBuffer");
cu.clearBuffer(pmeEnergyBuffer);
sort = new HipSort(cu, new SortTrait(), cu.getNumAtoms());
// Prepare for doing PME on its own stream.
if (usePmeStream) {
CHECK_RESULT(hipStreamCreateWithFlags(&pmeStream, hipStreamNonBlocking), "Error creating stream for NonbondedForce");
CHECK_RESULT(hipEventCreateWithFlags(&pmeSyncEvent, cu.getEventFlags()), "Error creating event for NonbondedForce");
CHECK_RESULT(hipEventCreateWithFlags(&paramsSyncEvent, cu.getEventFlags()), "Error creating event for NonbondedForce");
int recipForceGroup = force.getReciprocalSpaceForceGroup();
if (recipForceGroup < 0)
recipForceGroup = force.getForceGroup();
cu.addPreComputation(new SyncStreamPreComputation(cu, pmeStream, pmeSyncEvent, recipForceGroup));
cu.addPostComputation(new SyncStreamPostComputation(cu, pmeSyncEvent, cu.getKernel(module, "addEnergy"), pmeEnergyBuffer, recipForceGroup));
}
hipStream_t fftStream = usePmeStream ? pmeStream : cu.getCurrentStream();
fft = cu.createFFT(gridSizeX, gridSizeY, gridSizeZ, true, fftStream, pmeGrid1, pmeGrid2);
if (doLJPME)
dispersionFft = cu.createFFT(dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ, true, fftStream, pmeGrid1, pmeGrid2);
hasInitializedFFT = true;
// Initialize the b-spline moduli.
for (int grid = 0; grid < 2; grid++) {
int xsize, ysize, zsize;
HipArray *xmoduli, *ymoduli, *zmoduli;
if (grid == 0) {
xsize = gridSizeX;
ysize = gridSizeY;
zsize = gridSizeZ;
xmoduli = &pmeBsplineModuliX;
ymoduli = &pmeBsplineModuliY;
zmoduli = &pmeBsplineModuliZ;
}
else {
if (!doLJPME)
continue;
xsize = dispersionGridSizeX;
ysize = dispersionGridSizeY;
zsize = dispersionGridSizeZ;
xmoduli = &pmeDispersionBsplineModuliX;
ymoduli = &pmeDispersionBsplineModuliY;
zmoduli = &pmeDispersionBsplineModuliZ;
}
int maxSize = max(max(xsize, ysize), zsize);
vector<double> data(PmeOrder);
vector<double> ddata(PmeOrder);
vector<double> bsplines_data(maxSize);
data[PmeOrder-1] = 0.0;
data[1] = 0.0;
data[0] = 1.0;
for (int i = 3; i < PmeOrder; i++) {
double div = 1.0/(i-1.0);
data[i-1] = 0.0;
for (int j = 1; j < (i-1); j++)
data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
data[0] = div*data[0];
}
// Differentiate.
ddata[0] = -data[0];
for (int i = 1; i < PmeOrder; i++)
ddata[i] = data[i-1]-data[i];
double div = 1.0/(PmeOrder-1);
data[PmeOrder-1] = 0.0;
for (int i = 1; i < (PmeOrder-1); i++)
data[PmeOrder-i-1] = div*(i*data[PmeOrder-i-2]+(PmeOrder-i)*data[PmeOrder-i-1]);
data[0] = div*data[0];
for (int i = 0; i < maxSize; i++)
bsplines_data[i] = 0.0;
for (int i = 1; i <= PmeOrder; i++)
bsplines_data[i] = data[i-1];
// Evaluate the actual bspline moduli for X/Y/Z.
for (int dim = 0; dim < 3; dim++) {
int ndata = (dim == 0 ? xsize : dim == 1 ? ysize : zsize);
vector<double> moduli(ndata);
for (int i = 0; i < ndata; i++) {
double sc = 0.0;
double ss = 0.0;
for (int j = 0; j < ndata; j++) {
double arg = (2.0*M_PI*i*j)/ndata;
sc += bsplines_data[j]*cos(arg);
ss += bsplines_data[j]*sin(arg);
}
moduli[i] = sc*sc+ss*ss;
}
for (int i = 0; i < ndata; i++)
if (moduli[i] < 1.0e-7)
moduli[i] = (moduli[(i-1+ndata)%ndata]+moduli[(i+1)%ndata])*0.5;
if (dim == 0)
xmoduli->upload(moduli, true);
else if (dim == 1)
ymoduli->upload(moduli, true);
else
zmoduli->upload(moduli, true);
}
}
}
}
}
// Add code to subtract off the reciprocal part of excluded interactions.
if ((nonbondedMethod == Ewald || nonbondedMethod == PME || nonbondedMethod == LJPME) && pmeio == NULL) {
int numContexts = cu.getPlatformData().contexts.size();
int startIndex = cu.getContextIndex()*force.getNumExceptions()/numContexts;
int endIndex = (cu.getContextIndex()+1)*force.getNumExceptions()/numContexts;
int numExclusions = endIndex-startIndex;
if (numExclusions > 0) {
paramsDefines["HAS_EXCLUSIONS"] = "1";
vector<vector<int> > atoms(numExclusions, vector<int>(2));
exclusionAtoms.initialize<int2>(cu, numExclusions, "exclusionAtoms");
exclusionParams.initialize<float4>(cu, numExclusions, "exclusionParams");
vector<int2> exclusionAtomsVec(numExclusions);
for (int i = 0; i < numExclusions; i++) {
int j = i+startIndex;
exclusionAtomsVec[i] = make_int2(exclusions[j].first, exclusions[j].second);
atoms[i][0] = exclusions[j].first;
atoms[i][1] = exclusions[j].second;
}
exclusionAtoms.upload(exclusionAtomsVec);
map<string, string> replacements;
replacements["PARAMS"] = cu.getBondedUtilities().addArgument(exclusionParams, "float4");
replacements["EWALD_ALPHA"] = cu.doubleToString(alpha);
replacements["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
replacements["DO_LJPME"] = doLJPME ? "1" : "0";
replacements["USE_PERIODIC"] = force.getExceptionsUsePeriodicBoundaryConditions() ? "1" : "0";
if (doLJPME)
replacements["EWALD_DISPERSION_ALPHA"] = cu.doubleToString(dispersionAlpha);
if (force.getIncludeDirectSpace())
cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CommonKernelSources::pmeExclusions, replacements), force.getForceGroup());
}
}
// Add the interaction to the default nonbonded kernel.
string source = cu.replaceStrings(CommonKernelSources::coulombLennardJones, defines);
charges.initialize(cu, cu.getPaddedNumAtoms(), cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float), "charges");
baseParticleParams.initialize<float4>(cu, cu.getPaddedNumAtoms(), "baseParticleParams");
baseParticleParams.upload(baseParticleParamVec);
map<string, string> replacements;
replacements["ONE_4PI_EPS0"] = cu.doubleToString(ONE_4PI_EPS0);
if (usePosqCharges) {
replacements["CHARGE1"] = "posq1.w";
replacements["CHARGE2"] = "posq2.w";
}
else {
replacements["CHARGE1"] = prefix+"charge1";
replacements["CHARGE2"] = prefix+"charge2";
}
if (hasCoulomb && !usePosqCharges)
cu.getNonbondedUtilities().addParameter(HipNonbondedUtilities::ParameterInfo(prefix+"charge", "real", 1, charges.getElementSize(), charges.getDevicePointer()));
sigmaEpsilon.initialize<float2>(cu, cu.getPaddedNumAtoms(), "sigmaEpsilon");
if (hasLJ) {
replacements["SIGMA_EPSILON1"] = prefix+"sigmaEpsilon1";
replacements["SIGMA_EPSILON2"] = prefix+"sigmaEpsilon2";
cu.getNonbondedUtilities().addParameter(HipNonbondedUtilities::ParameterInfo(prefix+"sigmaEpsilon", "float", 2, sizeof(float2), sigmaEpsilon.getDevicePointer()));
}
source = cu.replaceStrings(source, replacements);
if (force.getIncludeDirectSpace())
cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup(), numParticles > 3000, true);
// Initialize the exceptions.
int numContexts = cu.getPlatformData().contexts.size();
int startIndex = cu.getContextIndex()*exceptions.size()/numContexts;
int endIndex = (cu.getContextIndex()+1)*exceptions.size()/numContexts;
int numExceptions = endIndex-startIndex;
if (numExceptions > 0) {
paramsDefines["HAS_EXCEPTIONS"] = "1";
exceptionAtoms.resize(numExceptions);
vector<vector<int> > atoms(numExceptions, vector<int>(2));
exceptionParams.initialize<float4>(cu, numExceptions, "exceptionParams");
baseExceptionParams.initialize<float4>(cu, numExceptions, "baseExceptionParams");
vector<float4> baseExceptionParamsVec(numExceptions);
for (int i = 0; i < numExceptions; i++) {
double chargeProd, sigma, epsilon;
force.getExceptionParameters(exceptions[startIndex+i], atoms[i][0], atoms[i][1], chargeProd, sigma, epsilon);
baseExceptionParamsVec[i] = make_float4(chargeProd, sigma, epsilon, 0);
exceptionAtoms[i] = make_pair(atoms[i][0], atoms[i][1]);
}
baseExceptionParams.upload(baseExceptionParamsVec);
map<string, string> replacements;
replacements["APPLY_PERIODIC"] = (usePeriodic && force.getExceptionsUsePeriodicBoundaryConditions() ? "1" : "0");
replacements["PARAMS"] = cu.getBondedUtilities().addArgument(exceptionParams, "float4");
if (force.getIncludeDirectSpace())
cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CommonKernelSources::nonbondedExceptions, replacements), force.getForceGroup());
}
// Initialize parameter offsets.
vector<vector<float4> > particleOffsetVec(force.getNumParticles());
vector<vector<float4> > exceptionOffsetVec(numExceptions);
for (int i = 0; i < force.getNumParticleParameterOffsets(); i++) {
string param;
int particle;
double charge, sigma, epsilon;
force.getParticleParameterOffset(i, param, particle, charge, sigma, epsilon);
auto paramPos = find(paramNames.begin(), paramNames.end(), param);
int paramIndex;
if (paramPos == paramNames.end()) {
paramIndex = paramNames.size();
paramNames.push_back(param);
}
else
paramIndex = paramPos-paramNames.begin();
particleOffsetVec[particle].push_back(make_float4(charge, sigma, epsilon, paramIndex));
}
for (int i = 0; i < force.getNumExceptionParameterOffsets(); i++) {
string param;
int exception;
double charge, sigma, epsilon;
force.getExceptionParameterOffset(i, param, exception, charge, sigma, epsilon);
int index = exceptionIndex[exception];
if (index < startIndex || index >= endIndex)
continue;
auto paramPos = find(paramNames.begin(), paramNames.end(), param);
int paramIndex;
if (paramPos == paramNames.end()) {
paramIndex = paramNames.size();
paramNames.push_back(param);
}
else
paramIndex = paramPos-paramNames.begin();
exceptionOffsetVec[index-startIndex].push_back(make_float4(charge, sigma, epsilon, paramIndex));
}
paramValues.resize(paramNames.size(), 0.0);
particleParamOffsets.initialize<float4>(cu, max(force.getNumParticleParameterOffsets(), 1), "particleParamOffsets");
particleOffsetIndices.initialize<int>(cu, cu.getPaddedNumAtoms()+1, "particleOffsetIndices");
vector<int> particleOffsetIndicesVec, exceptionOffsetIndicesVec;
vector<float4> p, e;
for (int i = 0; i < particleOffsetVec.size(); i++) {
particleOffsetIndicesVec.push_back(p.size());
for (int j = 0; j < particleOffsetVec[i].size(); j++)
p.push_back(particleOffsetVec[i][j]);
}
while (particleOffsetIndicesVec.size() < particleOffsetIndices.getSize())
particleOffsetIndicesVec.push_back(p.size());
for (int i = 0; i < exceptionOffsetVec.size(); i++) {
exceptionOffsetIndicesVec.push_back(e.size());
for (int j = 0; j < exceptionOffsetVec[i].size(); j++)
e.push_back(exceptionOffsetVec[i][j]);
}
exceptionOffsetIndicesVec.push_back(e.size());
if (force.getNumParticleParameterOffsets() > 0) {
particleParamOffsets.upload(p);
particleOffsetIndices.upload(particleOffsetIndicesVec);
}
exceptionParamOffsets.initialize<float4>(cu, max((int) e.size(), 1), "exceptionParamOffsets");
exceptionOffsetIndices.initialize<int>(cu, exceptionOffsetIndicesVec.size(), "exceptionOffsetIndices");
if (e.size() > 0) {
exceptionParamOffsets.upload(e);
exceptionOffsetIndices.upload(exceptionOffsetIndicesVec);
}
globalParams.initialize(cu, max((int) paramValues.size(), 1), cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float), "globalParams");
if (paramValues.size() > 0)
globalParams.upload(paramValues, true);
recomputeParams = true;
// Initialize the kernel for updating parameters.
hipModule_t module = cu.createModule(CommonKernelSources::nonbondedParameters, paramsDefines);
computeParamsKernel = cu.getKernel(module, "computeParameters");
computeExclusionParamsKernel = cu.getKernel(module, "computeExclusionParameters");
info = new ForceInfo(force);
cu.addForce(info);
}
double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
// Update particle and exception parameters.
ContextSelector selector(cu);
bool paramChanged = false;
for (int i = 0; i < paramNames.size(); i++) {
double value = context.getParameter(paramNames[i]);
if (value != paramValues[i]) {
paramValues[i] = value;;
paramChanged = true;
}
}
if (paramChanged) {
recomputeParams = true;
globalParams.upload(paramValues, true);
}
double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
if (recomputeParams || hasOffsets) {
int computeSelfEnergy = (includeEnergy && includeReciprocal);
int numAtoms = cu.getPaddedNumAtoms();
vector<void*> paramsArgs = {&cu.getEnergyBuffer().getDevicePointer(), &computeSelfEnergy, &globalParams.getDevicePointer(), &numAtoms,
&baseParticleParams.getDevicePointer(), &cu.getPosq().getDevicePointer(), &charges.getDevicePointer(), &sigmaEpsilon.getDevicePointer(),
&particleParamOffsets.getDevicePointer(), &particleOffsetIndices.getDevicePointer()};
int numExceptions;
if (exceptionParams.isInitialized()) {
numExceptions = exceptionParams.getSize();
paramsArgs.push_back(&numExceptions);
paramsArgs.push_back(&baseExceptionParams.getDevicePointer());
paramsArgs.push_back(&exceptionParams.getDevicePointer());
paramsArgs.push_back(&exceptionParamOffsets.getDevicePointer());
paramsArgs.push_back(&exceptionOffsetIndices.getDevicePointer());
}
cu.executeKernel(computeParamsKernel, &paramsArgs[0], cu.getPaddedNumAtoms());
if (exclusionParams.isInitialized()) {
int numExclusions = exclusionParams.getSize();
vector<void*> exclusionParamsArgs = {&cu.getPosq().getDevicePointer(), &charges.getDevicePointer(), &sigmaEpsilon.getDevicePointer(),
&numExclusions, &exclusionAtoms.getDevicePointer(), &exclusionParams.getDevicePointer()};
cu.executeKernel(computeExclusionParamsKernel, &exclusionParamsArgs[0], numExclusions);
}
if (usePmeStream) {
hipEventRecord(paramsSyncEvent, cu.getCurrentStream());
hipStreamWaitEvent(pmeStream, paramsSyncEvent, 0);
}
if (hasOffsets)
energy = 0.0; // The Ewald self energy was computed in the kernel.
recomputeParams = false;
}
// Do reciprocal space calculations.
if (cosSinSums.isInitialized() && includeReciprocal) {
void* sumsArgs[] = {&cu.getEnergyBuffer().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums.getDevicePointer(), cu.getPeriodicBoxSizePointer()};
cu.executeKernel(ewaldSumsKernel, sumsArgs, cosSinSums.getSize());
void* forcesArgs[] = {&cu.getForce().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums.getDevicePointer(), cu.getPeriodicBoxSizePointer()};
cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms());
}
if (pmeGrid1.isInitialized() && includeReciprocal) {
if (usePmeStream)
cu.setCurrentStream(pmeStream);
// Invert the periodic box vectors.
Vec3 boxVectors[3];
cu.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
double determinant = boxVectors[0][0]*boxVectors[1][1]*boxVectors[2][2];
double scale = 1.0/determinant;
double4 recipBoxVectors[3];
recipBoxVectors[0] = make_double4(boxVectors[1][1]*boxVectors[2][2]*scale, 0, 0, 0);
recipBoxVectors[1] = make_double4(-boxVectors[1][0]*boxVectors[2][2]*scale, boxVectors[0][0]*boxVectors[2][2]*scale, 0, 0);
recipBoxVectors[2] = make_double4((boxVectors[1][0]*boxVectors[2][1]-boxVectors[1][1]*boxVectors[2][0])*scale, -boxVectors[0][0]*boxVectors[2][1]*scale, boxVectors[0][0]*boxVectors[1][1]*scale, 0);
float4 recipBoxVectorsFloat[3];
void* recipBoxVectorPointer[3];
if (cu.getUseDoublePrecision()) {
recipBoxVectorPointer[0] = &recipBoxVectors[0];
recipBoxVectorPointer[1] = &recipBoxVectors[1];
recipBoxVectorPointer[2] = &recipBoxVectors[2];
}
else {
recipBoxVectorsFloat[0] = make_float4((float) recipBoxVectors[0].x, 0, 0, 0);
recipBoxVectorsFloat[1] = make_float4((float) recipBoxVectors[1].x, (float) recipBoxVectors[1].y, 0, 0);
recipBoxVectorsFloat[2] = make_float4((float) recipBoxVectors[2].x, (float) recipBoxVectors[2].y, (float) recipBoxVectors[2].z, 0);
recipBoxVectorPointer[0] = &recipBoxVectorsFloat[0];
recipBoxVectorPointer[1] = &recipBoxVectorsFloat[1];
recipBoxVectorPointer[2] = &recipBoxVectorsFloat[2];
}
// Execute the reciprocal space kernels.
if (hasCoulomb) {
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernelFlat(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
sort->sort(pmeAtomGridIndex);
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeGrid2.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&charges.getDevicePointer()};
cu.executeKernelFlat(pmeSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()};
cu.executeKernelFlat(pmeFinishSpreadChargeKernel, finishSpreadArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
fft->execFFT(true);
if (includeEnergy) {
void* computeEnergyArgs[] = {&pmeGrid2.getDevicePointer(), usePmeStream ? &pmeEnergyBuffer.getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
&pmeBsplineModuliX.getDevicePointer(), &pmeBsplineModuliY.getDevicePointer(), &pmeBsplineModuliZ.getDevicePointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeEvalEnergyKernel, computeEnergyArgs, gridSizeX*gridSizeY*gridSizeZ);
}
void* convolutionArgs[] = {&pmeGrid2.getDevicePointer(), &pmeBsplineModuliX.getDevicePointer(),
&pmeBsplineModuliY.getDevicePointer(), &pmeBsplineModuliZ.getDevicePointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeConvolutionKernel, convolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
fft->execFFT(false);
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &pmeGrid1.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&charges.getDevicePointer()};
cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
}
if (doLJPME && hasLJ) {
if (!hasCoulomb) {
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernelFlat(pmeDispersionGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
sort->sort(pmeAtomGridIndex);
cu.clearBuffer(pmeEnergyBuffer);
}
cu.clearBuffer(pmeGrid2);
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeGrid2.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&sigmaEpsilon.getDevicePointer()};
cu.executeKernelFlat(pmeDispersionSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()};
cu.executeKernelFlat(pmeDispersionFinishSpreadChargeKernel, finishSpreadArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
dispersionFft->execFFT(true);
if (includeEnergy) {
void* computeEnergyArgs[] = {&pmeGrid2.getDevicePointer(), usePmeStream ? &pmeEnergyBuffer.getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
&pmeDispersionBsplineModuliX.getDevicePointer(), &pmeDispersionBsplineModuliY.getDevicePointer(), &pmeDispersionBsplineModuliZ.getDevicePointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeEvalDispersionEnergyKernel, computeEnergyArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
}
void* convolutionArgs[] = {&pmeGrid2.getDevicePointer(), &pmeDispersionBsplineModuliX.getDevicePointer(),
&pmeDispersionBsplineModuliY.getDevicePointer(), &pmeDispersionBsplineModuliZ.getDevicePointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeDispersionConvolutionKernel, convolutionArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
dispersionFft->execFFT(false);
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &pmeGrid1.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&sigmaEpsilon.getDevicePointer()};
cu.executeKernel(pmeInterpolateDispersionForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
}
if (usePmeStream) {
hipEventRecord(pmeSyncEvent, pmeStream);
cu.restoreDefaultStream();
}
}
if (dispersionCoefficient != 0.0 && includeDirect) {
double4 boxSize = cu.getPeriodicBoxSize();
energy += dispersionCoefficient/(boxSize.x*boxSize.y*boxSize.z);
}
return energy;
}
void HipCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context, const NonbondedForce& force) {
// Make sure the new parameters are acceptable.
ContextSelector selector(cu);
if (force.getNumParticles() != cu.getNumAtoms())
throw OpenMMException("updateParametersInContext: The number of particles has changed");
if (!hasCoulomb || !hasLJ) {
for (int i = 0; i < force.getNumParticles(); i++) {
double charge, sigma, epsilon;
force.getParticleParameters(i, charge, sigma, epsilon);
if (!hasCoulomb && charge != 0.0)
throw OpenMMException("updateParametersInContext: The nonbonded force kernel does not include Coulomb interactions, because all charges were originally 0");
if (!hasLJ && epsilon != 0.0)
throw OpenMMException("updateParametersInContext: The nonbonded force kernel does not include Lennard-Jones interactions, because all epsilons were originally 0");
}
}
set<int> exceptionsWithOffsets;
for (int i = 0; i < force.getNumExceptionParameterOffsets(); i++) {
string param;
int exception;
double charge, sigma, epsilon;
force.getExceptionParameterOffset(i, param, exception, charge, sigma, epsilon);
exceptionsWithOffsets.insert(exception);
}
vector<int> exceptions;
for (int i = 0; i < force.getNumExceptions(); i++) {
int particle1, particle2;
double chargeProd, sigma, epsilon;
force.getExceptionParameters(i, particle1, particle2, chargeProd, sigma, epsilon);
if (chargeProd != 0.0 || epsilon != 0.0 || exceptionsWithOffsets.find(i) != exceptionsWithOffsets.end())
exceptions.push_back(i);
}
int numContexts = cu.getPlatformData().contexts.size();
int startIndex = cu.getContextIndex()*exceptions.size()/numContexts;
int endIndex = (cu.getContextIndex()+1)*exceptions.size()/numContexts;
int numExceptions = endIndex-startIndex;
if (numExceptions != exceptionAtoms.size())
throw OpenMMException("updateParametersInContext: The set of non-excluded exceptions has changed");
// Record the per-particle parameters.
vector<float4> baseParticleParamVec(cu.getPaddedNumAtoms(), make_float4(0, 0, 0, 0));
const vector<int>& order = cu.getAtomIndex();
for (int i = 0; i < force.getNumParticles(); i++) {
double charge, sigma, epsilon;
force.getParticleParameters(i, charge, sigma, epsilon);
baseParticleParamVec[i] = make_float4(charge, sigma, epsilon, 0);
}
baseParticleParams.upload(baseParticleParamVec);
// Record the exceptions.
if (numExceptions > 0) {
vector<float4> baseExceptionParamsVec(numExceptions);
for (int i = 0; i < numExceptions; i++) {
int particle1, particle2;
double chargeProd, sigma, epsilon;
force.getExceptionParameters(exceptions[startIndex+i], particle1, particle2, chargeProd, sigma, epsilon);
if (make_pair(particle1, particle2) != exceptionAtoms[i])
throw OpenMMException("updateParametersInContext: The set of non-excluded exceptions has changed");
baseExceptionParamsVec[i] = make_float4(chargeProd, sigma, epsilon, 0);
}
baseExceptionParams.upload(baseExceptionParamsVec);
}
// Compute other values.
ewaldSelfEnergy = 0.0;
if (nonbondedMethod == Ewald || nonbondedMethod == PME || nonbondedMethod == LJPME) {
if (cu.getContextIndex() == 0) {
for (int i = 0; i < force.getNumParticles(); i++) {
ewaldSelfEnergy -= baseParticleParamVec[i].x*baseParticleParamVec[i].x*ONE_4PI_EPS0*alpha/sqrt(M_PI);
if (doLJPME)
ewaldSelfEnergy += baseParticleParamVec[i].z*pow(baseParticleParamVec[i].y*dispersionAlpha, 6)/3.0;
}
}
}
if (force.getUseDispersionCorrection() && cu.getContextIndex() == 0 && (nonbondedMethod == CutoffPeriodic || nonbondedMethod == Ewald || nonbondedMethod == PME))
dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(context.getSystem(), force);
cu.invalidateMolecules();
recomputeParams = true;
}
void HipCalcNonbondedForceKernel::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
if (nonbondedMethod != PME)
throw OpenMMException("getPMEParametersInContext: This Context is not using PME");
if (cu.getPlatformData().useCpuPme)
cpuPme.getAs<CalcPmeReciprocalForceKernel>().getPMEParameters(alpha, nx, ny, nz);
else {
alpha = this->alpha;
nx = gridSizeX;
ny = gridSizeY;
nz = gridSizeZ;
}
}
void HipCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
if (!doLJPME)
throw OpenMMException("getPMEParametersInContext: This Context is not using PME");
if (cu.getPlatformData().useCpuPme)
//cpuPme.getAs<CalcPmeReciprocalForceKernel>().getLJPMEParameters(alpha, nx, ny, nz);
throw OpenMMException("getPMEParametersInContext: CPUPME has not been implemented for LJPME yet.");
else {
alpha = this->dispersionAlpha;
nx = dispersionGridSizeX;
ny = dispersionGridSizeY;
nz = dispersionGridSizeZ;
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2023 Stanford University and the Authors. *
* Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "openmm/OpenMMException.h"
#include "HipNonbondedUtilities.h"
#include "HipArray.h"
#include "HipContext.h"
#include "HipKernelSources.h"
#include "HipExpressionUtilities.h"
#include "HipSort.h"
#include <algorithm>
#include <map>
#include <set>
#include <utility>
using namespace OpenMM;
using namespace std;
#define CHECK_RESULT(result) \
if (result != hipSuccess) { \
std::stringstream m; \
m<<errorMessage<<": "<<context.getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
throw OpenMMException(m.str());\
}
class HipNonbondedUtilities::BlockSortTrait : public HipSort::SortTrait {
public:
BlockSortTrait() {}
int getDataSize() const {return sizeof(int);}
int getKeySize() const {return sizeof(int);}
const char* getDataType() const {return "unsigned int";}
const char* getKeyType() const {return "unsigned int";}
const char* getMinKey() const {return "0";}
const char* getMaxKey() const {return "0xFFFFFFFFu";}
const char* getMaxValue() const {return "0xFFFFFFFFu";}
const char* getSortKey() const {return "value";}
};
HipNonbondedUtilities::HipNonbondedUtilities(HipContext& context) : context(context), useCutoff(false), usePeriodic(false), useNeighborList(false), anyExclusions(false), usePadding(true),
blockSorter(NULL), pinnedCountBuffer(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0), canUsePairList(true), tilesAfterReorder(0) {
// Decide how many thread blocks to use.
string errorMessage = "Error initializing nonbonded utilities";
CHECK_RESULT(hipEventCreateWithFlags(&downloadCountEvent, context.getEventFlags()));
CHECK_RESULT(hipHostMalloc((void**) &pinnedCountBuffer, 2*sizeof(unsigned int), context.getHostMallocFlags()));
numForceThreadBlocks = 5*4*context.getMultiprocessors();
forceThreadBlockSize = 64;
findInteractingBlocksThreadBlockSize = context.getSIMDWidth();
// When building the neighbor list, we can optionally use large blocks (32 * warpSize atoms) to
// accelerate the process. This makes building the neighbor list faster, but it prevents
// us from sorting atom blocks by size, which leads to a slightly less efficient neighbor
// list. We guess based on system size which will be faster.
useLargeBlocks = (context.getNumAtoms() > 90000);
setKernelSource(HipKernelSources::nonbonded);
}
HipNonbondedUtilities::~HipNonbondedUtilities() {
if (blockSorter != NULL)
delete blockSorter;
if (pinnedCountBuffer != NULL)
hipHostFree(pinnedCountBuffer);
hipEventDestroy(downloadCountEvent);
}
void HipNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup, bool usesNeighborList) {
addInteraction(usesCutoff, usesPeriodic, usesExclusions, cutoffDistance, exclusionList, kernel, forceGroup, usesNeighborList, false);
}
void HipNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup, bool usesNeighborList, bool supportsPairList) {
if (groupCutoff.size() > 0) {
if (usesCutoff != useCutoff)
throw OpenMMException("All Forces must agree on whether to use a cutoff");
if (usesPeriodic != usePeriodic)
throw OpenMMException("All Forces must agree on whether to use periodic boundary conditions");
if (usesCutoff && groupCutoff.find(forceGroup) != groupCutoff.end() && groupCutoff[forceGroup] != cutoffDistance)
throw OpenMMException("All Forces in a single force group must use the same cutoff distance");
}
if (usesExclusions)
requestExclusions(exclusionList);
useCutoff = usesCutoff;
usePeriodic = usesPeriodic;
useNeighborList |= (usesNeighborList && useCutoff);
groupCutoff[forceGroup] = cutoffDistance;
groupFlags |= 1<<forceGroup;
canUsePairList &= supportsPairList;
if (kernel.size() > 0) {
if (groupKernelSource.find(forceGroup) == groupKernelSource.end())
groupKernelSource[forceGroup] = "";
map<string, string> replacements;
replacements["CUTOFF"] = "CUTOFF_"+context.intToString(forceGroup);
replacements["CUTOFF_SQUARED"] = "CUTOFF_"+context.intToString(forceGroup)+"_SQUARED";
groupKernelSource[forceGroup] += context.replaceStrings(kernel, replacements)+"\n";
}
}
void HipNonbondedUtilities::addParameter(ComputeParameterInfo parameter) {
parameters.push_back(ParameterInfo(parameter.getName(), parameter.getComponentType(), parameter.getNumComponents(),
parameter.getSize(), context.unwrap(parameter.getArray()).getDevicePointer(), parameter.isConstant()));
}
void HipNonbondedUtilities::addParameter(const ParameterInfo& parameter) {
parameters.push_back(parameter);
}
void HipNonbondedUtilities::addArgument(ComputeParameterInfo parameter) {
arguments.push_back(ParameterInfo(parameter.getName(), parameter.getComponentType(), parameter.getNumComponents(),
parameter.getSize(), context.unwrap(parameter.getArray()).getDevicePointer(), parameter.isConstant()));
}
void HipNonbondedUtilities::addArgument(const ParameterInfo& parameter) {
arguments.push_back(parameter);
}
string HipNonbondedUtilities::addEnergyParameterDerivative(const string& param) {
// See if the parameter has already been added.
int index;
for (index = 0; index < energyParameterDerivatives.size(); index++)
if (param == energyParameterDerivatives[index])
break;
if (index == energyParameterDerivatives.size())
energyParameterDerivatives.push_back(param);
context.addEnergyParameterDerivative(param);
return string("energyParamDeriv")+context.intToString(index);
}
void HipNonbondedUtilities::requestExclusions(const vector<vector<int> >& exclusionList) {
if (anyExclusions) {
bool sameExclusions = (exclusionList.size() == atomExclusions.size());
for (int i = 0; i < (int) exclusionList.size() && sameExclusions; i++) {
if (exclusionList[i].size() != atomExclusions[i].size())
sameExclusions = false;
set<int> expectedExclusions;
expectedExclusions.insert(atomExclusions[i].begin(), atomExclusions[i].end());
for (int j = 0; j < (int) exclusionList[i].size(); j++)
if (expectedExclusions.find(exclusionList[i][j]) == expectedExclusions.end())
sameExclusions = false;
}
if (!sameExclusions)
throw OpenMMException("All Forces must have identical exceptions");
}
else {
atomExclusions = exclusionList;
anyExclusions = true;
}
}
static bool compareInt2(int2 a, int2 b) {
return ((a.y < b.y) || (a.y == b.y && a.x < b.x));
}
static bool compareInt2LargeSIMD(int2 a, int2 b) {
// This version is used on devices with SIMD width greater than tile size. It puts diagonal tiles before off-diagonal
// ones to reduce thread divergence.
if (a.x == a.y) {
if (b.x == b.y)
return (a.x < b.x);
return true;
}
if (b.x == b.y)
return false;
return ((a.y < b.y) || (a.y == b.y && a.x < b.x));
}
void HipNonbondedUtilities::initialize(const System& system) {
string errorMessage = "Error initializing nonbonded utilities";
if (atomExclusions.size() == 0) {
// No exclusions were specifically requested, so just mark every atom as not interacting with itself.
atomExclusions.resize(context.getNumAtoms());
for (int i = 0; i < (int) atomExclusions.size(); i++)
atomExclusions[i].push_back(i);
}
// Create the list of tiles.
numAtoms = context.getNumAtoms();
int numAtomBlocks = context.getNumAtomBlocks();
int numContexts = context.getPlatformData().contexts.size();
setAtomBlockRange(context.getContextIndex()/(double) numContexts, (context.getContextIndex()+1)/(double) numContexts);
// Build a list of tiles that contain exclusions.
set<pair<int, int> > tilesWithExclusions;
for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
int x = atom1/HipContext::TileSize;
for (int j = 0; j < (int) atomExclusions[atom1].size(); ++j) {
int atom2 = atomExclusions[atom1][j];
int y = atom2/HipContext::TileSize;
tilesWithExclusions.insert(make_pair(max(x, y), min(x, y)));
}
}
vector<int2> exclusionTilesVec;
for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter)
exclusionTilesVec.push_back(make_int2(iter->first, iter->second));
sort(exclusionTilesVec.begin(), exclusionTilesVec.end(), context.getSIMDWidth() <= 32 || !useNeighborList ? compareInt2 : compareInt2LargeSIMD);
exclusionTiles.initialize<int2>(context, exclusionTilesVec.size(), "exclusionTiles");
exclusionTiles.upload(exclusionTilesVec);
map<pair<int, int>, int> exclusionTileMap;
for (int i = 0; i < (int) exclusionTilesVec.size(); i++) {
int2 tile = exclusionTilesVec[i];
exclusionTileMap[make_pair(tile.x, tile.y)] = i;
}
vector<vector<int> > exclusionBlocksForBlock(numAtomBlocks);
for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter) {
exclusionBlocksForBlock[iter->first].push_back(iter->second);
if (iter->first != iter->second)
exclusionBlocksForBlock[iter->second].push_back(iter->first);
}
vector<unsigned int> exclusionRowIndicesVec(numAtomBlocks+1, 0);
vector<unsigned int> exclusionIndicesVec;
for (int i = 0; i < numAtomBlocks; i++) {
exclusionIndicesVec.insert(exclusionIndicesVec.end(), exclusionBlocksForBlock[i].begin(), exclusionBlocksForBlock[i].end());
exclusionRowIndicesVec[i+1] = exclusionIndicesVec.size();
}
maxExclusions = 0;
for (int i = 0; i < (int) exclusionBlocksForBlock.size(); i++)
maxExclusions = (maxExclusions > exclusionBlocksForBlock[i].size() ? maxExclusions : exclusionBlocksForBlock[i].size());
exclusionIndices.initialize<unsigned int>(context, exclusionIndicesVec.size(), "exclusionIndices");
exclusionRowIndices.initialize<unsigned int>(context, exclusionRowIndicesVec.size(), "exclusionRowIndices");
exclusionIndices.upload(exclusionIndicesVec);
exclusionRowIndices.upload(exclusionRowIndicesVec);
// Record the exclusion data.
exclusions.initialize<tileflags>(context, tilesWithExclusions.size()*HipContext::TileSize, "exclusions");
tileflags allFlags = (tileflags) -1;
vector<tileflags> exclusionVec(exclusions.getSize(), allFlags);
for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
int x = atom1/HipContext::TileSize;
int offset1 = atom1-x*HipContext::TileSize;
for (int j = 0; j < (int) atomExclusions[atom1].size(); ++j) {
int atom2 = atomExclusions[atom1][j];
int y = atom2/HipContext::TileSize;
int offset2 = atom2-y*HipContext::TileSize;
if (x > y) {
int index = exclusionTileMap[make_pair(x, y)]*HipContext::TileSize;
exclusionVec[index+offset1] &= allFlags-(1<<offset2);
}
else {
int index = exclusionTileMap[make_pair(y, x)]*HipContext::TileSize;
exclusionVec[index+offset2] &= allFlags-(1<<offset1);
}
}
}
atomExclusions.clear(); // We won't use this again, so free the memory it used
exclusions.upload(exclusionVec);
// Create data structures for the neighbor list.
if (useCutoff) {
// Select a size for the arrays that hold the neighbor list. We have to make a fairly
// arbitrary guess, but if this turns out to be too small we'll increase it later.
maxTiles = 20*numAtomBlocks;
if (maxTiles > numTiles)
maxTiles = numTiles;
if (maxTiles < 1)
maxTiles = 1;
maxSinglePairs = 5*numAtoms;
// HIP-TODO: This may require tuning
numTilesInBatch = numAtomBlocks < 2000 ? 4 : 1;
interactingTiles.initialize<int>(context, maxTiles, "interactingTiles");
interactingAtoms.initialize<int>(context, HipContext::TileSize*maxTiles, "interactingAtoms");
interactionCount.initialize<unsigned int>(context, 2, "interactionCount");
singlePairs.initialize<int2>(context, maxSinglePairs, "singlePairs");
int elementSize = (context.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
blockCenter.initialize(context, numAtomBlocks, 4*elementSize, "blockCenter");
blockBoundingBox.initialize(context, numAtomBlocks, 4*elementSize, "blockBoundingBox");
sortedBlocks.initialize<unsigned int>(context, numAtomBlocks, "sortedBlocks");
sortedBlockCenter.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockCenter");
sortedBlockBoundingBox.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox");
blockSizeRange.initialize(context, 2, elementSize, "blockSizeRange");
largeBlockCenter.initialize(context, numAtomBlocks, 4*elementSize, "largeBlockCenter");
largeBlockBoundingBox.initialize(context, numAtomBlocks*4, elementSize, "largeBlockBoundingBox");
oldPositions.initialize(context, numAtoms, 4*elementSize, "oldPositions");
rebuildNeighborList.initialize<int>(context, 1, "rebuildNeighborList");
blockSorter = new HipSort(context, new BlockSortTrait(), numAtomBlocks, false);
vector<unsigned int> count(2, 0);
interactionCount.upload(count);
rebuildNeighborList.upload(&count[0]);
if (context.getUseDoublePrecision()) {
blockSizeRange.upload(vector<double>{1e38, 0});
} else {
blockSizeRange.upload(vector<float>{1e38, 0});
}
}
// Record arguments for kernels.
forceArgs.push_back(&context.getForce().getDevicePointer());
forceArgs.push_back(&context.getEnergyBuffer().getDevicePointer());
forceArgs.push_back(&context.getPosq().getDevicePointer());
forceArgs.push_back(&exclusions.getDevicePointer());
forceArgs.push_back(&exclusionTiles.getDevicePointer());
forceArgs.push_back(&startTileIndex);
forceArgs.push_back(&numTiles);
if (useCutoff) {
forceArgs.push_back(&interactingTiles.getDevicePointer());
forceArgs.push_back(&interactionCount.getDevicePointer());
forceArgs.push_back(context.getPeriodicBoxSizePointer());
forceArgs.push_back(context.getInvPeriodicBoxSizePointer());
forceArgs.push_back(context.getPeriodicBoxVecXPointer());
forceArgs.push_back(context.getPeriodicBoxVecYPointer());
forceArgs.push_back(context.getPeriodicBoxVecZPointer());
forceArgs.push_back(&maxTiles);
forceArgs.push_back(&blockCenter.getDevicePointer());
forceArgs.push_back(&blockBoundingBox.getDevicePointer());
forceArgs.push_back(&interactingAtoms.getDevicePointer());
forceArgs.push_back(&maxSinglePairs);
forceArgs.push_back(&singlePairs.getDevicePointer());
}
for (int i = 0; i < (int) parameters.size(); i++)
forceArgs.push_back(&parameters[i].getMemory());
for (ParameterInfo& arg : arguments)
forceArgs.push_back(&arg.getMemory());
if (energyParameterDerivatives.size() > 0)
forceArgs.push_back(&context.getEnergyParamDerivBuffer().getDevicePointer());
if (useCutoff) {
findBlockBoundsArgs.push_back(&numAtoms);
findBlockBoundsArgs.push_back(context.getPeriodicBoxSizePointer());
findBlockBoundsArgs.push_back(context.getInvPeriodicBoxSizePointer());
findBlockBoundsArgs.push_back(context.getPeriodicBoxVecXPointer());
findBlockBoundsArgs.push_back(context.getPeriodicBoxVecYPointer());
findBlockBoundsArgs.push_back(context.getPeriodicBoxVecZPointer());
findBlockBoundsArgs.push_back(&context.getPosq().getDevicePointer());
findBlockBoundsArgs.push_back(&blockCenter.getDevicePointer());
findBlockBoundsArgs.push_back(&blockBoundingBox.getDevicePointer());
findBlockBoundsArgs.push_back(&rebuildNeighborList.getDevicePointer());
findBlockBoundsArgs.push_back(&blockSizeRange.getDevicePointer());
computeSortKeysArgs.push_back(&blockBoundingBox.getDevicePointer());
computeSortKeysArgs.push_back(&sortedBlocks.getDevicePointer());
computeSortKeysArgs.push_back(&blockSizeRange.getDevicePointer());
sortBoxDataArgs.push_back(&sortedBlocks.getDevicePointer());
sortBoxDataArgs.push_back(&blockCenter.getDevicePointer());
sortBoxDataArgs.push_back(&blockBoundingBox.getDevicePointer());
sortBoxDataArgs.push_back(&sortedBlockCenter.getDevicePointer());
sortBoxDataArgs.push_back(&sortedBlockBoundingBox.getDevicePointer());
if (useLargeBlocks) {
sortBoxDataArgs.push_back(&largeBlockCenter.getDevicePointer());
sortBoxDataArgs.push_back(&largeBlockBoundingBox.getDevicePointer());
sortBoxDataArgs.push_back(context.getPeriodicBoxSizePointer());
sortBoxDataArgs.push_back(context.getInvPeriodicBoxSizePointer());
sortBoxDataArgs.push_back(context.getPeriodicBoxVecXPointer());
sortBoxDataArgs.push_back(context.getPeriodicBoxVecYPointer());
sortBoxDataArgs.push_back(context.getPeriodicBoxVecZPointer());
}
sortBoxDataArgs.push_back(&context.getPosq().getDevicePointer());
sortBoxDataArgs.push_back(&oldPositions.getDevicePointer());
sortBoxDataArgs.push_back(&interactionCount.getDevicePointer());
sortBoxDataArgs.push_back(&rebuildNeighborList.getDevicePointer());
sortBoxDataArgs.push_back(&forceRebuildNeighborList);
sortBoxDataArgs.push_back(&blockSizeRange.getDevicePointer());
findInteractingBlocksArgs.push_back(context.getPeriodicBoxSizePointer());
findInteractingBlocksArgs.push_back(context.getInvPeriodicBoxSizePointer());
findInteractingBlocksArgs.push_back(context.getPeriodicBoxVecXPointer());
findInteractingBlocksArgs.push_back(context.getPeriodicBoxVecYPointer());
findInteractingBlocksArgs.push_back(context.getPeriodicBoxVecZPointer());
findInteractingBlocksArgs.push_back(&interactionCount.getDevicePointer());
findInteractingBlocksArgs.push_back(&interactingTiles.getDevicePointer());
findInteractingBlocksArgs.push_back(&interactingAtoms.getDevicePointer());
findInteractingBlocksArgs.push_back(&singlePairs.getDevicePointer());
findInteractingBlocksArgs.push_back(&context.getPosq().getDevicePointer());
findInteractingBlocksArgs.push_back(&maxTiles);
findInteractingBlocksArgs.push_back(&maxSinglePairs);
findInteractingBlocksArgs.push_back(&startBlockIndex);
findInteractingBlocksArgs.push_back(&numBlocks);
findInteractingBlocksArgs.push_back(&sortedBlocks.getDevicePointer());
findInteractingBlocksArgs.push_back(&sortedBlockCenter.getDevicePointer());
findInteractingBlocksArgs.push_back(&sortedBlockBoundingBox.getDevicePointer());
if (useLargeBlocks) {
findInteractingBlocksArgs.push_back(&largeBlockCenter.getDevicePointer());
findInteractingBlocksArgs.push_back(&largeBlockBoundingBox.getDevicePointer());
}
findInteractingBlocksArgs.push_back(&exclusionIndices.getDevicePointer());
findInteractingBlocksArgs.push_back(&exclusionRowIndices.getDevicePointer());
findInteractingBlocksArgs.push_back(&oldPositions.getDevicePointer());
findInteractingBlocksArgs.push_back(&rebuildNeighborList.getDevicePointer());
copyInteractionCountsArgs.push_back(&interactionCount.getDevicePointer());
copyInteractionCountsArgs.push_back(&pinnedCountBuffer);
}
}
double HipNonbondedUtilities::getMaxCutoffDistance() {
double cutoff = 0.0;
for (map<int, double>::const_iterator iter = groupCutoff.begin(); iter != groupCutoff.end(); ++iter)
cutoff = max(cutoff, iter->second);
return cutoff;
}
double HipNonbondedUtilities::padCutoff(double cutoff) {
double padding = (usePadding ? 0.08*cutoff : 0.0);
return cutoff+padding;
}
void HipNonbondedUtilities::prepareInteractions(int forceGroups) {
if ((forceGroups&groupFlags) == 0)
return;
if (groupKernels.find(forceGroups) == groupKernels.end())
createKernelsForGroups(forceGroups);
KernelSet& kernels = groupKernels[forceGroups];
if (useCutoff && usePeriodic) {
double4 box = context.getPeriodicBoxSize();
double minAllowedSize = 1.999999*kernels.cutoffDistance;
if (box.x < minAllowedSize || box.y < minAllowedSize || box.z < minAllowedSize)
throw OpenMMException("The periodic box size has decreased to less than twice the nonbonded cutoff.");
}
if (!useNeighborList)
return;
if (numTiles == 0)
return;
// Compute the neighbor list.
if (lastCutoff != kernels.cutoffDistance)
forceRebuildNeighborList = true;
context.executeKernelFlat(kernels.findBlockBoundsKernel, &findBlockBoundsArgs[0], context.getPaddedNumAtoms(), context.getSIMDWidth());
context.executeKernelFlat(kernels.computeSortKeysKernel, &computeSortKeysArgs[0], context.getNumAtomBlocks());
blockSorter->sort(sortedBlocks);
context.executeKernelFlat(kernels.sortBoxDataKernel, &sortBoxDataArgs[0], context.getNumAtoms(), 64);
context.executeKernelFlat(kernels.findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtomBlocks() * context.getSIMDWidth() * numTilesInBatch, findInteractingBlocksThreadBlockSize);
forceRebuildNeighborList = false;
lastCutoff = kernels.cutoffDistance;
context.executeKernelFlat(kernels.copyInteractionCountsKernel, &copyInteractionCountsArgs[0], 1, 1);
hipEventRecord(downloadCountEvent, context.getCurrentStream());
}
void HipNonbondedUtilities::computeInteractions(int forceGroups, bool includeForces, bool includeEnergy) {
if ((forceGroups&groupFlags) == 0)
return;
KernelSet& kernels = groupKernels[forceGroups];
if (kernels.hasForces) {
hipFunction_t& kernel = (includeForces ? (includeEnergy ? kernels.forceEnergyKernel : kernels.forceKernel) : kernels.energyKernel);
if (kernel == NULL)
kernel = createInteractionKernel(kernels.source, parameters, arguments, true, true, forceGroups, includeForces, includeEnergy);
context.executeKernelFlat(kernel, &forceArgs[0], numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
}
if (useNeighborList && numTiles > 0) {
hipEventSynchronize(downloadCountEvent);
updateNeighborListSize();
}
}
bool HipNonbondedUtilities::updateNeighborListSize() {
if (!useCutoff)
return false;
if (context.getStepsSinceReorder() == 0 || tilesAfterReorder == 0)
tilesAfterReorder = pinnedCountBuffer[0];
else if (context.getStepsSinceReorder() > 25 && pinnedCountBuffer[0] > 1.1*tilesAfterReorder)
context.forceReorder();
if (pinnedCountBuffer[0] <= maxTiles && pinnedCountBuffer[1] <= maxSinglePairs)
return false;
// The most recent timestep had too many interactions to fit in the arrays. Make the arrays bigger to prevent
// this from happening in the future.
if (pinnedCountBuffer[0] > maxTiles) {
maxTiles = (unsigned int) (1.2*pinnedCountBuffer[0]);
unsigned int numBlocks = context.getNumAtomBlocks();
int totalTiles = numBlocks*(numBlocks+1)/2;
if (maxTiles > totalTiles)
maxTiles = totalTiles;
interactingTiles.resize(maxTiles);
interactingAtoms.resize(HipContext::TileSize*(size_t) maxTiles);
if (forceArgs.size() > 0)
forceArgs[7] = &interactingTiles.getDevicePointer();
findInteractingBlocksArgs[6] = &interactingTiles.getDevicePointer();
if (forceArgs.size() > 0)
forceArgs[17] = &interactingAtoms.getDevicePointer();
findInteractingBlocksArgs[7] = &interactingAtoms.getDevicePointer();
}
if (pinnedCountBuffer[1] > maxSinglePairs) {
maxSinglePairs = (unsigned int) (1.2*pinnedCountBuffer[1]);
singlePairs.resize(maxSinglePairs);
if (forceArgs.size() > 0)
forceArgs[19] = &singlePairs.getDevicePointer();
findInteractingBlocksArgs[8] = &singlePairs.getDevicePointer();
}
forceRebuildNeighborList = true;
context.setForcesValid(false);
return true;
}
void HipNonbondedUtilities::setUsePadding(bool padding) {
usePadding = padding;
}
void HipNonbondedUtilities::setAtomBlockRange(double startFraction, double endFraction) {
int numAtomBlocks = context.getNumAtomBlocks();
startBlockIndex = (int) (startFraction*numAtomBlocks);
numBlocks = (int) (endFraction*numAtomBlocks)-startBlockIndex;
long long totalTiles = context.getNumAtomBlocks()*((long long)context.getNumAtomBlocks()+1)/2;
startTileIndex = (int) (startFraction*totalTiles);
numTiles = (long long) (endFraction*totalTiles)-startTileIndex;
forceRebuildNeighborList = true;
}
void HipNonbondedUtilities::createKernelsForGroups(int groups) {
KernelSet kernels;
double cutoff = 0.0;
string source;
for (int i = 0; i < 32; i++) {
if ((groups&(1<<i)) != 0) {
cutoff = max(cutoff, groupCutoff[i]);
source += groupKernelSource[i];
}
}
kernels.hasForces = (source.size() > 0);
kernels.cutoffDistance = cutoff;
kernels.source = source;
kernels.forceKernel = kernels.energyKernel = kernels.forceEnergyKernel = NULL;
if (useCutoff) {
double paddedCutoff = padCutoff(cutoff);
map<string, string> defines;
defines["TILE_SIZE"] = context.intToString(HipContext::TileSize);
defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
defines["PADDING"] = context.doubleToString(paddedCutoff-cutoff);
defines["PADDED_CUTOFF"] = context.doubleToString(paddedCutoff);
defines["PADDED_CUTOFF_SQUARED"] = context.doubleToString(paddedCutoff*paddedCutoff);
defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(exclusionTiles.getSize());
if (usePeriodic)
defines["USE_PERIODIC"] = "1";
if (context.getBoxIsTriclinic())
defines["TRICLINIC"] = "1";
if (useLargeBlocks)
defines["USE_LARGE_BLOCKS"] = "1";
defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions);
int maxBits = 0;
if (canUsePairList) {
if (context.getUseDoublePrecision()) {
maxBits = 4;
}
else {
if (context.getSIMDWidth() > 32) {
// CDNA
if (context.getNumAtoms() < 100000)
maxBits = 4;
else // Large systems
maxBits = 0;
}
else {
// RDNA
if (context.getNumAtoms() < 100000)
maxBits = 4;
else if (context.getNumAtoms() < 500000)
maxBits = 2;
else // Very large systems
maxBits = 0;
}
}
}
defines["MAX_BITS_FOR_PAIRS"] = context.intToString(maxBits);
defines["NUM_TILES_IN_BATCH"] = context.intToString(numTilesInBatch);
defines["GROUP_SIZE"] = context.intToString(findInteractingBlocksThreadBlockSize);
int binShift = 1;
while (1<<binShift <= context.getNumAtomBlocks())
binShift++;
defines["BIN_SHIFT"] = context.intToString(binShift);
defines["BLOCK_INDEX_MASK"] = context.intToString((1<<binShift)-1);
hipModule_t interactingBlocksProgram = context.createModule(HipKernelSources::vectorOps+HipKernelSources::findInteractingBlocks, defines);
kernels.findBlockBoundsKernel = context.getKernel(interactingBlocksProgram, "findBlockBounds");
kernels.computeSortKeysKernel = context.getKernel(interactingBlocksProgram, "computeSortKeys");
kernels.sortBoxDataKernel = context.getKernel(interactingBlocksProgram, "sortBoxData");
kernels.findInteractingBlocksKernel = context.getKernel(interactingBlocksProgram, "findBlocksWithInteractions");
kernels.copyInteractionCountsKernel = context.getKernel(interactingBlocksProgram, "copyInteractionCounts");
}
groupKernels[groups] = kernels;
}
hipFunction_t HipNonbondedUtilities::createInteractionKernel(const string& source, vector<ParameterInfo>& params, vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric, int groups, bool includeForces, bool includeEnergy) {
map<string, string> replacements;
replacements["COMPUTE_INTERACTION"] = source;
const string suffixes[] = {"x", "y", "z", "w"};
stringstream args;
for (const ParameterInfo& param : params) {
args << ", ";
if (param.isConstant())
args << "const ";
args << param.getType();
args << "* __restrict__ global_";
args << param.getName();
}
for (const ParameterInfo& arg : arguments) {
args << ", ";
if (arg.isConstant())
args << "const ";
args << arg.getType();
args << "* __restrict__ ";
args << arg.getName();
}
if (energyParameterDerivatives.size() > 0)
args << ", mixed* __restrict__ energyParamDerivs";
replacements["PARAMETER_ARGUMENTS"] = args.str();
stringstream load1;
for (const ParameterInfo& param : params) {
load1 << param.getType();
load1 << " ";
load1 << param.getName();
load1 << "1 = global_";
load1 << param.getName();
load1 << "[atom1];\n";
}
replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
// Part 1. Defines for on diagonal exclusion tiles
stringstream broadcastWarpData;
broadcastWarpData << "posq2.x = SHFL(shflPosq.x, j);\n";
broadcastWarpData << "posq2.y = SHFL(shflPosq.y, j);\n";
broadcastWarpData << "posq2.z = SHFL(shflPosq.z, j);\n";
broadcastWarpData << "posq2.w = SHFL(shflPosq.w, j);\n";
for (const ParameterInfo& param : params) {
broadcastWarpData << param.getType() << " shfl" << param.getName() << ";\n";
for (int j = 0; j < param.getNumComponents(); j++) {
if (param.getNumComponents() == 1)
broadcastWarpData << "shfl" << param.getName() << "=SHFL(" << param.getName() <<"1,j);\n";
else
broadcastWarpData << "shfl" << param.getName()+"."+suffixes[j] << "=SHFL(" << param.getName()+"1."+suffixes[j] <<",j);\n";
}
}
replacements["BROADCAST_WARP_DATA"] = broadcastWarpData.str();
// Part 2. Defines for off-diagonal exclusions, and neighborlist tiles.
stringstream declareLocal2;
for (const ParameterInfo& param : params)
declareLocal2<<param.getType()<<" shfl"<<param.getName()<<";\n";
replacements["DECLARE_LOCAL_PARAMETERS"] = declareLocal2.str();
stringstream loadLocal2;
for (const ParameterInfo& param : params)
loadLocal2<<"shfl"<<param.getName()<<" = global_"<<param.getName()<<"[j];\n";
replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str();
stringstream load2j;
for (const ParameterInfo& param : params)
load2j<<param.getType()<<" "<<param.getName()<<"2 = shfl"<<param.getName()<<";\n";
replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str();
stringstream clearLocal;
for (const ParameterInfo& param : params) {
clearLocal<<"shfl";
clearLocal<<param.getName()<<" = ";
if (param.getNumComponents() == 1)
clearLocal<<"0;\n";
else
clearLocal<<"make_"<<param.getType()<<"(0);\n";
}
replacements["CLEAR_LOCAL_PARAMETERS"] = clearLocal.str();
stringstream initDerivs;
for (int i = 0; i < energyParameterDerivatives.size(); i++)
initDerivs<<"mixed energyParamDeriv"<<i<<" = 0;\n";
replacements["INIT_DERIVATIVES"] = initDerivs.str();
stringstream saveDerivs;
const vector<string>& allParamDerivNames = context.getEnergyParamDerivNames();
int numDerivs = allParamDerivNames.size();
for (int i = 0; i < energyParameterDerivatives.size(); i++)
for (int index = 0; index < numDerivs; index++)
if (allParamDerivNames[index] == energyParameterDerivatives[i])
saveDerivs<<"energyParamDerivs[GLOBAL_ID*"<<numDerivs<<"+"<<index<<"] += energyParamDeriv"<<i<<";\n";
replacements["SAVE_DERIVATIVES"] = saveDerivs.str();
stringstream shuffleWarpData;
shuffleWarpData << "shflPosq = warpRotateLeft<TILE_SIZE>(shflPosq);\n";
shuffleWarpData << "shflForce = warpRotateLeft<TILE_SIZE>(shflForce);\n";
for (const ParameterInfo& param : params) {
shuffleWarpData<<"shfl"<<param.getName()<<"=warpRotateLeft<TILE_SIZE>(shfl"<<param.getName()<<");\n";
}
replacements["SHUFFLE_WARP_DATA"] = shuffleWarpData.str();
map<string, string> defines;
if (useCutoff)
defines["USE_CUTOFF"] = "1";
if (usePeriodic)
defines["USE_PERIODIC"] = "1";
if (useExclusions)
defines["USE_EXCLUSIONS"] = "1";
if (isSymmetric)
defines["USE_SYMMETRIC"] = "1";
if (useNeighborList)
defines["USE_NEIGHBOR_LIST"] = "1";
defines["ENABLE_SHUFFLE"] = "1"; // Used only in hippoNonbonded.cc
if (includeForces)
defines["INCLUDE_FORCES"] = "1";
if (includeEnergy)
defines["INCLUDE_ENERGY"] = "1";
defines["THREAD_BLOCK_SIZE"] = context.intToString(forceThreadBlockSize);
double maxCutoff = 0.0;
for (int i = 0; i < 32; i++) {
if ((groups&(1<<i)) != 0) {
double cutoff = groupCutoff[i];
maxCutoff = max(maxCutoff, cutoff);
defines["CUTOFF_"+context.intToString(i)+"_SQUARED"] = context.doubleToString(cutoff*cutoff);
defines["CUTOFF_"+context.intToString(i)] = context.doubleToString(cutoff);
}
}
defines["MAX_CUTOFF"] = context.doubleToString(maxCutoff);
defines["MAX_CUTOFF_SQUARED"] = context.doubleToString(maxCutoff*maxCutoff);
defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
defines["TILE_SIZE"] = context.intToString(HipContext::TileSize);
int numExclusionTiles = exclusionTiles.getSize();
defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(numExclusionTiles);
int numContexts = context.getPlatformData().contexts.size();
int startExclusionIndex = context.getContextIndex()*numExclusionTiles/numContexts;
int endExclusionIndex = (context.getContextIndex()+1)*numExclusionTiles/numContexts;
defines["FIRST_EXCLUSION_TILE"] = context.intToString(startExclusionIndex);
defines["LAST_EXCLUSION_TILE"] = context.intToString(endExclusionIndex);
hipModule_t program = context.createModule(HipKernelSources::vectorOps+context.replaceStrings(kernelSource, replacements), defines);
hipFunction_t kernel = context.getKernel(program, "computeNonbonded");
return kernel;
}
void HipNonbondedUtilities::setKernelSource(const string& source) {
kernelSource = source;
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011-2021 Stanford University and the Authors. *
* Portions copyright (c) 2020-2021 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "HipParallelKernels.h"
#include "HipKernelSources.h"
#include "openmm/common/ContextSelector.h"
using namespace OpenMM;
using namespace std;
#define CHECK_RESULT(result, prefix) \
if (result != hipSuccess) { \
std::stringstream m; \
m<<prefix<<": "<<cu.getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
throw OpenMMException(m.str());\
}
/**
* Get the current clock time, measured in microseconds.
*/
#ifdef _MSC_VER
#include <Windows.h>
static long long getTime() {
FILETIME ft;
GetSystemTimeAsFileTime(&ft); // 100-nanoseconds since 1-1-1601
ULARGE_INTEGER result;
result.LowPart = ft.dwLowDateTime;
result.HighPart = ft.dwHighDateTime;
return result.QuadPart/10;
}
#else
#include <sys/time.h>
static long long getTime() {
struct timeval tod;
gettimeofday(&tod, 0);
return 1000000*tod.tv_sec+tod.tv_usec;
}
#endif
class HipParallelCalcForcesAndEnergyKernel::BeginComputationTask : public HipContext::WorkTask {
public:
BeginComputationTask(ContextImpl& context, HipContext& cu, HipCalcForcesAndEnergyKernel& kernel,
bool includeForce, bool includeEnergy, int groups, void* pinnedMemory, hipEvent_t event) : context(context), cu(cu), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), pinnedMemory(pinnedMemory), event(event) {
}
void execute() {
// Copy coordinates over to this device and execute the kernel.
ContextSelector selector(cu);
if (cu.getContextIndex() > 0) {
hipStreamWaitEvent(cu.getCurrentStream(), event, 0);
if (!cu.getPlatformData().peerAccessSupported)
cu.getPosq().upload(pinnedMemory, false);
}
kernel.beginComputation(context, includeForce, includeEnergy, groups);
}
private:
ContextImpl& context;
HipContext& cu;
HipCalcForcesAndEnergyKernel& kernel;
bool includeForce, includeEnergy;
int groups;
void* pinnedMemory;
hipEvent_t event;
};
class HipParallelCalcForcesAndEnergyKernel::FinishComputationTask : public HipContext::WorkTask {
public:
FinishComputationTask(ContextImpl& context, HipContext& cu, HipCalcForcesAndEnergyKernel& kernel,
bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, long long* pinnedMemory, HipArray& contextForces,
bool& valid, hipStream_t stream, hipEvent_t event, hipEvent_t localEvent) :
context(context), cu(cu), kernel(kernel), includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), energy(energy),
completionTime(completionTime), pinnedMemory(pinnedMemory), contextForces(contextForces), valid(valid),
stream(stream), event(event), localEvent(localEvent) {
}
void execute() {
// Execute the kernel, then download forces.
ContextSelector selector(cu);
energy += kernel.finishComputation(context, includeForce, includeEnergy, groups, valid);
if (cu.getComputeForceCount() < 200) {
// Record timing information for load balancing. Since this takes time, only do it at the start of the simulation.
CHECK_RESULT(hipStreamSynchronize(cu.getCurrentStream()), "Error synchronizing HIP context");
completionTime = getTime();
}
if (includeForce) {
if (cu.getContextIndex() > 0) {
hipEventRecord(localEvent, cu.getCurrentStream());
hipStreamWaitEvent(stream, localEvent, 0);
int numAtoms = cu.getPaddedNumAtoms();
if (cu.getPlatformData().peerAccessSupported) {
int numBytes = numAtoms*3*sizeof(long long);
int offset = (cu.getContextIndex()-1)*numBytes;
CHECK_RESULT(hipMemcpyAsync(static_cast<char*>(contextForces.getDevicePointer())+offset,
cu.getForce().getDevicePointer(), numBytes, hipMemcpyDeviceToDevice, stream), "Error copying forces");
hipEventRecord(event, stream);
}
else
cu.getForce().download(&pinnedMemory[(cu.getContextIndex()-1)*numAtoms*3]);
}
}
}
private:
ContextImpl& context;
HipContext& cu;
HipCalcForcesAndEnergyKernel& kernel;
bool includeForce, includeEnergy;
int groups;
double& energy;
long long& completionTime;
long long* pinnedMemory;
HipArray& contextForces;
bool& valid;
hipStream_t stream;
hipEvent_t event;
hipEvent_t localEvent;
};
HipParallelCalcForcesAndEnergyKernel::HipParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, HipPlatform::PlatformData& data) :
CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextNonbondedFractions(data.contexts.size()),
pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new HipCalcForcesAndEnergyKernel(name, platform, *data.contexts[i])));
}
HipParallelCalcForcesAndEnergyKernel::~HipParallelCalcForcesAndEnergyKernel() {
ContextSelector selector(*data.contexts[0]);
if (pinnedPositionBuffer != NULL)
hipHostFree(pinnedPositionBuffer);
if (pinnedForceBuffer != NULL)
hipHostFree(pinnedForceBuffer);
hipEventDestroy(event);
for (int i = 0; i < peerCopyEvent.size(); i++)
hipEventDestroy(peerCopyEvent[i]);
for (int i = 0; i < peerCopyEventLocal.size(); i++)
hipEventDestroy(peerCopyEventLocal[i]);
for (int i = 0; i < peerCopyStream.size(); i++)
hipStreamDestroy(peerCopyStream[i]);
}
void HipParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
HipContext& cu = *data.contexts[0];
ContextSelector selector(cu);
hipModule_t module = cu.createModule(HipKernelSources::parallel);
sumKernel = cu.getKernel(module, "sumForces");
int numContexts = data.contexts.size();
for (int i = 0; i < numContexts; i++)
getKernel(i).initialize(system);
for (int i = 0; i < numContexts; i++)
contextNonbondedFractions[i] = 1/(double) numContexts;
CHECK_RESULT(hipEventCreateWithFlags(&event, cu.getEventFlags()), "Error creating event");
peerCopyEvent.resize(numContexts);
peerCopyEventLocal.resize(numContexts);
peerCopyStream.resize(numContexts);
for (int i = 0; i < numContexts; i++) {
HipContext& cuLocal = *data.contexts[i];
ContextSelector selectorLocal(cuLocal);
CHECK_RESULT(hipEventCreateWithFlags(&peerCopyEvent[i], cu.getEventFlags()), "Error creating event");
CHECK_RESULT(hipStreamCreateWithFlags(&peerCopyStream[i], hipStreamNonBlocking), "Error creating stream");
CHECK_RESULT(hipEventCreateWithFlags(&peerCopyEventLocal[i], cu.getEventFlags()), "Error creating event");
}
}
void HipParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
HipContext& cu = *data.contexts[0];
ContextSelector selector(cu);
if (!contextForces.isInitialized()) {
contextForces.initialize<long long>(cu, 3*(data.contexts.size()-1)*cu.getPaddedNumAtoms(), "contextForces");
if (!cu.getPlatformData().peerAccessSupported) {
CHECK_RESULT(hipHostMalloc((void**) &pinnedForceBuffer, 3*(data.contexts.size()-1)*cu.getPaddedNumAtoms()*sizeof(long long), hipHostMallocPortable), "Error allocating pinned memory");
CHECK_RESULT(hipHostMalloc(&pinnedPositionBuffer, cu.getPaddedNumAtoms()*(cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4)), hipHostMallocPortable), "Error allocating pinned memory");
}
}
// Copy coordinates over to each device and execute the kernel.
if (!cu.getPlatformData().peerAccessSupported) {
cu.getPosq().download(pinnedPositionBuffer, false);
hipEventRecord(event, cu.getCurrentStream());
}
else {
int numBytes = cu.getPosq().getSize()*cu.getPosq().getElementSize();
hipEventRecord(event, cu.getCurrentStream());
for (int i = 1; i < (int) data.contexts.size(); i++) {
hipStreamWaitEvent(peerCopyStream[i], event, 0);
CHECK_RESULT(hipMemcpyAsync(
data.contexts[i]->getPosq().getDevicePointer(),
cu.getPosq().getDevicePointer(), numBytes,
hipMemcpyDeviceToDevice, peerCopyStream[i]), "Error copying positions");
hipEventRecord(peerCopyEvent[i], peerCopyStream[i]);
}
}
for (int i = 0; i < (int) data.contexts.size(); i++) {
data.contextEnergy[i] = 0.0;
HipContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread();
hipEvent_t waitEvent = (cu.getPlatformData().peerAccessSupported ? peerCopyEvent[i] : event);
thread.addTask(new BeginComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, pinnedPositionBuffer, waitEvent));
}
data.syncContexts();
}
double HipParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups, bool& valid) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
HipContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new FinishComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, data.contextEnergy[i], completionTimes[i],
pinnedForceBuffer, contextForces, valid, peerCopyStream[i], peerCopyEvent[i], peerCopyEventLocal[i]));
}
data.syncContexts();
HipContext& cu = *data.contexts[0];
ContextSelector selector(cu);
if (cu.getPlatformData().peerAccessSupported)
for (int i = 1; i < data.contexts.size(); i++)
hipStreamWaitEvent(cu.getCurrentStream(), peerCopyEvent[i], 0);
double energy = 0.0;
for (int i = 0; i < (int) data.contextEnergy.size(); i++)
energy += data.contextEnergy[i];
if (includeForce && valid) {
// Sum the forces from all devices.
if (!cu.getPlatformData().peerAccessSupported)
contextForces.upload(pinnedForceBuffer, false);
int bufferSize = 3*cu.getPaddedNumAtoms();
int numBuffers = data.contexts.size()-1;
void* args[] = {&cu.getForce().getDevicePointer(), &contextForces.getDevicePointer(), &bufferSize, &numBuffers};
cu.executeKernel(sumKernel, args, bufferSize);
// Balance work between the contexts by transferring a little nonbonded work from the context that
// finished last to the one that finished first.
if (cu.getComputeForceCount() < 200) {
int firstIndex = 0, lastIndex = 0;
const double eps = 0.001;
for (int i = 0; i < (int) completionTimes.size(); i++) {
if (completionTimes[i] < completionTimes[firstIndex])
firstIndex = i;
if (contextNonbondedFractions[lastIndex] < eps || completionTimes[i] > completionTimes[lastIndex])
lastIndex = i;
}
double fractionToTransfer = min(cu.getComputeForceCount() < 100 ? 0.01 : 0.001, contextNonbondedFractions[lastIndex]);
contextNonbondedFractions[firstIndex] += fractionToTransfer;
contextNonbondedFractions[lastIndex] -= fractionToTransfer;
double startFraction = 0.0;
for (int i = 0; i < (int) contextNonbondedFractions.size(); i++) {
double endFraction = startFraction+contextNonbondedFractions[i];
if (i == contextNonbondedFractions.size()-1)
endFraction = 1.0; // Avoid roundoff error
data.contexts[i]->getNonbondedUtilities().setAtomBlockRange(startFraction, endFraction);
startFraction = endFraction;
}
}
}
return energy;
}
class HipParallelCalcHarmonicBondForceKernel::Task : public HipContext::WorkTask {
public:
Task(ContextImpl& context, CommonCalcHarmonicBondForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
void execute() {
energy += kernel.execute(context, includeForce, includeEnergy);
}
private:
ContextImpl& context;
CommonCalcHarmonicBondForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
HipParallelCalcHarmonicBondForceKernel::HipParallelCalcHarmonicBondForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
CalcHarmonicBondForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CommonCalcHarmonicBondForceKernel(name, platform, *data.contexts[i], system)));
}
void HipParallelCalcHarmonicBondForceKernel::initialize(const System& system, const HarmonicBondForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).initialize(system, force);
}
double HipParallelCalcHarmonicBondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
HipContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
}
void HipParallelCalcHarmonicBondForceKernel::copyParametersToContext(ContextImpl& context, const HarmonicBondForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).copyParametersToContext(context, force);
}
class HipParallelCalcCustomBondForceKernel::Task : public HipContext::WorkTask {
public:
Task(ContextImpl& context, CommonCalcCustomBondForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
void execute() {
energy += kernel.execute(context, includeForce, includeEnergy);
}
private:
ContextImpl& context;
CommonCalcCustomBondForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
HipParallelCalcCustomBondForceKernel::HipParallelCalcCustomBondForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
CalcCustomBondForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CommonCalcCustomBondForceKernel(name, platform, *data.contexts[i], system)));
}
void HipParallelCalcCustomBondForceKernel::initialize(const System& system, const CustomBondForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).initialize(system, force);
}
double HipParallelCalcCustomBondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
HipContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
}
void HipParallelCalcCustomBondForceKernel::copyParametersToContext(ContextImpl& context, const CustomBondForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).copyParametersToContext(context, force);
}
class HipParallelCalcHarmonicAngleForceKernel::Task : public HipContext::WorkTask {
public:
Task(ContextImpl& context, CommonCalcHarmonicAngleForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
void execute() {
energy += kernel.execute(context, includeForce, includeEnergy);
}
private:
ContextImpl& context;
CommonCalcHarmonicAngleForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
HipParallelCalcHarmonicAngleForceKernel::HipParallelCalcHarmonicAngleForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
CalcHarmonicAngleForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CommonCalcHarmonicAngleForceKernel(name, platform, *data.contexts[i], system)));
}
void HipParallelCalcHarmonicAngleForceKernel::initialize(const System& system, const HarmonicAngleForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).initialize(system, force);
}
double HipParallelCalcHarmonicAngleForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
HipContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
}
void HipParallelCalcHarmonicAngleForceKernel::copyParametersToContext(ContextImpl& context, const HarmonicAngleForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).copyParametersToContext(context, force);
}
class HipParallelCalcCustomAngleForceKernel::Task : public HipContext::WorkTask {
public:
Task(ContextImpl& context, CommonCalcCustomAngleForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
void execute() {
energy += kernel.execute(context, includeForce, includeEnergy);
}
private:
ContextImpl& context;
CommonCalcCustomAngleForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
HipParallelCalcCustomAngleForceKernel::HipParallelCalcCustomAngleForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
CalcCustomAngleForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CommonCalcCustomAngleForceKernel(name, platform, *data.contexts[i], system)));
}
void HipParallelCalcCustomAngleForceKernel::initialize(const System& system, const CustomAngleForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).initialize(system, force);
}
double HipParallelCalcCustomAngleForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
HipContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
}
void HipParallelCalcCustomAngleForceKernel::copyParametersToContext(ContextImpl& context, const CustomAngleForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).copyParametersToContext(context, force);
}
class HipParallelCalcPeriodicTorsionForceKernel::Task : public HipContext::WorkTask {
public:
Task(ContextImpl& context, CommonCalcPeriodicTorsionForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
void execute() {
energy += kernel.execute(context, includeForce, includeEnergy);
}
private:
ContextImpl& context;
CommonCalcPeriodicTorsionForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
HipParallelCalcPeriodicTorsionForceKernel::HipParallelCalcPeriodicTorsionForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
CalcPeriodicTorsionForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CommonCalcPeriodicTorsionForceKernel(name, platform, *data.contexts[i], system)));
}
void HipParallelCalcPeriodicTorsionForceKernel::initialize(const System& system, const PeriodicTorsionForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).initialize(system, force);
}
double HipParallelCalcPeriodicTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
HipContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
}
void HipParallelCalcPeriodicTorsionForceKernel::copyParametersToContext(ContextImpl& context, const PeriodicTorsionForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).copyParametersToContext(context, force);
}
class HipParallelCalcRBTorsionForceKernel::Task : public HipContext::WorkTask {
public:
Task(ContextImpl& context, CommonCalcRBTorsionForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
void execute() {
energy += kernel.execute(context, includeForce, includeEnergy);
}
private:
ContextImpl& context;
CommonCalcRBTorsionForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
HipParallelCalcRBTorsionForceKernel::HipParallelCalcRBTorsionForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
CalcRBTorsionForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CommonCalcRBTorsionForceKernel(name, platform, *data.contexts[i], system)));
}
void HipParallelCalcRBTorsionForceKernel::initialize(const System& system, const RBTorsionForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).initialize(system, force);
}
double HipParallelCalcRBTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
HipContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
}
void HipParallelCalcRBTorsionForceKernel::copyParametersToContext(ContextImpl& context, const RBTorsionForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).copyParametersToContext(context, force);
}
class HipParallelCalcCMAPTorsionForceKernel::Task : public HipContext::WorkTask {
public:
Task(ContextImpl& context, CommonCalcCMAPTorsionForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
void execute() {
energy += kernel.execute(context, includeForce, includeEnergy);
}
private:
ContextImpl& context;
CommonCalcCMAPTorsionForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
HipParallelCalcCMAPTorsionForceKernel::HipParallelCalcCMAPTorsionForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
CalcCMAPTorsionForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CommonCalcCMAPTorsionForceKernel(name, platform, *data.contexts[i], system)));
}
void HipParallelCalcCMAPTorsionForceKernel::initialize(const System& system, const CMAPTorsionForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).initialize(system, force);
}
double HipParallelCalcCMAPTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
HipContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
}
void HipParallelCalcCMAPTorsionForceKernel::copyParametersToContext(ContextImpl& context, const CMAPTorsionForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).copyParametersToContext(context, force);
}
class HipParallelCalcCustomTorsionForceKernel::Task : public HipContext::WorkTask {
public:
Task(ContextImpl& context, CommonCalcCustomTorsionForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
void execute() {
energy += kernel.execute(context, includeForce, includeEnergy);
}
private:
ContextImpl& context;
CommonCalcCustomTorsionForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
HipParallelCalcCustomTorsionForceKernel::HipParallelCalcCustomTorsionForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
CalcCustomTorsionForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CommonCalcCustomTorsionForceKernel(name, platform, *data.contexts[i], system)));
}
void HipParallelCalcCustomTorsionForceKernel::initialize(const System& system, const CustomTorsionForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).initialize(system, force);
}
double HipParallelCalcCustomTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
HipContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
}
void HipParallelCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& context, const CustomTorsionForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).copyParametersToContext(context, force);
}
class HipParallelCalcNonbondedForceKernel::Task : public HipContext::WorkTask {
public:
Task(ContextImpl& context, HipCalcNonbondedForceKernel& kernel, bool includeForce,
bool includeEnergy, bool includeDirect, bool includeReciprocal, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), includeDirect(includeDirect), includeReciprocal(includeReciprocal), energy(energy) {
}
void execute() {
energy += kernel.execute(context, includeForce, includeEnergy, includeDirect, includeReciprocal);
}
private:
ContextImpl& context;
HipCalcNonbondedForceKernel& kernel;
bool includeForce, includeEnergy, includeDirect, includeReciprocal;
double& energy;
};
HipParallelCalcNonbondedForceKernel::HipParallelCalcNonbondedForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
CalcNonbondedForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new HipCalcNonbondedForceKernel(name, platform, *data.contexts[i], system)));
}
void HipParallelCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).initialize(system, force);
}
double HipParallelCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
HipContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, includeDirect, includeReciprocal, data.contextEnergy[i]));
}
return 0.0;
}
void HipParallelCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context, const NonbondedForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).copyParametersToContext(context, force);
}
void HipParallelCalcNonbondedForceKernel::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
dynamic_cast<const HipCalcNonbondedForceKernel&>(kernels[0].getImpl()).getPMEParameters(alpha, nx, ny, nz);
}
void HipParallelCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
dynamic_cast<const HipCalcNonbondedForceKernel&>(kernels[0].getImpl()).getLJPMEParameters(alpha, nx, ny, nz);
}
class HipParallelCalcCustomNonbondedForceKernel::Task : public HipContext::WorkTask {
public:
Task(ContextImpl& context, CommonCalcCustomNonbondedForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
void execute() {
energy += kernel.execute(context, includeForce, includeEnergy);
}
private:
ContextImpl& context;
CommonCalcCustomNonbondedForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
HipParallelCalcCustomNonbondedForceKernel::HipParallelCalcCustomNonbondedForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
CalcCustomNonbondedForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CommonCalcCustomNonbondedForceKernel(name, platform, *data.contexts[i], system)));
}
void HipParallelCalcCustomNonbondedForceKernel::initialize(const System& system, const CustomNonbondedForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).initialize(system, force);
}
double HipParallelCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
HipContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
}
void HipParallelCalcCustomNonbondedForceKernel::copyParametersToContext(ContextImpl& context, const CustomNonbondedForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).copyParametersToContext(context, force);
}
class HipParallelCalcCustomExternalForceKernel::Task : public HipContext::WorkTask {
public:
Task(ContextImpl& context, CommonCalcCustomExternalForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
void execute() {
energy += kernel.execute(context, includeForce, includeEnergy);
}
private:
ContextImpl& context;
CommonCalcCustomExternalForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
HipParallelCalcCustomExternalForceKernel::HipParallelCalcCustomExternalForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
CalcCustomExternalForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CommonCalcCustomExternalForceKernel(name, platform, *data.contexts[i], system)));
}
void HipParallelCalcCustomExternalForceKernel::initialize(const System& system, const CustomExternalForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).initialize(system, force);
}
double HipParallelCalcCustomExternalForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
HipContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
}
void HipParallelCalcCustomExternalForceKernel::copyParametersToContext(ContextImpl& context, const CustomExternalForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).copyParametersToContext(context, force);
}
class HipParallelCalcCustomHbondForceKernel::Task : public HipContext::WorkTask {
public:
Task(ContextImpl& context, CommonCalcCustomHbondForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
void execute() {
energy += kernel.execute(context, includeForce, includeEnergy);
}
private:
ContextImpl& context;
CommonCalcCustomHbondForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
HipParallelCalcCustomHbondForceKernel::HipParallelCalcCustomHbondForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
CalcCustomHbondForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CommonCalcCustomHbondForceKernel(name, platform, *data.contexts[i], system)));
}
void HipParallelCalcCustomHbondForceKernel::initialize(const System& system, const CustomHbondForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).initialize(system, force);
}
double HipParallelCalcCustomHbondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
HipContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
}
void HipParallelCalcCustomHbondForceKernel::copyParametersToContext(ContextImpl& context, const CustomHbondForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).copyParametersToContext(context, force);
}
class HipParallelCalcCustomCompoundBondForceKernel::Task : public HipContext::WorkTask {
public:
Task(ContextImpl& context, CommonCalcCustomCompoundBondForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
void execute() {
energy += kernel.execute(context, includeForce, includeEnergy);
}
private:
ContextImpl& context;
CommonCalcCustomCompoundBondForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
HipParallelCalcCustomCompoundBondForceKernel::HipParallelCalcCustomCompoundBondForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
CalcCustomCompoundBondForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CommonCalcCustomCompoundBondForceKernel(name, platform, *data.contexts[i], system)));
}
void HipParallelCalcCustomCompoundBondForceKernel::initialize(const System& system, const CustomCompoundBondForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).initialize(system, force);
}
double HipParallelCalcCustomCompoundBondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
HipContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
}
void HipParallelCalcCustomCompoundBondForceKernel::copyParametersToContext(ContextImpl& context, const CustomCompoundBondForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).copyParametersToContext(context, force);
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2019 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "HipParameterSet.h"
using namespace OpenMM;
using namespace std;
HipParameterSet::HipParameterSet(HipContext& context, int numParameters, int numObjects, const string& name, bool bufferPerParameter, bool useDoublePrecision) :
ComputeParameterSet(context, numParameters, numObjects, name, bufferPerParameter, useDoublePrecision) {
for (auto& info : getParameterInfos())
buffers.push_back(HipNonbondedUtilities::ParameterInfo(info.getName(), info.getComponentType(), info.getNumComponents(), info.getSize(), context.unwrap(info.getArray()).getDevicePointer()));
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2024 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "HipContext.h"
#include "HipExpressionUtilities.h"
#include "HipPlatform.h"
#include "HipKernelFactory.h"
#include "HipKernels.h"
#include "openmm/Context.h"
#include "openmm/System.h"
#include "openmm/internal/ContextImpl.h"
#include "openmm/internal/hardware.h"
#include <algorithm>
#include <cctype>
#include <sstream>
#include <cstdio>
#ifdef _MSC_VER
#include <Windows.h>
#endif
using namespace OpenMM;
using namespace std;
#define CHECK_RESULT(result, prefix) \
if (result != hipSuccess) { \
std::stringstream m; \
m<<prefix<<": "<<HipContext::getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
throw OpenMMException(m.str());\
}
#ifdef OPENMM_COMMON_BUILDING_STATIC_LIBRARY
extern "C" void registerHipPlatform() {
Platform::registerPlatform(new HipPlatform());
}
#else
extern "C" OPENMM_EXPORT_COMMON void registerPlatforms() {
Platform::registerPlatform(new HipPlatform());
}
#endif
HipPlatform::HipPlatform() {
deprecatedPropertyReplacements["HipDeviceIndex"] = HipDeviceIndex();
deprecatedPropertyReplacements["HipDeviceName"] = HipDeviceName();
deprecatedPropertyReplacements["HipUseBlockingSync"] = HipUseBlockingSync();
deprecatedPropertyReplacements["HipPrecision"] = HipPrecision();
deprecatedPropertyReplacements["HipUseCpuPme"] = HipUseCpuPme();
deprecatedPropertyReplacements["HipTempDirectory"] = HipTempDirectory();
deprecatedPropertyReplacements["HipDisablePmeStream"] = HipDisablePmeStream();
deprecatedPropertyReplacements["HipDeterministicForces"] = HipDeterministicForces();
HipKernelFactory* factory = new HipKernelFactory();
registerKernelFactory(CalcForcesAndEnergyKernel::Name(), factory);
registerKernelFactory(UpdateStateDataKernel::Name(), factory);
registerKernelFactory(ApplyConstraintsKernel::Name(), factory);
registerKernelFactory(VirtualSitesKernel::Name(), factory);
registerKernelFactory(CalcHarmonicBondForceKernel::Name(), factory);
registerKernelFactory(CalcCustomBondForceKernel::Name(), factory);
registerKernelFactory(CalcHarmonicAngleForceKernel::Name(), factory);
registerKernelFactory(CalcCustomAngleForceKernel::Name(), factory);
registerKernelFactory(CalcPeriodicTorsionForceKernel::Name(), factory);
registerKernelFactory(CalcRBTorsionForceKernel::Name(), factory);
registerKernelFactory(CalcCMAPTorsionForceKernel::Name(), factory);
registerKernelFactory(CalcCustomTorsionForceKernel::Name(), factory);
registerKernelFactory(CalcNonbondedForceKernel::Name(), factory);
registerKernelFactory(CalcCustomNonbondedForceKernel::Name(), factory);
registerKernelFactory(CalcGBSAOBCForceKernel::Name(), factory);
registerKernelFactory(CalcCustomGBForceKernel::Name(), factory);
registerKernelFactory(CalcCustomExternalForceKernel::Name(), factory);
registerKernelFactory(CalcCustomHbondForceKernel::Name(), factory);
registerKernelFactory(CalcCustomCentroidBondForceKernel::Name(), factory);
registerKernelFactory(CalcCustomCompoundBondForceKernel::Name(), factory);
registerKernelFactory(CalcCustomCPPForceKernel::Name(), factory);
registerKernelFactory(CalcCustomCVForceKernel::Name(), factory);
registerKernelFactory(CalcRMSDForceKernel::Name(), factory);
registerKernelFactory(CalcCustomManyParticleForceKernel::Name(), factory);
registerKernelFactory(CalcGayBerneForceKernel::Name(), factory);
registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
registerKernelFactory(IntegrateNoseHooverStepKernel::Name(), factory);
registerKernelFactory(IntegrateLangevinMiddleStepKernel::Name(), factory);
registerKernelFactory(IntegrateBrownianStepKernel::Name(), factory);
registerKernelFactory(IntegrateVariableVerletStepKernel::Name(), factory);
registerKernelFactory(IntegrateVariableLangevinStepKernel::Name(), factory);
registerKernelFactory(IntegrateCustomStepKernel::Name(), factory);
registerKernelFactory(ApplyAndersenThermostatKernel::Name(), factory);
registerKernelFactory(ApplyMonteCarloBarostatKernel::Name(), factory);
registerKernelFactory(RemoveCMMotionKernel::Name(), factory);
registerKernelFactory(CalcATMForceKernel::Name(), factory);
platformProperties.push_back(HipDeviceIndex());
platformProperties.push_back(HipDeviceName());
platformProperties.push_back(HipUseBlockingSync());
platformProperties.push_back(HipPrecision());
platformProperties.push_back(HipUseCpuPme());
platformProperties.push_back(HipTempDirectory());
platformProperties.push_back(HipDisablePmeStream());
platformProperties.push_back(HipDeterministicForces());
setPropertyDefaultValue(HipDeviceIndex(), "");
setPropertyDefaultValue(HipDeviceName(), "");
setPropertyDefaultValue(HipUseBlockingSync(), "true");
setPropertyDefaultValue(HipPrecision(), "single");
setPropertyDefaultValue(HipUseCpuPme(), "false");
setPropertyDefaultValue(HipDisablePmeStream(), "false");
setPropertyDefaultValue(HipDeterministicForces(), "false");
#ifdef _MSC_VER
setPropertyDefaultValue(HipTempDirectory(), string(getenv("TEMP")));
#else
char* tmpdir = getenv("TMPDIR");
string tmp = (tmpdir == NULL ? string(P_tmpdir) : string(tmpdir));
setPropertyDefaultValue(HipTempDirectory(), tmp);
#endif
}
double HipPlatform::getSpeed() const {
// Reduce the speed of the HIP platform if there are no HIP devices in the system,
// so the OpenCL plaform can be selected as default
int numDevices;
return hipGetDeviceCount(&numDevices) != hipErrorNoDevice ? 100 : 40;
}
bool HipPlatform::supportsDoublePrecision() const {
return true;
}
const string& HipPlatform::getPropertyValue(const Context& context, const string& property) const {
const ContextImpl& impl = getContextImpl(context);
const PlatformData* data = reinterpret_cast<const PlatformData*>(impl.getPlatformData());
string propertyName = property;
if (deprecatedPropertyReplacements.find(property) != deprecatedPropertyReplacements.end())
propertyName = deprecatedPropertyReplacements.find(property)->second;
map<string, string>::const_iterator value = data->propertyValues.find(propertyName);
if (value != data->propertyValues.end())
return value->second;
return Platform::getPropertyValue(context, property);
}
void HipPlatform::setPropertyValue(Context& context, const string& property, const string& value) const {
}
void HipPlatform::contextCreated(ContextImpl& context, const map<string, string>& properties) const {
const string& devicePropValue = (properties.find(HipDeviceIndex()) == properties.end() ?
getPropertyDefaultValue(HipDeviceIndex()) : properties.find(HipDeviceIndex())->second);
string blockingPropValue = (properties.find(HipUseBlockingSync()) == properties.end() ?
getPropertyDefaultValue(HipUseBlockingSync()) : properties.find(HipUseBlockingSync())->second);
string precisionPropValue = (properties.find(HipPrecision()) == properties.end() ?
getPropertyDefaultValue(HipPrecision()) : properties.find(HipPrecision())->second);
string cpuPmePropValue = (properties.find(HipUseCpuPme()) == properties.end() ?
getPropertyDefaultValue(HipUseCpuPme()) : properties.find(HipUseCpuPme())->second);
const string& tempPropValue = (properties.find(HipTempDirectory()) == properties.end() ?
getPropertyDefaultValue(HipTempDirectory()) : properties.find(HipTempDirectory())->second);
string pmeStreamPropValue = (properties.find(HipDisablePmeStream()) == properties.end() ?
getPropertyDefaultValue(HipDisablePmeStream()) : properties.find(HipDisablePmeStream())->second);
string deterministicForcesValue = (properties.find(HipDeterministicForces()) == properties.end() ?
getPropertyDefaultValue(HipDeterministicForces()) : properties.find(HipDeterministicForces())->second);
transform(blockingPropValue.begin(), blockingPropValue.end(), blockingPropValue.begin(), ::tolower);
transform(precisionPropValue.begin(), precisionPropValue.end(), precisionPropValue.begin(), ::tolower);
transform(cpuPmePropValue.begin(), cpuPmePropValue.end(), cpuPmePropValue.begin(), ::tolower);
transform(pmeStreamPropValue.begin(), pmeStreamPropValue.end(), pmeStreamPropValue.begin(), ::tolower);
transform(deterministicForcesValue.begin(), deterministicForcesValue.end(), deterministicForcesValue.begin(), ::tolower);
vector<string> pmeKernelName;
pmeKernelName.push_back(CalcPmeReciprocalForceKernel::Name());
if (!supportsKernels(pmeKernelName))
cpuPmePropValue = "false";
int threads = getNumProcessors();
char* threadsEnv = getenv("OPENMM_CPU_THREADS");
if (threadsEnv != NULL)
stringstream(threadsEnv) >> threads;
context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, tempPropValue,
pmeStreamPropValue, deterministicForcesValue, threads, NULL));
}
void HipPlatform::linkedContextCreated(ContextImpl& context, ContextImpl& originalContext) const {
Platform& platform = originalContext.getPlatform();
string devicePropValue = platform.getPropertyValue(originalContext.getOwner(), HipDeviceIndex());
string blockingPropValue = platform.getPropertyValue(originalContext.getOwner(), HipUseBlockingSync());
string precisionPropValue = platform.getPropertyValue(originalContext.getOwner(), HipPrecision());
string cpuPmePropValue = platform.getPropertyValue(originalContext.getOwner(), HipUseCpuPme());
string tempPropValue = platform.getPropertyValue(originalContext.getOwner(), HipTempDirectory());
string pmeStreamPropValue = platform.getPropertyValue(originalContext.getOwner(), HipDisablePmeStream());
string deterministicForcesValue = platform.getPropertyValue(originalContext.getOwner(), HipDeterministicForces());
int threads = reinterpret_cast<PlatformData*>(originalContext.getPlatformData())->threads.getNumThreads();
context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, tempPropValue,
pmeStreamPropValue, deterministicForcesValue, threads, &originalContext));
}
void HipPlatform::contextDestroyed(ContextImpl& context) const {
PlatformData* data = reinterpret_cast<PlatformData*>(context.getPlatformData());
delete data;
}
HipPlatform::PlatformData::PlatformData(ContextImpl* context, const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty,
const string& cpuPmeProperty, const string& tempProperty, const string& pmeStreamProperty,
const string& deterministicForcesProperty, int numThreads, ContextImpl* originalContext) :
context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0), hasInitializedContexts(false),
threads(numThreads) {
bool blocking = (blockingProperty == "true");
vector<string> devices;
size_t searchPos = 0, nextPos;
while ((nextPos = deviceIndexProperty.find_first_of(", ", searchPos)) != string::npos) {
devices.push_back(deviceIndexProperty.substr(searchPos, nextPos-searchPos));
searchPos = nextPos+1;
}
devices.push_back(deviceIndexProperty.substr(searchPos));
PlatformData* originalData = NULL;
if (originalContext != NULL)
originalData = reinterpret_cast<PlatformData*>(originalContext->getPlatformData());
try {
for (int i = 0; i < (int) devices.size(); i++) {
if (devices[i].length() > 0) {
int deviceIndex;
stringstream(devices[i]) >> deviceIndex;
contexts.push_back(new HipContext(system, deviceIndex, blocking, precisionProperty, tempProperty, *this, (originalData == NULL ? NULL : originalData->contexts[i])));
}
}
if (contexts.size() == 0)
contexts.push_back(new HipContext(system, -1, blocking, precisionProperty, tempProperty, *this, (originalData == NULL ? NULL : originalData->contexts[0])));
}
catch (...) {
// If an exception was thrown, do our best to clean up memory.
for (int i = 0; i < (int) contexts.size(); i++)
delete contexts[i];
throw;
}
stringstream deviceIndex, deviceName;
for (int i = 0; i < (int) contexts.size(); i++) {
if (i > 0) {
deviceIndex << ',';
deviceName << ',';
}
deviceIndex << contexts[i]->getDeviceIndex();
char name[1000];
CHECK_RESULT(hipDeviceGetName(name, 1000, contexts[i]->getDevice()), "Error querying device name");
deviceName << name;
}
useCpuPme = (cpuPmeProperty == "true" && !contexts[0]->getUseDoublePrecision());
disablePmeStream = (pmeStreamProperty == "true");
deterministicForces = (deterministicForcesProperty == "true");
propertyValues[HipPlatform::HipDeviceIndex()] = deviceIndex.str();
propertyValues[HipPlatform::HipDeviceName()] = deviceName.str();
propertyValues[HipPlatform::HipUseBlockingSync()] = blocking ? "true" : "false";
propertyValues[HipPlatform::HipPrecision()] = precisionProperty;
propertyValues[HipPlatform::HipUseCpuPme()] = useCpuPme ? "true" : "false";
propertyValues[HipPlatform::HipTempDirectory()] = tempProperty;
propertyValues[HipPlatform::HipDisablePmeStream()] = disablePmeStream ? "true" : "false";
propertyValues[HipPlatform::HipDeterministicForces()] = deterministicForces ? "true" : "false";
contextEnergy.resize(contexts.size());
// Determine whether peer-to-peer copying is supported, and enable it if so.
peerAccessSupported = true;
for (int i = 1; i < contexts.size(); i++) {
int canAccess;
hipDeviceCanAccessPeer(&canAccess, contexts[i]->getDevice(), contexts[0]->getDevice());
if (!canAccess) {
peerAccessSupported = false;
break;
}
}
}
HipPlatform::PlatformData::~PlatformData() {
for (int i = 0; i < (int) contexts.size(); i++)
delete contexts[i];
}
void HipPlatform::PlatformData::initializeContexts(const System& system) {
if (hasInitializedContexts)
return;
for (int i = 0; i < (int) contexts.size(); i++)
contexts[i]->initialize();
hasInitializedContexts = true;
}
void HipPlatform::PlatformData::syncContexts() {
for (int i = 0; i < (int) contexts.size(); i++)
contexts[i]->getWorkThread().flush();
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2019 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "HipProgram.h"
#include "HipKernel.h"
using namespace OpenMM;
using namespace std;
HipProgram::HipProgram(HipContext& context, hipModule_t module) : context(context), module(module) {
}
ComputeKernel HipProgram::createKernel(const string& name) {
hipFunction_t kernel = context.getKernel(module, name.c_str());
return shared_ptr<ComputeKernelImpl>(new HipKernel(context, kernel, name));
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2021 Stanford University and the Authors. *
* Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "HipSort.h"
#include "HipKernelSources.h"
#include <algorithm>
#include <map>
using namespace OpenMM;
using namespace std;
HipSort::HipSort(HipContext& context, SortTrait* trait, unsigned int length, bool uniform) :
context(context), trait(trait), dataLength(length), uniform(uniform) {
// Create kernels.
map<string, string> replacements;
replacements["DATA_TYPE"] = trait->getDataType();
replacements["KEY_TYPE"] = trait->getKeyType();
replacements["SORT_KEY"] = trait->getSortKey();
replacements["MIN_KEY"] = trait->getMinKey();
replacements["MAX_KEY"] = trait->getMaxKey();
replacements["MAX_VALUE"] = trait->getMaxValue();
replacements["UNIFORM"] = (uniform ? "1" : "0");
hipModule_t module = context.createModule(context.replaceStrings(HipKernelSources::sort, replacements));
shortListKernel = context.getKernel(module, "sortShortList");
shortList2Kernel = context.getKernel(module, "sortShortList2");
computeRangeKernel = context.getKernel(module, "computeRange");
assignElementsKernel = context.getKernel(module, uniform ? "assignElementsToBuckets" : "assignElementsToBuckets2");
computeBucketPositionsKernel = context.getKernel(module, "computeBucketPositions");
copyToBucketsKernel = context.getKernel(module, "copyDataToBuckets");
sortBucketsKernel = context.getKernel(module, "sortBuckets");
// Work out the work group sizes for various kernels.
int maxSharedMem;
hipDeviceGetAttribute(&maxSharedMem, hipDeviceAttributeMaxSharedMemoryPerBlock, context.getDevice());
int maxLocalBuffer = (maxSharedMem/trait->getDataSize())/2;
int maxShortList = min(1024, max(maxLocalBuffer, HipContext::ThreadBlockSize*context.getNumThreadBlocks()));
isShortList = (length <= maxShortList);
sortKernelSize = 256;
rangeKernelSize = 256;
if (rangeKernelSize > length)
rangeKernelSize = length;
rangeKernelBlocks = (length + rangeKernelSize - 1) / rangeKernelSize;
if (sortKernelSize > maxLocalBuffer)
sortKernelSize = maxLocalBuffer;
unsigned int targetBucketSize = uniform ? sortKernelSize/2 : sortKernelSize/8;
unsigned int numBuckets = length/targetBucketSize;
if (numBuckets < 1)
numBuckets = 1;
// computeBucketPositions is executed as a single work group so larger block size is faster.
positionsKernelSize = 1024;
if (positionsKernelSize > numBuckets)
positionsKernelSize = numBuckets;
// Create workspace arrays.
if (!isShortList) {
counters.initialize<unsigned int>(context, 1, "counters");
unsigned int zero = 0;
counters.upload(&zero);
dataRange.initialize(context, 2*rangeKernelBlocks, trait->getKeySize(), "sortDataRange");
bucketOffset.initialize<uint1>(context, numBuckets, "bucketOffset");
bucketOfElement.initialize<uint1>(context, length, "bucketOfElement");
offsetInBucket.initialize<uint1>(context, length, "offsetInBucket");
}
buckets.initialize(context, length, trait->getDataSize(), "buckets");
}
HipSort::~HipSort() {
delete trait;
}
void HipSort::sort(HipArray& data) {
if (data.getSize() != dataLength || data.getElementSize() != trait->getDataSize())
throw OpenMMException("HipSort called with different data size");
if (data.getSize() == 0)
return;
if (isShortList) {
// We can use a simpler sort kernel that does the entire operation in one kernel.
if (dataLength <= HipContext::ThreadBlockSize*context.getNumThreadBlocks()) {
void* sortArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &dataLength};
context.executeKernel(shortList2Kernel, sortArgs, dataLength, HipContext::ThreadBlockSize, HipContext::ThreadBlockSize*trait->getKeySize());
buckets.copyTo(data);
}
else {
void* sortArgs[] = {&data.getDevicePointer(), &dataLength};
context.executeKernel(shortListKernel, sortArgs, sortKernelSize, sortKernelSize, dataLength*trait->getDataSize());
}
}
else {
// Compute the range of data values.
unsigned int numBuckets = bucketOffset.getSize();
void* rangeArgs[] = {&data.getDevicePointer(), &dataLength, &dataRange.getDevicePointer(), &numBuckets, &bucketOffset.getDevicePointer(), &counters.getDevicePointer()};
context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelBlocks*rangeKernelSize, rangeKernelSize, 2*rangeKernelSize*trait->getKeySize());
// Assign array elements to buckets.
void* elementsArgs[] = {&data.getDevicePointer(), &dataLength, &numBuckets, &dataRange.getDevicePointer(),
&bucketOffset.getDevicePointer(), &bucketOfElement.getDevicePointer(), &offsetInBucket.getDevicePointer()};
context.executeKernel(assignElementsKernel, elementsArgs, data.getSize(), 128);
// Compute the position of each bucket.
void* computeArgs[] = {&numBuckets, &bucketOffset.getDevicePointer(), &counters.getDevicePointer()};
context.executeKernel(computeBucketPositionsKernel, computeArgs, positionsKernelSize, positionsKernelSize, positionsKernelSize*sizeof(int));
// Copy the data into the buckets.
void* copyArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &dataLength, &bucketOffset.getDevicePointer(),
&bucketOfElement.getDevicePointer(), &offsetInBucket.getDevicePointer()};
context.executeKernel(copyToBucketsKernel, copyArgs, data.getSize());
// Sort each bucket.
void* sortArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &bucketOffset.getDevicePointer()};
context.executeKernelFlat(sortBucketsKernel, sortArgs, numBuckets*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize());
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment