#include "openmm/common/windowsExportCommon.h"
#include "HipArray.h"
#include "HipBondedUtilities.h"
#include "HipExpressionUtilities.h"
#include "HipIntegrationUtilities.h"
#include "HipNonbondedUtilities.h"
#include "HipPlatform.h"
#include "openmm/OpenMMException.h"
#include "openmm/common/ComputeContext.h"
#include "openmm/Kernel.h"
typedef unsigned int tileflags;
namespace OpenMM {
/**
* This class contains the information associated with a Context by the HIP Platform. Each HipContext is
* specific to a particular device, and manages data structures and kernels for that device. When running a simulation
* in parallel on multiple devices, there is a separate HipContext for each one. The list of all contexts is
* stored in the HipPlatform::PlatformData.
*
* In addition, a worker thread is created for each HipContext. This is used for parallel computations, so that
* blocking calls to one device will not block other devices. When only a single device is being used, the worker
* thread is not used and calculations are performed on the main application thread.
*/
class OPENMM_EXPORT_COMMON HipContext : public ComputeContext {
public:
class WorkTask;
class WorkThread;
class ReorderListener;
class ForcePreComputation;
class ForcePostComputation;
static const int ThreadBlockSize;
static const int TileSize;
HipContext(const System& system, int deviceIndex, bool useBlockingSync, const std::string& precision,
const std::string& tempDir, HipPlatform::PlatformData& platformData, HipContext* originalContext);
~HipContext();
/**
* This is called to initialize internal data structures after all Forces in the system
* have been initialized.
*/
void initialize();
/**
* Get whether the context associated with this object is valid.
*/
bool getContextIsValid() const {
return contextIsValid;
}
/**
* Set the device associated with this object to be the current device. If the context is not
* valid, this returns without doing anything.
*/
void setAsCurrent();
/**
* Push the device associated with this object to be the current device. If the context is not
* valid, this returns without doing anything.
*/
void pushAsCurrent();
/**
* Pop the device associated with this object off the stack of contexts. If the context is not
* valid, this returns without doing anything.
*/
void popAsCurrent();
/**
* Get the hipDevice_t associated with this object.
*/
hipDevice_t getDevice() {
return device;
}
/**
* Get the compute capability of the device associated with this object.
*/
double getComputeCapability() const {
return computeCapability;
}
/**
* Get the index of the hipDevice_t associated with this object.
*/
int getDeviceIndex() const {
return deviceIndex;
}
/**
* Get the PlatformData object this context is part of.
*/
HipPlatform::PlatformData& getPlatformData() {
return platformData;
}
/**
* Get the number of contexts being used for the current simulation.
* This is relevant when a simulation is parallelized across multiple devices. In that case,
* one HipContext is created for each device.
*/
int getNumContexts() const {
return platformData.contexts.size();
}
/**
* Get the index of this context in the list stored in the PlatformData.
*/
int getContextIndex() const {
return contextIndex;
}
/**
* Get a list of all contexts being used for the current simulation.
* This is relevant when a simulation is parallelized across multiple devices. In that case,
* one ComputeContext is created for each device.
*/
std::vector getAllContexts();
/**
* Get the ContextImpl is ComputeContext is associated with.
*/
ContextImpl* getContextImpl() {
return platformData.context;
}
/**
* Get a workspace used for accumulating energy when a simulation is parallelized across
* multiple devices.
*/
double& getEnergyWorkspace();
/**
* Create a new ComputeQueue for use with this context.
*/
ComputeQueue createQueue();
/**
* Get the stream currently being used for execution.
*/
hipStream_t getCurrentStream();
/**
* Construct an uninitialized array of the appropriate class for this platform. The returned
* value should be created on the heap with the "new" operator.
*/
HipArray* createArray();
/**
* Construct a ComputeEvent object of the appropriate class for this platform.
*/
ComputeEvent createEvent();
/**
* Construct a ComputeSort object of the appropriate class for this platform.
*
* @param trait a SortTrait defining the type of data to sort. It should have been allocated
* on the heap with the "new" operator. This object takes over ownership of it,
* and deletes it when the ComputeSort is deleted.
* @param length the length of the arrays this object will be used to sort
* @param uniform whether the input data is expected to follow a uniform or nonuniform
* distribution. This argument is used only as a hint. It allows parts
* of the algorithm to be tuned for faster performance on the expected
* distribution.
*/
ComputeSort createSort(ComputeSortImpl::SortTrait* trait, unsigned int length, bool uniform=true);
/**
* Create an object for performing 3D FFTs. The caller is responsible for deleting
* the object when it is no longer needed.
*
* @param xsize the first dimension of the data sets on which FFTs will be performed
* @param ysize the second dimension of the data sets on which FFTs will be performed
* @param zsize the third dimension of the data sets on which FFTs will be performed
* @param realToComplex if true, a real-to-complex transform will be done. Otherwise, it is complex-to-complex.
*/
FFT3D createFFT(int xsize, int ysize, int zsize, bool realToComplex=false);
/**
* Get the smallest legal size for a dimension of the grid.
*/
int findLegalFFTDimension(int minimum);
/**
* Compile source code to create a ComputeProgram.
*
* @param source the source code of the program
* @param defines a set of preprocessor definitions (name, value) to define when compiling the program
*/
ComputeProgram compileProgram(const std::string source, const std::map& defines=std::map());
/**
* Convert an array to an HipArray. If the argument is already an HipArray, this simply casts it.
* If the argument is a ComputeArray that wraps a HipArray, this returns the wrapped array. For any
* other argument, this throws an exception.
*/
HipArray& unwrap(ArrayInterface& array) const;
/**
* Get the array which contains the position (the xyz components) and charge (the w component) of each atom.
*/
HipArray& getPosq() {
return posq;
}
/**
* Get the array which contains a correction to the position of each atom. This only exists if getUseMixedPrecision() returns true.
*/
HipArray& getPosqCorrection() {
return posqCorrection;
}
/**
* Get the array which contains the velocity (the xyz components) and inverse mass (the w component) of each atom.
*/
HipArray& getVelm() {
return velm;
}
/**
* Get the array which contains the force on each atom (represented as three long longs in 64 bit fixed point).
*/
HipArray& getForce() {
return force;
}
/**
* The HIP platform does not use floating point force buffers, so this throws an exception.
*/
ArrayInterface& getFloatForceBuffer() {
throw OpenMMException("HIP platform does not use floating point force buffers");
}
/**
* Get the array which contains a contribution to each force represented as 64 bit fixed point.
* This is a synonym for getForce(). It exists to satisfy the ComputeContext interface.
*/
HipArray& getLongForceBuffer() {
return force;
}
/**
* Not all HIP devices support 64 bit atomics, so this throws an exception.
* @return
*/
ArrayInterface& getForceBuffers() {
throw OpenMMException("HIP platform does not use floating point force buffers");
}
/**
* Get the array which contains the buffer in which energy is computed.
*/
HipArray& getEnergyBuffer() {
return energyBuffer;
}
/**
* Get the array which contains the buffer in which derivatives of the energy with respect to parameters are computed.
*/
HipArray& getEnergyParamDerivBuffer() {
return energyParamDerivBuffer;
}
/**
* Get a pointer to a block of pinned memory that can be used for efficient transfers between host and device.
* This is guaranteed to be at least as large as any of the arrays returned by methods of this class.
*/
void* getPinnedBuffer() {
return pinnedBuffer;
}
/**
* Get a shared ThreadPool that code can use to parallelize operations.
*
* Because this object is freely available to all code, care is needed to avoid conflicts. Only use it
* from the main thread, and make sure all operations are complete before you invoke any other code that
* might make use of it
*/
ThreadPool& getThreadPool() {
return getPlatformData().threads;
}
/**
* Get the array which contains the index of each atom.
*/
HipArray& getAtomIndexArray() {
return atomIndexDevice;
}
/**
* Get a file name in tempDir unique for the current process and context.
*/
std::string getTempFileName() const;
/**
* Get src hash.
*/
std::string getHash(const std::string& src) const;
/**
* Get a filename in cacheDir based on src hash.
*/
std::string getCacheFileName(const std::string& src) const;
/**
* Create a HIP module from source code.
*
* @param source the source code of the module
*/
hipModule_t createModule(const std::string source);
/**
* Create a HIP module from source code.
*
* @param source the source code of the module
* @param defines a set of preprocessor definitions (name, value) to define when compiling the program
*/
hipModule_t createModule(const std::string source, const std::map& defines);
/**
* Get a kernel from a HIP module.
*
* @param module the module to get the kernel from
* @param name the name of the kernel to get
*/
hipFunction_t getKernel(hipModule_t& module, const std::string& name);
/**
* Execute a kernel.
*
* @param kernel the kernel to execute
* @param arguments an array of pointers to the kernel arguments
* @param threads the maximum number of threads that should be used
* @param blockSize the size of each thread block to use
* @param sharedSize the amount of dynamic shared memory to allocated for the kernel, in bytes
*/
void executeKernel(hipFunction_t kernel, void** arguments, int threads, int blockSize = -1, unsigned int sharedSize = 0);
/**
* Execute a kernel with full grid.
*
* @param kernel the kernel to execute
* @param arguments an array of pointers to the kernel arguments
* @param threads the total number of threads that should be used
* @param blockSize the size of each thread block to use
* @param sharedSize the amount of dynamic shared memory to allocated for the kernel, in bytes
*/
void executeKernelFlat(hipFunction_t kernel, void** arguments, int threads, int blockSize = -1, unsigned int sharedSize = 0);
/**
* Compute the largest thread block size that can be used for a kernel that requires a particular amount of
* shared memory per thread.
*
* @param memory the number of bytes of shared memory per thread
*/
int computeThreadBlockSize(double memory) const;
/**
* Set all elements of an array to 0.
*/
void clearBuffer(ArrayInterface& array);
/**
* Set all elements of an array to 0.
*
* @param memory the memory to clear
* @param size the size of the buffer in bytes
*/
void clearBuffer(hipDeviceptr_t memory, int size);
/**
* Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
*/
void addAutoclearBuffer(ArrayInterface& array);
/**
* Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
*
* @param memory the memory to clear
* @param size the size of the buffer in bytes
*/
void addAutoclearBuffer(hipDeviceptr_t memory, int size);
/**
* Clear all buffers that have been registered with addAutoclearBuffer().
*/
void clearAutoclearBuffers();
/**
* Sum the buffer containing energy.
*/
double reduceEnergy();
/**
* Get the number of blocks of TileSize atoms.
*/
int getNumAtomBlocks() const {
return numAtomBlocks;
}
/**
* Get the standard number of thread blocks to use when executing kernels.
*/
int getNumThreadBlocks() const {
return numThreadBlocks;
}
/**
* Get the maximum number of threads in a thread block supported by this device.
*/
int getMaxThreadBlockSize() const {
return 256;
}
/**
* Get whether the device being used is a CPU. In some cases, different algorithms
* may be more efficient on CPUs and GPUs.
*/
bool getIsCPU() const {
return false;
}
/**
* Get the SIMD width of the device being used.
*/
int getSIMDWidth() const {
return simdWidth;
}
/**
* Get the number of multiprocessors (compute units) of the device being used.
*/
int getMultiprocessors() const {
return multiprocessors;
}
/**
* Get whether the device being used supports 64 bit atomic operations on global memory.
*/
bool getSupports64BitGlobalAtomics() const {
return true;
}
/**
* Get whether the device being used supports 32 bit floating point atomic operations
* on global memory (fast hardware instructions, not a compare-and-swap loop implementation).
*/
bool getSupportsHardwareFloatGlobalAtomicAdd() const {
return supportsHardwareFloatGlobalAtomicAdd;
}
/**
* Get whether the device being used supports double precision math.
*/
bool getSupportsDoublePrecision() const {
return true;
}
/**
* Get whether double precision is being used.
*/
bool getUseDoublePrecision() const {
return useDoublePrecision;
}
/**
* Get whether mixed precision is being used.
*/
bool getUseMixedPrecision() const {
return useMixedPrecision;
}
/**
* Get whether the periodic box is triclinic.
*/
bool getBoxIsTriclinic() const {
return boxIsTriclinic;
}
/**
* Convert a HIP result code to the corresponding string description.
*/
static std::string getErrorString(hipError_t result);
/**
* Get the vectors defining the periodic box.
*/
void getPeriodicBoxVectors(Vec3& a, Vec3& b, Vec3& c) const {
a = Vec3(periodicBoxVecX.x, periodicBoxVecX.y, periodicBoxVecX.z);
b = Vec3(periodicBoxVecY.x, periodicBoxVecY.y, periodicBoxVecY.z);
c = Vec3(periodicBoxVecZ.x, periodicBoxVecZ.y, periodicBoxVecZ.z);
}
/**
* Set the vectors defining the periodic box.
*/
void setPeriodicBoxVectors(const Vec3& a, const Vec3& b, const Vec3& c) {
periodicBoxVecX = make_double4(a[0], a[1], a[2], 0.0);
periodicBoxVecY = make_double4(b[0], b[1], b[2], 0.0);
periodicBoxVecZ = make_double4(c[0], c[1], c[2], 0.0);
periodicBoxVecXFloat = make_float4((float) a[0], (float) a[1], (float) a[2], 0.0f);
periodicBoxVecYFloat = make_float4((float) b[0], (float) b[1], (float) b[2], 0.0f);
periodicBoxVecZFloat = make_float4((float) c[0], (float) c[1], (float) c[2], 0.0f);
periodicBoxSize = make_double4(a[0], b[1], c[2], 0.0);
invPeriodicBoxSize = make_double4(1.0/a[0], 1.0/b[1], 1.0/c[2], 0.0);
periodicBoxSizeFloat = make_float4((float) a[0], (float) b[1], (float) c[2], 0.0f);
invPeriodicBoxSizeFloat = make_float4(1.0f/(float) a[0], 1.0f/(float) b[1], 1.0f/(float) c[2], 0.0f);
}
/**
* Get the size of the periodic box.
*/
double4 getPeriodicBoxSize() const {
return periodicBoxSize;
}
/**
* Get the inverse of the size of the periodic box.
*/
double4 getInvPeriodicBoxSize() const {
return invPeriodicBoxSize;
}
/**
* Get a pointer to the size of the periodic box, represented as either a float4 or double4 depending on
* this context's precision. This value is suitable for passing to kernels as an argument.
*/
void* getPeriodicBoxSizePointer() {
return (useDoublePrecision ? reinterpret_cast(&periodicBoxSize) : reinterpret_cast(&periodicBoxSizeFloat));
}
/**
* Get a pointer to the inverse of the size of the periodic box, represented as either a float4 or double4 depending on
* this context's precision. This value is suitable for passing to kernels as an argument.
*/
void* getInvPeriodicBoxSizePointer() {
return (useDoublePrecision ? reinterpret_cast(&invPeriodicBoxSize) : reinterpret_cast(&invPeriodicBoxSizeFloat));
}
/**
* Get a pointer to the first periodic box vector, represented as either a float4 or double4 depending on
* this context's precision. This value is suitable for passing to kernels as an argument.
*/
void* getPeriodicBoxVecXPointer() {
return (useDoublePrecision ? reinterpret_cast(&periodicBoxVecX) : reinterpret_cast(&periodicBoxVecXFloat));
}
/**
* Get a pointer to the second periodic box vector, represented as either a float4 or double4 depending on
* this context's precision. This value is suitable for passing to kernels as an argument.
*/
void* getPeriodicBoxVecYPointer() {
return (useDoublePrecision ? reinterpret_cast(&periodicBoxVecY) : reinterpret_cast(&periodicBoxVecYFloat));
}
/**
* Get a pointer to the third periodic box vector, represented as either a float4 or double4 depending on
* this context's precision. This value is suitable for passing to kernels as an argument.
*/
void* getPeriodicBoxVecZPointer() {
return (useDoublePrecision ? reinterpret_cast(&periodicBoxVecZ) : reinterpret_cast(&periodicBoxVecZFloat));
}
/**
* Get the HipIntegrationUtilities for this context.
*/
HipIntegrationUtilities& getIntegrationUtilities() {
return *integration;
}
/**
* Get the HipExpressionUtilities for this context.
*/
HipExpressionUtilities& getExpressionUtilities() {
return *expression;
}
/**
* Get the HipBondedUtilities for this context.
*/
HipBondedUtilities& getBondedUtilities() {
return *bonded;
}
/**
* Get the HipNonbondedUtilities for this context.
*/
HipNonbondedUtilities& getNonbondedUtilities() {
return *nonbonded;
}
/**
* Create a new NonbondedUtilities for use with this context. This should be called
* only in unusual situations, when a Force needs its own NonbondedUtilities object
* separate from the standard one. The caller is responsible for deleting the object
* when it is no longer needed.
*/
HipNonbondedUtilities* createNonbondedUtilities() {
return new HipNonbondedUtilities(*this);
}
/**
* This should be called by the Integrator from its own initialize() method.
* It ensures all contexts are fully initialized.
*/
void initializeContexts();
/**
* Set the particle charges. These are packed into the fourth element of the posq array.
*/
void setCharges(const std::vector& charges);
/**
* Request to use the fourth element of the posq array for storing charges. Since only one force can
* do that, this returns true the first time it is called, and false on all subsequent calls.
*/
bool requestPosqCharges();
/**
* Get the names of all parameters with respect to which energy derivatives are computed.
*/
const std::vector& getEnergyParamDerivNames() const {
return energyParamDerivNames;
}
/**
* Get a workspace data structure used for accumulating the values of derivatives of the energy
* with respect to parameters.
*/
std::map& getEnergyParamDerivWorkspace() {
return energyParamDerivWorkspace;
}
/**
* Register that the derivative of potential energy with respect to a context parameter
* will need to be calculated. If this is called multiple times for a single parameter,
* it is only added to the list once.
*
* @param param the name of the parameter to add
*/
void addEnergyParameterDerivative(const std::string& param);
/**
* Wait until all work that has been queued (kernel executions, asynchronous data transfers, etc.)
* has been submitted to the device. This does not mean it has necessarily been completed.
* Calling this periodically may improve the responsiveness of the computer's GUI, but at the
* expense of reduced simulation performance.
*/
void flushQueue();
/**
* Get the flags that should be used when creating hipEvent_t objects.
*/
unsigned int getEventFlags();
/**
* Get the flags that should be used when allocating pinned host memory.
*/
unsigned int getHostMallocFlags();
private:
/**
* Compute a sorted list of device indices in decreasing order of desirability
*/
std::vector getDevicePrecedence();
static bool hasInitializedHip;
double computeCapability;
HipPlatform::PlatformData& platformData;
int deviceIndex;
int contextIndex;
int numAtomBlocks;
int numThreadBlocks;
int simdWidth;
int multiprocessors;
int sharedMemPerBlock;
bool supportsHardwareFloatGlobalAtomicAdd;
unsigned int hostMallocFlags;
bool useBlockingSync, useDoublePrecision, useMixedPrecision, contextIsValid, boxIsTriclinic, hasAssignedPosqCharges;
bool isLinkedContext;
std::string tempDir, cacheDir, gpuArchitecture;
float4 periodicBoxVecXFloat, periodicBoxVecYFloat, periodicBoxVecZFloat, periodicBoxSizeFloat, invPeriodicBoxSizeFloat;
double4 periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ, periodicBoxSize, invPeriodicBoxSize;
std::map compilationDefines;
std::vector loadedModules;
hipDevice_t device;
hipFunction_t clearBufferKernel;
hipFunction_t clearTwoBuffersKernel;
hipFunction_t clearThreeBuffersKernel;
hipFunction_t clearFourBuffersKernel;
hipFunction_t clearFiveBuffersKernel;
hipFunction_t clearSixBuffersKernel;
hipFunction_t reduceEnergyKernel;
hipFunction_t setChargesKernel;
void* pinnedBuffer;
HipArray posq;
HipArray posqCorrection;
HipArray velm;
HipArray force;
HipArray energyBuffer;
HipArray energySum;
HipArray energyParamDerivBuffer;
HipArray atomIndexDevice;
HipArray chargeBuffer;
std::vector energyParamDerivNames;
std::map energyParamDerivWorkspace;
std::vector autoclearBuffers;
std::vector autoclearBufferSizes;
HipIntegrationUtilities* integration;
HipExpressionUtilities* expression;
HipBondedUtilities* bonded;
HipNonbondedUtilities* nonbonded;
};
/**
* This class exists only for backward compatibility. Use ComputeContext::WorkTask instead.
*/
class OPENMM_EXPORT_COMMON HipContext::WorkTask : public ComputeContext::WorkTask {
};
/**
* This class exists only for backward compatibility. Use ComputeContext::ReorderListener instead.
*/
class OPENMM_EXPORT_COMMON HipContext::ReorderListener : public ComputeContext::ReorderListener {
};
/**
* This class exists only for backward compatibility. Use ComputeContext::ForcePreComputation instead.
*/
class OPENMM_EXPORT_COMMON HipContext::ForcePreComputation : public ComputeContext::ForcePreComputation {
};
/**
* This class exists only for backward compatibility. Use ComputeContext::ForcePostComputation instead.
*/
class OPENMM_EXPORT_COMMON HipContext::ForcePostComputation : public ComputeContext::ForcePostComputation {
};
} // namespace OpenMM
#endif /*OPENMM_HIPCONTEXT_H_*/