#include "openmm/internal/windowsExport.h"
#include "CudaPlatform.h"
namespace OpenMM {
class CudaArray;
class CudaForceInfo;
class CudaIntegrationUtilities;
class CudaBondedUtilities;
class CudaNonbondedUtilities;
class System;
/**
* This class contains the information associated with a Context by the CUDA Platform. Each CudaContext is
* specific to a particular device, and manages data structures and kernels for that device. When running a simulation
* in parallel on multiple devices, there is a separate CudaContext for each one. The list of all contexts is
* stored in the CudaPlatform::PlatformData.
*
* In addition, a worker thread is created for each CudaContext. This is used for parallel computations, so that
* blocking calls to one device will not block other devices. When only a single device is being used, the worker
* thread is not used and calculations are performed on the main application thread.
*/
class OPENMM_EXPORT CudaContext {
public:
class WorkTask;
class WorkThread;
class ReorderListener;
static const int ThreadBlockSize;
static const int TileSize;
CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const std::string& precision,
const std::string& compiler, const std::string& tempDir, CudaPlatform::PlatformData& platformData);
~CudaContext();
// /**
// * This is called to initialize internal data structures after all Forces in the system
// * have been initialized.
// */
// void initialize();
/**
* Add a CudaForce to this context.
*/
void addForce(CudaForceInfo* force);
/**
* Get the CUcontext associated with this object.
*/
CUcontext getContext() {
return context;
}
/**
* Get the CUdevice associated with this object.
*/
CUdevice getDevice() {
return device;
}
/**
* Get the index of the CUdevice associated with this object.
*/
int getDeviceIndex() {
return deviceIndex;
}
/**
* Get the PlatformData object this context is part of.
*/
CudaPlatform::PlatformData& getPlatformData() {
return platformData;
}
/**
* Get the index of this context in the list stored in the PlatformData.
*/
int getContextIndex() const {
return contextIndex;
}
/**
* Get the array which contains the position (the xyz components) and charge (the w component) of each atom.
*/
CudaArray& getPosq() {
return *posq;
}
/**
* Get the array which contains the velocity (the xyz components) and inverse mass (the w component) of each atom.
*/
CudaArray& getVelm() {
return *velm;
}
// /**
// * Get the array which contains the force on each atom.
// */
// CudaArray& getForce() {
// return *force;
// }
// /**
// * Get the array which contains the buffers in which forces are computed.
// */
// CudaArray& getForceBuffers() {
// return *forceBuffers;
// }
// /**
// * Get the array which contains a contribution to each force represented as 64 bit fixed point.
// */
// CudaArray& getLongForceBuffer() {
// return *longForceBuffer;
// }
// /**
// * Get the array which contains the buffer in which energy is computed.
// */
// CudaArray& getEnergyBuffer() {
// return *energyBuffer;
// }
// /**
// * Get the array which contains the index of each atom.
// */
// CudaArray& getAtomIndex() {
// return *atomIndex;
// }
// /**
// * Get the number of cells by which the positions are offset.
// */
// std::vector& getPosCellOffsets() {
// return posCellOffsets;
// }
/**
* Replace all occurrences of a list of substrings.
*
* @param input a string to process
* @param replacements a set of strings that should be replaced with new strings wherever they appear in the input string
* @return a new string produced by performing the replacements
*/
std::string replaceStrings(const std::string& input, const std::map& replacements) const;
/**
* Create a CUDA module from source code.
*
* @param source the source code of the module
* @param optimizationFlags the optimization flags to pass to the CUDA compiler. If this is
* omitted, a default set of options will be used
*/
CUmodule createModule(const std::string source, const char* optimizationFlags = NULL);
/**
* Create a CUDA module from source code.
*
* @param source the source code of the module
* @param defines a set of preprocessor definitions (name, value) to define when compiling the program
* @param optimizationFlags the optimization flags to pass to the CUDA compiler. If this is
* omitted, a default set of options will be used
*/
CUmodule createModule(const std::string source, const std::map& defines, const char* optimizationFlags = NULL);
// /**
// * Execute a kernel.
// *
// * @param kernel the kernel to execute
// * @param workUnits the maximum number of work units that should be used
// * @param blockSize the size of each thread block to use
// */
// void executeKernel(cl::Kernel& kernel, int workUnits, int blockSize = -1);
// /**
// * Set all elements of an array to 0.
// */
// void clearBuffer(CudaArray& array);
// /**
// * Set all elements of an array to 0.
// */
// void clearBuffer(CudaArray& array);
// /**
// * Set all elements of an array to 0.
// *
// * @param memory the Memory to clear
// * @param size the number of float elements in the buffer
// */
// void clearBuffer(cl::Memory& memory, int size);
// /**
// * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
// *
// * @param memory the Memory to clear
// * @param size the number of float elements in the buffer
// */
// void addAutoclearBuffer(cl::Memory& memory, int size);
// /**
// * Clear all buffers that have been registered with addAutoclearBuffer().
// */
// void clearAutoclearBuffers();
// /**
// * Given a collection of buffers packed into an array, sum them and store
// * the sum in the first buffer.
// *
// * @param array the array containing the buffers to reduce
// * @param numBuffers the number of buffers packed into the array
// */
// void reduceBuffer(CudaArray& array, int numBuffers);
// /**
// * Sum the buffesr containing forces.
// */
// void reduceForces();
// /**
// * Get the current simulation time.
// */
// double getTime() {
// return time;
// }
// /**
// * Set the current simulation time.
// */
// void setTime(double t) {
// time = t;
// }
// /**
// * Get the number of integration steps that have been taken.
// */
// int getStepCount() {
// return stepCount;
// }
// /**
// * Set the number of integration steps that have been taken.
// */
// void setStepCount(int steps) {
// stepCount = steps;
// }
// /**
// * Get the number of times forces or energy has been computed.
// */
// int getComputeForceCount() {
// return computeForceCount;
// }
// /**
// * Set the number of times forces or energy has been computed.
// */
// void setComputeForceCount(int count) {
// computeForceCount = count;
// }
// /**
// * Get the number of atoms.
// */
// int getNumAtoms() const {
// return numAtoms;
// }
// /**
// * Get the number of atoms, rounded up to a multiple of TileSize. This is the actual size of
// * most arrays with one element per atom.
// */
// int getPaddedNumAtoms() const {
// return paddedNumAtoms;
// }
// /**
// * Get the number of blocks of TileSize atoms.
// */
// int getNumAtomBlocks() const {
// return numAtomBlocks;
// }
// /**
// * Get the standard number of thread blocks to use when executing kernels.
// */
// int getNumThreadBlocks() const {
// return numThreadBlocks;
// }
// /**
// * Get the number of force buffers.
// */
// int getNumForceBuffers() const {
// return numForceBuffers;
// }
// /**
// * Get the SIMD width of the device being used.
// */
// int getSIMDWidth() const {
// return simdWidth;
// }
// /**
// * Get whether the device being used supports 64 bit atomic operations on global memory.
// */
// bool getSupports64BitGlobalAtomics() {
// return supports64BitGlobalAtomics;
// }
// /**
// * Get whether the device being used supports double precision math.
// */
// bool getSupportsDoublePrecision() {
// return supportsDoublePrecision;
// }
// /**
// * Get the size of the periodic box.
// */
// mm_float4 getPeriodicBoxSize() const {
// return periodicBoxSize;
// }
// /**
// * Set the size of the periodic box.
// */
// void setPeriodicBoxSize(double xsize, double ysize, double zsize) {
// periodicBoxSize = mm_float4((float) xsize, (float) ysize, (float) zsize, 0);
// invPeriodicBoxSize = mm_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0);
// }
// /**
// * Get the inverse of the size of the periodic box.
// */
// mm_float4 getInvPeriodicBoxSize() const {
// return invPeriodicBoxSize;
// }
// /**
// * Get the CudaIntegrationUtilities for this context.
// */
// CudaIntegrationUtilities& getIntegrationUtilities() {
// return *integration;
// }
// /**
// * Get the CudaBondedUtilities for this context.
// */
// CudaBondedUtilities& getBondedUtilities() {
// return *bonded;
// }
// /**
// * Get the CudaNonbondedUtilities for this context.
// */
// CudaNonbondedUtilities& getNonbondedUtilities() {
// return *nonbonded;
// }
// /**
// * Get the thread used by this context for executing parallel computations.
// */
// WorkThread& getWorkThread() {
// return *thread;
// }
// /**
// * Get whether atoms were reordered during the most recent force/energy computation.
// */
// bool getAtomsWereReordered() const {
// return atomsWereReordered;
// }
// /**
// * Set whether atoms were reordered during the most recent force/energy computation.
// */
// void setAtomsWereReordered(bool wereReordered) {
// atomsWereReordered = wereReordered;
// }
// /**
// * Reorder the internal arrays of atoms to try to keep spatially contiguous atoms close
// * together in the arrays.
// *
// * @param enforcePeriodic if true, the atom positions may be altered to enforce periodic boundary conditions
// */
// void reorderAtoms(bool enforcePeriodic);
// /**
// * Add a listener that should be called whenever atoms get reordered. The CudaContext
// * assumes ownership of the object, and deletes it when the context itself is deleted.
// */
// void addReorderListener(ReorderListener* listener);
// /**
// * Get the list of ReorderListeners.
// */
// std::vector& getReorderListeners() {
// return reorderListeners;
// }
// /**
// * Mark that the current molecule definitions (and hence the atom order) may be invalid.
// * This should be called whenever force field parameters change. It will cause the definitions
// * and order to be revalidated the next to reorderAtoms() is called.
// */
// void invalidateMolecules();
// /**
// * Get whether the current molecule definitions are valid.
// */
// bool getMoleculesAreInvalid() {
// return moleculesInvalid;
// }
private:
struct Molecule;
struct MoleculeGroup;
class VirtualSiteInfo;
// void findMoleculeGroups();
// static void tagAtomsInMolecule(int atom, int molecule, std::vector& atomMolecule, std::vector >& atomBonds);
// /**
// * Ensure that all molecules marked as "identical" really are identical. This should be
// * called whenever force field parameters change. If necessary, it will rebuild the list
// * of molecules and resort the atoms.
// */
// void validateMolecules();
static bool hasInitializedCuda;
const System& system;
double time;
CudaPlatform::PlatformData& platformData;
int deviceIndex;
int contextIndex;
int stepCount;
int computeForceCount;
int numAtoms;
int paddedNumAtoms;
int numAtomBlocks;
int numThreadBlocks;
// int numForceBuffers;
// int simdWidth;
bool useBlockingSync, useDoublePrecision, accumulateInDouble, contextIsValid, atomsWereReordered, moleculesInvalid;
std::string compiler, tempDir, gpuArchitecture;
float4 periodicBoxSize;
float4 invPeriodicBoxSize;
std::string defaultOptimizationOptions;
std::map compilationDefines;
CUcontext context;
CUdevice device;
CUfunction clearBufferKernel;
CUfunction clearTwoBuffersKernel;
CUfunction clearThreeBuffersKernel;
CUfunction clearFourBuffersKernel;
CUfunction clearFiveBuffersKernel;
CUfunction clearSixBuffersKernel;
CUfunction reduceFloat4Kernel;
CUfunction reduceForcesKernel;
std::vector forces;
std::vector molecules;
std::vector moleculeGroups;
std::vector posCellOffsets;
CudaArray* posq;
CudaArray* velm;
// CudaArray* force;
// CudaArray* forceBuffers;
// CudaArray* longForceBuffer;
// CudaArray* energyBuffer;
// CudaArray* atomIndex;
// std::vector autoclearBuffers;
// std::vector autoclearBufferSizes;
std::vector reorderListeners;
// CudaIntegrationUtilities* integration;
// CudaBondedUtilities* bonded;
// CudaNonbondedUtilities* nonbonded;
WorkThread* thread;
};
struct CudaContext::Molecule {
std::vector atoms;
std::vector constraints;
std::vector > groups;
};
struct CudaContext::MoleculeGroup {
std::vector atoms;
std::vector instances;
std::vector offsets;
};
/**
* This abstract class defines a task to be executed on the worker thread.
*/
class CudaContext::WorkTask {
public:
virtual void execute() = 0;
virtual ~WorkTask() {
}
};
class CudaContext::WorkThread {
public:
struct ThreadData;
WorkThread();
~WorkThread();
/**
* Request that a task be executed on the worker thread. The argument should have been allocated on the
* heap with the "new" operator. After its execute() method finishes, the object will be deleted automatically.
*/
void addTask(CudaContext::WorkTask* task);
/**
* Get whether the worker thread is idle, waiting for a task to be added.
*/
bool isWaiting();
/**
* Get whether the worker thread has exited.
*/
bool isFinished();
/**
* Block until all tasks have finished executing and the worker thread is idle.
*/
void flush();
private:
std::queue tasks;
bool waiting, finished;
pthread_mutex_t queueLock;
pthread_cond_t waitForTaskCondition, queueEmptyCondition;
pthread_t thread;
};
/**
* This abstract class defines a function to be executed whenever atoms get reordered.
* Objects that need to know when reordering happens should create a reorderListener
* and register it by calling addReorderListener().
*/
class CudaContext::ReorderListener {
public:
virtual void execute() = 0;
virtual ~ReorderListener() {
}
};
} // namespace OpenMM
#endif /*OPENMM_CUDACONTEXT_H_*/