#ifndef OPENMM_CUDACONTEXT_H_ #define OPENMM_CUDACONTEXT_H_ /* -------------------------------------------------------------------------- * * OpenMM * * -------------------------------------------------------------------------- * * This is part of the OpenMM molecular simulation toolkit originating from * * Simbios, the NIH National Center for Physics-Based Simulation of * * Biological Structures at Stanford, funded under the NIH Roadmap for * * Medical Research, grant U54 GM072970. See https://simtk.org. * * * * Portions copyright (c) 2009-2012 Stanford University and the Authors. * * Authors: Peter Eastman * * Contributors: * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU Lesser General Public License as published * * by the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU Lesser General Public License for more details. * * * * You should have received a copy of the GNU Lesser General Public License * * along with this program. If not, see . * * -------------------------------------------------------------------------- */ #include #include #include #include #define __CL_ENABLE_EXCEPTIONS #ifdef _MSC_VER // Prevent Windows from defining macros that interfere with other code. #define NOMINMAX #endif #include #include #include #include "openmm/internal/windowsExport.h" #include "CudaPlatform.h" namespace OpenMM { class CudaArray; class CudaForceInfo; class CudaIntegrationUtilities; class CudaBondedUtilities; class CudaNonbondedUtilities; class System; /** * This class contains the information associated with a Context by the CUDA Platform. Each CudaContext is * specific to a particular device, and manages data structures and kernels for that device. When running a simulation * in parallel on multiple devices, there is a separate CudaContext for each one. The list of all contexts is * stored in the CudaPlatform::PlatformData. *

* In addition, a worker thread is created for each CudaContext. This is used for parallel computations, so that * blocking calls to one device will not block other devices. When only a single device is being used, the worker * thread is not used and calculations are performed on the main application thread. */ class OPENMM_EXPORT CudaContext { public: class WorkTask; class WorkThread; class ReorderListener; static const int ThreadBlockSize; static const int TileSize; CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const std::string& precision, const std::string& compiler, const std::string& tempDir, CudaPlatform::PlatformData& platformData); ~CudaContext(); // /** // * This is called to initialize internal data structures after all Forces in the system // * have been initialized. // */ // void initialize(); /** * Add a CudaForce to this context. */ void addForce(CudaForceInfo* force); /** * Get the CUcontext associated with this object. */ CUcontext getContext() { return context; } /** * Get the CUdevice associated with this object. */ CUdevice getDevice() { return device; } /** * Get the index of the CUdevice associated with this object. */ int getDeviceIndex() { return deviceIndex; } /** * Get the PlatformData object this context is part of. */ CudaPlatform::PlatformData& getPlatformData() { return platformData; } /** * Get the index of this context in the list stored in the PlatformData. */ int getContextIndex() const { return contextIndex; } /** * Get the array which contains the position (the xyz components) and charge (the w component) of each atom. */ CudaArray& getPosq() { return *posq; } /** * Get the array which contains the velocity (the xyz components) and inverse mass (the w component) of each atom. */ CudaArray& getVelm() { return *velm; } // /** // * Get the array which contains the force on each atom. // */ // CudaArray& getForce() { // return *force; // } // /** // * Get the array which contains the buffers in which forces are computed. // */ // CudaArray& getForceBuffers() { // return *forceBuffers; // } // /** // * Get the array which contains a contribution to each force represented as 64 bit fixed point. // */ // CudaArray& getLongForceBuffer() { // return *longForceBuffer; // } // /** // * Get the array which contains the buffer in which energy is computed. // */ // CudaArray& getEnergyBuffer() { // return *energyBuffer; // } // /** // * Get the array which contains the index of each atom. // */ // CudaArray& getAtomIndex() { // return *atomIndex; // } // /** // * Get the number of cells by which the positions are offset. // */ // std::vector& getPosCellOffsets() { // return posCellOffsets; // } /** * Replace all occurrences of a list of substrings. * * @param input a string to process * @param replacements a set of strings that should be replaced with new strings wherever they appear in the input string * @return a new string produced by performing the replacements */ std::string replaceStrings(const std::string& input, const std::map& replacements) const; /** * Create a CUDA module from source code. * * @param source the source code of the module * @param optimizationFlags the optimization flags to pass to the CUDA compiler. If this is * omitted, a default set of options will be used */ CUmodule createModule(const std::string source, const char* optimizationFlags = NULL); /** * Create a CUDA module from source code. * * @param source the source code of the module * @param defines a set of preprocessor definitions (name, value) to define when compiling the program * @param optimizationFlags the optimization flags to pass to the CUDA compiler. If this is * omitted, a default set of options will be used */ CUmodule createModule(const std::string source, const std::map& defines, const char* optimizationFlags = NULL); // /** // * Execute a kernel. // * // * @param kernel the kernel to execute // * @param workUnits the maximum number of work units that should be used // * @param blockSize the size of each thread block to use // */ // void executeKernel(cl::Kernel& kernel, int workUnits, int blockSize = -1); // /** // * Set all elements of an array to 0. // */ // void clearBuffer(CudaArray& array); // /** // * Set all elements of an array to 0. // */ // void clearBuffer(CudaArray& array); // /** // * Set all elements of an array to 0. // * // * @param memory the Memory to clear // * @param size the number of float elements in the buffer // */ // void clearBuffer(cl::Memory& memory, int size); // /** // * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation. // * // * @param memory the Memory to clear // * @param size the number of float elements in the buffer // */ // void addAutoclearBuffer(cl::Memory& memory, int size); // /** // * Clear all buffers that have been registered with addAutoclearBuffer(). // */ // void clearAutoclearBuffers(); // /** // * Given a collection of buffers packed into an array, sum them and store // * the sum in the first buffer. // * // * @param array the array containing the buffers to reduce // * @param numBuffers the number of buffers packed into the array // */ // void reduceBuffer(CudaArray& array, int numBuffers); // /** // * Sum the buffesr containing forces. // */ // void reduceForces(); // /** // * Get the current simulation time. // */ // double getTime() { // return time; // } // /** // * Set the current simulation time. // */ // void setTime(double t) { // time = t; // } // /** // * Get the number of integration steps that have been taken. // */ // int getStepCount() { // return stepCount; // } // /** // * Set the number of integration steps that have been taken. // */ // void setStepCount(int steps) { // stepCount = steps; // } // /** // * Get the number of times forces or energy has been computed. // */ // int getComputeForceCount() { // return computeForceCount; // } // /** // * Set the number of times forces or energy has been computed. // */ // void setComputeForceCount(int count) { // computeForceCount = count; // } // /** // * Get the number of atoms. // */ // int getNumAtoms() const { // return numAtoms; // } // /** // * Get the number of atoms, rounded up to a multiple of TileSize. This is the actual size of // * most arrays with one element per atom. // */ // int getPaddedNumAtoms() const { // return paddedNumAtoms; // } // /** // * Get the number of blocks of TileSize atoms. // */ // int getNumAtomBlocks() const { // return numAtomBlocks; // } // /** // * Get the standard number of thread blocks to use when executing kernels. // */ // int getNumThreadBlocks() const { // return numThreadBlocks; // } // /** // * Get the number of force buffers. // */ // int getNumForceBuffers() const { // return numForceBuffers; // } // /** // * Get the SIMD width of the device being used. // */ // int getSIMDWidth() const { // return simdWidth; // } // /** // * Get whether the device being used supports 64 bit atomic operations on global memory. // */ // bool getSupports64BitGlobalAtomics() { // return supports64BitGlobalAtomics; // } // /** // * Get whether the device being used supports double precision math. // */ // bool getSupportsDoublePrecision() { // return supportsDoublePrecision; // } // /** // * Get the size of the periodic box. // */ // mm_float4 getPeriodicBoxSize() const { // return periodicBoxSize; // } // /** // * Set the size of the periodic box. // */ // void setPeriodicBoxSize(double xsize, double ysize, double zsize) { // periodicBoxSize = mm_float4((float) xsize, (float) ysize, (float) zsize, 0); // invPeriodicBoxSize = mm_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0); // } // /** // * Get the inverse of the size of the periodic box. // */ // mm_float4 getInvPeriodicBoxSize() const { // return invPeriodicBoxSize; // } // /** // * Get the CudaIntegrationUtilities for this context. // */ // CudaIntegrationUtilities& getIntegrationUtilities() { // return *integration; // } // /** // * Get the CudaBondedUtilities for this context. // */ // CudaBondedUtilities& getBondedUtilities() { // return *bonded; // } // /** // * Get the CudaNonbondedUtilities for this context. // */ // CudaNonbondedUtilities& getNonbondedUtilities() { // return *nonbonded; // } // /** // * Get the thread used by this context for executing parallel computations. // */ // WorkThread& getWorkThread() { // return *thread; // } // /** // * Get whether atoms were reordered during the most recent force/energy computation. // */ // bool getAtomsWereReordered() const { // return atomsWereReordered; // } // /** // * Set whether atoms were reordered during the most recent force/energy computation. // */ // void setAtomsWereReordered(bool wereReordered) { // atomsWereReordered = wereReordered; // } // /** // * Reorder the internal arrays of atoms to try to keep spatially contiguous atoms close // * together in the arrays. // * // * @param enforcePeriodic if true, the atom positions may be altered to enforce periodic boundary conditions // */ // void reorderAtoms(bool enforcePeriodic); // /** // * Add a listener that should be called whenever atoms get reordered. The CudaContext // * assumes ownership of the object, and deletes it when the context itself is deleted. // */ // void addReorderListener(ReorderListener* listener); // /** // * Get the list of ReorderListeners. // */ // std::vector& getReorderListeners() { // return reorderListeners; // } // /** // * Mark that the current molecule definitions (and hence the atom order) may be invalid. // * This should be called whenever force field parameters change. It will cause the definitions // * and order to be revalidated the next to reorderAtoms() is called. // */ // void invalidateMolecules(); // /** // * Get whether the current molecule definitions are valid. // */ // bool getMoleculesAreInvalid() { // return moleculesInvalid; // } private: struct Molecule; struct MoleculeGroup; class VirtualSiteInfo; // void findMoleculeGroups(); // static void tagAtomsInMolecule(int atom, int molecule, std::vector& atomMolecule, std::vector >& atomBonds); // /** // * Ensure that all molecules marked as "identical" really are identical. This should be // * called whenever force field parameters change. If necessary, it will rebuild the list // * of molecules and resort the atoms. // */ // void validateMolecules(); static bool hasInitializedCuda; const System& system; double time; CudaPlatform::PlatformData& platformData; int deviceIndex; int contextIndex; int stepCount; int computeForceCount; int numAtoms; int paddedNumAtoms; int numAtomBlocks; int numThreadBlocks; // int numForceBuffers; // int simdWidth; bool useBlockingSync, useDoublePrecision, accumulateInDouble, contextIsValid, atomsWereReordered, moleculesInvalid; std::string compiler, tempDir, gpuArchitecture; float4 periodicBoxSize; float4 invPeriodicBoxSize; std::string defaultOptimizationOptions; std::map compilationDefines; CUcontext context; CUdevice device; CUfunction clearBufferKernel; CUfunction clearTwoBuffersKernel; CUfunction clearThreeBuffersKernel; CUfunction clearFourBuffersKernel; CUfunction clearFiveBuffersKernel; CUfunction clearSixBuffersKernel; CUfunction reduceFloat4Kernel; CUfunction reduceForcesKernel; std::vector forces; std::vector molecules; std::vector moleculeGroups; std::vector posCellOffsets; CudaArray* posq; CudaArray* velm; // CudaArray* force; // CudaArray* forceBuffers; // CudaArray* longForceBuffer; // CudaArray* energyBuffer; // CudaArray* atomIndex; // std::vector autoclearBuffers; // std::vector autoclearBufferSizes; std::vector reorderListeners; // CudaIntegrationUtilities* integration; // CudaBondedUtilities* bonded; // CudaNonbondedUtilities* nonbonded; WorkThread* thread; }; struct CudaContext::Molecule { std::vector atoms; std::vector constraints; std::vector > groups; }; struct CudaContext::MoleculeGroup { std::vector atoms; std::vector instances; std::vector offsets; }; /** * This abstract class defines a task to be executed on the worker thread. */ class CudaContext::WorkTask { public: virtual void execute() = 0; virtual ~WorkTask() { } }; class CudaContext::WorkThread { public: struct ThreadData; WorkThread(); ~WorkThread(); /** * Request that a task be executed on the worker thread. The argument should have been allocated on the * heap with the "new" operator. After its execute() method finishes, the object will be deleted automatically. */ void addTask(CudaContext::WorkTask* task); /** * Get whether the worker thread is idle, waiting for a task to be added. */ bool isWaiting(); /** * Get whether the worker thread has exited. */ bool isFinished(); /** * Block until all tasks have finished executing and the worker thread is idle. */ void flush(); private: std::queue tasks; bool waiting, finished; pthread_mutex_t queueLock; pthread_cond_t waitForTaskCondition, queueEmptyCondition; pthread_t thread; }; /** * This abstract class defines a function to be executed whenever atoms get reordered. * Objects that need to know when reordering happens should create a reorderListener * and register it by calling addReorderListener(). */ class CudaContext::ReorderListener { public: virtual void execute() = 0; virtual ~ReorderListener() { } }; } // namespace OpenMM #endif /*OPENMM_CUDACONTEXT_H_*/