Unverified Commit 2443dcee authored by Peter Eastman's avatar Peter Eastman Committed by GitHub
Browse files

Common implementation of NonbondedForce (#4922)

* Use common API for kernels

* More code uses common interface

* Bug fixes

* Unified interface for sorting

* Simplified interface for FFT

* Use common event API for synchronization

* Minor changes to make code more consistent between platforms

* Common implementation of NonbondedForce

* Bug fixes

* Flag to enable list of single pairs

* CUDA and OpenCL use common implementation of NonbondedForce

* Fixed compilation error

* HIP uses common implementation of NonbondedForce
parent dfb8d755
#ifndef OPENMM_COMMONCALCNONBONDEDFORCEKERNEL_H_
#define OPENMM_COMMONCALCNONBONDEDFORCEKERNEL_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2025 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "openmm/kernels.h"
#include "openmm/common/ComputeArray.h"
#include "openmm/common/ComputeContext.h"
#include "openmm/common/ComputeEvent.h"
#include "openmm/common/ComputeQueue.h"
#include "openmm/common/ComputeSort.h"
#include "openmm/common/FFT3D.h"
#include <map>
#include <string>
#include <utility>
#include <vector>
namespace OpenMM {
/**
* This kernel is invoked by NonbondedForce to calculate the forces acting on the system.
*/
class CommonCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public:
CommonCalcNonbondedForceKernel(std::string name, const Platform& platform, ComputeContext& cc, const System& system) : CalcNonbondedForceKernel(name, platform),
hasInitializedKernel(false), cc(cc), pmeio(NULL) {
}
~CommonCalcNonbondedForceKernel();
/**
* Initialize the kernel. Subclasses should call this from their initialize() method.
*
* @param system the System this kernel will be applied to
* @param force the NonbondedForce this kernel will be used for
* @param usePmeQueue whether to perform PME on a separate queue
* @param deviceIsCpu whether the device this calculation is running on is a CPU
* @param useFixedPointChargeSpreading whether PME charge spreading should be done in fixed point or floating point
* @param useCpuPme whether to perform the PME reciprocal space calculation on the CPU
*/
void commonInitialize(const System& system, const NonbondedForce& force, bool usePmeQueue, bool deviceIsCpu, bool useFixedPointChargeSpreading, bool useCpuPme);
/**
* Execute the kernel to calculate the forces and/or energy.
*
* @param context the context in which to execute this kernel
* @param includeForces true if forces should be calculated
* @param includeEnergy true if the energy should be calculated
* @param includeDirect true if direct space interactions should be included
* @param includeReciprocal true if reciprocal space interactions should be included
* @return the potential energy due to the force
*/
double execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal);
/**
* Copy changed parameters over to a context.
*
* @param context the context to copy parameters to
* @param force the NonbondedForce to copy the parameters from
* @param firstParticle the index of the first particle whose parameters might have changed
* @param lastParticle the index of the last particle whose parameters might have changed
* @param firstException the index of the first exception whose parameters might have changed
* @param lastException the index of the last exception whose parameters might have changed
*/
void copyParametersToContext(ContextImpl& context, const NonbondedForce& force, int firstParticle, int lastParticle, int firstException, int lastException);
/**
* Get the parameters being used for PME.
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
/**
* Get the parameters being used for the dispersion term in LJPME.
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
private:
class SortTrait : public ComputeSortImpl::SortTrait {
int getDataSize() const {return 8;}
int getKeySize() const {return 4;}
const char* getDataType() const {return "int2";}
const char* getKeyType() const {return "int";}
const char* getMinKey() const {return "(-2147483647-1)";}
const char* getMaxKey() const {return "2147483647";}
const char* getMaxValue() const {return "make_int2(2147483647, 2147483647)";}
const char* getSortKey() const {return "value.y";}
};
class ForceInfo;
class PmeIO;
class PmePreComputation;
class PmePostComputation;
class SyncQueuePreComputation;
class SyncQueuePostComputation;
ComputeContext& cc;
ForceInfo* info;
bool hasInitializedKernel;
ComputeArray charges;
ComputeArray sigmaEpsilon;
ComputeArray exceptionParams;
ComputeArray exclusionAtoms;
ComputeArray exclusionParams;
ComputeArray baseParticleParams;
ComputeArray baseExceptionParams;
ComputeArray particleParamOffsets;
ComputeArray exceptionParamOffsets;
ComputeArray particleOffsetIndices;
ComputeArray exceptionOffsetIndices;
ComputeArray globalParams;
ComputeArray cosSinSums;
ComputeArray pmeGrid1;
ComputeArray pmeGrid2;
ComputeArray pmeBsplineModuliX;
ComputeArray pmeBsplineModuliY;
ComputeArray pmeBsplineModuliZ;
ComputeArray pmeDispersionBsplineModuliX;
ComputeArray pmeDispersionBsplineModuliY;
ComputeArray pmeDispersionBsplineModuliZ;
ComputeArray pmeAtomGridIndex;
ComputeArray pmeEnergyBuffer;
ComputeArray chargeBuffer;
ComputeSort sort;
ComputeQueue pmeQueue;
ComputeEvent pmeSyncEvent, paramsSyncEvent;
FFT3D fft, dispersionFft;
Kernel cpuPme;
PmeIO* pmeio;
SyncQueuePostComputation* syncQueue;
ComputeKernel computeParamsKernel, computeExclusionParamsKernel, computePlasmaCorrectionKernel;
ComputeKernel ewaldSumsKernel, ewaldForcesKernel;
ComputeKernel pmeGridIndexKernel, pmeDispersionGridIndexKernel;
ComputeKernel pmeSpreadChargeKernel, pmeDispersionSpreadChargeKernel;
ComputeKernel pmeFinishSpreadChargeKernel, pmeDispersionFinishSpreadChargeKernel;
ComputeKernel pmeConvolutionKernel, pmeDispersionConvolutionKernel;
ComputeKernel pmeEvalEnergyKernel, pmeDispersionEvalEnergyKernel;
ComputeKernel pmeInterpolateForceKernel, pmeDispersionInterpolateForceKernel;
std::map<std::string, std::string> pmeDefines;
std::vector<std::pair<int, int> > exceptionAtoms;
std::vector<std::string> paramNames;
std::vector<double> paramValues;
std::map<int, int> exceptionIndex;
double ewaldSelfEnergy, dispersionCoefficient, alpha, dispersionAlpha, totalCharge;
int gridSizeX, gridSizeY, gridSizeZ;
int dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ;
bool usePmeQueue, deviceIsCpu, useFixedPointChargeSpreading, useCpuPme;
bool hasCoulomb, hasLJ, doLJPME, usePosqCharges, recomputeParams, hasOffsets;
NonbondedMethod nonbondedMethod;
static const int PmeOrder = 5;
};
} // namespace OpenMM
#endif /*OPENMM_COMMONCALCNONBONDEDFORCEKERNEL_H_*/
......@@ -37,11 +37,13 @@
#include "openmm/common/ComputeForceInfo.h"
#include "openmm/common/ComputeProgram.h"
#include "openmm/common/ComputeQueue.h"
#include "openmm/common/ComputeSort.h"
#include "openmm/common/ComputeVectorTypes.h"
#include "openmm/common/FFT3D.h"
#include "openmm/common/IntegrationUtilities.h"
#include "openmm/common/NonbondedUtilities.h"
#include "openmm/Vec3.h"
#include "openmm/internal/ContextImpl.h"
#include <condition_variable>
#include <map>
#include <mutex>
......@@ -139,6 +141,10 @@ public:
* one ComputeContext is created for each device.
*/
virtual std::vector<ComputeContext*> getAllContexts() = 0;
/**
* Get the ContextImpl is ComputeContext is associated with.
*/
virtual ContextImpl& getContextImpl() = 0;
/**
* Get a workspace used for accumulating energy when a simulation is parallelized across
* multiple devices.
......@@ -169,6 +175,19 @@ public:
* Construct a ComputeEvent object of the appropriate class for this platform.
*/
virtual ComputeEvent createEvent() = 0;
/**
* Construct a ComputeSort object of the appropriate class for this platform.
*
* @param trait a SortTrait defining the type of data to sort. It should have been allocated
* on the heap with the "new" operator. This object takes over ownership of it,
* and deletes it when the ComputeSort is deleted.
* @param length the length of the arrays this object will be used to sort
* @param uniform whether the input data is expected to follow a uniform or nonuniform
* distribution. This argument is used only as a hint. It allows parts
* of the algorithm to be tuned for faster performance on the expected
* distribution.
*/
virtual ComputeSort createSort(ComputeSortImpl::SortTrait* trait, unsigned int length, bool uniform=true) = 0;
/**
* Compile source code to create a ComputeProgram.
*
......@@ -501,7 +520,7 @@ public:
* @param zsize the third dimension of the data sets on which FFTs will be performed
* @param realToComplex if true, a real-to-complex transform will be done. Otherwise, it is complex-to-complex.
*/
virtual FFT3D* createFFT(int xsize, int ysize, int zsize, bool realToComplex=false) = 0;
virtual FFT3D createFFT(int xsize, int ysize, int zsize, bool realToComplex=false) = 0;
/**
* Get the smallest legal size for a dimension of the grid.
*/
......@@ -511,6 +530,15 @@ public:
* It ensures all contexts are fully initialized.
*/
virtual void initializeContexts() = 0;
/**
* Set the particle charges. These are packed into the fourth element of the posq array.
*/
virtual void setCharges(const std::vector<double>& charges) = 0;
/**
* Request to use the fourth element of the posq array for storing charges. Since only one force can
* do that, this returns true the first time it is called, and false on all subsequent calls.
*/
virtual bool requestPosqCharges() = 0;
/**
* Get the thread used by this context for executing parallel computations.
*/
......
......@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2019 Stanford University and the Authors. *
* Portions copyright (c) 2019-2025 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -27,17 +27,18 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "ComputeQueue.h"
#include <memory>
namespace OpenMM {
/**
* This abstract class represents an event for synchronization between the host and
* device. It is created by calling createEvent() on a ComputeContext, which returns
* an instance of a platform-specific subclass. To use it, call enqueue() immediately
* after starting an asynchronous operation, such as a kernel invocation or non-blocking
* data transfer. Then at a later point call wait(). This will cause the host to block
* until all operations started before the call to enequeue() have completed.
* This abstract class represents an event for synchronization between the host and device,
* or between queues on the same device. It is created by calling createEvent() on a ComputeContext,
* which returns an instance of a platform-specific subclass. To use it, call enqueue() immediately
* after starting an asynchronous operation, such as a kernel invocation or non-blocking data
* transfer. Then at a later point call wait() or queueWait(). This will cause the host or a
* specified queue to block until all operations started before the call to enequeue() have completed.
*
* Instead of referring to this class directly, it is best to use a ComputeEvent, which is
* a typedef for a shared_ptr to a ComputeEventImpl. This allows you to treat it as having
......@@ -56,6 +57,11 @@ public:
* Block until all operations started before the call to enqueue() have completed.
*/
virtual void wait() = 0;
/**
* Enqueue a barrier that causes a specified ComputeQueue to block until all
* operations started before the call to enqueue() have completed.
*/
virtual void queueWait(ComputeQueue queue) = 0;
};
typedef std::shared_ptr<ComputeEventImpl> ComputeEvent;
......
#ifndef OPENMM_COMPUTESORT_H_
#define OPENMM_COMPUTESORT_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2025 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "openmm/common/ArrayInterface.h"
#include "openmm/common/windowsExportCommon.h"
#include <memory>
namespace OpenMM {
/**
* This abstract class represents an algorithm for sorting arrays. It is created
* by calling createEvent() on a ComputeContext, which returns an instance of a
* platform-specific subclass.
*
* Instead of referring to this class directly, it is best to use a ComputeSort, which is
* a typedef for a shared_ptr to a ComputeSortImpl. This allows you to treat it as having
* value semantics, and frees you from having to manage memory.
*/
class OPENMM_EXPORT_COMMON ComputeSortImpl {
public:
class SortTrait;
virtual ~ComputeSortImpl() {
}
/**
* Sort an array.
*/
virtual void sort(ArrayInterface& data) = 0;
};
typedef std::shared_ptr<ComputeSortImpl> ComputeSort;
/**
* A subclass of SortTrait defines the type of value to sort, and the key for sorting them.
*/
class ComputeSortImpl::SortTrait {
public:
virtual ~SortTrait() {
}
/**
* Get the size of each data value in bytes.
*/
virtual int getDataSize() const = 0;
/**
* Get the size of each key value in bytes.
*/
virtual int getKeySize() const = 0;
/**
* Get the data type of the values to sort.
*/
virtual const char* getDataType() const = 0;
/**
* Get the data type of the sorting key.
*/
virtual const char* getKeyType() const = 0;
/**
* Get the minimum value a key can take.
*/
virtual const char* getMinKey() const = 0;
/**
* Get the maximum value a key can take.
*/
virtual const char* getMaxKey() const = 0;
/**
* Get a value whose key is guaranteed to equal getMaxKey().
*/
virtual const char* getMaxValue() const = 0;
/**
* Get the source code to select the key from the data value.
*/
virtual const char* getSortKey() const = 0;
};
} // namespace OpenMM
#endif /*OPENMM_COMPUTESORT_H_*/
......@@ -28,6 +28,7 @@
* -------------------------------------------------------------------------- */
#include "openmm/common/ArrayInterface.h"
#include <memory>
namespace OpenMM {
......@@ -44,11 +45,15 @@ namespace OpenMM {
* Note that this class performs an unnormalized transform. That means that if you perform
* a forward transform followed immediately by an inverse transform, the effect is to
* multiply every value of the original data set by the total number of data points.
*
* Instead of referring to this class directly, it is best to use a FFT3D, which is
* a typedef for a shared_ptr to a FFT3DImpl. This allows you to treat it as having
* value semantics, and frees you from having to manage memory.
*/
class OPENMM_EXPORT_COMMON FFT3D {
class OPENMM_EXPORT_COMMON FFT3DImpl {
public:
virtual ~FFT3D() {
virtual ~FFT3DImpl() {
}
/**
* Perform a Fourier transform. The transform cannot be done in-place: the input and output
......@@ -66,6 +71,8 @@ public:
virtual void execFFT(ArrayInterface& in, ArrayInterface& out, bool forward=true) = 0;
};
typedef std::shared_ptr<FFT3DImpl> FFT3D;
} // namespace OpenMM
#endif // __OPENMM_FFT3D_H__
......@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2019 Stanford University and the Authors. *
* Portions copyright (c) 2009-2025 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -71,8 +71,11 @@ public:
* @param forceGroup the force group in which the interaction should be calculated
* @param useNeighborList specifies whether a neighbor list should be used to optimize this interaction. This should
* be viewed as only a suggestion. Even when it is false, a neighbor list may be used anyway.
* @param supportsPairList specifies whether this interaction can work with a neighbor list that uses a separate pair list
*/
virtual void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup, bool useNeighborList=true) = 0;
virtual void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance,
const std::vector<std::vector<int> >& exclusionList, const std::string& kernel,
int forceGroup, bool useNeighborList=true, bool supportsPairList=false) = 0;
/**
* Add a per-atom parameter that the default interaction kernel may depend on.
*/
......
This diff is collapsed.
......@@ -42,7 +42,6 @@
#include "CudaArray.h"
#include "CudaBondedUtilities.h"
#include "CudaExpressionUtilities.h"
#include "CudaFFT3D.h"
#include "CudaIntegrationUtilities.h"
#include "CudaNonbondedUtilities.h"
#include "CudaPlatform.h"
......@@ -154,6 +153,12 @@ public:
* one ComputeContext is created for each device.
*/
std::vector<ComputeContext*> getAllContexts();
/**
* Get the ContextImpl is ComputeContext is associated with.
*/
ContextImpl& getContextImpl() {
return *platformData.context;
}
/**
* Get a workspace used for accumulating energy when a simulation is parallelized across
* multiple devices.
......@@ -176,6 +181,19 @@ public:
* Construct a ComputeEvent object of the appropriate class for this platform.
*/
ComputeEvent createEvent();
/**
* Construct a ComputeSort object of the appropriate class for this platform.
*
* @param trait a SortTrait defining the type of data to sort. It should have been allocated
* on the heap with the "new" operator. This object takes over ownership of it,
* and deletes it when the ComputeSort is deleted.
* @param length the length of the arrays this object will be used to sort
* @param uniform whether the input data is expected to follow a uniform or nonuniform
* distribution. This argument is used only as a hint. It allows parts
* of the algorithm to be tuned for faster performance on the expected
* distribution.
*/
ComputeSort createSort(ComputeSortImpl::SortTrait* trait, unsigned int length, bool uniform=true);
/**
* Compile source code to create a ComputeProgram.
*
......@@ -515,7 +533,7 @@ public:
* @param zsize the third dimension of the data sets on which FFTs will be performed
* @param realToComplex if true, a real-to-complex transform will be done. Otherwise, it is complex-to-complex.
*/
CudaFFT3D* createFFT(int xsize, int ysize, int zsize, bool realToComplex=false);
FFT3D createFFT(int xsize, int ysize, int zsize, bool realToComplex=false);
/**
* This should be called by the Integrator from its own initialize() method.
* It ensures all contexts are fully initialized.
......
......@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2019 Stanford University and the Authors. *
* Portions copyright (c) 2019-2025 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -48,6 +48,11 @@ public:
* Block until all operations started before the call to enqueue() have completed.
*/
void wait();
/**
* Enqueue a barrier that causes a specified ComputeQueue to block until all
* operations started before the call to enqueue() have completed.
*/
void queueWait(ComputeQueue queue);
private:
CudaContext& context;
CUevent event;
......
......@@ -50,7 +50,7 @@ class CudaContext;
* multiply every value of the original data set by the total number of data points.
*/
class OPENMM_EXPORT_COMMON CudaFFT3D : public FFT3D {
class OPENMM_EXPORT_COMMON CudaFFT3D : public FFT3DImpl {
public:
/**
* Create a CudaFFT3D object for performing transforms of a particular size.
......
......@@ -30,11 +30,12 @@
#include "CudaPlatform.h"
#include "CudaArray.h"
#include "CudaContext.h"
#include "CudaFFT3D.h"
#include "CudaSort.h"
#include "openmm/kernels.h"
#include "openmm/System.h"
#include "openmm/common/CommonKernels.h"
#include "openmm/common/CommonCalcNonbondedForce.h"
#include "openmm/common/ComputeSort.h"
#include "openmm/common/FFT3D.h"
namespace OpenMM {
......@@ -85,12 +86,11 @@ private:
/**
* This kernel is invoked by NonbondedForce to calculate the forces acting on the system.
*/
class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
class CudaCalcNonbondedForceKernel : public CommonCalcNonbondedForceKernel {
public:
CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcNonbondedForceKernel(name, platform),
cu(cu), hasInitializedFFT(false), sort(NULL), dispersionFft(NULL), fft(NULL), pmeio(NULL), useFixedPointChargeSpreading(false), usePmeStream(false) {
CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) :
CommonCalcNonbondedForceKernel(name, platform, cu, system), cu(cu) {
}
~CudaCalcNonbondedForceKernel();
/**
* Initialize the kernel.
*
......@@ -98,123 +98,8 @@ public:
* @param force the NonbondedForce this kernel will be used for
*/
void initialize(const System& system, const NonbondedForce& force);
/**
* Execute the kernel to calculate the forces and/or energy.
*
* @param context the context in which to execute this kernel
* @param includeForces true if forces should be calculated
* @param includeEnergy true if the energy should be calculated
* @param includeDirect true if direct space interactions should be included
* @param includeReciprocal true if reciprocal space interactions should be included
* @return the potential energy due to the force
*/
double execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal);
/**
* Copy changed parameters over to a context.
*
* @param context the context to copy parameters to
* @param force the NonbondedForce to copy the parameters from
* @param firstParticle the index of the first particle whose parameters might have changed
* @param lastParticle the index of the last particle whose parameters might have changed
* @param firstException the index of the first exception whose parameters might have changed
* @param lastException the index of the last exception whose parameters might have changed
*/
void copyParametersToContext(ContextImpl& context, const NonbondedForce& force, int firstParticle, int lastParticle, int firstException, int lastException);
/**
* Get the parameters being used for PME.
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
/**
* Get the dispersion parameters being used for the dispersion term in LJPME.
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
private:
class SortTrait : public CudaSort::SortTrait {
int getDataSize() const {return 8;}
int getKeySize() const {return 4;}
const char* getDataType() const {return "int2";}
const char* getKeyType() const {return "int";}
const char* getMinKey() const {return "(-2147483647-1)";}
const char* getMaxKey() const {return "2147483647";}
const char* getMaxValue() const {return "make_int2(2147483647, 2147483647)";}
const char* getSortKey() const {return "value.y";}
};
class ForceInfo;
class PmeIO;
class PmePreComputation;
class PmePostComputation;
class SyncStreamPreComputation;
class SyncStreamPostComputation;
CudaContext& cu;
ForceInfo* info;
bool hasInitializedFFT;
CudaArray charges;
CudaArray sigmaEpsilon;
CudaArray exceptionParams;
CudaArray exclusionAtoms;
CudaArray exclusionParams;
CudaArray baseParticleParams;
CudaArray baseExceptionParams;
CudaArray particleParamOffsets;
CudaArray exceptionParamOffsets;
CudaArray particleOffsetIndices;
CudaArray exceptionOffsetIndices;
CudaArray globalParams;
CudaArray cosSinSums;
CudaArray pmeGrid1;
CudaArray pmeGrid2;
CudaArray pmeBsplineModuliX;
CudaArray pmeBsplineModuliY;
CudaArray pmeBsplineModuliZ;
CudaArray pmeDispersionBsplineModuliX;
CudaArray pmeDispersionBsplineModuliY;
CudaArray pmeDispersionBsplineModuliZ;
CudaArray pmeAtomGridIndex;
CudaArray pmeEnergyBuffer;
CudaArray chargeBuffer;
CudaSort* sort;
Kernel cpuPme;
PmeIO* pmeio;
ComputeQueue pmeQueue;
CUevent pmeSyncEvent, paramsSyncEvent;
CudaFFT3D* fft;
CudaFFT3D* dispersionFft;
CUfunction computeParamsKernel, computeExclusionParamsKernel, computePlasmaCorrectionKernel;
CUfunction ewaldSumsKernel;
CUfunction ewaldForcesKernel;
CUfunction pmeGridIndexKernel;
CUfunction pmeDispersionGridIndexKernel;
CUfunction pmeSpreadChargeKernel;
CUfunction pmeDispersionSpreadChargeKernel;
CUfunction pmeFinishSpreadChargeKernel;
CUfunction pmeDispersionFinishSpreadChargeKernel;
CUfunction pmeEvalEnergyKernel;
CUfunction pmeEvalDispersionEnergyKernel;
CUfunction pmeConvolutionKernel;
CUfunction pmeDispersionConvolutionKernel;
CUfunction pmeInterpolateForceKernel;
CUfunction pmeInterpolateDispersionForceKernel;
std::vector<std::pair<int, int> > exceptionAtoms;
std::vector<std::string> paramNames;
std::vector<double> paramValues;
std::map<int, int> exceptionIndex;
double ewaldSelfEnergy, dispersionCoefficient, alpha, dispersionAlpha, totalCharge;
int interpolateForceThreads;
int gridSizeX, gridSizeY, gridSizeZ;
int dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ;
bool hasCoulomb, hasLJ, useFixedPointChargeSpreading, usePmeStream, doLJPME, usePosqCharges, recomputeParams, hasOffsets;
NonbondedMethod nonbondedMethod;
static const int PmeOrder = 5;
};
/**
......
......@@ -30,6 +30,7 @@
#include "openmm/System.h"
#include "CudaArray.h"
#include "CudaExpressionUtilities.h"
#include "openmm/common/ComputeSort.h"
#include "openmm/common/NonbondedUtilities.h"
#include <cuda.h>
#include <sstream>
......@@ -39,7 +40,6 @@
namespace OpenMM {
class CudaContext;
class CudaSort;
/**
* This class provides a generic interface for calculating nonbonded interactions. It does this in two
......@@ -71,20 +71,6 @@ public:
class ParameterInfo;
CudaNonbondedUtilities(CudaContext& context);
~CudaNonbondedUtilities();
/**
* Add a nonbonded interaction to be evaluated by the default interaction kernel.
*
* @param usesCutoff specifies whether a cutoff should be applied to this interaction
* @param usesPeriodic specifies whether periodic boundary conditions should be applied to this interaction
* @param usesExclusions specifies whether this interaction uses exclusions. If this is true, it must have identical exclusions to every other interaction.
* @param cutoffDistance the cutoff distance for this interaction (ignored if usesCutoff is false)
* @param exclusionList for each atom, specifies the list of other atoms whose interactions should be excluded
* @param kernel the code to evaluate the interaction
* @param forceGroup the force group in which the interaction should be calculated
* @param useNeighborList specifies whether a neighbor list should be used to optimize this interaction. This should
* be viewed as only a suggestion. Even when it is false, a neighbor list may be used anyway.
*/
void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup, bool useNeighborList=true);
/**
* Add a nonbonded interaction to be evaluated by the default interaction kernel.
*
......@@ -99,7 +85,9 @@ public:
* be viewed as only a suggestion. Even when it is false, a neighbor list may be used anyway.
* @param supportsPairList specifies whether this interaction can work with a neighbor list that uses a separate pair list
*/
void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup, bool useNeighborList, bool supportsPairList);
void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance,
const std::vector<std::vector<int> >& exclusionList, const std::string& kernel,
int forceGroup, bool useNeighborList=true, bool supportsPairList=false);
/**
* Add a per-atom parameter that the default interaction kernel may depend on.
*/
......@@ -343,7 +331,7 @@ private:
CudaArray largeBlockBoundingBox;
CudaArray oldPositions;
CudaArray rebuildNeighborList;
CudaSort* blockSorter;
ComputeSort blockSorter;
CUevent downloadCountEvent;
unsigned int* pinnedCountBuffer;
std::vector<void*> forceArgs, findBlockBoundsArgs, computeSortKeysArgs, sortBoxDataArgs, findInteractingBlocksArgs;
......
......@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2021 Stanford University and the Authors. *
* Portions copyright (c) 2010-2025 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -28,6 +28,7 @@
* -------------------------------------------------------------------------- */
#include "CudaArray.h"
#include "openmm/common/ComputeSort.h"
#include "openmm/common/windowsExportCommon.h"
#include "CudaContext.h"
......@@ -41,7 +42,7 @@ namespace OpenMM {
* sort and the key for sorting it. Here is an example of a trait class for
* sorting floats:
*
* class FloatTrait : public CudaSort::SortTrait {
* class FloatTrait : public ComputeSortImpl::SortTrait {
* int getDataSize() const {return 4;}
* int getKeySize() const {return 4;}
* const char* getDataType() const {return "float";}
......@@ -66,9 +67,8 @@ namespace OpenMM {
* elements).
*/
class OPENMM_EXPORT_COMMON CudaSort {
class OPENMM_EXPORT_COMMON CudaSort : public ComputeSortImpl {
public:
class SortTrait;
/**
* Create a CudaSort object for sorting data of a particular type.
*
......@@ -82,15 +82,15 @@ public:
* of the algorithm to be tuned for faster performance on the expected
* distribution.
*/
CudaSort(CudaContext& context, SortTrait* trait, unsigned int length, bool uniform=true);
CudaSort(CudaContext& context, ComputeSortImpl::SortTrait* trait, unsigned int length, bool uniform=true);
~CudaSort();
/**
* Sort an array.
*/
void sort(CudaArray& data);
void sort(ArrayInterface& data);
private:
CudaContext& context;
SortTrait* trait;
ComputeSortImpl::SortTrait* trait;
CudaArray dataRange;
CudaArray bucketOfElement;
CudaArray offsetInBucket;
......@@ -101,48 +101,6 @@ private:
bool isShortList, uniform;
};
/**
* A subclass of SortTrait defines the type of value to sort, and the key for sorting them.
*/
class CudaSort::SortTrait {
public:
virtual ~SortTrait() {
}
/**
* Get the size of each data value in bytes.
*/
virtual int getDataSize() const = 0;
/**
* Get the size of each key value in bytes.
*/
virtual int getKeySize() const = 0;
/**
* Get the data type of the values to sort.
*/
virtual const char* getDataType() const = 0;
/**
* Get the data type of the sorting key.
*/
virtual const char* getKeyType() const = 0;
/**
* Get the minimum value a key can take.
*/
virtual const char* getMinKey() const = 0;
/**
* Get the maximum value a key can take.
*/
virtual const char* getMaxKey() const = 0;
/**
* Get a value whose key is guaranteed to equal getMaxKey().
*/
virtual const char* getMaxValue() const = 0;
/**
* Get the CUDA code to select the key from the data value.
*/
virtual const char* getSortKey() const = 0;
};
} // namespace OpenMM
#endif // __OPENMM_CUDASORT_H__
......@@ -32,11 +32,13 @@
#include "CudaArray.h"
#include "CudaBondedUtilities.h"
#include "CudaEvent.h"
#include "CudaFFT3D.h"
#include "CudaIntegrationUtilities.h"
#include "CudaKernels.h"
#include "CudaKernelSources.h"
#include "CudaNonbondedUtilities.h"
#include "CudaProgram.h"
#include "CudaSort.h"
#include "openmm/common/ComputeArray.h"
#include "openmm/common/ContextSelector.h"
#include "SHA1.h"
......@@ -439,8 +441,8 @@ void CudaContext::initializeContexts() {
getPlatformData().initializeContexts(system);
}
CudaFFT3D* CudaContext::createFFT(int xsize, int ysize, int zsize, bool realToComplex) {
return new CudaFFT3D(*this, xsize, ysize, zsize, realToComplex);
FFT3D CudaContext::createFFT(int xsize, int ysize, int zsize, bool realToComplex) {
return FFT3D(new CudaFFT3D(*this, xsize, ysize, zsize, realToComplex));
}
void CudaContext::setAsCurrent() {
......@@ -667,6 +669,10 @@ ComputeEvent CudaContext::createEvent() {
return shared_ptr<ComputeEventImpl>(new CudaEvent(*this));
}
ComputeSort CudaContext::createSort(ComputeSortImpl::SortTrait* trait, unsigned int length, bool uniform) {
return shared_ptr<ComputeSortImpl>(new CudaSort(*this, trait, length, uniform));
}
ComputeProgram CudaContext::compileProgram(const std::string source, const std::map<std::string, std::string>& defines) {
CUmodule module = createModule(CudaKernelSources::vectorOps+source, defines);
return shared_ptr<ComputeProgramImpl>(new CudaProgram(*this, module));
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2019 Stanford University and the Authors. *
* Portions copyright (c) 2019-2025 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -42,9 +42,13 @@ CudaEvent::~CudaEvent() {
}
void CudaEvent::enqueue() {
cuEventRecord(event, 0);
cuEventRecord(event, context.getCurrentStream());
}
void CudaEvent::wait() {
cuEventSynchronize(event);
}
void CudaEvent::queueWait(ComputeQueue queue) {
cuStreamWaitEvent(dynamic_cast<CudaQueue*>(queue.get())->getStream(), event, 0);
}
This diff is collapsed.
......@@ -30,7 +30,6 @@
#include "CudaContext.h"
#include "CudaKernelSources.h"
#include "CudaExpressionUtilities.h"
#include "CudaSort.h"
#include <algorithm>
#include <map>
#include <set>
......@@ -47,7 +46,7 @@ using namespace std;
}
class CudaNonbondedUtilities::BlockSortTrait : public CudaSort::SortTrait {
class CudaNonbondedUtilities::BlockSortTrait : public ComputeSortImpl::SortTrait {
public:
BlockSortTrait() {}
int getDataSize() const {return sizeof(int);}
......@@ -61,7 +60,7 @@ public:
};
CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), useCutoff(false), usePeriodic(false), useNeighborList(false), anyExclusions(false), usePadding(true),
blockSorter(NULL), pinnedCountBuffer(NULL), forceRebuildNeighborList(true), groupFlags(0), canUsePairList(true), tilesAfterReorder(0) {
pinnedCountBuffer(NULL), forceRebuildNeighborList(true), groupFlags(0), canUsePairList(true), tilesAfterReorder(0) {
// Decide how many thread blocks to use.
string errorMessage = "Error initializing nonbonded utilities";
......@@ -82,18 +81,13 @@ CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(c
}
CudaNonbondedUtilities::~CudaNonbondedUtilities() {
if (blockSorter != NULL)
delete blockSorter;
if (pinnedCountBuffer != NULL)
cuMemFreeHost(pinnedCountBuffer);
cuEventDestroy(downloadCountEvent);
}
void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup, bool useNeighborList) {
addInteraction(usesCutoff, usesPeriodic, usesExclusions, cutoffDistance, exclusionList, kernel, forceGroup, useNeighborList, false);
}
void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup, bool useNeighborList, bool supportsPairList) {
void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance,
const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup, bool useNeighborList, bool supportsPairList) {
if (groupCutoff.size() > 0) {
if (usesCutoff != useCutoff)
throw OpenMMException("All Forces must agree on whether to use a cutoff");
......@@ -289,7 +283,7 @@ void CudaNonbondedUtilities::initialize(const System& system) {
largeBlockBoundingBox.initialize(context, numAtomBlocks, 4*elementSize, "largeBlockBoundingBox");
oldPositions.initialize(context, numAtoms, 4*elementSize, "oldPositions");
rebuildNeighborList.initialize<int>(context, 1, "rebuildNeighborList");
blockSorter = new CudaSort(context, new BlockSortTrait(), numAtomBlocks, false);
blockSorter = context.createSort(new BlockSortTrait(), numAtomBlocks, false);
vector<unsigned int> count(2, 0);
interactionCount.upload(count);
rebuildNeighborList.upload(&count[0]);
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2021 Stanford University and the Authors. *
* Portions copyright (c) 2010-2025 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -32,7 +32,7 @@
using namespace OpenMM;
using namespace std;
CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length, bool uniform) :
CudaSort::CudaSort(CudaContext& context, ComputeSortImpl::SortTrait* trait, unsigned int length, bool uniform) :
context(context), trait(trait), dataLength(length), uniform(uniform) {
// Create kernels.
......@@ -92,21 +92,22 @@ CudaSort::~CudaSort() {
delete trait;
}
void CudaSort::sort(CudaArray& data) {
void CudaSort::sort(ArrayInterface& data) {
if (data.getSize() != dataLength || data.getElementSize() != trait->getDataSize())
throw OpenMMException("CudaSort called with different data size");
if (data.getSize() == 0)
return;
CudaArray& cudata = context.unwrap(data);
if (isShortList) {
// We can use a simpler sort kernel that does the entire operation in one kernel.
if (dataLength <= CudaContext::ThreadBlockSize*context.getNumThreadBlocks()) {
void* sortArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &dataLength};
void* sortArgs[] = {&cudata.getDevicePointer(), &buckets.getDevicePointer(), &dataLength};
context.executeKernel(shortList2Kernel, sortArgs, dataLength);
buckets.copyTo(data);
buckets.copyTo(cudata);
}
else {
void* sortArgs[] = {&data.getDevicePointer(), &dataLength};
void* sortArgs[] = {&cudata.getDevicePointer(), &dataLength};
context.executeKernel(shortListKernel, sortArgs, sortKernelSize, sortKernelSize, dataLength*trait->getDataSize());
}
}
......@@ -114,14 +115,14 @@ void CudaSort::sort(CudaArray& data) {
// Compute the range of data values.
unsigned int numBuckets = bucketOffset.getSize();
void* rangeArgs[] = {&data.getDevicePointer(), &dataLength, &dataRange.getDevicePointer(), &numBuckets, &bucketOffset.getDevicePointer()};
void* rangeArgs[] = {&cudata.getDevicePointer(), &dataLength, &dataRange.getDevicePointer(), &numBuckets, &bucketOffset.getDevicePointer()};
context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelSize, rangeKernelSize, 2*rangeKernelSize*trait->getKeySize());
// Assign array elements to buckets.
void* elementsArgs[] = {&data.getDevicePointer(), &dataLength, &numBuckets, &dataRange.getDevicePointer(),
void* elementsArgs[] = {&cudata.getDevicePointer(), &dataLength, &numBuckets, &dataRange.getDevicePointer(),
&bucketOffset.getDevicePointer(), &bucketOfElement.getDevicePointer(), &offsetInBucket.getDevicePointer()};
context.executeKernel(assignElementsKernel, elementsArgs, data.getSize(), 128);
context.executeKernel(assignElementsKernel, elementsArgs, cudata.getSize(), 128);
// Compute the position of each bucket.
......@@ -130,13 +131,13 @@ void CudaSort::sort(CudaArray& data) {
// Copy the data into the buckets.
void* copyArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &dataLength, &bucketOffset.getDevicePointer(),
void* copyArgs[] = {&cudata.getDevicePointer(), &buckets.getDevicePointer(), &dataLength, &bucketOffset.getDevicePointer(),
&bucketOfElement.getDevicePointer(), &offsetInBucket.getDevicePointer()};
context.executeKernel(copyToBucketsKernel, copyArgs, data.getSize());
context.executeKernel(copyToBucketsKernel, copyArgs, cudata.getSize());
// Sort each bucket.
void* sortArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &numBuckets, &bucketOffset.getDevicePointer()};
context.executeKernel(sortBucketsKernel, sortArgs, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize());
void* sortArgs[] = {&cudata.getDevicePointer(), &buckets.getDevicePointer(), &numBuckets, &bucketOffset.getDevicePointer()};
context.executeKernel(sortBucketsKernel, sortArgs, ((cudata.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize());
}
}
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2021 Stanford University and the Authors. *
* Portions copyright (c) 2008-2025 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -48,7 +48,7 @@ using namespace std;
CudaPlatform platform;
class SortTrait : public CudaSort::SortTrait {
class SortTrait : public ComputeSortImpl::SortTrait {
int getDataSize() const {return 4;}
int getKeySize() const {return 4;}
const char* getDataType() const {return "float";}
......
......@@ -51,7 +51,6 @@
#include "HipArray.h"
#include "HipBondedUtilities.h"
#include "HipExpressionUtilities.h"
#include "HipFFT3D.h"
#include "HipIntegrationUtilities.h"
#include "HipNonbondedUtilities.h"
#include "HipPlatform.h"
......@@ -156,6 +155,12 @@ public:
* one ComputeContext is created for each device.
*/
std::vector<ComputeContext*> getAllContexts();
/**
* Get the ContextImpl is ComputeContext is associated with.
*/
ContextImpl& getContextImpl() {
return *platformData.context;
}
/**
* Get a workspace used for accumulating energy when a simulation is parallelized across
* multiple devices.
......@@ -178,6 +183,19 @@ public:
* Construct a ComputeEvent object of the appropriate class for this platform.
*/
ComputeEvent createEvent();
/**
* Construct a ComputeSort object of the appropriate class for this platform.
*
* @param trait a SortTrait defining the type of data to sort. It should have been allocated
* on the heap with the "new" operator. This object takes over ownership of it,
* and deletes it when the ComputeSort is deleted.
* @param length the length of the arrays this object will be used to sort
* @param uniform whether the input data is expected to follow a uniform or nonuniform
* distribution. This argument is used only as a hint. It allows parts
* of the algorithm to be tuned for faster performance on the expected
* distribution.
*/
ComputeSort createSort(ComputeSortImpl::SortTrait* trait, unsigned int length, bool uniform=true);
/**
* Create an object for performing 3D FFTs. The caller is responsible for deleting
* the object when it is no longer needed.
......@@ -187,7 +205,7 @@ public:
* @param zsize the third dimension of the data sets on which FFTs will be performed
* @param realToComplex if true, a real-to-complex transform will be done. Otherwise, it is complex-to-complex.
*/
HipFFT3D* createFFT(int xsize, int ysize, int zsize, bool realToComplex=false);
FFT3D createFFT(int xsize, int ysize, int zsize, bool realToComplex=false);
/**
* Get the smallest legal size for a dimension of the grid.
*/
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment