Unverified Commit 2443dcee authored by Peter Eastman's avatar Peter Eastman Committed by GitHub
Browse files

Common implementation of NonbondedForce (#4922)

* Use common API for kernels

* More code uses common interface

* Bug fixes

* Unified interface for sorting

* Simplified interface for FFT

* Use common event API for synchronization

* Minor changes to make code more consistent between platforms

* Common implementation of NonbondedForce

* Bug fixes

* Flag to enable list of single pairs

* CUDA and OpenCL use common implementation of NonbondedForce

* Fixed compilation error

* HIP uses common implementation of NonbondedForce
parent dfb8d755
#ifndef OPENMM_COMMONCALCNONBONDEDFORCEKERNEL_H_
#define OPENMM_COMMONCALCNONBONDEDFORCEKERNEL_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2025 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "openmm/kernels.h"
#include "openmm/common/ComputeArray.h"
#include "openmm/common/ComputeContext.h"
#include "openmm/common/ComputeEvent.h"
#include "openmm/common/ComputeQueue.h"
#include "openmm/common/ComputeSort.h"
#include "openmm/common/FFT3D.h"
#include <map>
#include <string>
#include <utility>
#include <vector>
namespace OpenMM {
/**
* This kernel is invoked by NonbondedForce to calculate the forces acting on the system.
*/
class CommonCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public:
CommonCalcNonbondedForceKernel(std::string name, const Platform& platform, ComputeContext& cc, const System& system) : CalcNonbondedForceKernel(name, platform),
hasInitializedKernel(false), cc(cc), pmeio(NULL) {
}
~CommonCalcNonbondedForceKernel();
/**
* Initialize the kernel. Subclasses should call this from their initialize() method.
*
* @param system the System this kernel will be applied to
* @param force the NonbondedForce this kernel will be used for
* @param usePmeQueue whether to perform PME on a separate queue
* @param deviceIsCpu whether the device this calculation is running on is a CPU
* @param useFixedPointChargeSpreading whether PME charge spreading should be done in fixed point or floating point
* @param useCpuPme whether to perform the PME reciprocal space calculation on the CPU
*/
void commonInitialize(const System& system, const NonbondedForce& force, bool usePmeQueue, bool deviceIsCpu, bool useFixedPointChargeSpreading, bool useCpuPme);
/**
* Execute the kernel to calculate the forces and/or energy.
*
* @param context the context in which to execute this kernel
* @param includeForces true if forces should be calculated
* @param includeEnergy true if the energy should be calculated
* @param includeDirect true if direct space interactions should be included
* @param includeReciprocal true if reciprocal space interactions should be included
* @return the potential energy due to the force
*/
double execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal);
/**
* Copy changed parameters over to a context.
*
* @param context the context to copy parameters to
* @param force the NonbondedForce to copy the parameters from
* @param firstParticle the index of the first particle whose parameters might have changed
* @param lastParticle the index of the last particle whose parameters might have changed
* @param firstException the index of the first exception whose parameters might have changed
* @param lastException the index of the last exception whose parameters might have changed
*/
void copyParametersToContext(ContextImpl& context, const NonbondedForce& force, int firstParticle, int lastParticle, int firstException, int lastException);
/**
* Get the parameters being used for PME.
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
/**
* Get the parameters being used for the dispersion term in LJPME.
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
private:
class SortTrait : public ComputeSortImpl::SortTrait {
int getDataSize() const {return 8;}
int getKeySize() const {return 4;}
const char* getDataType() const {return "int2";}
const char* getKeyType() const {return "int";}
const char* getMinKey() const {return "(-2147483647-1)";}
const char* getMaxKey() const {return "2147483647";}
const char* getMaxValue() const {return "make_int2(2147483647, 2147483647)";}
const char* getSortKey() const {return "value.y";}
};
class ForceInfo;
class PmeIO;
class PmePreComputation;
class PmePostComputation;
class SyncQueuePreComputation;
class SyncQueuePostComputation;
ComputeContext& cc;
ForceInfo* info;
bool hasInitializedKernel;
ComputeArray charges;
ComputeArray sigmaEpsilon;
ComputeArray exceptionParams;
ComputeArray exclusionAtoms;
ComputeArray exclusionParams;
ComputeArray baseParticleParams;
ComputeArray baseExceptionParams;
ComputeArray particleParamOffsets;
ComputeArray exceptionParamOffsets;
ComputeArray particleOffsetIndices;
ComputeArray exceptionOffsetIndices;
ComputeArray globalParams;
ComputeArray cosSinSums;
ComputeArray pmeGrid1;
ComputeArray pmeGrid2;
ComputeArray pmeBsplineModuliX;
ComputeArray pmeBsplineModuliY;
ComputeArray pmeBsplineModuliZ;
ComputeArray pmeDispersionBsplineModuliX;
ComputeArray pmeDispersionBsplineModuliY;
ComputeArray pmeDispersionBsplineModuliZ;
ComputeArray pmeAtomGridIndex;
ComputeArray pmeEnergyBuffer;
ComputeArray chargeBuffer;
ComputeSort sort;
ComputeQueue pmeQueue;
ComputeEvent pmeSyncEvent, paramsSyncEvent;
FFT3D fft, dispersionFft;
Kernel cpuPme;
PmeIO* pmeio;
SyncQueuePostComputation* syncQueue;
ComputeKernel computeParamsKernel, computeExclusionParamsKernel, computePlasmaCorrectionKernel;
ComputeKernel ewaldSumsKernel, ewaldForcesKernel;
ComputeKernel pmeGridIndexKernel, pmeDispersionGridIndexKernel;
ComputeKernel pmeSpreadChargeKernel, pmeDispersionSpreadChargeKernel;
ComputeKernel pmeFinishSpreadChargeKernel, pmeDispersionFinishSpreadChargeKernel;
ComputeKernel pmeConvolutionKernel, pmeDispersionConvolutionKernel;
ComputeKernel pmeEvalEnergyKernel, pmeDispersionEvalEnergyKernel;
ComputeKernel pmeInterpolateForceKernel, pmeDispersionInterpolateForceKernel;
std::map<std::string, std::string> pmeDefines;
std::vector<std::pair<int, int> > exceptionAtoms;
std::vector<std::string> paramNames;
std::vector<double> paramValues;
std::map<int, int> exceptionIndex;
double ewaldSelfEnergy, dispersionCoefficient, alpha, dispersionAlpha, totalCharge;
int gridSizeX, gridSizeY, gridSizeZ;
int dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ;
bool usePmeQueue, deviceIsCpu, useFixedPointChargeSpreading, useCpuPme;
bool hasCoulomb, hasLJ, doLJPME, usePosqCharges, recomputeParams, hasOffsets;
NonbondedMethod nonbondedMethod;
static const int PmeOrder = 5;
};
} // namespace OpenMM
#endif /*OPENMM_COMMONCALCNONBONDEDFORCEKERNEL_H_*/
...@@ -37,11 +37,13 @@ ...@@ -37,11 +37,13 @@
#include "openmm/common/ComputeForceInfo.h" #include "openmm/common/ComputeForceInfo.h"
#include "openmm/common/ComputeProgram.h" #include "openmm/common/ComputeProgram.h"
#include "openmm/common/ComputeQueue.h" #include "openmm/common/ComputeQueue.h"
#include "openmm/common/ComputeSort.h"
#include "openmm/common/ComputeVectorTypes.h" #include "openmm/common/ComputeVectorTypes.h"
#include "openmm/common/FFT3D.h" #include "openmm/common/FFT3D.h"
#include "openmm/common/IntegrationUtilities.h" #include "openmm/common/IntegrationUtilities.h"
#include "openmm/common/NonbondedUtilities.h" #include "openmm/common/NonbondedUtilities.h"
#include "openmm/Vec3.h" #include "openmm/Vec3.h"
#include "openmm/internal/ContextImpl.h"
#include <condition_variable> #include <condition_variable>
#include <map> #include <map>
#include <mutex> #include <mutex>
...@@ -139,6 +141,10 @@ public: ...@@ -139,6 +141,10 @@ public:
* one ComputeContext is created for each device. * one ComputeContext is created for each device.
*/ */
virtual std::vector<ComputeContext*> getAllContexts() = 0; virtual std::vector<ComputeContext*> getAllContexts() = 0;
/**
* Get the ContextImpl is ComputeContext is associated with.
*/
virtual ContextImpl& getContextImpl() = 0;
/** /**
* Get a workspace used for accumulating energy when a simulation is parallelized across * Get a workspace used for accumulating energy when a simulation is parallelized across
* multiple devices. * multiple devices.
...@@ -169,6 +175,19 @@ public: ...@@ -169,6 +175,19 @@ public:
* Construct a ComputeEvent object of the appropriate class for this platform. * Construct a ComputeEvent object of the appropriate class for this platform.
*/ */
virtual ComputeEvent createEvent() = 0; virtual ComputeEvent createEvent() = 0;
/**
* Construct a ComputeSort object of the appropriate class for this platform.
*
* @param trait a SortTrait defining the type of data to sort. It should have been allocated
* on the heap with the "new" operator. This object takes over ownership of it,
* and deletes it when the ComputeSort is deleted.
* @param length the length of the arrays this object will be used to sort
* @param uniform whether the input data is expected to follow a uniform or nonuniform
* distribution. This argument is used only as a hint. It allows parts
* of the algorithm to be tuned for faster performance on the expected
* distribution.
*/
virtual ComputeSort createSort(ComputeSortImpl::SortTrait* trait, unsigned int length, bool uniform=true) = 0;
/** /**
* Compile source code to create a ComputeProgram. * Compile source code to create a ComputeProgram.
* *
...@@ -501,7 +520,7 @@ public: ...@@ -501,7 +520,7 @@ public:
* @param zsize the third dimension of the data sets on which FFTs will be performed * @param zsize the third dimension of the data sets on which FFTs will be performed
* @param realToComplex if true, a real-to-complex transform will be done. Otherwise, it is complex-to-complex. * @param realToComplex if true, a real-to-complex transform will be done. Otherwise, it is complex-to-complex.
*/ */
virtual FFT3D* createFFT(int xsize, int ysize, int zsize, bool realToComplex=false) = 0; virtual FFT3D createFFT(int xsize, int ysize, int zsize, bool realToComplex=false) = 0;
/** /**
* Get the smallest legal size for a dimension of the grid. * Get the smallest legal size for a dimension of the grid.
*/ */
...@@ -511,6 +530,15 @@ public: ...@@ -511,6 +530,15 @@ public:
* It ensures all contexts are fully initialized. * It ensures all contexts are fully initialized.
*/ */
virtual void initializeContexts() = 0; virtual void initializeContexts() = 0;
/**
* Set the particle charges. These are packed into the fourth element of the posq array.
*/
virtual void setCharges(const std::vector<double>& charges) = 0;
/**
* Request to use the fourth element of the posq array for storing charges. Since only one force can
* do that, this returns true the first time it is called, and false on all subsequent calls.
*/
virtual bool requestPosqCharges() = 0;
/** /**
* Get the thread used by this context for executing parallel computations. * Get the thread used by this context for executing parallel computations.
*/ */
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2019 Stanford University and the Authors. * * Portions copyright (c) 2019-2025 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -27,17 +27,18 @@ ...@@ -27,17 +27,18 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>. * * along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */ * -------------------------------------------------------------------------- */
#include "ComputeQueue.h"
#include <memory> #include <memory>
namespace OpenMM { namespace OpenMM {
/** /**
* This abstract class represents an event for synchronization between the host and * This abstract class represents an event for synchronization between the host and device,
* device. It is created by calling createEvent() on a ComputeContext, which returns * or between queues on the same device. It is created by calling createEvent() on a ComputeContext,
* an instance of a platform-specific subclass. To use it, call enqueue() immediately * which returns an instance of a platform-specific subclass. To use it, call enqueue() immediately
* after starting an asynchronous operation, such as a kernel invocation or non-blocking * after starting an asynchronous operation, such as a kernel invocation or non-blocking data
* data transfer. Then at a later point call wait(). This will cause the host to block * transfer. Then at a later point call wait() or queueWait(). This will cause the host or a
* until all operations started before the call to enequeue() have completed. * specified queue to block until all operations started before the call to enequeue() have completed.
* *
* Instead of referring to this class directly, it is best to use a ComputeEvent, which is * Instead of referring to this class directly, it is best to use a ComputeEvent, which is
* a typedef for a shared_ptr to a ComputeEventImpl. This allows you to treat it as having * a typedef for a shared_ptr to a ComputeEventImpl. This allows you to treat it as having
...@@ -56,6 +57,11 @@ public: ...@@ -56,6 +57,11 @@ public:
* Block until all operations started before the call to enqueue() have completed. * Block until all operations started before the call to enqueue() have completed.
*/ */
virtual void wait() = 0; virtual void wait() = 0;
/**
* Enqueue a barrier that causes a specified ComputeQueue to block until all
* operations started before the call to enqueue() have completed.
*/
virtual void queueWait(ComputeQueue queue) = 0;
}; };
typedef std::shared_ptr<ComputeEventImpl> ComputeEvent; typedef std::shared_ptr<ComputeEventImpl> ComputeEvent;
......
#ifndef OPENMM_COMPUTESORT_H_
#define OPENMM_COMPUTESORT_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2025 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "openmm/common/ArrayInterface.h"
#include "openmm/common/windowsExportCommon.h"
#include <memory>
namespace OpenMM {
/**
* This abstract class represents an algorithm for sorting arrays. It is created
* by calling createEvent() on a ComputeContext, which returns an instance of a
* platform-specific subclass.
*
* Instead of referring to this class directly, it is best to use a ComputeSort, which is
* a typedef for a shared_ptr to a ComputeSortImpl. This allows you to treat it as having
* value semantics, and frees you from having to manage memory.
*/
class OPENMM_EXPORT_COMMON ComputeSortImpl {
public:
class SortTrait;
virtual ~ComputeSortImpl() {
}
/**
* Sort an array.
*/
virtual void sort(ArrayInterface& data) = 0;
};
typedef std::shared_ptr<ComputeSortImpl> ComputeSort;
/**
* A subclass of SortTrait defines the type of value to sort, and the key for sorting them.
*/
class ComputeSortImpl::SortTrait {
public:
virtual ~SortTrait() {
}
/**
* Get the size of each data value in bytes.
*/
virtual int getDataSize() const = 0;
/**
* Get the size of each key value in bytes.
*/
virtual int getKeySize() const = 0;
/**
* Get the data type of the values to sort.
*/
virtual const char* getDataType() const = 0;
/**
* Get the data type of the sorting key.
*/
virtual const char* getKeyType() const = 0;
/**
* Get the minimum value a key can take.
*/
virtual const char* getMinKey() const = 0;
/**
* Get the maximum value a key can take.
*/
virtual const char* getMaxKey() const = 0;
/**
* Get a value whose key is guaranteed to equal getMaxKey().
*/
virtual const char* getMaxValue() const = 0;
/**
* Get the source code to select the key from the data value.
*/
virtual const char* getSortKey() const = 0;
};
} // namespace OpenMM
#endif /*OPENMM_COMPUTESORT_H_*/
...@@ -28,6 +28,7 @@ ...@@ -28,6 +28,7 @@
* -------------------------------------------------------------------------- */ * -------------------------------------------------------------------------- */
#include "openmm/common/ArrayInterface.h" #include "openmm/common/ArrayInterface.h"
#include <memory>
namespace OpenMM { namespace OpenMM {
...@@ -44,11 +45,15 @@ namespace OpenMM { ...@@ -44,11 +45,15 @@ namespace OpenMM {
* Note that this class performs an unnormalized transform. That means that if you perform * Note that this class performs an unnormalized transform. That means that if you perform
* a forward transform followed immediately by an inverse transform, the effect is to * a forward transform followed immediately by an inverse transform, the effect is to
* multiply every value of the original data set by the total number of data points. * multiply every value of the original data set by the total number of data points.
*
* Instead of referring to this class directly, it is best to use a FFT3D, which is
* a typedef for a shared_ptr to a FFT3DImpl. This allows you to treat it as having
* value semantics, and frees you from having to manage memory.
*/ */
class OPENMM_EXPORT_COMMON FFT3D { class OPENMM_EXPORT_COMMON FFT3DImpl {
public: public:
virtual ~FFT3D() { virtual ~FFT3DImpl() {
} }
/** /**
* Perform a Fourier transform. The transform cannot be done in-place: the input and output * Perform a Fourier transform. The transform cannot be done in-place: the input and output
...@@ -66,6 +71,8 @@ public: ...@@ -66,6 +71,8 @@ public:
virtual void execFFT(ArrayInterface& in, ArrayInterface& out, bool forward=true) = 0; virtual void execFFT(ArrayInterface& in, ArrayInterface& out, bool forward=true) = 0;
}; };
typedef std::shared_ptr<FFT3DImpl> FFT3D;
} // namespace OpenMM } // namespace OpenMM
#endif // __OPENMM_FFT3D_H__ #endif // __OPENMM_FFT3D_H__
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2019 Stanford University and the Authors. * * Portions copyright (c) 2009-2025 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -71,8 +71,11 @@ public: ...@@ -71,8 +71,11 @@ public:
* @param forceGroup the force group in which the interaction should be calculated * @param forceGroup the force group in which the interaction should be calculated
* @param useNeighborList specifies whether a neighbor list should be used to optimize this interaction. This should * @param useNeighborList specifies whether a neighbor list should be used to optimize this interaction. This should
* be viewed as only a suggestion. Even when it is false, a neighbor list may be used anyway. * be viewed as only a suggestion. Even when it is false, a neighbor list may be used anyway.
* @param supportsPairList specifies whether this interaction can work with a neighbor list that uses a separate pair list
*/ */
virtual void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup, bool useNeighborList=true) = 0; virtual void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance,
const std::vector<std::vector<int> >& exclusionList, const std::string& kernel,
int forceGroup, bool useNeighborList=true, bool supportsPairList=false) = 0;
/** /**
* Add a per-atom parameter that the default interaction kernel may depend on. * Add a per-atom parameter that the default interaction kernel may depend on.
*/ */
......
This diff is collapsed.
...@@ -42,7 +42,6 @@ ...@@ -42,7 +42,6 @@
#include "CudaArray.h" #include "CudaArray.h"
#include "CudaBondedUtilities.h" #include "CudaBondedUtilities.h"
#include "CudaExpressionUtilities.h" #include "CudaExpressionUtilities.h"
#include "CudaFFT3D.h"
#include "CudaIntegrationUtilities.h" #include "CudaIntegrationUtilities.h"
#include "CudaNonbondedUtilities.h" #include "CudaNonbondedUtilities.h"
#include "CudaPlatform.h" #include "CudaPlatform.h"
...@@ -154,6 +153,12 @@ public: ...@@ -154,6 +153,12 @@ public:
* one ComputeContext is created for each device. * one ComputeContext is created for each device.
*/ */
std::vector<ComputeContext*> getAllContexts(); std::vector<ComputeContext*> getAllContexts();
/**
* Get the ContextImpl is ComputeContext is associated with.
*/
ContextImpl& getContextImpl() {
return *platformData.context;
}
/** /**
* Get a workspace used for accumulating energy when a simulation is parallelized across * Get a workspace used for accumulating energy when a simulation is parallelized across
* multiple devices. * multiple devices.
...@@ -176,6 +181,19 @@ public: ...@@ -176,6 +181,19 @@ public:
* Construct a ComputeEvent object of the appropriate class for this platform. * Construct a ComputeEvent object of the appropriate class for this platform.
*/ */
ComputeEvent createEvent(); ComputeEvent createEvent();
/**
* Construct a ComputeSort object of the appropriate class for this platform.
*
* @param trait a SortTrait defining the type of data to sort. It should have been allocated
* on the heap with the "new" operator. This object takes over ownership of it,
* and deletes it when the ComputeSort is deleted.
* @param length the length of the arrays this object will be used to sort
* @param uniform whether the input data is expected to follow a uniform or nonuniform
* distribution. This argument is used only as a hint. It allows parts
* of the algorithm to be tuned for faster performance on the expected
* distribution.
*/
ComputeSort createSort(ComputeSortImpl::SortTrait* trait, unsigned int length, bool uniform=true);
/** /**
* Compile source code to create a ComputeProgram. * Compile source code to create a ComputeProgram.
* *
...@@ -515,7 +533,7 @@ public: ...@@ -515,7 +533,7 @@ public:
* @param zsize the third dimension of the data sets on which FFTs will be performed * @param zsize the third dimension of the data sets on which FFTs will be performed
* @param realToComplex if true, a real-to-complex transform will be done. Otherwise, it is complex-to-complex. * @param realToComplex if true, a real-to-complex transform will be done. Otherwise, it is complex-to-complex.
*/ */
CudaFFT3D* createFFT(int xsize, int ysize, int zsize, bool realToComplex=false); FFT3D createFFT(int xsize, int ysize, int zsize, bool realToComplex=false);
/** /**
* This should be called by the Integrator from its own initialize() method. * This should be called by the Integrator from its own initialize() method.
* It ensures all contexts are fully initialized. * It ensures all contexts are fully initialized.
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2019 Stanford University and the Authors. * * Portions copyright (c) 2019-2025 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -48,6 +48,11 @@ public: ...@@ -48,6 +48,11 @@ public:
* Block until all operations started before the call to enqueue() have completed. * Block until all operations started before the call to enqueue() have completed.
*/ */
void wait(); void wait();
/**
* Enqueue a barrier that causes a specified ComputeQueue to block until all
* operations started before the call to enqueue() have completed.
*/
void queueWait(ComputeQueue queue);
private: private:
CudaContext& context; CudaContext& context;
CUevent event; CUevent event;
......
...@@ -50,7 +50,7 @@ class CudaContext; ...@@ -50,7 +50,7 @@ class CudaContext;
* multiply every value of the original data set by the total number of data points. * multiply every value of the original data set by the total number of data points.
*/ */
class OPENMM_EXPORT_COMMON CudaFFT3D : public FFT3D { class OPENMM_EXPORT_COMMON CudaFFT3D : public FFT3DImpl {
public: public:
/** /**
* Create a CudaFFT3D object for performing transforms of a particular size. * Create a CudaFFT3D object for performing transforms of a particular size.
......
...@@ -30,11 +30,12 @@ ...@@ -30,11 +30,12 @@
#include "CudaPlatform.h" #include "CudaPlatform.h"
#include "CudaArray.h" #include "CudaArray.h"
#include "CudaContext.h" #include "CudaContext.h"
#include "CudaFFT3D.h"
#include "CudaSort.h"
#include "openmm/kernels.h" #include "openmm/kernels.h"
#include "openmm/System.h" #include "openmm/System.h"
#include "openmm/common/CommonKernels.h" #include "openmm/common/CommonKernels.h"
#include "openmm/common/CommonCalcNonbondedForce.h"
#include "openmm/common/ComputeSort.h"
#include "openmm/common/FFT3D.h"
namespace OpenMM { namespace OpenMM {
...@@ -85,12 +86,11 @@ private: ...@@ -85,12 +86,11 @@ private:
/** /**
* This kernel is invoked by NonbondedForce to calculate the forces acting on the system. * This kernel is invoked by NonbondedForce to calculate the forces acting on the system.
*/ */
class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel { class CudaCalcNonbondedForceKernel : public CommonCalcNonbondedForceKernel {
public: public:
CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcNonbondedForceKernel(name, platform), CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) :
cu(cu), hasInitializedFFT(false), sort(NULL), dispersionFft(NULL), fft(NULL), pmeio(NULL), useFixedPointChargeSpreading(false), usePmeStream(false) { CommonCalcNonbondedForceKernel(name, platform, cu, system), cu(cu) {
} }
~CudaCalcNonbondedForceKernel();
/** /**
* Initialize the kernel. * Initialize the kernel.
* *
...@@ -98,123 +98,8 @@ public: ...@@ -98,123 +98,8 @@ public:
* @param force the NonbondedForce this kernel will be used for * @param force the NonbondedForce this kernel will be used for
*/ */
void initialize(const System& system, const NonbondedForce& force); void initialize(const System& system, const NonbondedForce& force);
/**
* Execute the kernel to calculate the forces and/or energy.
*
* @param context the context in which to execute this kernel
* @param includeForces true if forces should be calculated
* @param includeEnergy true if the energy should be calculated
* @param includeDirect true if direct space interactions should be included
* @param includeReciprocal true if reciprocal space interactions should be included
* @return the potential energy due to the force
*/
double execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal);
/**
* Copy changed parameters over to a context.
*
* @param context the context to copy parameters to
* @param force the NonbondedForce to copy the parameters from
* @param firstParticle the index of the first particle whose parameters might have changed
* @param lastParticle the index of the last particle whose parameters might have changed
* @param firstException the index of the first exception whose parameters might have changed
* @param lastException the index of the last exception whose parameters might have changed
*/
void copyParametersToContext(ContextImpl& context, const NonbondedForce& force, int firstParticle, int lastParticle, int firstException, int lastException);
/**
* Get the parameters being used for PME.
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
/**
* Get the dispersion parameters being used for the dispersion term in LJPME.
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
private: private:
class SortTrait : public CudaSort::SortTrait {
int getDataSize() const {return 8;}
int getKeySize() const {return 4;}
const char* getDataType() const {return "int2";}
const char* getKeyType() const {return "int";}
const char* getMinKey() const {return "(-2147483647-1)";}
const char* getMaxKey() const {return "2147483647";}
const char* getMaxValue() const {return "make_int2(2147483647, 2147483647)";}
const char* getSortKey() const {return "value.y";}
};
class ForceInfo;
class PmeIO;
class PmePreComputation;
class PmePostComputation;
class SyncStreamPreComputation;
class SyncStreamPostComputation;
CudaContext& cu; CudaContext& cu;
ForceInfo* info;
bool hasInitializedFFT;
CudaArray charges;
CudaArray sigmaEpsilon;
CudaArray exceptionParams;
CudaArray exclusionAtoms;
CudaArray exclusionParams;
CudaArray baseParticleParams;
CudaArray baseExceptionParams;
CudaArray particleParamOffsets;
CudaArray exceptionParamOffsets;
CudaArray particleOffsetIndices;
CudaArray exceptionOffsetIndices;
CudaArray globalParams;
CudaArray cosSinSums;
CudaArray pmeGrid1;
CudaArray pmeGrid2;
CudaArray pmeBsplineModuliX;
CudaArray pmeBsplineModuliY;
CudaArray pmeBsplineModuliZ;
CudaArray pmeDispersionBsplineModuliX;
CudaArray pmeDispersionBsplineModuliY;
CudaArray pmeDispersionBsplineModuliZ;
CudaArray pmeAtomGridIndex;
CudaArray pmeEnergyBuffer;
CudaArray chargeBuffer;
CudaSort* sort;
Kernel cpuPme;
PmeIO* pmeio;
ComputeQueue pmeQueue;
CUevent pmeSyncEvent, paramsSyncEvent;
CudaFFT3D* fft;
CudaFFT3D* dispersionFft;
CUfunction computeParamsKernel, computeExclusionParamsKernel, computePlasmaCorrectionKernel;
CUfunction ewaldSumsKernel;
CUfunction ewaldForcesKernel;
CUfunction pmeGridIndexKernel;
CUfunction pmeDispersionGridIndexKernel;
CUfunction pmeSpreadChargeKernel;
CUfunction pmeDispersionSpreadChargeKernel;
CUfunction pmeFinishSpreadChargeKernel;
CUfunction pmeDispersionFinishSpreadChargeKernel;
CUfunction pmeEvalEnergyKernel;
CUfunction pmeEvalDispersionEnergyKernel;
CUfunction pmeConvolutionKernel;
CUfunction pmeDispersionConvolutionKernel;
CUfunction pmeInterpolateForceKernel;
CUfunction pmeInterpolateDispersionForceKernel;
std::vector<std::pair<int, int> > exceptionAtoms;
std::vector<std::string> paramNames;
std::vector<double> paramValues;
std::map<int, int> exceptionIndex;
double ewaldSelfEnergy, dispersionCoefficient, alpha, dispersionAlpha, totalCharge;
int interpolateForceThreads;
int gridSizeX, gridSizeY, gridSizeZ;
int dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ;
bool hasCoulomb, hasLJ, useFixedPointChargeSpreading, usePmeStream, doLJPME, usePosqCharges, recomputeParams, hasOffsets;
NonbondedMethod nonbondedMethod;
static const int PmeOrder = 5;
}; };
/** /**
......
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
#include "openmm/System.h" #include "openmm/System.h"
#include "CudaArray.h" #include "CudaArray.h"
#include "CudaExpressionUtilities.h" #include "CudaExpressionUtilities.h"
#include "openmm/common/ComputeSort.h"
#include "openmm/common/NonbondedUtilities.h" #include "openmm/common/NonbondedUtilities.h"
#include <cuda.h> #include <cuda.h>
#include <sstream> #include <sstream>
...@@ -39,7 +40,6 @@ ...@@ -39,7 +40,6 @@
namespace OpenMM { namespace OpenMM {
class CudaContext; class CudaContext;
class CudaSort;
/** /**
* This class provides a generic interface for calculating nonbonded interactions. It does this in two * This class provides a generic interface for calculating nonbonded interactions. It does this in two
...@@ -71,20 +71,6 @@ public: ...@@ -71,20 +71,6 @@ public:
class ParameterInfo; class ParameterInfo;
CudaNonbondedUtilities(CudaContext& context); CudaNonbondedUtilities(CudaContext& context);
~CudaNonbondedUtilities(); ~CudaNonbondedUtilities();
/**
* Add a nonbonded interaction to be evaluated by the default interaction kernel.
*
* @param usesCutoff specifies whether a cutoff should be applied to this interaction
* @param usesPeriodic specifies whether periodic boundary conditions should be applied to this interaction
* @param usesExclusions specifies whether this interaction uses exclusions. If this is true, it must have identical exclusions to every other interaction.
* @param cutoffDistance the cutoff distance for this interaction (ignored if usesCutoff is false)
* @param exclusionList for each atom, specifies the list of other atoms whose interactions should be excluded
* @param kernel the code to evaluate the interaction
* @param forceGroup the force group in which the interaction should be calculated
* @param useNeighborList specifies whether a neighbor list should be used to optimize this interaction. This should
* be viewed as only a suggestion. Even when it is false, a neighbor list may be used anyway.
*/
void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup, bool useNeighborList=true);
/** /**
* Add a nonbonded interaction to be evaluated by the default interaction kernel. * Add a nonbonded interaction to be evaluated by the default interaction kernel.
* *
...@@ -99,7 +85,9 @@ public: ...@@ -99,7 +85,9 @@ public:
* be viewed as only a suggestion. Even when it is false, a neighbor list may be used anyway. * be viewed as only a suggestion. Even when it is false, a neighbor list may be used anyway.
* @param supportsPairList specifies whether this interaction can work with a neighbor list that uses a separate pair list * @param supportsPairList specifies whether this interaction can work with a neighbor list that uses a separate pair list
*/ */
void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup, bool useNeighborList, bool supportsPairList); void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance,
const std::vector<std::vector<int> >& exclusionList, const std::string& kernel,
int forceGroup, bool useNeighborList=true, bool supportsPairList=false);
/** /**
* Add a per-atom parameter that the default interaction kernel may depend on. * Add a per-atom parameter that the default interaction kernel may depend on.
*/ */
...@@ -343,7 +331,7 @@ private: ...@@ -343,7 +331,7 @@ private:
CudaArray largeBlockBoundingBox; CudaArray largeBlockBoundingBox;
CudaArray oldPositions; CudaArray oldPositions;
CudaArray rebuildNeighborList; CudaArray rebuildNeighborList;
CudaSort* blockSorter; ComputeSort blockSorter;
CUevent downloadCountEvent; CUevent downloadCountEvent;
unsigned int* pinnedCountBuffer; unsigned int* pinnedCountBuffer;
std::vector<void*> forceArgs, findBlockBoundsArgs, computeSortKeysArgs, sortBoxDataArgs, findInteractingBlocksArgs; std::vector<void*> forceArgs, findBlockBoundsArgs, computeSortKeysArgs, sortBoxDataArgs, findInteractingBlocksArgs;
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2010-2021 Stanford University and the Authors. * * Portions copyright (c) 2010-2025 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -28,6 +28,7 @@ ...@@ -28,6 +28,7 @@
* -------------------------------------------------------------------------- */ * -------------------------------------------------------------------------- */
#include "CudaArray.h" #include "CudaArray.h"
#include "openmm/common/ComputeSort.h"
#include "openmm/common/windowsExportCommon.h" #include "openmm/common/windowsExportCommon.h"
#include "CudaContext.h" #include "CudaContext.h"
...@@ -41,7 +42,7 @@ namespace OpenMM { ...@@ -41,7 +42,7 @@ namespace OpenMM {
* sort and the key for sorting it. Here is an example of a trait class for * sort and the key for sorting it. Here is an example of a trait class for
* sorting floats: * sorting floats:
* *
* class FloatTrait : public CudaSort::SortTrait { * class FloatTrait : public ComputeSortImpl::SortTrait {
* int getDataSize() const {return 4;} * int getDataSize() const {return 4;}
* int getKeySize() const {return 4;} * int getKeySize() const {return 4;}
* const char* getDataType() const {return "float";} * const char* getDataType() const {return "float";}
...@@ -66,9 +67,8 @@ namespace OpenMM { ...@@ -66,9 +67,8 @@ namespace OpenMM {
* elements). * elements).
*/ */
class OPENMM_EXPORT_COMMON CudaSort { class OPENMM_EXPORT_COMMON CudaSort : public ComputeSortImpl {
public: public:
class SortTrait;
/** /**
* Create a CudaSort object for sorting data of a particular type. * Create a CudaSort object for sorting data of a particular type.
* *
...@@ -82,15 +82,15 @@ public: ...@@ -82,15 +82,15 @@ public:
* of the algorithm to be tuned for faster performance on the expected * of the algorithm to be tuned for faster performance on the expected
* distribution. * distribution.
*/ */
CudaSort(CudaContext& context, SortTrait* trait, unsigned int length, bool uniform=true); CudaSort(CudaContext& context, ComputeSortImpl::SortTrait* trait, unsigned int length, bool uniform=true);
~CudaSort(); ~CudaSort();
/** /**
* Sort an array. * Sort an array.
*/ */
void sort(CudaArray& data); void sort(ArrayInterface& data);
private: private:
CudaContext& context; CudaContext& context;
SortTrait* trait; ComputeSortImpl::SortTrait* trait;
CudaArray dataRange; CudaArray dataRange;
CudaArray bucketOfElement; CudaArray bucketOfElement;
CudaArray offsetInBucket; CudaArray offsetInBucket;
...@@ -101,48 +101,6 @@ private: ...@@ -101,48 +101,6 @@ private:
bool isShortList, uniform; bool isShortList, uniform;
}; };
/**
* A subclass of SortTrait defines the type of value to sort, and the key for sorting them.
*/
class CudaSort::SortTrait {
public:
virtual ~SortTrait() {
}
/**
* Get the size of each data value in bytes.
*/
virtual int getDataSize() const = 0;
/**
* Get the size of each key value in bytes.
*/
virtual int getKeySize() const = 0;
/**
* Get the data type of the values to sort.
*/
virtual const char* getDataType() const = 0;
/**
* Get the data type of the sorting key.
*/
virtual const char* getKeyType() const = 0;
/**
* Get the minimum value a key can take.
*/
virtual const char* getMinKey() const = 0;
/**
* Get the maximum value a key can take.
*/
virtual const char* getMaxKey() const = 0;
/**
* Get a value whose key is guaranteed to equal getMaxKey().
*/
virtual const char* getMaxValue() const = 0;
/**
* Get the CUDA code to select the key from the data value.
*/
virtual const char* getSortKey() const = 0;
};
} // namespace OpenMM } // namespace OpenMM
#endif // __OPENMM_CUDASORT_H__ #endif // __OPENMM_CUDASORT_H__
...@@ -32,11 +32,13 @@ ...@@ -32,11 +32,13 @@
#include "CudaArray.h" #include "CudaArray.h"
#include "CudaBondedUtilities.h" #include "CudaBondedUtilities.h"
#include "CudaEvent.h" #include "CudaEvent.h"
#include "CudaFFT3D.h"
#include "CudaIntegrationUtilities.h" #include "CudaIntegrationUtilities.h"
#include "CudaKernels.h" #include "CudaKernels.h"
#include "CudaKernelSources.h" #include "CudaKernelSources.h"
#include "CudaNonbondedUtilities.h" #include "CudaNonbondedUtilities.h"
#include "CudaProgram.h" #include "CudaProgram.h"
#include "CudaSort.h"
#include "openmm/common/ComputeArray.h" #include "openmm/common/ComputeArray.h"
#include "openmm/common/ContextSelector.h" #include "openmm/common/ContextSelector.h"
#include "SHA1.h" #include "SHA1.h"
...@@ -439,8 +441,8 @@ void CudaContext::initializeContexts() { ...@@ -439,8 +441,8 @@ void CudaContext::initializeContexts() {
getPlatformData().initializeContexts(system); getPlatformData().initializeContexts(system);
} }
CudaFFT3D* CudaContext::createFFT(int xsize, int ysize, int zsize, bool realToComplex) { FFT3D CudaContext::createFFT(int xsize, int ysize, int zsize, bool realToComplex) {
return new CudaFFT3D(*this, xsize, ysize, zsize, realToComplex); return FFT3D(new CudaFFT3D(*this, xsize, ysize, zsize, realToComplex));
} }
void CudaContext::setAsCurrent() { void CudaContext::setAsCurrent() {
...@@ -667,6 +669,10 @@ ComputeEvent CudaContext::createEvent() { ...@@ -667,6 +669,10 @@ ComputeEvent CudaContext::createEvent() {
return shared_ptr<ComputeEventImpl>(new CudaEvent(*this)); return shared_ptr<ComputeEventImpl>(new CudaEvent(*this));
} }
ComputeSort CudaContext::createSort(ComputeSortImpl::SortTrait* trait, unsigned int length, bool uniform) {
return shared_ptr<ComputeSortImpl>(new CudaSort(*this, trait, length, uniform));
}
ComputeProgram CudaContext::compileProgram(const std::string source, const std::map<std::string, std::string>& defines) { ComputeProgram CudaContext::compileProgram(const std::string source, const std::map<std::string, std::string>& defines) {
CUmodule module = createModule(CudaKernelSources::vectorOps+source, defines); CUmodule module = createModule(CudaKernelSources::vectorOps+source, defines);
return shared_ptr<ComputeProgramImpl>(new CudaProgram(*this, module)); return shared_ptr<ComputeProgramImpl>(new CudaProgram(*this, module));
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2019 Stanford University and the Authors. * * Portions copyright (c) 2019-2025 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -42,9 +42,13 @@ CudaEvent::~CudaEvent() { ...@@ -42,9 +42,13 @@ CudaEvent::~CudaEvent() {
} }
void CudaEvent::enqueue() { void CudaEvent::enqueue() {
cuEventRecord(event, 0); cuEventRecord(event, context.getCurrentStream());
} }
void CudaEvent::wait() { void CudaEvent::wait() {
cuEventSynchronize(event); cuEventSynchronize(event);
} }
void CudaEvent::queueWait(ComputeQueue queue) {
cuStreamWaitEvent(dynamic_cast<CudaQueue*>(queue.get())->getStream(), event, 0);
}
This diff is collapsed.
...@@ -30,7 +30,6 @@ ...@@ -30,7 +30,6 @@
#include "CudaContext.h" #include "CudaContext.h"
#include "CudaKernelSources.h" #include "CudaKernelSources.h"
#include "CudaExpressionUtilities.h" #include "CudaExpressionUtilities.h"
#include "CudaSort.h"
#include <algorithm> #include <algorithm>
#include <map> #include <map>
#include <set> #include <set>
...@@ -47,7 +46,7 @@ using namespace std; ...@@ -47,7 +46,7 @@ using namespace std;
} }
class CudaNonbondedUtilities::BlockSortTrait : public CudaSort::SortTrait { class CudaNonbondedUtilities::BlockSortTrait : public ComputeSortImpl::SortTrait {
public: public:
BlockSortTrait() {} BlockSortTrait() {}
int getDataSize() const {return sizeof(int);} int getDataSize() const {return sizeof(int);}
...@@ -61,7 +60,7 @@ public: ...@@ -61,7 +60,7 @@ public:
}; };
CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), useCutoff(false), usePeriodic(false), useNeighborList(false), anyExclusions(false), usePadding(true), CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), useCutoff(false), usePeriodic(false), useNeighborList(false), anyExclusions(false), usePadding(true),
blockSorter(NULL), pinnedCountBuffer(NULL), forceRebuildNeighborList(true), groupFlags(0), canUsePairList(true), tilesAfterReorder(0) { pinnedCountBuffer(NULL), forceRebuildNeighborList(true), groupFlags(0), canUsePairList(true), tilesAfterReorder(0) {
// Decide how many thread blocks to use. // Decide how many thread blocks to use.
string errorMessage = "Error initializing nonbonded utilities"; string errorMessage = "Error initializing nonbonded utilities";
...@@ -82,18 +81,13 @@ CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(c ...@@ -82,18 +81,13 @@ CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(c
} }
CudaNonbondedUtilities::~CudaNonbondedUtilities() { CudaNonbondedUtilities::~CudaNonbondedUtilities() {
if (blockSorter != NULL)
delete blockSorter;
if (pinnedCountBuffer != NULL) if (pinnedCountBuffer != NULL)
cuMemFreeHost(pinnedCountBuffer); cuMemFreeHost(pinnedCountBuffer);
cuEventDestroy(downloadCountEvent); cuEventDestroy(downloadCountEvent);
} }
void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup, bool useNeighborList) { void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance,
addInteraction(usesCutoff, usesPeriodic, usesExclusions, cutoffDistance, exclusionList, kernel, forceGroup, useNeighborList, false); const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup, bool useNeighborList, bool supportsPairList) {
}
void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup, bool useNeighborList, bool supportsPairList) {
if (groupCutoff.size() > 0) { if (groupCutoff.size() > 0) {
if (usesCutoff != useCutoff) if (usesCutoff != useCutoff)
throw OpenMMException("All Forces must agree on whether to use a cutoff"); throw OpenMMException("All Forces must agree on whether to use a cutoff");
...@@ -289,7 +283,7 @@ void CudaNonbondedUtilities::initialize(const System& system) { ...@@ -289,7 +283,7 @@ void CudaNonbondedUtilities::initialize(const System& system) {
largeBlockBoundingBox.initialize(context, numAtomBlocks, 4*elementSize, "largeBlockBoundingBox"); largeBlockBoundingBox.initialize(context, numAtomBlocks, 4*elementSize, "largeBlockBoundingBox");
oldPositions.initialize(context, numAtoms, 4*elementSize, "oldPositions"); oldPositions.initialize(context, numAtoms, 4*elementSize, "oldPositions");
rebuildNeighborList.initialize<int>(context, 1, "rebuildNeighborList"); rebuildNeighborList.initialize<int>(context, 1, "rebuildNeighborList");
blockSorter = new CudaSort(context, new BlockSortTrait(), numAtomBlocks, false); blockSorter = context.createSort(new BlockSortTrait(), numAtomBlocks, false);
vector<unsigned int> count(2, 0); vector<unsigned int> count(2, 0);
interactionCount.upload(count); interactionCount.upload(count);
rebuildNeighborList.upload(&count[0]); rebuildNeighborList.upload(&count[0]);
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2010-2021 Stanford University and the Authors. * * Portions copyright (c) 2010-2025 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -32,7 +32,7 @@ ...@@ -32,7 +32,7 @@
using namespace OpenMM; using namespace OpenMM;
using namespace std; using namespace std;
CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length, bool uniform) : CudaSort::CudaSort(CudaContext& context, ComputeSortImpl::SortTrait* trait, unsigned int length, bool uniform) :
context(context), trait(trait), dataLength(length), uniform(uniform) { context(context), trait(trait), dataLength(length), uniform(uniform) {
// Create kernels. // Create kernels.
...@@ -92,21 +92,22 @@ CudaSort::~CudaSort() { ...@@ -92,21 +92,22 @@ CudaSort::~CudaSort() {
delete trait; delete trait;
} }
void CudaSort::sort(CudaArray& data) { void CudaSort::sort(ArrayInterface& data) {
if (data.getSize() != dataLength || data.getElementSize() != trait->getDataSize()) if (data.getSize() != dataLength || data.getElementSize() != trait->getDataSize())
throw OpenMMException("CudaSort called with different data size"); throw OpenMMException("CudaSort called with different data size");
if (data.getSize() == 0) if (data.getSize() == 0)
return; return;
CudaArray& cudata = context.unwrap(data);
if (isShortList) { if (isShortList) {
// We can use a simpler sort kernel that does the entire operation in one kernel. // We can use a simpler sort kernel that does the entire operation in one kernel.
if (dataLength <= CudaContext::ThreadBlockSize*context.getNumThreadBlocks()) { if (dataLength <= CudaContext::ThreadBlockSize*context.getNumThreadBlocks()) {
void* sortArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &dataLength}; void* sortArgs[] = {&cudata.getDevicePointer(), &buckets.getDevicePointer(), &dataLength};
context.executeKernel(shortList2Kernel, sortArgs, dataLength); context.executeKernel(shortList2Kernel, sortArgs, dataLength);
buckets.copyTo(data); buckets.copyTo(cudata);
} }
else { else {
void* sortArgs[] = {&data.getDevicePointer(), &dataLength}; void* sortArgs[] = {&cudata.getDevicePointer(), &dataLength};
context.executeKernel(shortListKernel, sortArgs, sortKernelSize, sortKernelSize, dataLength*trait->getDataSize()); context.executeKernel(shortListKernel, sortArgs, sortKernelSize, sortKernelSize, dataLength*trait->getDataSize());
} }
} }
...@@ -114,14 +115,14 @@ void CudaSort::sort(CudaArray& data) { ...@@ -114,14 +115,14 @@ void CudaSort::sort(CudaArray& data) {
// Compute the range of data values. // Compute the range of data values.
unsigned int numBuckets = bucketOffset.getSize(); unsigned int numBuckets = bucketOffset.getSize();
void* rangeArgs[] = {&data.getDevicePointer(), &dataLength, &dataRange.getDevicePointer(), &numBuckets, &bucketOffset.getDevicePointer()}; void* rangeArgs[] = {&cudata.getDevicePointer(), &dataLength, &dataRange.getDevicePointer(), &numBuckets, &bucketOffset.getDevicePointer()};
context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelSize, rangeKernelSize, 2*rangeKernelSize*trait->getKeySize()); context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelSize, rangeKernelSize, 2*rangeKernelSize*trait->getKeySize());
// Assign array elements to buckets. // Assign array elements to buckets.
void* elementsArgs[] = {&data.getDevicePointer(), &dataLength, &numBuckets, &dataRange.getDevicePointer(), void* elementsArgs[] = {&cudata.getDevicePointer(), &dataLength, &numBuckets, &dataRange.getDevicePointer(),
&bucketOffset.getDevicePointer(), &bucketOfElement.getDevicePointer(), &offsetInBucket.getDevicePointer()}; &bucketOffset.getDevicePointer(), &bucketOfElement.getDevicePointer(), &offsetInBucket.getDevicePointer()};
context.executeKernel(assignElementsKernel, elementsArgs, data.getSize(), 128); context.executeKernel(assignElementsKernel, elementsArgs, cudata.getSize(), 128);
// Compute the position of each bucket. // Compute the position of each bucket.
...@@ -130,13 +131,13 @@ void CudaSort::sort(CudaArray& data) { ...@@ -130,13 +131,13 @@ void CudaSort::sort(CudaArray& data) {
// Copy the data into the buckets. // Copy the data into the buckets.
void* copyArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &dataLength, &bucketOffset.getDevicePointer(), void* copyArgs[] = {&cudata.getDevicePointer(), &buckets.getDevicePointer(), &dataLength, &bucketOffset.getDevicePointer(),
&bucketOfElement.getDevicePointer(), &offsetInBucket.getDevicePointer()}; &bucketOfElement.getDevicePointer(), &offsetInBucket.getDevicePointer()};
context.executeKernel(copyToBucketsKernel, copyArgs, data.getSize()); context.executeKernel(copyToBucketsKernel, copyArgs, cudata.getSize());
// Sort each bucket. // Sort each bucket.
void* sortArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &numBuckets, &bucketOffset.getDevicePointer()}; void* sortArgs[] = {&cudata.getDevicePointer(), &buckets.getDevicePointer(), &numBuckets, &bucketOffset.getDevicePointer()};
context.executeKernel(sortBucketsKernel, sortArgs, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize()); context.executeKernel(sortBucketsKernel, sortArgs, ((cudata.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize());
} }
} }
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2008-2021 Stanford University and the Authors. * * Portions copyright (c) 2008-2025 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -48,7 +48,7 @@ using namespace std; ...@@ -48,7 +48,7 @@ using namespace std;
CudaPlatform platform; CudaPlatform platform;
class SortTrait : public CudaSort::SortTrait { class SortTrait : public ComputeSortImpl::SortTrait {
int getDataSize() const {return 4;} int getDataSize() const {return 4;}
int getKeySize() const {return 4;} int getKeySize() const {return 4;}
const char* getDataType() const {return "float";} const char* getDataType() const {return "float";}
......
...@@ -51,7 +51,6 @@ ...@@ -51,7 +51,6 @@
#include "HipArray.h" #include "HipArray.h"
#include "HipBondedUtilities.h" #include "HipBondedUtilities.h"
#include "HipExpressionUtilities.h" #include "HipExpressionUtilities.h"
#include "HipFFT3D.h"
#include "HipIntegrationUtilities.h" #include "HipIntegrationUtilities.h"
#include "HipNonbondedUtilities.h" #include "HipNonbondedUtilities.h"
#include "HipPlatform.h" #include "HipPlatform.h"
...@@ -156,6 +155,12 @@ public: ...@@ -156,6 +155,12 @@ public:
* one ComputeContext is created for each device. * one ComputeContext is created for each device.
*/ */
std::vector<ComputeContext*> getAllContexts(); std::vector<ComputeContext*> getAllContexts();
/**
* Get the ContextImpl is ComputeContext is associated with.
*/
ContextImpl& getContextImpl() {
return *platformData.context;
}
/** /**
* Get a workspace used for accumulating energy when a simulation is parallelized across * Get a workspace used for accumulating energy when a simulation is parallelized across
* multiple devices. * multiple devices.
...@@ -178,6 +183,19 @@ public: ...@@ -178,6 +183,19 @@ public:
* Construct a ComputeEvent object of the appropriate class for this platform. * Construct a ComputeEvent object of the appropriate class for this platform.
*/ */
ComputeEvent createEvent(); ComputeEvent createEvent();
/**
* Construct a ComputeSort object of the appropriate class for this platform.
*
* @param trait a SortTrait defining the type of data to sort. It should have been allocated
* on the heap with the "new" operator. This object takes over ownership of it,
* and deletes it when the ComputeSort is deleted.
* @param length the length of the arrays this object will be used to sort
* @param uniform whether the input data is expected to follow a uniform or nonuniform
* distribution. This argument is used only as a hint. It allows parts
* of the algorithm to be tuned for faster performance on the expected
* distribution.
*/
ComputeSort createSort(ComputeSortImpl::SortTrait* trait, unsigned int length, bool uniform=true);
/** /**
* Create an object for performing 3D FFTs. The caller is responsible for deleting * Create an object for performing 3D FFTs. The caller is responsible for deleting
* the object when it is no longer needed. * the object when it is no longer needed.
...@@ -187,7 +205,7 @@ public: ...@@ -187,7 +205,7 @@ public:
* @param zsize the third dimension of the data sets on which FFTs will be performed * @param zsize the third dimension of the data sets on which FFTs will be performed
* @param realToComplex if true, a real-to-complex transform will be done. Otherwise, it is complex-to-complex. * @param realToComplex if true, a real-to-complex transform will be done. Otherwise, it is complex-to-complex.
*/ */
HipFFT3D* createFFT(int xsize, int ysize, int zsize, bool realToComplex=false); FFT3D createFFT(int xsize, int ysize, int zsize, bool realToComplex=false);
/** /**
* Get the smallest legal size for a dimension of the grid. * Get the smallest legal size for a dimension of the grid.
*/ */
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment