Unverified Commit 2443dcee authored by Peter Eastman's avatar Peter Eastman Committed by GitHub
Browse files

Common implementation of NonbondedForce (#4922)

* Use common API for kernels

* More code uses common interface

* Bug fixes

* Unified interface for sorting

* Simplified interface for FFT

* Use common event API for synchronization

* Minor changes to make code more consistent between platforms

* Common implementation of NonbondedForce

* Bug fixes

* Flag to enable list of single pairs

* CUDA and OpenCL use common implementation of NonbondedForce

* Fixed compilation error

* HIP uses common implementation of NonbondedForce
parent dfb8d755
#ifndef OPENMM_COMMONCALCNONBONDEDFORCEKERNEL_H_
#define OPENMM_COMMONCALCNONBONDEDFORCEKERNEL_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2025 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "openmm/kernels.h"
#include "openmm/common/ComputeArray.h"
#include "openmm/common/ComputeContext.h"
#include "openmm/common/ComputeEvent.h"
#include "openmm/common/ComputeQueue.h"
#include "openmm/common/ComputeSort.h"
#include "openmm/common/FFT3D.h"
#include <map>
#include <string>
#include <utility>
#include <vector>
namespace OpenMM {
/**
* This kernel is invoked by NonbondedForce to calculate the forces acting on the system.
*/
class CommonCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public:
CommonCalcNonbondedForceKernel(std::string name, const Platform& platform, ComputeContext& cc, const System& system) : CalcNonbondedForceKernel(name, platform),
hasInitializedKernel(false), cc(cc), pmeio(NULL) {
}
~CommonCalcNonbondedForceKernel();
/**
* Initialize the kernel. Subclasses should call this from their initialize() method.
*
* @param system the System this kernel will be applied to
* @param force the NonbondedForce this kernel will be used for
* @param usePmeQueue whether to perform PME on a separate queue
* @param deviceIsCpu whether the device this calculation is running on is a CPU
* @param useFixedPointChargeSpreading whether PME charge spreading should be done in fixed point or floating point
* @param useCpuPme whether to perform the PME reciprocal space calculation on the CPU
*/
void commonInitialize(const System& system, const NonbondedForce& force, bool usePmeQueue, bool deviceIsCpu, bool useFixedPointChargeSpreading, bool useCpuPme);
/**
* Execute the kernel to calculate the forces and/or energy.
*
* @param context the context in which to execute this kernel
* @param includeForces true if forces should be calculated
* @param includeEnergy true if the energy should be calculated
* @param includeDirect true if direct space interactions should be included
* @param includeReciprocal true if reciprocal space interactions should be included
* @return the potential energy due to the force
*/
double execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal);
/**
* Copy changed parameters over to a context.
*
* @param context the context to copy parameters to
* @param force the NonbondedForce to copy the parameters from
* @param firstParticle the index of the first particle whose parameters might have changed
* @param lastParticle the index of the last particle whose parameters might have changed
* @param firstException the index of the first exception whose parameters might have changed
* @param lastException the index of the last exception whose parameters might have changed
*/
void copyParametersToContext(ContextImpl& context, const NonbondedForce& force, int firstParticle, int lastParticle, int firstException, int lastException);
/**
* Get the parameters being used for PME.
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
/**
* Get the parameters being used for the dispersion term in LJPME.
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
private:
class SortTrait : public ComputeSortImpl::SortTrait {
int getDataSize() const {return 8;}
int getKeySize() const {return 4;}
const char* getDataType() const {return "int2";}
const char* getKeyType() const {return "int";}
const char* getMinKey() const {return "(-2147483647-1)";}
const char* getMaxKey() const {return "2147483647";}
const char* getMaxValue() const {return "make_int2(2147483647, 2147483647)";}
const char* getSortKey() const {return "value.y";}
};
class ForceInfo;
class PmeIO;
class PmePreComputation;
class PmePostComputation;
class SyncQueuePreComputation;
class SyncQueuePostComputation;
ComputeContext& cc;
ForceInfo* info;
bool hasInitializedKernel;
ComputeArray charges;
ComputeArray sigmaEpsilon;
ComputeArray exceptionParams;
ComputeArray exclusionAtoms;
ComputeArray exclusionParams;
ComputeArray baseParticleParams;
ComputeArray baseExceptionParams;
ComputeArray particleParamOffsets;
ComputeArray exceptionParamOffsets;
ComputeArray particleOffsetIndices;
ComputeArray exceptionOffsetIndices;
ComputeArray globalParams;
ComputeArray cosSinSums;
ComputeArray pmeGrid1;
ComputeArray pmeGrid2;
ComputeArray pmeBsplineModuliX;
ComputeArray pmeBsplineModuliY;
ComputeArray pmeBsplineModuliZ;
ComputeArray pmeDispersionBsplineModuliX;
ComputeArray pmeDispersionBsplineModuliY;
ComputeArray pmeDispersionBsplineModuliZ;
ComputeArray pmeAtomGridIndex;
ComputeArray pmeEnergyBuffer;
ComputeArray chargeBuffer;
ComputeSort sort;
ComputeQueue pmeQueue;
ComputeEvent pmeSyncEvent, paramsSyncEvent;
FFT3D fft, dispersionFft;
Kernel cpuPme;
PmeIO* pmeio;
SyncQueuePostComputation* syncQueue;
ComputeKernel computeParamsKernel, computeExclusionParamsKernel, computePlasmaCorrectionKernel;
ComputeKernel ewaldSumsKernel, ewaldForcesKernel;
ComputeKernel pmeGridIndexKernel, pmeDispersionGridIndexKernel;
ComputeKernel pmeSpreadChargeKernel, pmeDispersionSpreadChargeKernel;
ComputeKernel pmeFinishSpreadChargeKernel, pmeDispersionFinishSpreadChargeKernel;
ComputeKernel pmeConvolutionKernel, pmeDispersionConvolutionKernel;
ComputeKernel pmeEvalEnergyKernel, pmeDispersionEvalEnergyKernel;
ComputeKernel pmeInterpolateForceKernel, pmeDispersionInterpolateForceKernel;
std::map<std::string, std::string> pmeDefines;
std::vector<std::pair<int, int> > exceptionAtoms;
std::vector<std::string> paramNames;
std::vector<double> paramValues;
std::map<int, int> exceptionIndex;
double ewaldSelfEnergy, dispersionCoefficient, alpha, dispersionAlpha, totalCharge;
int gridSizeX, gridSizeY, gridSizeZ;
int dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ;
bool usePmeQueue, deviceIsCpu, useFixedPointChargeSpreading, useCpuPme;
bool hasCoulomb, hasLJ, doLJPME, usePosqCharges, recomputeParams, hasOffsets;
NonbondedMethod nonbondedMethod;
static const int PmeOrder = 5;
};
} // namespace OpenMM
#endif /*OPENMM_COMMONCALCNONBONDEDFORCEKERNEL_H_*/
......@@ -37,11 +37,13 @@
#include "openmm/common/ComputeForceInfo.h"
#include "openmm/common/ComputeProgram.h"
#include "openmm/common/ComputeQueue.h"
#include "openmm/common/ComputeSort.h"
#include "openmm/common/ComputeVectorTypes.h"
#include "openmm/common/FFT3D.h"
#include "openmm/common/IntegrationUtilities.h"
#include "openmm/common/NonbondedUtilities.h"
#include "openmm/Vec3.h"
#include "openmm/internal/ContextImpl.h"
#include <condition_variable>
#include <map>
#include <mutex>
......@@ -139,6 +141,10 @@ public:
* one ComputeContext is created for each device.
*/
virtual std::vector<ComputeContext*> getAllContexts() = 0;
/**
* Get the ContextImpl is ComputeContext is associated with.
*/
virtual ContextImpl& getContextImpl() = 0;
/**
* Get a workspace used for accumulating energy when a simulation is parallelized across
* multiple devices.
......@@ -169,6 +175,19 @@ public:
* Construct a ComputeEvent object of the appropriate class for this platform.
*/
virtual ComputeEvent createEvent() = 0;
/**
* Construct a ComputeSort object of the appropriate class for this platform.
*
* @param trait a SortTrait defining the type of data to sort. It should have been allocated
* on the heap with the "new" operator. This object takes over ownership of it,
* and deletes it when the ComputeSort is deleted.
* @param length the length of the arrays this object will be used to sort
* @param uniform whether the input data is expected to follow a uniform or nonuniform
* distribution. This argument is used only as a hint. It allows parts
* of the algorithm to be tuned for faster performance on the expected
* distribution.
*/
virtual ComputeSort createSort(ComputeSortImpl::SortTrait* trait, unsigned int length, bool uniform=true) = 0;
/**
* Compile source code to create a ComputeProgram.
*
......@@ -501,7 +520,7 @@ public:
* @param zsize the third dimension of the data sets on which FFTs will be performed
* @param realToComplex if true, a real-to-complex transform will be done. Otherwise, it is complex-to-complex.
*/
virtual FFT3D* createFFT(int xsize, int ysize, int zsize, bool realToComplex=false) = 0;
virtual FFT3D createFFT(int xsize, int ysize, int zsize, bool realToComplex=false) = 0;
/**
* Get the smallest legal size for a dimension of the grid.
*/
......@@ -511,6 +530,15 @@ public:
* It ensures all contexts are fully initialized.
*/
virtual void initializeContexts() = 0;
/**
* Set the particle charges. These are packed into the fourth element of the posq array.
*/
virtual void setCharges(const std::vector<double>& charges) = 0;
/**
* Request to use the fourth element of the posq array for storing charges. Since only one force can
* do that, this returns true the first time it is called, and false on all subsequent calls.
*/
virtual bool requestPosqCharges() = 0;
/**
* Get the thread used by this context for executing parallel computations.
*/
......
......@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2019 Stanford University and the Authors. *
* Portions copyright (c) 2019-2025 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -27,17 +27,18 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "ComputeQueue.h"
#include <memory>
namespace OpenMM {
/**
* This abstract class represents an event for synchronization between the host and
* device. It is created by calling createEvent() on a ComputeContext, which returns
* an instance of a platform-specific subclass. To use it, call enqueue() immediately
* after starting an asynchronous operation, such as a kernel invocation or non-blocking
* data transfer. Then at a later point call wait(). This will cause the host to block
* until all operations started before the call to enequeue() have completed.
* This abstract class represents an event for synchronization between the host and device,
* or between queues on the same device. It is created by calling createEvent() on a ComputeContext,
* which returns an instance of a platform-specific subclass. To use it, call enqueue() immediately
* after starting an asynchronous operation, such as a kernel invocation or non-blocking data
* transfer. Then at a later point call wait() or queueWait(). This will cause the host or a
* specified queue to block until all operations started before the call to enequeue() have completed.
*
* Instead of referring to this class directly, it is best to use a ComputeEvent, which is
* a typedef for a shared_ptr to a ComputeEventImpl. This allows you to treat it as having
......@@ -56,6 +57,11 @@ public:
* Block until all operations started before the call to enqueue() have completed.
*/
virtual void wait() = 0;
/**
* Enqueue a barrier that causes a specified ComputeQueue to block until all
* operations started before the call to enqueue() have completed.
*/
virtual void queueWait(ComputeQueue queue) = 0;
};
typedef std::shared_ptr<ComputeEventImpl> ComputeEvent;
......
#ifndef OPENMM_COMPUTESORT_H_
#define OPENMM_COMPUTESORT_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2025 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "openmm/common/ArrayInterface.h"
#include "openmm/common/windowsExportCommon.h"
#include <memory>
namespace OpenMM {
/**
* This abstract class represents an algorithm for sorting arrays. It is created
* by calling createEvent() on a ComputeContext, which returns an instance of a
* platform-specific subclass.
*
* Instead of referring to this class directly, it is best to use a ComputeSort, which is
* a typedef for a shared_ptr to a ComputeSortImpl. This allows you to treat it as having
* value semantics, and frees you from having to manage memory.
*/
class OPENMM_EXPORT_COMMON ComputeSortImpl {
public:
class SortTrait;
virtual ~ComputeSortImpl() {
}
/**
* Sort an array.
*/
virtual void sort(ArrayInterface& data) = 0;
};
typedef std::shared_ptr<ComputeSortImpl> ComputeSort;
/**
* A subclass of SortTrait defines the type of value to sort, and the key for sorting them.
*/
class ComputeSortImpl::SortTrait {
public:
virtual ~SortTrait() {
}
/**
* Get the size of each data value in bytes.
*/
virtual int getDataSize() const = 0;
/**
* Get the size of each key value in bytes.
*/
virtual int getKeySize() const = 0;
/**
* Get the data type of the values to sort.
*/
virtual const char* getDataType() const = 0;
/**
* Get the data type of the sorting key.
*/
virtual const char* getKeyType() const = 0;
/**
* Get the minimum value a key can take.
*/
virtual const char* getMinKey() const = 0;
/**
* Get the maximum value a key can take.
*/
virtual const char* getMaxKey() const = 0;
/**
* Get a value whose key is guaranteed to equal getMaxKey().
*/
virtual const char* getMaxValue() const = 0;
/**
* Get the source code to select the key from the data value.
*/
virtual const char* getSortKey() const = 0;
};
} // namespace OpenMM
#endif /*OPENMM_COMPUTESORT_H_*/
......@@ -28,6 +28,7 @@
* -------------------------------------------------------------------------- */
#include "openmm/common/ArrayInterface.h"
#include <memory>
namespace OpenMM {
......@@ -44,11 +45,15 @@ namespace OpenMM {
* Note that this class performs an unnormalized transform. That means that if you perform
* a forward transform followed immediately by an inverse transform, the effect is to
* multiply every value of the original data set by the total number of data points.
*
* Instead of referring to this class directly, it is best to use a FFT3D, which is
* a typedef for a shared_ptr to a FFT3DImpl. This allows you to treat it as having
* value semantics, and frees you from having to manage memory.
*/
class OPENMM_EXPORT_COMMON FFT3D {
class OPENMM_EXPORT_COMMON FFT3DImpl {
public:
virtual ~FFT3D() {
virtual ~FFT3DImpl() {
}
/**
* Perform a Fourier transform. The transform cannot be done in-place: the input and output
......@@ -66,6 +71,8 @@ public:
virtual void execFFT(ArrayInterface& in, ArrayInterface& out, bool forward=true) = 0;
};
typedef std::shared_ptr<FFT3DImpl> FFT3D;
} // namespace OpenMM
#endif // __OPENMM_FFT3D_H__
......@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2019 Stanford University and the Authors. *
* Portions copyright (c) 2009-2025 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -71,8 +71,11 @@ public:
* @param forceGroup the force group in which the interaction should be calculated
* @param useNeighborList specifies whether a neighbor list should be used to optimize this interaction. This should
* be viewed as only a suggestion. Even when it is false, a neighbor list may be used anyway.
* @param supportsPairList specifies whether this interaction can work with a neighbor list that uses a separate pair list
*/
virtual void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup, bool useNeighborList=true) = 0;
virtual void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance,
const std::vector<std::vector<int> >& exclusionList, const std::string& kernel,
int forceGroup, bool useNeighborList=true, bool supportsPairList=false) = 0;
/**
* Add a per-atom parameter that the default interaction kernel may depend on.
*/
......
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2025 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "openmm/Context.h"
#include "openmm/internal/NonbondedForceImpl.h"
#include "openmm/common/BondedUtilities.h"
#include "openmm/common/CommonCalcNonbondedForce.h"
#include "openmm/common/CommonKernelUtilities.h"
#include "openmm/common/ContextSelector.h"
#include "openmm/common/NonbondedUtilities.h"
#include "CommonKernelSources.h"
#include "SimTKOpenMMRealType.h"
#include <algorithm>
#include <assert.h>
#include <cmath>
#include <iterator>
#include <set>
using namespace OpenMM;
using namespace std;
class CommonCalcNonbondedForceKernel::ForceInfo : public ComputeForceInfo {
public:
ForceInfo(const NonbondedForce& force) : force(force) {
particleOffset.resize(force.getNumParticles(), -1);
exceptionOffset.resize(force.getNumExceptions(), -1);
for (int i = 0; i < force.getNumParticleParameterOffsets(); i++) {
string parameter;
int particleIndex;
double chargeScale, sigmaScale, epsilonScale;
force.getParticleParameterOffset(i, parameter, particleIndex, chargeScale, sigmaScale, epsilonScale);
particleOffset[particleIndex] = i;
}
for (int i = 0; i < force.getNumExceptionParameterOffsets(); i++) {
string parameter;
int exceptionIndex;
double chargeProdScale, sigmaScale, epsilonScale;
force.getExceptionParameterOffset(i, parameter, exceptionIndex, chargeProdScale, sigmaScale, epsilonScale);
exceptionOffset[exceptionIndex] = i;
}
}
bool areParticlesIdentical(int particle1, int particle2) {
if (particleOffset[particle1] != -1 || particleOffset[particle2] != -1) {
if (particleOffset[particle1] == -1 || particleOffset[particle2] == -1)
return false;
string parameter1, parameter2;
int particleIndex1, particleIndex2;
double chargeScale1, sigmaScale1, epsilonScale1, chargeScale2, sigmaScale2, epsilonScale2;
force.getParticleParameterOffset(particleOffset[particle1], parameter1, particleIndex1, chargeScale1, sigmaScale1, epsilonScale1);
force.getParticleParameterOffset(particleOffset[particle2], parameter2, particleIndex2, chargeScale2, sigmaScale2, epsilonScale2);
if (parameter1 != parameter2 || chargeScale1 != chargeScale2 || sigmaScale1 != sigmaScale2 || epsilonScale1 != epsilonScale2)
return false;
}
double charge1, charge2, sigma1, sigma2, epsilon1, epsilon2;
force.getParticleParameters(particle1, charge1, sigma1, epsilon1);
force.getParticleParameters(particle2, charge2, sigma2, epsilon2);
return (charge1 == charge2 && sigma1 == sigma2 && epsilon1 == epsilon2);
}
int getNumParticleGroups() {
return force.getNumExceptions();
}
void getParticlesInGroup(int index, vector<int>& particles) {
int particle1, particle2;
double chargeProd, sigma, epsilon;
force.getExceptionParameters(index, particle1, particle2, chargeProd, sigma, epsilon);
particles.resize(2);
particles[0] = particle1;
particles[1] = particle2;
}
bool areGroupsIdentical(int group1, int group2) {
if (exceptionOffset[group1] != -1 || exceptionOffset[group2] != -1) {
if (exceptionOffset[group1] == -1 || exceptionOffset[group2] == -1)
return false;
string parameter1, parameter2;
int exceptionIndex1, exceptionIndex2;
double chargeProdScale1, sigmaScale1, epsilonScale1, chargeProdScale2, sigmaScale2, epsilonScale2;
force.getExceptionParameterOffset(exceptionOffset[group1], parameter1, exceptionIndex1, chargeProdScale1, sigmaScale1, epsilonScale1);
force.getExceptionParameterOffset(exceptionOffset[group2], parameter2, exceptionIndex2, chargeProdScale2, sigmaScale2, epsilonScale2);
if (parameter1 != parameter2 || chargeProdScale1 != chargeProdScale2 || sigmaScale1 != sigmaScale2 || epsilonScale1 != epsilonScale2)
return false;
}
int particle1, particle2;
double chargeProd1, chargeProd2, sigma1, sigma2, epsilon1, epsilon2;
force.getExceptionParameters(group1, particle1, particle2, chargeProd1, sigma1, epsilon1);
force.getExceptionParameters(group2, particle1, particle2, chargeProd2, sigma2, epsilon2);
return (chargeProd1 == chargeProd2 && sigma1 == sigma2 && epsilon1 == epsilon2);
}
private:
const NonbondedForce& force;
vector<int> particleOffset, exceptionOffset;
};
class CommonCalcNonbondedForceKernel::PmeIO : public CalcPmeReciprocalForceKernel::IO {
public:
PmeIO(ComputeContext& cc, ComputeKernel addForcesKernel) : cc(cc), addForcesKernel(addForcesKernel) {
forceTemp.initialize<mm_float4>(cc, cc.getNumAtoms(), "PmeForce");
addForcesKernel->addArg(forceTemp);
addForcesKernel->addArg();
}
float* getPosq() {
ContextSelector selector(cc);
cc.getPosq().download(posq);
return (float*) &posq[0];
}
void setForce(float* force) {
forceTemp.upload(force);
addForcesKernel->setArg(1, cc.getLongForceBuffer());
addForcesKernel->execute(cc.getNumAtoms());
}
private:
ComputeContext& cc;
vector<mm_float4> posq;
ComputeArray forceTemp;
ComputeKernel addForcesKernel;
};
class CommonCalcNonbondedForceKernel::PmePreComputation : public ComputeContext::ForcePreComputation {
public:
PmePreComputation(ComputeContext& cc, Kernel& pme, CalcPmeReciprocalForceKernel::IO& io) : cc(cc), pme(pme), io(io) {
}
void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
Vec3 boxVectors[3];
cc.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
pme.getAs<CalcPmeReciprocalForceKernel>().beginComputation(io, boxVectors, includeEnergy);
}
private:
ComputeContext& cc;
Kernel pme;
CalcPmeReciprocalForceKernel::IO& io;
};
class CommonCalcNonbondedForceKernel::PmePostComputation : public ComputeContext::ForcePostComputation {
public:
PmePostComputation(Kernel& pme, CalcPmeReciprocalForceKernel::IO& io) : pme(pme), io(io) {
}
double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
return pme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
}
private:
Kernel pme;
CalcPmeReciprocalForceKernel::IO& io;
};
class CommonCalcNonbondedForceKernel::SyncQueuePreComputation : public ComputeContext::ForcePreComputation {
public:
SyncQueuePreComputation(ComputeContext& cc, ComputeQueue queue, ComputeEvent event, int forceGroup) : cc(cc), queue(queue), event(event), forceGroup(forceGroup) {
}
void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
if ((groups&(1<<forceGroup)) != 0) {
event->enqueue();
event->queueWait(queue);
}
}
private:
ComputeContext& cc;
ComputeQueue queue;
ComputeEvent event;
int forceGroup;
};
class CommonCalcNonbondedForceKernel::SyncQueuePostComputation : public ComputeContext::ForcePostComputation {
public:
SyncQueuePostComputation(ComputeContext& cc, ComputeEvent event, ComputeArray& pmeEnergyBuffer, int forceGroup) : cc(cc), event(event),
pmeEnergyBuffer(pmeEnergyBuffer), forceGroup(forceGroup) {
}
void setKernel(ComputeKernel kernel) {
addEnergyKernel = kernel;
addEnergyKernel->addArg(pmeEnergyBuffer);
addEnergyKernel->addArg(cc.getEnergyBuffer());
addEnergyKernel->addArg((int) pmeEnergyBuffer.getSize());
}
double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
if ((groups&(1<<forceGroup)) != 0) {
event->wait();
if (includeEnergy)
addEnergyKernel->execute(pmeEnergyBuffer.getSize());
}
return 0.0;
}
private:
ComputeContext& cc;
ComputeEvent event;
ComputeKernel addEnergyKernel;
ComputeArray& pmeEnergyBuffer;
int forceGroup;
};
CommonCalcNonbondedForceKernel::~CommonCalcNonbondedForceKernel() {
ContextSelector selector(cc);
if (pmeio != NULL)
delete pmeio;
}
void CommonCalcNonbondedForceKernel::commonInitialize(const System& system, const NonbondedForce& force, bool usePmeQueue,
bool deviceIsCpu, bool useFixedPointChargeSpreading, bool useCpuPme) {
this->usePmeQueue = false;
this->deviceIsCpu = deviceIsCpu;
this->useFixedPointChargeSpreading = useFixedPointChargeSpreading;
this->useCpuPme = useCpuPme;
ContextSelector selector(cc);
int forceIndex;
for (forceIndex = 0; forceIndex < system.getNumForces() && &system.getForce(forceIndex) != &force; ++forceIndex)
;
string prefix = "nonbonded"+cc.intToString(forceIndex)+"_";
// Identify which exceptions are 1-4 interactions.
set<int> exceptionsWithOffsets;
for (int i = 0; i < force.getNumExceptionParameterOffsets(); i++) {
string param;
int exception;
double charge, sigma, epsilon;
force.getExceptionParameterOffset(i, param, exception, charge, sigma, epsilon);
exceptionsWithOffsets.insert(exception);
}
vector<pair<int, int> > exclusions;
vector<int> exceptions;
for (int i = 0; i < force.getNumExceptions(); i++) {
int particle1, particle2;
double chargeProd, sigma, epsilon;
force.getExceptionParameters(i, particle1, particle2, chargeProd, sigma, epsilon);
exclusions.push_back(pair<int, int>(particle1, particle2));
if (chargeProd != 0.0 || epsilon != 0.0 || exceptionsWithOffsets.find(i) != exceptionsWithOffsets.end()) {
exceptionIndex[i] = exceptions.size();
exceptions.push_back(i);
}
}
// Initialize nonbonded interactions.
int numParticles = force.getNumParticles();
vector<mm_float4> baseParticleParamVec(cc.getPaddedNumAtoms(), mm_float4(0, 0, 0, 0));
vector<vector<int> > exclusionList(numParticles);
hasCoulomb = false;
hasLJ = false;
for (int i = 0; i < numParticles; i++) {
double charge, sigma, epsilon;
force.getParticleParameters(i, charge, sigma, epsilon);
baseParticleParamVec[i] = mm_float4(charge, sigma, epsilon, 0);
exclusionList[i].push_back(i);
if (charge != 0.0)
hasCoulomb = true;
if (epsilon != 0.0)
hasLJ = true;
}
for (int i = 0; i < force.getNumParticleParameterOffsets(); i++) {
string param;
int particle;
double charge, sigma, epsilon;
force.getParticleParameterOffset(i, param, particle, charge, sigma, epsilon);
if (charge != 0.0)
hasCoulomb = true;
if (epsilon != 0.0)
hasLJ = true;
}
for (auto exclusion : exclusions) {
exclusionList[exclusion.first].push_back(exclusion.second);
exclusionList[exclusion.second].push_back(exclusion.first);
}
nonbondedMethod = CalcNonbondedForceKernel::NonbondedMethod(force.getNonbondedMethod());
bool useCutoff = (nonbondedMethod != NoCutoff);
bool usePeriodic = (nonbondedMethod != NoCutoff && nonbondedMethod != CutoffNonPeriodic);
doLJPME = (nonbondedMethod == LJPME && hasLJ);
usePosqCharges = hasCoulomb ? cc.requestPosqCharges() : false;
map<string, string> defines;
defines["HAS_COULOMB"] = (hasCoulomb ? "1" : "0");
defines["HAS_LENNARD_JONES"] = (hasLJ ? "1" : "0");
defines["USE_LJ_SWITCH"] = (useCutoff && force.getUseSwitchingFunction() ? "1" : "0");
if (useCutoff) {
// Compute the reaction field constants.
double reactionFieldK = pow(force.getCutoffDistance(), -3.0)*(force.getReactionFieldDielectric()-1.0)/(2.0*force.getReactionFieldDielectric()+1.0);
double reactionFieldC = (1.0 / force.getCutoffDistance())*(3.0*force.getReactionFieldDielectric())/(2.0*force.getReactionFieldDielectric()+1.0);
defines["REACTION_FIELD_K"] = cc.doubleToString(reactionFieldK);
defines["REACTION_FIELD_C"] = cc.doubleToString(reactionFieldC);
// Compute the switching coefficients.
if (force.getUseSwitchingFunction()) {
defines["LJ_SWITCH_CUTOFF"] = cc.doubleToString(force.getSwitchingDistance());
defines["LJ_SWITCH_C3"] = cc.doubleToString(10/pow(force.getSwitchingDistance()-force.getCutoffDistance(), 3.0));
defines["LJ_SWITCH_C4"] = cc.doubleToString(15/pow(force.getSwitchingDistance()-force.getCutoffDistance(), 4.0));
defines["LJ_SWITCH_C5"] = cc.doubleToString(6/pow(force.getSwitchingDistance()-force.getCutoffDistance(), 5.0));
}
}
if (force.getUseDispersionCorrection() && cc.getContextIndex() == 0 && !doLJPME)
dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(system, force);
else
dispersionCoefficient = 0.0;
alpha = 0;
ewaldSelfEnergy = 0.0;
totalCharge = 0.0;
map<string, string> paramsDefines;
paramsDefines["ONE_4PI_EPS0"] = cc.doubleToString(ONE_4PI_EPS0);
paramsDefines["EPSILON0"] = cc.doubleToString(EPSILON0);
paramsDefines["WORK_GROUP_SIZE"] = cc.intToString(cc.ThreadBlockSize);
hasOffsets = (force.getNumParticleParameterOffsets() > 0 || force.getNumExceptionParameterOffsets() > 0);
if (hasOffsets)
paramsDefines["HAS_OFFSETS"] = "1";
if (force.getNumParticleParameterOffsets() > 0)
paramsDefines["HAS_PARTICLE_OFFSETS"] = "1";
if (force.getNumExceptionParameterOffsets() > 0)
paramsDefines["HAS_EXCEPTION_OFFSETS"] = "1";
if (usePosqCharges)
paramsDefines["USE_POSQ_CHARGES"] = "1";
if (doLJPME)
paramsDefines["INCLUDE_LJPME_EXCEPTIONS"] = "1";
if (nonbondedMethod == Ewald) {
// Compute the Ewald parameters.
int kmaxx, kmaxy, kmaxz;
NonbondedForceImpl::calcEwaldParameters(system, force, alpha, kmaxx, kmaxy, kmaxz);
defines["EWALD_ALPHA"] = cc.doubleToString(alpha);
defines["TWO_OVER_SQRT_PI"] = cc.doubleToString(2.0/sqrt(M_PI));
defines["USE_EWALD"] = "1";
if (cc.getContextIndex() == 0) {
paramsDefines["INCLUDE_EWALD"] = "1";
paramsDefines["EWALD_SELF_ENERGY_SCALE"] = cc.doubleToString(ONE_4PI_EPS0*alpha/sqrt(M_PI));
for (int i = 0; i < numParticles; i++) {
ewaldSelfEnergy -= baseParticleParamVec[i].x*baseParticleParamVec[i].x*ONE_4PI_EPS0*alpha/sqrt(M_PI);
totalCharge += baseParticleParamVec[i].x;
}
// Create the reciprocal space kernels.
map<string, string> replacements;
replacements["NUM_ATOMS"] = cc.intToString(numParticles);
replacements["PADDED_NUM_ATOMS"] = cc.intToString(cc.getPaddedNumAtoms());
replacements["KMAX_X"] = cc.intToString(kmaxx);
replacements["KMAX_Y"] = cc.intToString(kmaxy);
replacements["KMAX_Z"] = cc.intToString(kmaxz);
replacements["EXP_COEFFICIENT"] = cc.doubleToString(-1.0/(4.0*alpha*alpha));
replacements["ONE_4PI_EPS0"] = cc.doubleToString(ONE_4PI_EPS0);
replacements["M_PI"] = cc.doubleToString(M_PI);
ComputeProgram program = cc.compileProgram(CommonKernelSources::ewald, replacements);
ewaldSumsKernel = program->createKernel("calculateEwaldCosSinSums");
ewaldForcesKernel = program->createKernel("calculateEwaldForces");
int elementSize = (cc.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2));
cosSinSums.initialize(cc, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), elementSize, "cosSinSums");
}
}
else if (((nonbondedMethod == PME || nonbondedMethod == LJPME) && hasCoulomb) || doLJPME) {
// Compute the PME parameters.
NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSizeX, gridSizeY, gridSizeZ, false);
gridSizeX = cc.findLegalFFTDimension(gridSizeX);
gridSizeY = cc.findLegalFFTDimension(gridSizeY);
gridSizeZ = cc.findLegalFFTDimension(gridSizeZ);
if (doLJPME) {
NonbondedForceImpl::calcPMEParameters(system, force, dispersionAlpha, dispersionGridSizeX,
dispersionGridSizeY, dispersionGridSizeZ, true);
dispersionGridSizeX = cc.findLegalFFTDimension(dispersionGridSizeX);
dispersionGridSizeY = cc.findLegalFFTDimension(dispersionGridSizeY);
dispersionGridSizeZ = cc.findLegalFFTDimension(dispersionGridSizeZ);
}
defines["EWALD_ALPHA"] = cc.doubleToString(alpha);
defines["TWO_OVER_SQRT_PI"] = cc.doubleToString(2.0/sqrt(M_PI));
defines["USE_EWALD"] = "1";
defines["DO_LJPME"] = doLJPME ? "1" : "0";
if (doLJPME) {
defines["EWALD_DISPERSION_ALPHA"] = cc.doubleToString(dispersionAlpha);
double invRCut6 = pow(force.getCutoffDistance(), -6);
double dalphaR = dispersionAlpha * force.getCutoffDistance();
double dar2 = dalphaR*dalphaR;
double dar4 = dar2*dar2;
double multShift6 = -invRCut6*(1.0 - exp(-dar2) * (1.0 + dar2 + 0.5*dar4));
defines["INVCUT6"] = cc.doubleToString(invRCut6);
defines["MULTSHIFT6"] = cc.doubleToString(multShift6);
}
if (cc.getContextIndex() == 0) {
paramsDefines["INCLUDE_EWALD"] = "1";
paramsDefines["EWALD_SELF_ENERGY_SCALE"] = cc.doubleToString(ONE_4PI_EPS0*alpha/sqrt(M_PI));
for (int i = 0; i < numParticles; i++) {
ewaldSelfEnergy -= baseParticleParamVec[i].x*baseParticleParamVec[i].x*ONE_4PI_EPS0*alpha/sqrt(M_PI);
totalCharge += baseParticleParamVec[i].x;
}
if (doLJPME) {
paramsDefines["INCLUDE_LJPME"] = "1";
paramsDefines["LJPME_SELF_ENERGY_SCALE"] = cc.doubleToString(pow(dispersionAlpha, 6)/3.0);
for (int i = 0; i < numParticles; i++)
ewaldSelfEnergy += baseParticleParamVec[i].z*pow(baseParticleParamVec[i].y*dispersionAlpha, 6)/3.0;
}
pmeDefines["PME_ORDER"] = cc.intToString(PmeOrder);
pmeDefines["NUM_ATOMS"] = cc.intToString(numParticles);
pmeDefines["PADDED_NUM_ATOMS"] = cc.intToString(cc.getPaddedNumAtoms());
pmeDefines["RECIP_EXP_FACTOR"] = cc.doubleToString(M_PI*M_PI/(alpha*alpha));
pmeDefines["GRID_SIZE_X"] = cc.intToString(gridSizeX);
pmeDefines["GRID_SIZE_Y"] = cc.intToString(gridSizeY);
pmeDefines["GRID_SIZE_Z"] = cc.intToString(gridSizeZ);
pmeDefines["EPSILON_FACTOR"] = cc.doubleToString(sqrt(ONE_4PI_EPS0));
pmeDefines["M_PI"] = cc.doubleToString(M_PI);
if (useFixedPointChargeSpreading)
pmeDefines["USE_FIXED_POINT_CHARGE_SPREADING"] = "1";
if (deviceIsCpu)
pmeDefines["DEVICE_IS_CPU"] = "1";
if (useCpuPme && !doLJPME && usePosqCharges) {
// Create the CPU PME kernel.
try {
cpuPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), cc.getContextImpl());
cpuPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSizeX, gridSizeY, gridSizeZ, numParticles, alpha, false);
ComputeProgram program = cc.compileProgram(CommonKernelSources::pme, pmeDefines);
ComputeKernel addForcesKernel = program->createKernel("addForces");
pmeio = new PmeIO(cc, addForcesKernel);
cc.addPreComputation(new PmePreComputation(cc, cpuPme, *pmeio));
cc.addPostComputation(new PmePostComputation(cpuPme, *pmeio));
}
catch (OpenMMException& ex) {
// The CPU PME plugin isn't available.
}
}
if (pmeio == NULL) {
// Create required data structures.
int elementSize = (cc.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
int gridElements = gridSizeX*gridSizeY*gridSizeZ;
if (doLJPME) {
gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
}
pmeGrid1.initialize(cc, gridElements, 2*elementSize, "pmeGrid1");
pmeGrid2.initialize(cc, gridElements, 2*elementSize, "pmeGrid2");
if (useFixedPointChargeSpreading)
cc.addAutoclearBuffer(pmeGrid2);
else
cc.addAutoclearBuffer(pmeGrid1);
pmeBsplineModuliX.initialize(cc, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliY.initialize(cc, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliZ.initialize(cc, gridSizeZ, elementSize, "pmeBsplineModuliZ");
if (doLJPME) {
pmeDispersionBsplineModuliX.initialize(cc, dispersionGridSizeX, elementSize, "pmeDispersionBsplineModuliX");
pmeDispersionBsplineModuliY.initialize(cc, dispersionGridSizeY, elementSize, "pmeDispersionBsplineModuliY");
pmeDispersionBsplineModuliZ.initialize(cc, dispersionGridSizeZ, elementSize, "pmeDispersionBsplineModuliZ");
}
pmeAtomGridIndex.initialize<mm_int2>(cc, numParticles, "pmeAtomGridIndex");
int energyElementSize = (cc.getUseDoublePrecision() || cc.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
pmeEnergyBuffer.initialize(cc, cc.getNumThreadBlocks()*ComputeContext::ThreadBlockSize, energyElementSize, "pmeEnergyBuffer");
cc.clearBuffer(pmeEnergyBuffer);
sort = cc.createSort(new SortTrait(), cc.getNumAtoms());
fft = cc.createFFT(gridSizeX, gridSizeY, gridSizeZ, true);
if (doLJPME)
dispersionFft = cc.createFFT(dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ, true);
this->usePmeQueue = usePmeQueue;
if (usePmeQueue) {
pmeDefines["USE_PME_STREAM"] = "1";
pmeQueue = cc.createQueue();
int recipForceGroup = force.getReciprocalSpaceForceGroup();
if (recipForceGroup < 0)
recipForceGroup = force.getForceGroup();
pmeSyncEvent = cc.createEvent();
paramsSyncEvent = cc.createEvent();
cc.addPreComputation(new SyncQueuePreComputation(cc, pmeQueue, pmeSyncEvent, recipForceGroup));
cc.addPostComputation(syncQueue = new SyncQueuePostComputation(cc, pmeSyncEvent, pmeEnergyBuffer, recipForceGroup));
}
// Initialize the b-spline moduli.
for (int grid = 0; grid < 2; grid++) {
int xsize, ysize, zsize;
ComputeArray *xmoduli, *ymoduli, *zmoduli;
if (grid == 0) {
xsize = gridSizeX;
ysize = gridSizeY;
zsize = gridSizeZ;
xmoduli = &pmeBsplineModuliX;
ymoduli = &pmeBsplineModuliY;
zmoduli = &pmeBsplineModuliZ;
}
else {
if (!doLJPME)
continue;
xsize = dispersionGridSizeX;
ysize = dispersionGridSizeY;
zsize = dispersionGridSizeZ;
xmoduli = &pmeDispersionBsplineModuliX;
ymoduli = &pmeDispersionBsplineModuliY;
zmoduli = &pmeDispersionBsplineModuliZ;
}
int maxSize = max(max(xsize, ysize), zsize);
vector<double> data(PmeOrder);
vector<double> ddata(PmeOrder);
vector<double> bsplines_data(maxSize);
data[PmeOrder-1] = 0.0;
data[1] = 0.0;
data[0] = 1.0;
for (int i = 3; i < PmeOrder; i++) {
double div = 1.0/(i-1.0);
data[i-1] = 0.0;
for (int j = 1; j < (i-1); j++)
data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
data[0] = div*data[0];
}
// Differentiate.
ddata[0] = -data[0];
for (int i = 1; i < PmeOrder; i++)
ddata[i] = data[i-1]-data[i];
double div = 1.0/(PmeOrder-1);
data[PmeOrder-1] = 0.0;
for (int i = 1; i < (PmeOrder-1); i++)
data[PmeOrder-i-1] = div*(i*data[PmeOrder-i-2]+(PmeOrder-i)*data[PmeOrder-i-1]);
data[0] = div*data[0];
for (int i = 0; i < maxSize; i++)
bsplines_data[i] = 0.0;
for (int i = 1; i <= PmeOrder; i++)
bsplines_data[i] = data[i-1];
// Evaluate the actual bspline moduli for X/Y/Z.
for (int dim = 0; dim < 3; dim++) {
int ndata = (dim == 0 ? xsize : dim == 1 ? ysize : zsize);
vector<double> moduli(ndata);
for (int i = 0; i < ndata; i++) {
double sc = 0.0;
double ss = 0.0;
for (int j = 0; j < ndata; j++) {
double arg = (2.0*M_PI*i*j)/ndata;
sc += bsplines_data[j]*cos(arg);
ss += bsplines_data[j]*sin(arg);
}
moduli[i] = sc*sc+ss*ss;
}
for (int i = 0; i < ndata; i++)
if (moduli[i] < 1.0e-7)
moduli[i] = (moduli[(i-1+ndata)%ndata]+moduli[(i+1)%ndata])*0.5;
if (dim == 0)
xmoduli->upload(moduli, true);
else if (dim == 1)
ymoduli->upload(moduli, true);
else
zmoduli->upload(moduli, true);
}
}
}
}
}
// Add code to subtract off the reciprocal part of excluded interactions.
if ((nonbondedMethod == Ewald || nonbondedMethod == PME || nonbondedMethod == LJPME) && pmeio == NULL) {
int numContexts = cc.getNumContexts();
int startIndex = cc.getContextIndex()*force.getNumExceptions()/numContexts;
int endIndex = (cc.getContextIndex()+1)*force.getNumExceptions()/numContexts;
int numExclusions = endIndex-startIndex;
if (numExclusions > 0) {
paramsDefines["HAS_EXCLUSIONS"] = "1";
vector<vector<int> > atoms(numExclusions, vector<int>(2));
exclusionAtoms.initialize<mm_int2>(cc, numExclusions, "exclusionAtoms");
exclusionParams.initialize<mm_float4>(cc, numExclusions, "exclusionParams");
vector<mm_int2> exclusionAtomsVec(numExclusions);
for (int i = 0; i < numExclusions; i++) {
int j = i+startIndex;
exclusionAtomsVec[i] = mm_int2(exclusions[j].first, exclusions[j].second);
atoms[i][0] = exclusions[j].first;
atoms[i][1] = exclusions[j].second;
}
exclusionAtoms.upload(exclusionAtomsVec);
map<string, string> replacements;
replacements["PARAMS"] = cc.getBondedUtilities().addArgument(exclusionParams, "float4");
replacements["EWALD_ALPHA"] = cc.doubleToString(alpha);
replacements["TWO_OVER_SQRT_PI"] = cc.doubleToString(2.0/sqrt(M_PI));
replacements["DO_LJPME"] = doLJPME ? "1" : "0";
replacements["USE_PERIODIC"] = force.getExceptionsUsePeriodicBoundaryConditions() ? "1" : "0";
if (doLJPME)
replacements["EWALD_DISPERSION_ALPHA"] = cc.doubleToString(dispersionAlpha);
if (force.getIncludeDirectSpace())
cc.getBondedUtilities().addInteraction(atoms, cc.replaceStrings(CommonKernelSources::pmeExclusions, replacements), force.getForceGroup());
}
}
// Add the interaction to the default nonbonded kernel.
string source = cc.replaceStrings(CommonKernelSources::coulombLennardJones, defines);
charges.initialize(cc, cc.getPaddedNumAtoms(), cc.getUseDoublePrecision() ? sizeof(double) : sizeof(float), "charges");
baseParticleParams.initialize<mm_float4>(cc, cc.getPaddedNumAtoms(), "baseParticleParams");
baseParticleParams.upload(baseParticleParamVec);
map<string, string> replacements;
replacements["ONE_4PI_EPS0"] = cc.doubleToString(ONE_4PI_EPS0);
if (usePosqCharges) {
replacements["CHARGE1"] = "posq1.w";
replacements["CHARGE2"] = "posq2.w";
}
else {
replacements["CHARGE1"] = prefix+"charge1";
replacements["CHARGE2"] = prefix+"charge2";
}
if (hasCoulomb && !usePosqCharges)
cc.getNonbondedUtilities().addParameter(ComputeParameterInfo(charges, prefix+"charge", "real", 1));
sigmaEpsilon.initialize<mm_float2>(cc, cc.getPaddedNumAtoms(), "sigmaEpsilon");
if (hasLJ) {
replacements["SIGMA_EPSILON1"] = prefix+"sigmaEpsilon1";
replacements["SIGMA_EPSILON2"] = prefix+"sigmaEpsilon2";
cc.getNonbondedUtilities().addParameter(ComputeParameterInfo(sigmaEpsilon, prefix+"sigmaEpsilon", "float", 2));
}
source = cc.replaceStrings(source, replacements);
if (force.getIncludeDirectSpace())
cc.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup(), numParticles > 3000, true);
// Initialize the exceptions.
int numContexts = cc.getNumContexts();
int startIndex = cc.getContextIndex()*exceptions.size()/numContexts;
int endIndex = (cc.getContextIndex()+1)*exceptions.size()/numContexts;
int numExceptions = endIndex-startIndex;
if (numExceptions > 0) {
paramsDefines["HAS_EXCEPTIONS"] = "1";
exceptionAtoms.resize(numExceptions);
vector<vector<int> > atoms(numExceptions, vector<int>(2));
exceptionParams.initialize<mm_float4>(cc, numExceptions, "exceptionParams");
baseExceptionParams.initialize<mm_float4>(cc, numExceptions, "baseExceptionParams");
vector<mm_float4> baseExceptionParamsVec(numExceptions);
for (int i = 0; i < numExceptions; i++) {
double chargeProd, sigma, epsilon;
force.getExceptionParameters(exceptions[startIndex+i], atoms[i][0], atoms[i][1], chargeProd, sigma, epsilon);
baseExceptionParamsVec[i] = mm_float4(chargeProd, sigma, epsilon, 0);
exceptionAtoms[i] = make_pair(atoms[i][0], atoms[i][1]);
}
baseExceptionParams.upload(baseExceptionParamsVec);
map<string, string> replacements;
replacements["APPLY_PERIODIC"] = (usePeriodic && force.getExceptionsUsePeriodicBoundaryConditions() ? "1" : "0");
replacements["PARAMS"] = cc.getBondedUtilities().addArgument(exceptionParams, "float4");
if (force.getIncludeDirectSpace())
cc.getBondedUtilities().addInteraction(atoms, cc.replaceStrings(CommonKernelSources::nonbondedExceptions, replacements), force.getForceGroup());
}
// Initialize parameter offsets.
vector<vector<mm_float4> > particleOffsetVec(force.getNumParticles());
vector<vector<mm_float4> > exceptionOffsetVec(numExceptions);
for (int i = 0; i < force.getNumParticleParameterOffsets(); i++) {
string param;
int particle;
double charge, sigma, epsilon;
force.getParticleParameterOffset(i, param, particle, charge, sigma, epsilon);
auto paramPos = find(paramNames.begin(), paramNames.end(), param);
int paramIndex;
if (paramPos == paramNames.end()) {
paramIndex = paramNames.size();
paramNames.push_back(param);
}
else
paramIndex = paramPos-paramNames.begin();
particleOffsetVec[particle].push_back(mm_float4(charge, sigma, epsilon, paramIndex));
}
for (int i = 0; i < force.getNumExceptionParameterOffsets(); i++) {
string param;
int exception;
double charge, sigma, epsilon;
force.getExceptionParameterOffset(i, param, exception, charge, sigma, epsilon);
int index = exceptionIndex[exception];
if (index < startIndex || index >= endIndex)
continue;
auto paramPos = find(paramNames.begin(), paramNames.end(), param);
int paramIndex;
if (paramPos == paramNames.end()) {
paramIndex = paramNames.size();
paramNames.push_back(param);
}
else
paramIndex = paramPos-paramNames.begin();
exceptionOffsetVec[index-startIndex].push_back(mm_float4(charge, sigma, epsilon, paramIndex));
}
paramValues.resize(paramNames.size(), 0.0);
particleParamOffsets.initialize<mm_float4>(cc, max(force.getNumParticleParameterOffsets(), 1), "particleParamOffsets");
particleOffsetIndices.initialize<int>(cc, cc.getPaddedNumAtoms()+1, "particleOffsetIndices");
vector<int> particleOffsetIndicesVec, exceptionOffsetIndicesVec;
vector<mm_float4> p, e;
for (int i = 0; i < particleOffsetVec.size(); i++) {
particleOffsetIndicesVec.push_back(p.size());
for (int j = 0; j < particleOffsetVec[i].size(); j++)
p.push_back(particleOffsetVec[i][j]);
}
while (particleOffsetIndicesVec.size() < particleOffsetIndices.getSize())
particleOffsetIndicesVec.push_back(p.size());
for (int i = 0; i < exceptionOffsetVec.size(); i++) {
exceptionOffsetIndicesVec.push_back(e.size());
for (int j = 0; j < exceptionOffsetVec[i].size(); j++)
e.push_back(exceptionOffsetVec[i][j]);
}
exceptionOffsetIndicesVec.push_back(e.size());
if (force.getNumParticleParameterOffsets() > 0) {
particleParamOffsets.upload(p);
particleOffsetIndices.upload(particleOffsetIndicesVec);
}
exceptionParamOffsets.initialize<mm_float4>(cc, max((int) e.size(), 1), "exceptionParamOffsets");
exceptionOffsetIndices.initialize<int>(cc, exceptionOffsetIndicesVec.size(), "exceptionOffsetIndices");
if (e.size() > 0) {
exceptionParamOffsets.upload(e);
exceptionOffsetIndices.upload(exceptionOffsetIndicesVec);
}
globalParams.initialize(cc, max((int) paramValues.size(), 1), cc.getUseDoublePrecision() ? sizeof(double) : sizeof(float), "globalParams");
if (paramValues.size() > 0)
globalParams.upload(paramValues, true);
chargeBuffer.initialize(cc, cc.getNumThreadBlocks(), cc.getUseDoublePrecision() ? sizeof(double) : sizeof(float), "chargeBuffer");
cc.clearBuffer(chargeBuffer);
recomputeParams = true;
// Initialize the kernel for updating parameters.
ComputeProgram program = cc.compileProgram(CommonKernelSources::nonbondedParameters, paramsDefines);
computeParamsKernel = program->createKernel("computeParameters");
computeExclusionParamsKernel = program->createKernel("computeExclusionParameters");
computePlasmaCorrectionKernel = program->createKernel("computePlasmaCorrection");
info = new ForceInfo(force);
cc.addForce(info);
}
double CommonCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
ContextSelector selector(cc);
if (!hasInitializedKernel) {
hasInitializedKernel = true;
computeParamsKernel->addArg(cc.getEnergyBuffer());
computeParamsKernel->addArg();
computeParamsKernel->addArg(globalParams);
computeParamsKernel->addArg(cc.getPaddedNumAtoms());
computeParamsKernel->addArg(baseParticleParams);
computeParamsKernel->addArg(cc.getPosq());
computeParamsKernel->addArg(charges);
computeParamsKernel->addArg(sigmaEpsilon);
computeParamsKernel->addArg(particleParamOffsets);
computeParamsKernel->addArg(particleOffsetIndices);
computeParamsKernel->addArg(chargeBuffer);
if (exceptionParams.isInitialized()) {
computeParamsKernel->addArg((int) exceptionParams.getSize());
computeParamsKernel->addArg(baseExceptionParams);
computeParamsKernel->addArg(exceptionParams);
computeParamsKernel->addArg(exceptionParamOffsets);
computeParamsKernel->addArg(exceptionOffsetIndices);
}
if (exclusionParams.isInitialized()) {
computeExclusionParamsKernel->addArg(cc.getPosq());
computeExclusionParamsKernel->addArg(charges);
computeExclusionParamsKernel->addArg(sigmaEpsilon);
computeExclusionParamsKernel->addArg((int) exclusionParams.getSize());
computeExclusionParamsKernel->addArg(exclusionAtoms);
computeExclusionParamsKernel->addArg(exclusionParams);
}
computePlasmaCorrectionKernel->addArg(chargeBuffer);
computePlasmaCorrectionKernel->addArg(cc.getEnergyBuffer());
if (cc.getUseDoublePrecision())
computePlasmaCorrectionKernel->addArg(alpha);
else
computePlasmaCorrectionKernel->addArg((float) alpha);
computePlasmaCorrectionKernel->addArg();
if (cosSinSums.isInitialized()) {
ewaldSumsKernel->addArg(cc.getEnergyBuffer());
ewaldSumsKernel->addArg(cc.getPosq());
ewaldSumsKernel->addArg(cosSinSums);
ewaldSumsKernel->addArg();
ewaldForcesKernel->addArg(cc.getLongForceBuffer());
ewaldForcesKernel->addArg(cc.getPosq());
ewaldForcesKernel->addArg(cosSinSums);
ewaldForcesKernel->addArg();
}
if (pmeGrid1.isInitialized()) {
// Create kernels for Coulomb PME.
map<string, string> replacements;
replacements["CHARGE"] = (usePosqCharges ? "pos.w" : "charges[atom]");
ComputeProgram program = cc.compileProgram(cc.replaceStrings(CommonKernelSources::pme, replacements), pmeDefines);
pmeGridIndexKernel = program->createKernel("findAtomGridIndex");
pmeSpreadChargeKernel = program->createKernel("gridSpreadCharge");
pmeConvolutionKernel = program->createKernel("reciprocalConvolution");
pmeEvalEnergyKernel = program->createKernel("gridEvaluateEnergy");
pmeInterpolateForceKernel = program->createKernel("gridInterpolateForce");
pmeGridIndexKernel->addArg(cc.getPosq());
pmeGridIndexKernel->addArg(pmeAtomGridIndex);
for (int i = 0; i < 8; i++)
pmeGridIndexKernel->addArg();
pmeSpreadChargeKernel->addArg(cc.getPosq());
if (useFixedPointChargeSpreading)
pmeSpreadChargeKernel->addArg(pmeGrid2);
else
pmeSpreadChargeKernel->addArg(pmeGrid1);
for (int i = 0; i < 8; i++)
pmeSpreadChargeKernel->addArg();
pmeSpreadChargeKernel->addArg(pmeAtomGridIndex);
pmeSpreadChargeKernel->addArg(charges);
pmeConvolutionKernel->addArg(pmeGrid2);
pmeConvolutionKernel->addArg(pmeBsplineModuliX);
pmeConvolutionKernel->addArg(pmeBsplineModuliY);
pmeConvolutionKernel->addArg(pmeBsplineModuliZ);
for (int i = 0; i < 3; i++)
pmeConvolutionKernel->addArg();
pmeEvalEnergyKernel->addArg(pmeGrid2);
if (usePmeQueue)
pmeEvalEnergyKernel->addArg(pmeEnergyBuffer);
else
pmeEvalEnergyKernel->addArg(cc.getEnergyBuffer());
pmeEvalEnergyKernel->addArg(pmeBsplineModuliX);
pmeEvalEnergyKernel->addArg(pmeBsplineModuliY);
pmeEvalEnergyKernel->addArg(pmeBsplineModuliZ);
for (int i = 0; i < 3; i++)
pmeEvalEnergyKernel->addArg();
pmeInterpolateForceKernel->addArg(cc.getPosq());
pmeInterpolateForceKernel->addArg(cc.getLongForceBuffer());
pmeInterpolateForceKernel->addArg(pmeGrid1);
for (int i = 0; i < 8; i++)
pmeInterpolateForceKernel->addArg();
pmeInterpolateForceKernel->addArg(pmeAtomGridIndex);
pmeInterpolateForceKernel->addArg(charges);
if (useFixedPointChargeSpreading) {
pmeFinishSpreadChargeKernel = program->createKernel("finishSpreadCharge");
pmeFinishSpreadChargeKernel->addArg(pmeGrid2);
pmeFinishSpreadChargeKernel->addArg(pmeGrid1);
}
if (usePmeQueue)
syncQueue->setKernel(program->createKernel("addEnergy"));
if (doLJPME) {
// Create kernels for LJ PME.
pmeDefines["EWALD_ALPHA"] = cc.doubleToString(dispersionAlpha);
pmeDefines["GRID_SIZE_X"] = cc.intToString(dispersionGridSizeX);
pmeDefines["GRID_SIZE_Y"] = cc.intToString(dispersionGridSizeY);
pmeDefines["GRID_SIZE_Z"] = cc.intToString(dispersionGridSizeZ);
pmeDefines["EPSILON_FACTOR"] = "1";
pmeDefines["RECIP_EXP_FACTOR"] = cc.doubleToString(M_PI*M_PI/(dispersionAlpha*dispersionAlpha));
pmeDefines["USE_LJPME"] = "1";
pmeDefines["CHARGE_FROM_SIGEPS"] = "1";
program = cc.compileProgram(CommonKernelSources::pme, pmeDefines);
pmeDispersionGridIndexKernel = program->createKernel("findAtomGridIndex");
pmeDispersionSpreadChargeKernel = program->createKernel("gridSpreadCharge");
pmeDispersionConvolutionKernel = program->createKernel("reciprocalConvolution");
pmeDispersionEvalEnergyKernel = program->createKernel("gridEvaluateEnergy");
pmeDispersionInterpolateForceKernel = program->createKernel("gridInterpolateForce");
pmeDispersionGridIndexKernel->addArg(cc.getPosq());
pmeDispersionGridIndexKernel->addArg(pmeAtomGridIndex);
for (int i = 0; i < 8; i++)
pmeDispersionGridIndexKernel->addArg();
pmeDispersionSpreadChargeKernel->addArg(cc.getPosq());
if (useFixedPointChargeSpreading)
pmeDispersionSpreadChargeKernel->addArg(pmeGrid2);
else
pmeDispersionSpreadChargeKernel->addArg(pmeGrid1);
for (int i = 0; i < 8; i++)
pmeDispersionSpreadChargeKernel->addArg();
pmeDispersionSpreadChargeKernel->addArg(pmeAtomGridIndex);
pmeDispersionSpreadChargeKernel->addArg(sigmaEpsilon);
pmeDispersionConvolutionKernel->addArg(pmeGrid2);
pmeDispersionConvolutionKernel->addArg(pmeDispersionBsplineModuliX);
pmeDispersionConvolutionKernel->addArg(pmeDispersionBsplineModuliY);
pmeDispersionConvolutionKernel->addArg(pmeDispersionBsplineModuliZ);
for (int i = 0; i < 3; i++)
pmeDispersionConvolutionKernel->addArg();
pmeDispersionEvalEnergyKernel->addArg(pmeGrid2);
if (usePmeQueue)
pmeDispersionEvalEnergyKernel->addArg(pmeEnergyBuffer);
else
pmeDispersionEvalEnergyKernel->addArg(cc.getEnergyBuffer());
pmeDispersionEvalEnergyKernel->addArg(pmeDispersionBsplineModuliX);
pmeDispersionEvalEnergyKernel->addArg(pmeDispersionBsplineModuliY);
pmeDispersionEvalEnergyKernel->addArg(pmeDispersionBsplineModuliZ);
for (int i = 0; i < 3; i++)
pmeDispersionEvalEnergyKernel->addArg();
pmeDispersionInterpolateForceKernel->addArg(cc.getPosq());
pmeDispersionInterpolateForceKernel->addArg(cc.getLongForceBuffer());
pmeDispersionInterpolateForceKernel->addArg(pmeGrid1);
for (int i = 0; i < 8; i++)
pmeDispersionInterpolateForceKernel->addArg();
pmeDispersionInterpolateForceKernel->addArg(pmeAtomGridIndex);
pmeDispersionInterpolateForceKernel->addArg(sigmaEpsilon);
if (useFixedPointChargeSpreading) {
pmeDispersionFinishSpreadChargeKernel = program->createKernel("finishSpreadCharge");
pmeDispersionFinishSpreadChargeKernel->addArg(pmeGrid2);
pmeDispersionFinishSpreadChargeKernel->addArg(pmeGrid1);
}
}
}
}
// Update particle and exception parameters.
bool paramChanged = false;
for (int i = 0; i < paramNames.size(); i++) {
double value = context.getParameter(paramNames[i]);
if (value != paramValues[i]) {
paramValues[i] = value;;
paramChanged = true;
}
}
if (paramChanged) {
recomputeParams = true;
globalParams.upload(paramValues, true);
}
double energy = 0.0;
if (includeReciprocal && (pmeGrid1.isInitialized() || cosSinSums.isInitialized())) {
Vec3 a, b, c;
cc.getPeriodicBoxVectors(a, b, c);
double volume = a[0]*b[1]*c[2];
energy = ewaldSelfEnergy - totalCharge*totalCharge/(8*EPSILON0*volume*alpha*alpha);
}
if (recomputeParams || hasOffsets) {
computeParamsKernel->setArg(1, (int) (includeEnergy && includeReciprocal));
computeParamsKernel->execute(cc.getNumAtoms());
if (exclusionParams.isInitialized())
computeExclusionParamsKernel->execute(exclusionParams.getSize());
if (usePmeQueue) {
paramsSyncEvent->enqueue();
paramsSyncEvent->queueWait(pmeQueue);
}
if (hasOffsets) {
// The Ewald self energy was computed in the kernel.
energy = 0.0;
if (pmeGrid1.isInitialized() || cosSinSums.isInitialized()) {
// Invoke a kernel to compute the correction for the neutralizing plasma.
Vec3 a, b, c;
cc.getPeriodicBoxVectors(a, b, c);
double volume = a[0]*b[1]*c[2];
if (cc.getUseDoublePrecision())
computePlasmaCorrectionKernel->setArg(3, volume);
else
computePlasmaCorrectionKernel->setArg(3, (float) volume);
computePlasmaCorrectionKernel->execute(ComputeContext::ThreadBlockSize, ComputeContext::ThreadBlockSize);
}
}
recomputeParams = false;
}
// Do reciprocal space calculations.
if (cosSinSums.isInitialized() && includeReciprocal) {
Vec3 a, b, c;
cc.getPeriodicBoxVectors(a, b, c);
if (cc.getUseDoublePrecision()) {
ewaldSumsKernel->setArg(3, mm_double4(a[0], b[1], c[2], 0));
ewaldForcesKernel->setArg(3, mm_double4(a[0], b[1], c[2], 0));
}
else {
ewaldSumsKernel->setArg(3, mm_float4((float) a[0], (float) b[1], (float) c[2], 0));
ewaldForcesKernel->setArg(3, mm_float4((float) a[0], (float) b[1], (float) c[2], 0));
}
ewaldSumsKernel->execute(cosSinSums.getSize());
ewaldForcesKernel->execute(cc.getNumAtoms());
}
if (pmeGrid1.isInitialized() && includeReciprocal) {
if (usePmeQueue)
cc.setCurrentQueue(pmeQueue);
// Invert the periodic box vectors.
Vec3 boxVectors[3];
cc.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
double determinant = boxVectors[0][0]*boxVectors[1][1]*boxVectors[2][2];
double scale = 1.0/determinant;
mm_double4 recipBoxVectors[3];
recipBoxVectors[0] = mm_double4(boxVectors[1][1]*boxVectors[2][2]*scale, 0, 0, 0);
recipBoxVectors[1] = mm_double4(-boxVectors[1][0]*boxVectors[2][2]*scale, boxVectors[0][0]*boxVectors[2][2]*scale, 0, 0);
recipBoxVectors[2] = mm_double4((boxVectors[1][0]*boxVectors[2][1]-boxVectors[1][1]*boxVectors[2][0])*scale, -boxVectors[0][0]*boxVectors[2][1]*scale, boxVectors[0][0]*boxVectors[1][1]*scale, 0);
mm_float4 recipBoxVectorsFloat[3];
for (int i = 0; i < 3; i++)
recipBoxVectorsFloat[i] = mm_float4((float) recipBoxVectors[i].x, (float) recipBoxVectors[i].y, (float) recipBoxVectors[i].z, 0);
// Execute the reciprocal space kernels.
if (hasCoulomb) {
setPeriodicBoxArgs(cc, pmeGridIndexKernel, 2);
if (cc.getUseDoublePrecision()) {
pmeGridIndexKernel->setArg(7, recipBoxVectors[0]);
pmeGridIndexKernel->setArg(8, recipBoxVectors[1]);
pmeGridIndexKernel->setArg(9, recipBoxVectors[2]);
}
else {
pmeGridIndexKernel->setArg(7, recipBoxVectorsFloat[0]);
pmeGridIndexKernel->setArg(8, recipBoxVectorsFloat[1]);
pmeGridIndexKernel->setArg(9, recipBoxVectorsFloat[2]);
}
pmeGridIndexKernel->execute(cc.getNumAtoms());
sort->sort(pmeAtomGridIndex);
setPeriodicBoxArgs(cc, pmeSpreadChargeKernel, 2);
if (cc.getUseDoublePrecision()) {
pmeSpreadChargeKernel->setArg(7, recipBoxVectors[0]);
pmeSpreadChargeKernel->setArg(8, recipBoxVectors[1]);
pmeSpreadChargeKernel->setArg(9, recipBoxVectors[2]);
}
else {
pmeSpreadChargeKernel->setArg(7, recipBoxVectorsFloat[0]);
pmeSpreadChargeKernel->setArg(8, recipBoxVectorsFloat[1]);
pmeSpreadChargeKernel->setArg(9, recipBoxVectorsFloat[2]);
}
pmeSpreadChargeKernel->execute(cc.getNumAtoms());
if (useFixedPointChargeSpreading)
pmeFinishSpreadChargeKernel->execute(gridSizeX*gridSizeY*gridSizeZ);
fft->execFFT(pmeGrid1, pmeGrid2, true);
if (cc.getUseDoublePrecision()) {
pmeConvolutionKernel->setArg<mm_double4>(4, recipBoxVectors[0]);
pmeConvolutionKernel->setArg<mm_double4>(5, recipBoxVectors[1]);
pmeConvolutionKernel->setArg<mm_double4>(6, recipBoxVectors[2]);
pmeEvalEnergyKernel->setArg<mm_double4>(5, recipBoxVectors[0]);
pmeEvalEnergyKernel->setArg<mm_double4>(6, recipBoxVectors[1]);
pmeEvalEnergyKernel->setArg<mm_double4>(7, recipBoxVectors[2]);
}
else {
pmeConvolutionKernel->setArg<mm_float4>(4, recipBoxVectorsFloat[0]);
pmeConvolutionKernel->setArg<mm_float4>(5, recipBoxVectorsFloat[1]);
pmeConvolutionKernel->setArg<mm_float4>(6, recipBoxVectorsFloat[2]);
pmeEvalEnergyKernel->setArg<mm_float4>(5, recipBoxVectorsFloat[0]);
pmeEvalEnergyKernel->setArg<mm_float4>(6, recipBoxVectorsFloat[1]);
pmeEvalEnergyKernel->setArg<mm_float4>(7, recipBoxVectorsFloat[2]);
}
if (includeEnergy)
pmeEvalEnergyKernel->execute(gridSizeX*gridSizeY*gridSizeZ);
pmeConvolutionKernel->execute(gridSizeX*gridSizeY*gridSizeZ);
fft->execFFT(pmeGrid2, pmeGrid1, false);
setPeriodicBoxArgs(cc, pmeInterpolateForceKernel, 3);
if (cc.getUseDoublePrecision()) {
pmeInterpolateForceKernel->setArg(8, recipBoxVectors[0]);
pmeInterpolateForceKernel->setArg(9, recipBoxVectors[1]);
pmeInterpolateForceKernel->setArg(10, recipBoxVectors[2]);
}
else {
pmeInterpolateForceKernel->setArg(8, recipBoxVectorsFloat[0]);
pmeInterpolateForceKernel->setArg(9, recipBoxVectorsFloat[1]);
pmeInterpolateForceKernel->setArg(10, recipBoxVectorsFloat[2]);
}
if (deviceIsCpu)
pmeInterpolateForceKernel->execute(cc.getNumThreadBlocks(), 1);
else
pmeInterpolateForceKernel->execute(cc.getNumAtoms());
}
if (doLJPME && hasLJ) {
setPeriodicBoxArgs(cc, pmeDispersionGridIndexKernel, 2);
if (cc.getUseDoublePrecision()) {
pmeDispersionGridIndexKernel->setArg(7, recipBoxVectors[0]);
pmeDispersionGridIndexKernel->setArg(8, recipBoxVectors[1]);
pmeDispersionGridIndexKernel->setArg(9, recipBoxVectors[2]);
}
else {
pmeDispersionGridIndexKernel->setArg(7, recipBoxVectorsFloat[0]);
pmeDispersionGridIndexKernel->setArg(8, recipBoxVectorsFloat[1]);
pmeDispersionGridIndexKernel->setArg(9, recipBoxVectorsFloat[2]);
}
pmeDispersionGridIndexKernel->execute(cc.getNumAtoms());
if (!hasCoulomb)
sort->sort(pmeAtomGridIndex);
if (useFixedPointChargeSpreading)
cc.clearBuffer(pmeGrid2);
else
cc.clearBuffer(pmeGrid1);
setPeriodicBoxArgs(cc, pmeDispersionSpreadChargeKernel, 2);
if (cc.getUseDoublePrecision()) {
pmeDispersionSpreadChargeKernel->setArg(7, recipBoxVectors[0]);
pmeDispersionSpreadChargeKernel->setArg(8, recipBoxVectors[1]);
pmeDispersionSpreadChargeKernel->setArg(9, recipBoxVectors[2]);
}
else {
pmeDispersionSpreadChargeKernel->setArg(7, recipBoxVectorsFloat[0]);
pmeDispersionSpreadChargeKernel->setArg(8, recipBoxVectorsFloat[1]);
pmeDispersionSpreadChargeKernel->setArg(9, recipBoxVectorsFloat[2]);
}
pmeDispersionSpreadChargeKernel->execute(cc.getNumAtoms());
if (useFixedPointChargeSpreading)
pmeDispersionFinishSpreadChargeKernel->execute(gridSizeX*gridSizeY*gridSizeZ);
dispersionFft->execFFT(pmeGrid1, pmeGrid2, true);
if (cc.getUseDoublePrecision()) {
pmeDispersionConvolutionKernel->setArg(4, recipBoxVectors[0]);
pmeDispersionConvolutionKernel->setArg(5, recipBoxVectors[1]);
pmeDispersionConvolutionKernel->setArg(6, recipBoxVectors[2]);
pmeDispersionEvalEnergyKernel->setArg(5, recipBoxVectors[0]);
pmeDispersionEvalEnergyKernel->setArg(6, recipBoxVectors[1]);
pmeDispersionEvalEnergyKernel->setArg(7, recipBoxVectors[2]);
}
else {
pmeDispersionConvolutionKernel->setArg(4, recipBoxVectorsFloat[0]);
pmeDispersionConvolutionKernel->setArg(5, recipBoxVectorsFloat[1]);
pmeDispersionConvolutionKernel->setArg(6, recipBoxVectorsFloat[2]);
pmeDispersionEvalEnergyKernel->setArg(5, recipBoxVectorsFloat[0]);
pmeDispersionEvalEnergyKernel->setArg(6, recipBoxVectorsFloat[1]);
pmeDispersionEvalEnergyKernel->setArg(7, recipBoxVectorsFloat[2]);
}
if (!hasCoulomb)
cc.clearBuffer(pmeEnergyBuffer);
if (includeEnergy)
pmeDispersionEvalEnergyKernel->execute(gridSizeX*gridSizeY*gridSizeZ);
pmeDispersionConvolutionKernel->execute(gridSizeX*gridSizeY*gridSizeZ);
dispersionFft->execFFT(pmeGrid2, pmeGrid1, false);
setPeriodicBoxArgs(cc, pmeDispersionInterpolateForceKernel, 3);
if (cc.getUseDoublePrecision()) {
pmeDispersionInterpolateForceKernel->setArg(8, recipBoxVectors[0]);
pmeDispersionInterpolateForceKernel->setArg(9, recipBoxVectors[1]);
pmeDispersionInterpolateForceKernel->setArg(10, recipBoxVectors[2]);
}
else {
pmeDispersionInterpolateForceKernel->setArg(8, recipBoxVectorsFloat[0]);
pmeDispersionInterpolateForceKernel->setArg(9, recipBoxVectorsFloat[1]);
pmeDispersionInterpolateForceKernel->setArg(10, recipBoxVectorsFloat[2]);
}
if (deviceIsCpu)
pmeDispersionInterpolateForceKernel->execute(cc.getNumThreadBlocks(), 1);
else
pmeDispersionInterpolateForceKernel->execute(cc.getNumAtoms());
}
if (usePmeQueue) {
pmeSyncEvent->enqueue();
cc.restoreDefaultQueue();
}
}
if (dispersionCoefficient != 0.0 && includeDirect) {
Vec3 a, b, c;
cc.getPeriodicBoxVectors(a, b, c);
energy += dispersionCoefficient/(a[0]*b[1]*c[2]);
}
return energy;
}
void CommonCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context, const NonbondedForce& force, int firstParticle, int lastParticle, int firstException, int lastException) {
// Make sure the new parameters are acceptable.
ContextSelector selector(cc);
if (force.getNumParticles() != cc.getNumAtoms())
throw OpenMMException("updateParametersInContext: The number of particles has changed");
if (!hasCoulomb || !hasLJ) {
for (int i = 0; i < force.getNumParticles(); i++) {
double charge, sigma, epsilon;
force.getParticleParameters(i, charge, sigma, epsilon);
if (!hasCoulomb && charge != 0.0)
throw OpenMMException("updateParametersInContext: The nonbonded force kernel does not include Coulomb interactions, because all charges were originally 0");
if (!hasLJ && epsilon != 0.0)
throw OpenMMException("updateParametersInContext: The nonbonded force kernel does not include Lennard-Jones interactions, because all epsilons were originally 0");
}
}
set<int> exceptionsWithOffsets;
for (int i = 0; i < force.getNumExceptionParameterOffsets(); i++) {
string param;
int exception;
double charge, sigma, epsilon;
force.getExceptionParameterOffset(i, param, exception, charge, sigma, epsilon);
exceptionsWithOffsets.insert(exception);
}
vector<int> exceptions;
for (int i = 0; i < force.getNumExceptions(); i++) {
int particle1, particle2;
double chargeProd, sigma, epsilon;
force.getExceptionParameters(i, particle1, particle2, chargeProd, sigma, epsilon);
if (exceptionIndex.find(i) == exceptionIndex.end()) {
if (chargeProd != 0.0 || epsilon != 0.0 || exceptionsWithOffsets.find(i) != exceptionsWithOffsets.end())
throw OpenMMException("updateParametersInContext: The set of non-excluded exceptions has changed");
}
else
exceptions.push_back(i);
}
int numContexts = cc.getNumContexts();
int startIndex = cc.getContextIndex()*exceptions.size()/numContexts;
int endIndex = (cc.getContextIndex()+1)*exceptions.size()/numContexts;
int numExceptions = endIndex-startIndex;
if (numExceptions != exceptionAtoms.size())
throw OpenMMException("updateParametersInContext: The set of non-excluded exceptions has changed");
// Record the per-particle parameters.
if (firstParticle <= lastParticle) {
vector<mm_float4> baseParticleParamVec(cc.getPaddedNumAtoms(), mm_float4(0, 0, 0, 0));
for (int i = 0; i < force.getNumParticles(); i++) {
double charge, sigma, epsilon;
force.getParticleParameters(i, charge, sigma, epsilon);
baseParticleParamVec[i] = mm_float4(charge, sigma, epsilon, 0);
}
baseParticleParams.uploadSubArray(&baseParticleParamVec[firstParticle], firstParticle, lastParticle-firstParticle+1);
// Compute the self energy.
ewaldSelfEnergy = 0.0;
totalCharge = 0.0;
if (nonbondedMethod == Ewald || nonbondedMethod == PME || nonbondedMethod == LJPME) {
if (cc.getContextIndex() == 0) {
for (int i = 0; i < force.getNumParticles(); i++) {
ewaldSelfEnergy -= baseParticleParamVec[i].x*baseParticleParamVec[i].x*ONE_4PI_EPS0*alpha/sqrt(M_PI);
totalCharge += baseParticleParamVec[i].x;
if (doLJPME)
ewaldSelfEnergy += baseParticleParamVec[i].z*pow(baseParticleParamVec[i].y*dispersionAlpha, 6)/3.0;
}
}
}
}
// Record the exceptions.
if (firstException <= lastException) {
vector<mm_float4> baseExceptionParamsVec(numExceptions);
for (int i = 0; i < numExceptions; i++) {
int particle1, particle2;
double chargeProd, sigma, epsilon;
force.getExceptionParameters(exceptions[startIndex+i], particle1, particle2, chargeProd, sigma, epsilon);
if (make_pair(particle1, particle2) != exceptionAtoms[i])
throw OpenMMException("updateParametersInContext: The set of non-excluded exceptions has changed");
baseExceptionParamsVec[i] = mm_float4(chargeProd, sigma, epsilon, 0);
}
baseExceptionParams.upload(baseExceptionParamsVec);
}
// Record parameter offsets.
vector<vector<mm_float4> > particleOffsetVec(force.getNumParticles());
vector<vector<mm_float4> > exceptionOffsetVec(numExceptions);
for (int i = 0; i < force.getNumParticleParameterOffsets(); i++) {
string param;
int particle;
double charge, sigma, epsilon;
force.getParticleParameterOffset(i, param, particle, charge, sigma, epsilon);
auto paramPos = find(paramNames.begin(), paramNames.end(), param);
if (paramPos == paramNames.end())
throw OpenMMException("updateParametersInContext: The parameter of a particle parameter offset has changed");
int paramIndex = paramPos-paramNames.begin();
particleOffsetVec[particle].push_back(mm_float4(charge, sigma, epsilon, paramIndex));
}
for (int i = 0; i < force.getNumExceptionParameterOffsets(); i++) {
string param;
int exception;
double charge, sigma, epsilon;
force.getExceptionParameterOffset(i, param, exception, charge, sigma, epsilon);
int index = exceptionIndex[exception];
if (index < startIndex || index >= endIndex)
continue;
auto paramPos = find(paramNames.begin(), paramNames.end(), param);
if (paramPos == paramNames.end())
throw OpenMMException("updateParametersInContext: The parameter of an exception parameter offset has changed");
int paramIndex = paramPos-paramNames.begin();
exceptionOffsetVec[index-startIndex].push_back(mm_float4(charge, sigma, epsilon, paramIndex));
}
if (max(force.getNumParticleParameterOffsets(), 1) != particleParamOffsets.getSize())
throw OpenMMException("updateParametersInContext: The number of particle parameter offsets has changed");
vector<mm_float4> p, e;
for (int i = 0; i < particleOffsetVec.size(); i++)
for (int j = 0; j < particleOffsetVec[i].size(); j++)
p.push_back(particleOffsetVec[i][j]);
for (int i = 0; i < exceptionOffsetVec.size(); i++)
for (int j = 0; j < exceptionOffsetVec[i].size(); j++)
e.push_back(exceptionOffsetVec[i][j]);
if (force.getNumParticleParameterOffsets() > 0)
particleParamOffsets.upload(p);
if (max((int) e.size(), 1) != exceptionParamOffsets.getSize())
throw OpenMMException("updateParametersInContext: The number of exception parameter offsets has changed");
if (e.size() > 0)
exceptionParamOffsets.upload(e);
// Compute other values.
if (force.getUseDispersionCorrection() && cc.getContextIndex() == 0 && (nonbondedMethod == CutoffPeriodic || nonbondedMethod == Ewald || nonbondedMethod == PME))
dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(context.getSystem(), force);
cc.invalidateMolecules(info, firstParticle <= lastParticle || force.getNumParticleParameterOffsets() > 0,
firstException <= lastException || force.getNumExceptionParameterOffsets() > 0);
recomputeParams = true;
}
void CommonCalcNonbondedForceKernel::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
if (nonbondedMethod != PME)
throw OpenMMException("getPMEParametersInContext: This Context is not using PME");
if (useCpuPme)
cpuPme.getAs<CalcPmeReciprocalForceKernel>().getPMEParameters(alpha, nx, ny, nz);
else {
alpha = this->alpha;
nx = gridSizeX;
ny = gridSizeY;
nz = gridSizeZ;
}
}
void CommonCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
if (nonbondedMethod != LJPME)
throw OpenMMException("getPMEParametersInContext: This Context is not using PME");
if (useCpuPme)
//cpuPme.getAs<CalcPmeReciprocalForceKernel>().getLJPMEParameters(alpha, nx, ny, nz);
throw OpenMMException("getPMEParametersInContext: CPUPME has not been implemented for LJPME yet.");
else {
alpha = this->dispersionAlpha;
nx = dispersionGridSizeX;
ny = dispersionGridSizeY;
nz = dispersionGridSizeZ;
}
}
......@@ -42,7 +42,6 @@
#include "CudaArray.h"
#include "CudaBondedUtilities.h"
#include "CudaExpressionUtilities.h"
#include "CudaFFT3D.h"
#include "CudaIntegrationUtilities.h"
#include "CudaNonbondedUtilities.h"
#include "CudaPlatform.h"
......@@ -154,6 +153,12 @@ public:
* one ComputeContext is created for each device.
*/
std::vector<ComputeContext*> getAllContexts();
/**
* Get the ContextImpl is ComputeContext is associated with.
*/
ContextImpl& getContextImpl() {
return *platformData.context;
}
/**
* Get a workspace used for accumulating energy when a simulation is parallelized across
* multiple devices.
......@@ -176,6 +181,19 @@ public:
* Construct a ComputeEvent object of the appropriate class for this platform.
*/
ComputeEvent createEvent();
/**
* Construct a ComputeSort object of the appropriate class for this platform.
*
* @param trait a SortTrait defining the type of data to sort. It should have been allocated
* on the heap with the "new" operator. This object takes over ownership of it,
* and deletes it when the ComputeSort is deleted.
* @param length the length of the arrays this object will be used to sort
* @param uniform whether the input data is expected to follow a uniform or nonuniform
* distribution. This argument is used only as a hint. It allows parts
* of the algorithm to be tuned for faster performance on the expected
* distribution.
*/
ComputeSort createSort(ComputeSortImpl::SortTrait* trait, unsigned int length, bool uniform=true);
/**
* Compile source code to create a ComputeProgram.
*
......@@ -515,7 +533,7 @@ public:
* @param zsize the third dimension of the data sets on which FFTs will be performed
* @param realToComplex if true, a real-to-complex transform will be done. Otherwise, it is complex-to-complex.
*/
CudaFFT3D* createFFT(int xsize, int ysize, int zsize, bool realToComplex=false);
FFT3D createFFT(int xsize, int ysize, int zsize, bool realToComplex=false);
/**
* This should be called by the Integrator from its own initialize() method.
* It ensures all contexts are fully initialized.
......
......@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2019 Stanford University and the Authors. *
* Portions copyright (c) 2019-2025 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -48,6 +48,11 @@ public:
* Block until all operations started before the call to enqueue() have completed.
*/
void wait();
/**
* Enqueue a barrier that causes a specified ComputeQueue to block until all
* operations started before the call to enqueue() have completed.
*/
void queueWait(ComputeQueue queue);
private:
CudaContext& context;
CUevent event;
......
......@@ -50,7 +50,7 @@ class CudaContext;
* multiply every value of the original data set by the total number of data points.
*/
class OPENMM_EXPORT_COMMON CudaFFT3D : public FFT3D {
class OPENMM_EXPORT_COMMON CudaFFT3D : public FFT3DImpl {
public:
/**
* Create a CudaFFT3D object for performing transforms of a particular size.
......
......@@ -30,11 +30,12 @@
#include "CudaPlatform.h"
#include "CudaArray.h"
#include "CudaContext.h"
#include "CudaFFT3D.h"
#include "CudaSort.h"
#include "openmm/kernels.h"
#include "openmm/System.h"
#include "openmm/common/CommonKernels.h"
#include "openmm/common/CommonCalcNonbondedForce.h"
#include "openmm/common/ComputeSort.h"
#include "openmm/common/FFT3D.h"
namespace OpenMM {
......@@ -85,12 +86,11 @@ private:
/**
* This kernel is invoked by NonbondedForce to calculate the forces acting on the system.
*/
class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
class CudaCalcNonbondedForceKernel : public CommonCalcNonbondedForceKernel {
public:
CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcNonbondedForceKernel(name, platform),
cu(cu), hasInitializedFFT(false), sort(NULL), dispersionFft(NULL), fft(NULL), pmeio(NULL), useFixedPointChargeSpreading(false), usePmeStream(false) {
CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) :
CommonCalcNonbondedForceKernel(name, platform, cu, system), cu(cu) {
}
~CudaCalcNonbondedForceKernel();
/**
* Initialize the kernel.
*
......@@ -98,123 +98,8 @@ public:
* @param force the NonbondedForce this kernel will be used for
*/
void initialize(const System& system, const NonbondedForce& force);
/**
* Execute the kernel to calculate the forces and/or energy.
*
* @param context the context in which to execute this kernel
* @param includeForces true if forces should be calculated
* @param includeEnergy true if the energy should be calculated
* @param includeDirect true if direct space interactions should be included
* @param includeReciprocal true if reciprocal space interactions should be included
* @return the potential energy due to the force
*/
double execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal);
/**
* Copy changed parameters over to a context.
*
* @param context the context to copy parameters to
* @param force the NonbondedForce to copy the parameters from
* @param firstParticle the index of the first particle whose parameters might have changed
* @param lastParticle the index of the last particle whose parameters might have changed
* @param firstException the index of the first exception whose parameters might have changed
* @param lastException the index of the last exception whose parameters might have changed
*/
void copyParametersToContext(ContextImpl& context, const NonbondedForce& force, int firstParticle, int lastParticle, int firstException, int lastException);
/**
* Get the parameters being used for PME.
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
/**
* Get the dispersion parameters being used for the dispersion term in LJPME.
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
private:
class SortTrait : public CudaSort::SortTrait {
int getDataSize() const {return 8;}
int getKeySize() const {return 4;}
const char* getDataType() const {return "int2";}
const char* getKeyType() const {return "int";}
const char* getMinKey() const {return "(-2147483647-1)";}
const char* getMaxKey() const {return "2147483647";}
const char* getMaxValue() const {return "make_int2(2147483647, 2147483647)";}
const char* getSortKey() const {return "value.y";}
};
class ForceInfo;
class PmeIO;
class PmePreComputation;
class PmePostComputation;
class SyncStreamPreComputation;
class SyncStreamPostComputation;
CudaContext& cu;
ForceInfo* info;
bool hasInitializedFFT;
CudaArray charges;
CudaArray sigmaEpsilon;
CudaArray exceptionParams;
CudaArray exclusionAtoms;
CudaArray exclusionParams;
CudaArray baseParticleParams;
CudaArray baseExceptionParams;
CudaArray particleParamOffsets;
CudaArray exceptionParamOffsets;
CudaArray particleOffsetIndices;
CudaArray exceptionOffsetIndices;
CudaArray globalParams;
CudaArray cosSinSums;
CudaArray pmeGrid1;
CudaArray pmeGrid2;
CudaArray pmeBsplineModuliX;
CudaArray pmeBsplineModuliY;
CudaArray pmeBsplineModuliZ;
CudaArray pmeDispersionBsplineModuliX;
CudaArray pmeDispersionBsplineModuliY;
CudaArray pmeDispersionBsplineModuliZ;
CudaArray pmeAtomGridIndex;
CudaArray pmeEnergyBuffer;
CudaArray chargeBuffer;
CudaSort* sort;
Kernel cpuPme;
PmeIO* pmeio;
ComputeQueue pmeQueue;
CUevent pmeSyncEvent, paramsSyncEvent;
CudaFFT3D* fft;
CudaFFT3D* dispersionFft;
CUfunction computeParamsKernel, computeExclusionParamsKernel, computePlasmaCorrectionKernel;
CUfunction ewaldSumsKernel;
CUfunction ewaldForcesKernel;
CUfunction pmeGridIndexKernel;
CUfunction pmeDispersionGridIndexKernel;
CUfunction pmeSpreadChargeKernel;
CUfunction pmeDispersionSpreadChargeKernel;
CUfunction pmeFinishSpreadChargeKernel;
CUfunction pmeDispersionFinishSpreadChargeKernel;
CUfunction pmeEvalEnergyKernel;
CUfunction pmeEvalDispersionEnergyKernel;
CUfunction pmeConvolutionKernel;
CUfunction pmeDispersionConvolutionKernel;
CUfunction pmeInterpolateForceKernel;
CUfunction pmeInterpolateDispersionForceKernel;
std::vector<std::pair<int, int> > exceptionAtoms;
std::vector<std::string> paramNames;
std::vector<double> paramValues;
std::map<int, int> exceptionIndex;
double ewaldSelfEnergy, dispersionCoefficient, alpha, dispersionAlpha, totalCharge;
int interpolateForceThreads;
int gridSizeX, gridSizeY, gridSizeZ;
int dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ;
bool hasCoulomb, hasLJ, useFixedPointChargeSpreading, usePmeStream, doLJPME, usePosqCharges, recomputeParams, hasOffsets;
NonbondedMethod nonbondedMethod;
static const int PmeOrder = 5;
};
/**
......
......@@ -30,6 +30,7 @@
#include "openmm/System.h"
#include "CudaArray.h"
#include "CudaExpressionUtilities.h"
#include "openmm/common/ComputeSort.h"
#include "openmm/common/NonbondedUtilities.h"
#include <cuda.h>
#include <sstream>
......@@ -39,7 +40,6 @@
namespace OpenMM {
class CudaContext;
class CudaSort;
/**
* This class provides a generic interface for calculating nonbonded interactions. It does this in two
......@@ -71,20 +71,6 @@ public:
class ParameterInfo;
CudaNonbondedUtilities(CudaContext& context);
~CudaNonbondedUtilities();
/**
* Add a nonbonded interaction to be evaluated by the default interaction kernel.
*
* @param usesCutoff specifies whether a cutoff should be applied to this interaction
* @param usesPeriodic specifies whether periodic boundary conditions should be applied to this interaction
* @param usesExclusions specifies whether this interaction uses exclusions. If this is true, it must have identical exclusions to every other interaction.
* @param cutoffDistance the cutoff distance for this interaction (ignored if usesCutoff is false)
* @param exclusionList for each atom, specifies the list of other atoms whose interactions should be excluded
* @param kernel the code to evaluate the interaction
* @param forceGroup the force group in which the interaction should be calculated
* @param useNeighborList specifies whether a neighbor list should be used to optimize this interaction. This should
* be viewed as only a suggestion. Even when it is false, a neighbor list may be used anyway.
*/
void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup, bool useNeighborList=true);
/**
* Add a nonbonded interaction to be evaluated by the default interaction kernel.
*
......@@ -99,7 +85,9 @@ public:
* be viewed as only a suggestion. Even when it is false, a neighbor list may be used anyway.
* @param supportsPairList specifies whether this interaction can work with a neighbor list that uses a separate pair list
*/
void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup, bool useNeighborList, bool supportsPairList);
void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance,
const std::vector<std::vector<int> >& exclusionList, const std::string& kernel,
int forceGroup, bool useNeighborList=true, bool supportsPairList=false);
/**
* Add a per-atom parameter that the default interaction kernel may depend on.
*/
......@@ -343,7 +331,7 @@ private:
CudaArray largeBlockBoundingBox;
CudaArray oldPositions;
CudaArray rebuildNeighborList;
CudaSort* blockSorter;
ComputeSort blockSorter;
CUevent downloadCountEvent;
unsigned int* pinnedCountBuffer;
std::vector<void*> forceArgs, findBlockBoundsArgs, computeSortKeysArgs, sortBoxDataArgs, findInteractingBlocksArgs;
......
......@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2021 Stanford University and the Authors. *
* Portions copyright (c) 2010-2025 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -28,6 +28,7 @@
* -------------------------------------------------------------------------- */
#include "CudaArray.h"
#include "openmm/common/ComputeSort.h"
#include "openmm/common/windowsExportCommon.h"
#include "CudaContext.h"
......@@ -41,7 +42,7 @@ namespace OpenMM {
* sort and the key for sorting it. Here is an example of a trait class for
* sorting floats:
*
* class FloatTrait : public CudaSort::SortTrait {
* class FloatTrait : public ComputeSortImpl::SortTrait {
* int getDataSize() const {return 4;}
* int getKeySize() const {return 4;}
* const char* getDataType() const {return "float";}
......@@ -66,9 +67,8 @@ namespace OpenMM {
* elements).
*/
class OPENMM_EXPORT_COMMON CudaSort {
class OPENMM_EXPORT_COMMON CudaSort : public ComputeSortImpl {
public:
class SortTrait;
/**
* Create a CudaSort object for sorting data of a particular type.
*
......@@ -82,15 +82,15 @@ public:
* of the algorithm to be tuned for faster performance on the expected
* distribution.
*/
CudaSort(CudaContext& context, SortTrait* trait, unsigned int length, bool uniform=true);
CudaSort(CudaContext& context, ComputeSortImpl::SortTrait* trait, unsigned int length, bool uniform=true);
~CudaSort();
/**
* Sort an array.
*/
void sort(CudaArray& data);
void sort(ArrayInterface& data);
private:
CudaContext& context;
SortTrait* trait;
ComputeSortImpl::SortTrait* trait;
CudaArray dataRange;
CudaArray bucketOfElement;
CudaArray offsetInBucket;
......@@ -101,48 +101,6 @@ private:
bool isShortList, uniform;
};
/**
* A subclass of SortTrait defines the type of value to sort, and the key for sorting them.
*/
class CudaSort::SortTrait {
public:
virtual ~SortTrait() {
}
/**
* Get the size of each data value in bytes.
*/
virtual int getDataSize() const = 0;
/**
* Get the size of each key value in bytes.
*/
virtual int getKeySize() const = 0;
/**
* Get the data type of the values to sort.
*/
virtual const char* getDataType() const = 0;
/**
* Get the data type of the sorting key.
*/
virtual const char* getKeyType() const = 0;
/**
* Get the minimum value a key can take.
*/
virtual const char* getMinKey() const = 0;
/**
* Get the maximum value a key can take.
*/
virtual const char* getMaxKey() const = 0;
/**
* Get a value whose key is guaranteed to equal getMaxKey().
*/
virtual const char* getMaxValue() const = 0;
/**
* Get the CUDA code to select the key from the data value.
*/
virtual const char* getSortKey() const = 0;
};
} // namespace OpenMM
#endif // __OPENMM_CUDASORT_H__
......@@ -32,11 +32,13 @@
#include "CudaArray.h"
#include "CudaBondedUtilities.h"
#include "CudaEvent.h"
#include "CudaFFT3D.h"
#include "CudaIntegrationUtilities.h"
#include "CudaKernels.h"
#include "CudaKernelSources.h"
#include "CudaNonbondedUtilities.h"
#include "CudaProgram.h"
#include "CudaSort.h"
#include "openmm/common/ComputeArray.h"
#include "openmm/common/ContextSelector.h"
#include "SHA1.h"
......@@ -439,8 +441,8 @@ void CudaContext::initializeContexts() {
getPlatformData().initializeContexts(system);
}
CudaFFT3D* CudaContext::createFFT(int xsize, int ysize, int zsize, bool realToComplex) {
return new CudaFFT3D(*this, xsize, ysize, zsize, realToComplex);
FFT3D CudaContext::createFFT(int xsize, int ysize, int zsize, bool realToComplex) {
return FFT3D(new CudaFFT3D(*this, xsize, ysize, zsize, realToComplex));
}
void CudaContext::setAsCurrent() {
......@@ -667,6 +669,10 @@ ComputeEvent CudaContext::createEvent() {
return shared_ptr<ComputeEventImpl>(new CudaEvent(*this));
}
ComputeSort CudaContext::createSort(ComputeSortImpl::SortTrait* trait, unsigned int length, bool uniform) {
return shared_ptr<ComputeSortImpl>(new CudaSort(*this, trait, length, uniform));
}
ComputeProgram CudaContext::compileProgram(const std::string source, const std::map<std::string, std::string>& defines) {
CUmodule module = createModule(CudaKernelSources::vectorOps+source, defines);
return shared_ptr<ComputeProgramImpl>(new CudaProgram(*this, module));
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2019 Stanford University and the Authors. *
* Portions copyright (c) 2019-2025 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -42,9 +42,13 @@ CudaEvent::~CudaEvent() {
}
void CudaEvent::enqueue() {
cuEventRecord(event, 0);
cuEventRecord(event, context.getCurrentStream());
}
void CudaEvent::wait() {
cuEventSynchronize(event);
}
void CudaEvent::queueWait(ComputeQueue queue) {
cuStreamWaitEvent(dynamic_cast<CudaQueue*>(queue.get())->getStream(), event, 0);
}
......@@ -86,1104 +86,8 @@ double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bo
return sum;
}
class CudaCalcNonbondedForceKernel::ForceInfo : public CudaForceInfo {
public:
ForceInfo(const NonbondedForce& force) : force(force) {
particleOffset.resize(force.getNumParticles(), -1);
exceptionOffset.resize(force.getNumExceptions(), -1);
for (int i = 0; i < force.getNumParticleParameterOffsets(); i++) {
string parameter;
int particleIndex;
double chargeScale, sigmaScale, epsilonScale;
force.getParticleParameterOffset(i, parameter, particleIndex, chargeScale, sigmaScale, epsilonScale);
particleOffset[particleIndex] = i;
}
for (int i = 0; i < force.getNumExceptionParameterOffsets(); i++) {
string parameter;
int exceptionIndex;
double chargeProdScale, sigmaScale, epsilonScale;
force.getExceptionParameterOffset(i, parameter, exceptionIndex, chargeProdScale, sigmaScale, epsilonScale);
exceptionOffset[exceptionIndex] = i;
}
}
bool areParticlesIdentical(int particle1, int particle2) {
if (particleOffset[particle1] != -1 || particleOffset[particle2] != -1) {
if (particleOffset[particle1] == -1 || particleOffset[particle2] == -1)
return false;
string parameter1, parameter2;
int particleIndex1, particleIndex2;
double chargeScale1, sigmaScale1, epsilonScale1, chargeScale2, sigmaScale2, epsilonScale2;
force.getParticleParameterOffset(particleOffset[particle1], parameter1, particleIndex1, chargeScale1, sigmaScale1, epsilonScale1);
force.getParticleParameterOffset(particleOffset[particle2], parameter2, particleIndex2, chargeScale2, sigmaScale2, epsilonScale2);
if (parameter1 != parameter2 || chargeScale1 != chargeScale2 || sigmaScale1 != sigmaScale2 || epsilonScale1 != epsilonScale2)
return false;
}
double charge1, charge2, sigma1, sigma2, epsilon1, epsilon2;
force.getParticleParameters(particle1, charge1, sigma1, epsilon1);
force.getParticleParameters(particle2, charge2, sigma2, epsilon2);
return (charge1 == charge2 && sigma1 == sigma2 && epsilon1 == epsilon2);
}
int getNumParticleGroups() {
return force.getNumExceptions();
}
void getParticlesInGroup(int index, vector<int>& particles) {
int particle1, particle2;
double chargeProd, sigma, epsilon;
force.getExceptionParameters(index, particle1, particle2, chargeProd, sigma, epsilon);
particles.resize(2);
particles[0] = particle1;
particles[1] = particle2;
}
bool areGroupsIdentical(int group1, int group2) {
if (exceptionOffset[group1] != -1 || exceptionOffset[group2] != -1) {
if (exceptionOffset[group1] == -1 || exceptionOffset[group2] == -1)
return false;
string parameter1, parameter2;
int exceptionIndex1, exceptionIndex2;
double chargeProdScale1, sigmaScale1, epsilonScale1, chargeProdScale2, sigmaScale2, epsilonScale2;
force.getExceptionParameterOffset(exceptionOffset[group1], parameter1, exceptionIndex1, chargeProdScale1, sigmaScale1, epsilonScale1);
force.getExceptionParameterOffset(exceptionOffset[group2], parameter2, exceptionIndex2, chargeProdScale2, sigmaScale2, epsilonScale2);
if (parameter1 != parameter2 || chargeProdScale1 != chargeProdScale2 || sigmaScale1 != sigmaScale2 || epsilonScale1 != epsilonScale2)
return false;
}
int particle1, particle2;
double chargeProd1, chargeProd2, sigma1, sigma2, epsilon1, epsilon2;
force.getExceptionParameters(group1, particle1, particle2, chargeProd1, sigma1, epsilon1);
force.getExceptionParameters(group2, particle1, particle2, chargeProd2, sigma2, epsilon2);
return (chargeProd1 == chargeProd2 && sigma1 == sigma2 && epsilon1 == epsilon2);
}
private:
const NonbondedForce& force;
vector<int> particleOffset, exceptionOffset;
};
class CudaCalcNonbondedForceKernel::PmeIO : public CalcPmeReciprocalForceKernel::IO {
public:
PmeIO(CudaContext& cu, CUfunction addForcesKernel) : cu(cu), addForcesKernel(addForcesKernel) {
forceTemp.initialize<float4>(cu, cu.getNumAtoms(), "PmeForce");
}
float* getPosq() {
ContextSelector selector(cu);
cu.getPosq().download(posq);
return (float*) &posq[0];
}
void setForce(float* force) {
forceTemp.upload(force);
void* args[] = {&forceTemp.getDevicePointer(), &cu.getForce().getDevicePointer()};
cu.executeKernel(addForcesKernel, args, cu.getNumAtoms());
}
private:
CudaContext& cu;
vector<float4> posq;
CudaArray forceTemp;
CUfunction addForcesKernel;
};
class CudaCalcNonbondedForceKernel::PmePreComputation : public CudaContext::ForcePreComputation {
public:
PmePreComputation(CudaContext& cu, Kernel& pme, CalcPmeReciprocalForceKernel::IO& io) : cu(cu), pme(pme), io(io) {
}
void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
Vec3 boxVectors[3] = {Vec3(cu.getPeriodicBoxSize().x, 0, 0), Vec3(0, cu.getPeriodicBoxSize().y, 0), Vec3(0, 0, cu.getPeriodicBoxSize().z)};
pme.getAs<CalcPmeReciprocalForceKernel>().beginComputation(io, boxVectors, includeEnergy);
}
private:
CudaContext& cu;
Kernel pme;
CalcPmeReciprocalForceKernel::IO& io;
};
class CudaCalcNonbondedForceKernel::PmePostComputation : public CudaContext::ForcePostComputation {
public:
PmePostComputation(Kernel& pme, CalcPmeReciprocalForceKernel::IO& io) : pme(pme), io(io) {
}
double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
return pme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
}
private:
Kernel pme;
CalcPmeReciprocalForceKernel::IO& io;
};
class CudaCalcNonbondedForceKernel::SyncStreamPreComputation : public CudaContext::ForcePreComputation {
public:
SyncStreamPreComputation(CudaContext& cu, ComputeQueue queue, CUevent event, int forceGroup) : cu(cu), queue(queue), event(event), forceGroup(forceGroup) {
}
void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
if ((groups&(1<<forceGroup)) != 0) {
cuEventRecord(event, cu.getCurrentStream());
cuStreamWaitEvent(dynamic_cast<CudaQueue*>(queue.get())->getStream(), event, 0);
}
}
private:
CudaContext& cu;
ComputeQueue queue;
CUevent event;
int forceGroup;
};
class CudaCalcNonbondedForceKernel::SyncStreamPostComputation : public CudaContext::ForcePostComputation {
public:
SyncStreamPostComputation(CudaContext& cu, CUevent event, CUfunction addEnergyKernel, CudaArray& pmeEnergyBuffer, int forceGroup) : cu(cu), event(event),
addEnergyKernel(addEnergyKernel), pmeEnergyBuffer(pmeEnergyBuffer), forceGroup(forceGroup) {
}
double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
if ((groups&(1<<forceGroup)) != 0) {
cuStreamWaitEvent(cu.getCurrentStream(), event, 0);
if (includeEnergy) {
int bufferSize = pmeEnergyBuffer.getSize();
void* args[] = {&pmeEnergyBuffer.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), &bufferSize};
cu.executeKernel(addEnergyKernel, args, bufferSize);
}
}
return 0.0;
}
private:
CudaContext& cu;
CUevent event;
CUfunction addEnergyKernel;
CudaArray& pmeEnergyBuffer;
int forceGroup;
};
CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
ContextSelector selector(cu);
if (sort != NULL)
delete sort;
if (fft != NULL)
delete fft;
if (dispersionFft != NULL)
delete dispersionFft;
if (pmeio != NULL)
delete pmeio;
}
void CudaCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
ContextSelector selector(cu);
int forceIndex;
for (forceIndex = 0; forceIndex < system.getNumForces() && &system.getForce(forceIndex) != &force; ++forceIndex)
;
string prefix = "nonbonded"+cu.intToString(forceIndex)+"_";
// Identify which exceptions are 1-4 interactions.
set<int> exceptionsWithOffsets;
for (int i = 0; i < force.getNumExceptionParameterOffsets(); i++) {
string param;
int exception;
double charge, sigma, epsilon;
force.getExceptionParameterOffset(i, param, exception, charge, sigma, epsilon);
exceptionsWithOffsets.insert(exception);
}
vector<pair<int, int> > exclusions;
vector<int> exceptions;
for (int i = 0; i < force.getNumExceptions(); i++) {
int particle1, particle2;
double chargeProd, sigma, epsilon;
force.getExceptionParameters(i, particle1, particle2, chargeProd, sigma, epsilon);
exclusions.push_back(pair<int, int>(particle1, particle2));
if (chargeProd != 0.0 || epsilon != 0.0 || exceptionsWithOffsets.find(i) != exceptionsWithOffsets.end()) {
exceptionIndex[i] = exceptions.size();
exceptions.push_back(i);
}
}
// Initialize nonbonded interactions.
int numParticles = force.getNumParticles();
vector<float4> baseParticleParamVec(cu.getPaddedNumAtoms(), make_float4(0, 0, 0, 0));
vector<vector<int> > exclusionList(numParticles);
hasCoulomb = false;
hasLJ = false;
for (int i = 0; i < numParticles; i++) {
double charge, sigma, epsilon;
force.getParticleParameters(i, charge, sigma, epsilon);
baseParticleParamVec[i] = make_float4(charge, sigma, epsilon, 0);
exclusionList[i].push_back(i);
if (charge != 0.0)
hasCoulomb = true;
if (epsilon != 0.0)
hasLJ = true;
}
for (int i = 0; i < force.getNumParticleParameterOffsets(); i++) {
string param;
int particle;
double charge, sigma, epsilon;
force.getParticleParameterOffset(i, param, particle, charge, sigma, epsilon);
if (charge != 0.0)
hasCoulomb = true;
if (epsilon != 0.0)
hasLJ = true;
}
for (auto exclusion : exclusions) {
exclusionList[exclusion.first].push_back(exclusion.second);
exclusionList[exclusion.second].push_back(exclusion.first);
}
nonbondedMethod = CalcNonbondedForceKernel::NonbondedMethod(force.getNonbondedMethod());
bool useCutoff = (nonbondedMethod != NoCutoff);
bool usePeriodic = (nonbondedMethod != NoCutoff && nonbondedMethod != CutoffNonPeriodic);
doLJPME = (nonbondedMethod == LJPME && hasLJ);
usePosqCharges = hasCoulomb ? cu.requestPosqCharges() : false;
map<string, string> defines;
defines["HAS_COULOMB"] = (hasCoulomb ? "1" : "0");
defines["HAS_LENNARD_JONES"] = (hasLJ ? "1" : "0");
defines["USE_LJ_SWITCH"] = (useCutoff && force.getUseSwitchingFunction() ? "1" : "0");
if (useCutoff) {
// Compute the reaction field constants.
double reactionFieldK = pow(force.getCutoffDistance(), -3.0)*(force.getReactionFieldDielectric()-1.0)/(2.0*force.getReactionFieldDielectric()+1.0);
double reactionFieldC = (1.0 / force.getCutoffDistance())*(3.0*force.getReactionFieldDielectric())/(2.0*force.getReactionFieldDielectric()+1.0);
defines["REACTION_FIELD_K"] = cu.doubleToString(reactionFieldK);
defines["REACTION_FIELD_C"] = cu.doubleToString(reactionFieldC);
// Compute the switching coefficients.
if (force.getUseSwitchingFunction()) {
defines["LJ_SWITCH_CUTOFF"] = cu.doubleToString(force.getSwitchingDistance());
defines["LJ_SWITCH_C3"] = cu.doubleToString(10/pow(force.getSwitchingDistance()-force.getCutoffDistance(), 3.0));
defines["LJ_SWITCH_C4"] = cu.doubleToString(15/pow(force.getSwitchingDistance()-force.getCutoffDistance(), 4.0));
defines["LJ_SWITCH_C5"] = cu.doubleToString(6/pow(force.getSwitchingDistance()-force.getCutoffDistance(), 5.0));
}
}
if (force.getUseDispersionCorrection() && cu.getContextIndex() == 0 && !doLJPME)
dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(system, force);
else
dispersionCoefficient = 0.0;
alpha = 0;
ewaldSelfEnergy = 0.0;
totalCharge = 0.0;
map<string, string> paramsDefines;
paramsDefines["ONE_4PI_EPS0"] = cu.doubleToString(ONE_4PI_EPS0);
paramsDefines["EPSILON0"] = cu.doubleToString(EPSILON0);
paramsDefines["WORK_GROUP_SIZE"] = cu.intToString(CudaContext::ThreadBlockSize);
hasOffsets = (force.getNumParticleParameterOffsets() > 0 || force.getNumExceptionParameterOffsets() > 0);
if (hasOffsets)
paramsDefines["HAS_OFFSETS"] = "1";
if (force.getNumParticleParameterOffsets() > 0)
paramsDefines["HAS_PARTICLE_OFFSETS"] = "1";
if (force.getNumExceptionParameterOffsets() > 0)
paramsDefines["HAS_EXCEPTION_OFFSETS"] = "1";
if (usePosqCharges)
paramsDefines["USE_POSQ_CHARGES"] = "1";
if (doLJPME)
paramsDefines["INCLUDE_LJPME_EXCEPTIONS"] = "1";
if (nonbondedMethod == Ewald) {
// Compute the Ewald parameters.
int kmaxx, kmaxy, kmaxz;
NonbondedForceImpl::calcEwaldParameters(system, force, alpha, kmaxx, kmaxy, kmaxz);
defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
defines["USE_EWALD"] = "1";
if (cu.getContextIndex() == 0) {
paramsDefines["INCLUDE_EWALD"] = "1";
paramsDefines["EWALD_SELF_ENERGY_SCALE"] = cu.doubleToString(ONE_4PI_EPS0*alpha/sqrt(M_PI));
for (int i = 0; i < numParticles; i++) {
ewaldSelfEnergy -= baseParticleParamVec[i].x*baseParticleParamVec[i].x*ONE_4PI_EPS0*alpha/sqrt(M_PI);
totalCharge += baseParticleParamVec[i].x;
}
// Create the reciprocal space kernels.
map<string, string> replacements;
replacements["NUM_ATOMS"] = cu.intToString(numParticles);
replacements["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
replacements["KMAX_X"] = cu.intToString(kmaxx);
replacements["KMAX_Y"] = cu.intToString(kmaxy);
replacements["KMAX_Z"] = cu.intToString(kmaxz);
replacements["EXP_COEFFICIENT"] = cu.doubleToString(-1.0/(4.0*alpha*alpha));
replacements["ONE_4PI_EPS0"] = cu.doubleToString(ONE_4PI_EPS0);
replacements["M_PI"] = cu.doubleToString(M_PI);
CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CommonKernelSources::ewald, replacements);
ewaldSumsKernel = cu.getKernel(module, "calculateEwaldCosSinSums");
ewaldForcesKernel = cu.getKernel(module, "calculateEwaldForces");
int elementSize = (cu.getUseDoublePrecision() ? sizeof(double2) : sizeof(float2));
cosSinSums.initialize(cu, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), elementSize, "cosSinSums");
}
}
else if (((nonbondedMethod == PME || nonbondedMethod == LJPME) && hasCoulomb) || doLJPME) {
// Compute the PME parameters.
NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSizeX, gridSizeY, gridSizeZ, false);
gridSizeX = cu.findLegalFFTDimension(gridSizeX);
gridSizeY = cu.findLegalFFTDimension(gridSizeY);
gridSizeZ = cu.findLegalFFTDimension(gridSizeZ);
if (doLJPME) {
NonbondedForceImpl::calcPMEParameters(system, force, dispersionAlpha, dispersionGridSizeX,
dispersionGridSizeY, dispersionGridSizeZ, true);
dispersionGridSizeX = cu.findLegalFFTDimension(dispersionGridSizeX);
dispersionGridSizeY = cu.findLegalFFTDimension(dispersionGridSizeY);
dispersionGridSizeZ = cu.findLegalFFTDimension(dispersionGridSizeZ);
}
defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
defines["USE_EWALD"] = "1";
defines["DO_LJPME"] = doLJPME ? "1" : "0";
if (doLJPME) {
defines["EWALD_DISPERSION_ALPHA"] = cu.doubleToString(dispersionAlpha);
double invRCut6 = pow(force.getCutoffDistance(), -6);
double dalphaR = dispersionAlpha * force.getCutoffDistance();
double dar2 = dalphaR*dalphaR;
double dar4 = dar2*dar2;
double multShift6 = -invRCut6*(1.0 - exp(-dar2) * (1.0 + dar2 + 0.5*dar4));
defines["INVCUT6"] = cu.doubleToString(invRCut6);
defines["MULTSHIFT6"] = cu.doubleToString(multShift6);
}
if (cu.getContextIndex() == 0) {
paramsDefines["INCLUDE_EWALD"] = "1";
paramsDefines["EWALD_SELF_ENERGY_SCALE"] = cu.doubleToString(ONE_4PI_EPS0*alpha/sqrt(M_PI));
for (int i = 0; i < numParticles; i++) {
ewaldSelfEnergy -= baseParticleParamVec[i].x*baseParticleParamVec[i].x*ONE_4PI_EPS0*alpha/sqrt(M_PI);
totalCharge += baseParticleParamVec[i].x;
}
if (doLJPME) {
paramsDefines["INCLUDE_LJPME"] = "1";
paramsDefines["LJPME_SELF_ENERGY_SCALE"] = cu.doubleToString(pow(dispersionAlpha, 6)/3.0);
for (int i = 0; i < numParticles; i++)
ewaldSelfEnergy += baseParticleParamVec[i].z*pow(baseParticleParamVec[i].y*dispersionAlpha, 6)/3.0;
}
char deviceName[100];
cuDeviceGetName(deviceName, 100, cu.getDevice());
usePmeStream = (!cu.getPlatformData().disablePmeStream && !cu.getPlatformData().useCpuPme && string(deviceName) != "GeForce GTX 980"); // Using a separate stream is slower on GTX 980
map<string, string> pmeDefines;
pmeDefines["PME_ORDER"] = cu.intToString(PmeOrder);
pmeDefines["NUM_ATOMS"] = cu.intToString(numParticles);
pmeDefines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
pmeDefines["RECIP_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(alpha*alpha));
pmeDefines["GRID_SIZE_X"] = cu.intToString(gridSizeX);
pmeDefines["GRID_SIZE_Y"] = cu.intToString(gridSizeY);
pmeDefines["GRID_SIZE_Z"] = cu.intToString(gridSizeZ);
pmeDefines["EPSILON_FACTOR"] = cu.doubleToString(sqrt(ONE_4PI_EPS0));
pmeDefines["M_PI"] = cu.doubleToString(M_PI);
useFixedPointChargeSpreading = cu.getUseDoublePrecision() || cu.getPlatformData().deterministicForces;
if (useFixedPointChargeSpreading)
pmeDefines["USE_FIXED_POINT_CHARGE_SPREADING"] = "1";
if (usePmeStream)
pmeDefines["USE_PME_STREAM"] = "1";
map<string, string> replacements;
replacements["CHARGE"] = (usePosqCharges ? "pos.w" : "charges[atom]");
CUmodule module = cu.createModule(CudaKernelSources::vectorOps+cu.replaceStrings(CommonKernelSources::pme, replacements), pmeDefines);
if (cu.getPlatformData().useCpuPme && !doLJPME && usePosqCharges) {
// Create the CPU PME kernel.
try {
cpuPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), *cu.getPlatformData().context);
cpuPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSizeX, gridSizeY, gridSizeZ, numParticles, alpha, cu.getPlatformData().deterministicForces);
CUfunction addForcesKernel = cu.getKernel(module, "addForces");
pmeio = new PmeIO(cu, addForcesKernel);
cu.addPreComputation(new PmePreComputation(cu, cpuPme, *pmeio));
cu.addPostComputation(new PmePostComputation(cpuPme, *pmeio));
}
catch (OpenMMException& ex) {
// The CPU PME plugin isn't available.
}
}
if (pmeio == NULL) {
pmeGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
pmeSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge");
pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
pmeInterpolateForceKernel = cu.getKernel(module, "gridInterpolateForce");
pmeEvalEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy");
if (useFixedPointChargeSpreading)
pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
cuFuncSetCacheConfig(pmeSpreadChargeKernel, CU_FUNC_CACHE_PREFER_SHARED);
cuFuncSetCacheConfig(pmeInterpolateForceKernel, CU_FUNC_CACHE_PREFER_L1);
if (doLJPME) {
pmeDefines["EWALD_ALPHA"] = cu.doubleToString(dispersionAlpha);
pmeDefines["GRID_SIZE_X"] = cu.intToString(dispersionGridSizeX);
pmeDefines["GRID_SIZE_Y"] = cu.intToString(dispersionGridSizeY);
pmeDefines["GRID_SIZE_Z"] = cu.intToString(dispersionGridSizeZ);
pmeDefines["RECIP_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(dispersionAlpha*dispersionAlpha));
pmeDefines["USE_LJPME"] = "1";
pmeDefines["CHARGE_FROM_SIGEPS"] = "1";
module = cu.createModule(CudaKernelSources::vectorOps+CommonKernelSources::pme, pmeDefines);
if (useFixedPointChargeSpreading)
pmeDispersionFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
pmeDispersionGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
pmeDispersionSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge");
pmeDispersionConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
pmeEvalDispersionEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy");
pmeInterpolateDispersionForceKernel = cu.getKernel(module, "gridInterpolateForce");
cuFuncSetCacheConfig(pmeDispersionSpreadChargeKernel, CU_FUNC_CACHE_PREFER_L1);
}
// Create required data structures.
int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
int gridElements = gridSizeX*gridSizeY*gridSizeZ;
if (doLJPME) {
gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
}
pmeGrid1.initialize(cu, gridElements, 2*elementSize, "pmeGrid1");
pmeGrid2.initialize(cu, gridElements, 2*elementSize, "pmeGrid2");
if (useFixedPointChargeSpreading)
cu.addAutoclearBuffer(pmeGrid2);
else
cu.addAutoclearBuffer(pmeGrid1);
pmeBsplineModuliX.initialize(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliY.initialize(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliZ.initialize(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
if (doLJPME) {
pmeDispersionBsplineModuliX.initialize(cu, dispersionGridSizeX, elementSize, "pmeDispersionBsplineModuliX");
pmeDispersionBsplineModuliY.initialize(cu, dispersionGridSizeY, elementSize, "pmeDispersionBsplineModuliY");
pmeDispersionBsplineModuliZ.initialize(cu, dispersionGridSizeZ, elementSize, "pmeDispersionBsplineModuliZ");
}
pmeAtomGridIndex.initialize<int2>(cu, numParticles, "pmeAtomGridIndex");
int energyElementSize = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
pmeEnergyBuffer.initialize(cu, cu.getNumThreadBlocks()*CudaContext::ThreadBlockSize, energyElementSize, "pmeEnergyBuffer");
cu.clearBuffer(pmeEnergyBuffer);
sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
fft = cu.createFFT(gridSizeX, gridSizeY, gridSizeZ, true);
if (doLJPME)
dispersionFft = cu.createFFT(dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ, true);
// Prepare for doing PME on its own stream.
if (usePmeStream) {
pmeQueue = cu.createQueue();
CHECK_RESULT(cuEventCreate(&pmeSyncEvent, cu.getEventFlags()), "Error creating event for NonbondedForce");
CHECK_RESULT(cuEventCreate(&paramsSyncEvent, cu.getEventFlags()), "Error creating event for NonbondedForce");
int recipForceGroup = force.getReciprocalSpaceForceGroup();
if (recipForceGroup < 0)
recipForceGroup = force.getForceGroup();
cu.addPreComputation(new SyncStreamPreComputation(cu, pmeQueue, pmeSyncEvent, recipForceGroup));
cu.addPostComputation(new SyncStreamPostComputation(cu, pmeSyncEvent, cu.getKernel(module, "addEnergy"), pmeEnergyBuffer, recipForceGroup));
}
hasInitializedFFT = true;
// Initialize the b-spline moduli.
for (int grid = 0; grid < 2; grid++) {
int xsize, ysize, zsize;
CudaArray *xmoduli, *ymoduli, *zmoduli;
if (grid == 0) {
xsize = gridSizeX;
ysize = gridSizeY;
zsize = gridSizeZ;
xmoduli = &pmeBsplineModuliX;
ymoduli = &pmeBsplineModuliY;
zmoduli = &pmeBsplineModuliZ;
}
else {
if (!doLJPME)
continue;
xsize = dispersionGridSizeX;
ysize = dispersionGridSizeY;
zsize = dispersionGridSizeZ;
xmoduli = &pmeDispersionBsplineModuliX;
ymoduli = &pmeDispersionBsplineModuliY;
zmoduli = &pmeDispersionBsplineModuliZ;
}
int maxSize = max(max(xsize, ysize), zsize);
vector<double> data(PmeOrder);
vector<double> ddata(PmeOrder);
vector<double> bsplines_data(maxSize);
data[PmeOrder-1] = 0.0;
data[1] = 0.0;
data[0] = 1.0;
for (int i = 3; i < PmeOrder; i++) {
double div = 1.0/(i-1.0);
data[i-1] = 0.0;
for (int j = 1; j < (i-1); j++)
data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
data[0] = div*data[0];
}
// Differentiate.
ddata[0] = -data[0];
for (int i = 1; i < PmeOrder; i++)
ddata[i] = data[i-1]-data[i];
double div = 1.0/(PmeOrder-1);
data[PmeOrder-1] = 0.0;
for (int i = 1; i < (PmeOrder-1); i++)
data[PmeOrder-i-1] = div*(i*data[PmeOrder-i-2]+(PmeOrder-i)*data[PmeOrder-i-1]);
data[0] = div*data[0];
for (int i = 0; i < maxSize; i++)
bsplines_data[i] = 0.0;
for (int i = 1; i <= PmeOrder; i++)
bsplines_data[i] = data[i-1];
// Evaluate the actual bspline moduli for X/Y/Z.
for (int dim = 0; dim < 3; dim++) {
int ndata = (dim == 0 ? xsize : dim == 1 ? ysize : zsize);
vector<double> moduli(ndata);
for (int i = 0; i < ndata; i++) {
double sc = 0.0;
double ss = 0.0;
for (int j = 0; j < ndata; j++) {
double arg = (2.0*M_PI*i*j)/ndata;
sc += bsplines_data[j]*cos(arg);
ss += bsplines_data[j]*sin(arg);
}
moduli[i] = sc*sc+ss*ss;
}
for (int i = 0; i < ndata; i++)
if (moduli[i] < 1.0e-7)
moduli[i] = (moduli[(i-1+ndata)%ndata]+moduli[(i+1)%ndata])*0.5;
if (dim == 0)
xmoduli->upload(moduli, true);
else if (dim == 1)
ymoduli->upload(moduli, true);
else
zmoduli->upload(moduli, true);
}
}
}
}
}
// Add code to subtract off the reciprocal part of excluded interactions.
if ((nonbondedMethod == Ewald || nonbondedMethod == PME || nonbondedMethod == LJPME) && pmeio == NULL) {
int numContexts = cu.getPlatformData().contexts.size();
int startIndex = cu.getContextIndex()*force.getNumExceptions()/numContexts;
int endIndex = (cu.getContextIndex()+1)*force.getNumExceptions()/numContexts;
int numExclusions = endIndex-startIndex;
if (numExclusions > 0) {
paramsDefines["HAS_EXCLUSIONS"] = "1";
vector<vector<int> > atoms(numExclusions, vector<int>(2));
exclusionAtoms.initialize<int2>(cu, numExclusions, "exclusionAtoms");
exclusionParams.initialize<float4>(cu, numExclusions, "exclusionParams");
vector<int2> exclusionAtomsVec(numExclusions);
for (int i = 0; i < numExclusions; i++) {
int j = i+startIndex;
exclusionAtomsVec[i] = make_int2(exclusions[j].first, exclusions[j].second);
atoms[i][0] = exclusions[j].first;
atoms[i][1] = exclusions[j].second;
}
exclusionAtoms.upload(exclusionAtomsVec);
map<string, string> replacements;
replacements["PARAMS"] = cu.getBondedUtilities().addArgument(exclusionParams, "float4");
replacements["EWALD_ALPHA"] = cu.doubleToString(alpha);
replacements["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
replacements["DO_LJPME"] = doLJPME ? "1" : "0";
replacements["USE_PERIODIC"] = force.getExceptionsUsePeriodicBoundaryConditions() ? "1" : "0";
if (doLJPME)
replacements["EWALD_DISPERSION_ALPHA"] = cu.doubleToString(dispersionAlpha);
if (force.getIncludeDirectSpace())
cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CommonKernelSources::pmeExclusions, replacements), force.getForceGroup());
}
}
// Add the interaction to the default nonbonded kernel.
string source = cu.replaceStrings(CommonKernelSources::coulombLennardJones, defines);
charges.initialize(cu, cu.getPaddedNumAtoms(), cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float), "charges");
baseParticleParams.initialize<float4>(cu, cu.getPaddedNumAtoms(), "baseParticleParams");
baseParticleParams.upload(baseParticleParamVec);
map<string, string> replacements;
replacements["ONE_4PI_EPS0"] = cu.doubleToString(ONE_4PI_EPS0);
if (usePosqCharges) {
replacements["CHARGE1"] = "posq1.w";
replacements["CHARGE2"] = "posq2.w";
}
else {
replacements["CHARGE1"] = prefix+"charge1";
replacements["CHARGE2"] = prefix+"charge2";
}
if (hasCoulomb && !usePosqCharges)
cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo(prefix+"charge", "real", 1, charges.getElementSize(), charges.getDevicePointer()));
sigmaEpsilon.initialize<float2>(cu, cu.getPaddedNumAtoms(), "sigmaEpsilon");
if (hasLJ) {
replacements["SIGMA_EPSILON1"] = prefix+"sigmaEpsilon1";
replacements["SIGMA_EPSILON2"] = prefix+"sigmaEpsilon2";
cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo(prefix+"sigmaEpsilon", "float", 2, sizeof(float2), sigmaEpsilon.getDevicePointer()));
}
source = cu.replaceStrings(source, replacements);
if (force.getIncludeDirectSpace())
cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup(), numParticles > 3000, true);
// Initialize the exceptions.
int numContexts = cu.getPlatformData().contexts.size();
int startIndex = cu.getContextIndex()*exceptions.size()/numContexts;
int endIndex = (cu.getContextIndex()+1)*exceptions.size()/numContexts;
int numExceptions = endIndex-startIndex;
if (numExceptions > 0) {
paramsDefines["HAS_EXCEPTIONS"] = "1";
exceptionAtoms.resize(numExceptions);
vector<vector<int> > atoms(numExceptions, vector<int>(2));
exceptionParams.initialize<float4>(cu, numExceptions, "exceptionParams");
baseExceptionParams.initialize<float4>(cu, numExceptions, "baseExceptionParams");
vector<float4> baseExceptionParamsVec(numExceptions);
for (int i = 0; i < numExceptions; i++) {
double chargeProd, sigma, epsilon;
force.getExceptionParameters(exceptions[startIndex+i], atoms[i][0], atoms[i][1], chargeProd, sigma, epsilon);
baseExceptionParamsVec[i] = make_float4(chargeProd, sigma, epsilon, 0);
exceptionAtoms[i] = make_pair(atoms[i][0], atoms[i][1]);
}
baseExceptionParams.upload(baseExceptionParamsVec);
map<string, string> replacements;
replacements["APPLY_PERIODIC"] = (usePeriodic && force.getExceptionsUsePeriodicBoundaryConditions() ? "1" : "0");
replacements["PARAMS"] = cu.getBondedUtilities().addArgument(exceptionParams, "float4");
if (force.getIncludeDirectSpace())
cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CommonKernelSources::nonbondedExceptions, replacements), force.getForceGroup());
}
// Initialize parameter offsets.
vector<vector<float4> > particleOffsetVec(force.getNumParticles());
vector<vector<float4> > exceptionOffsetVec(numExceptions);
for (int i = 0; i < force.getNumParticleParameterOffsets(); i++) {
string param;
int particle;
double charge, sigma, epsilon;
force.getParticleParameterOffset(i, param, particle, charge, sigma, epsilon);
auto paramPos = find(paramNames.begin(), paramNames.end(), param);
int paramIndex;
if (paramPos == paramNames.end()) {
paramIndex = paramNames.size();
paramNames.push_back(param);
}
else
paramIndex = paramPos-paramNames.begin();
particleOffsetVec[particle].push_back(make_float4(charge, sigma, epsilon, paramIndex));
}
for (int i = 0; i < force.getNumExceptionParameterOffsets(); i++) {
string param;
int exception;
double charge, sigma, epsilon;
force.getExceptionParameterOffset(i, param, exception, charge, sigma, epsilon);
int index = exceptionIndex[exception];
if (index < startIndex || index >= endIndex)
continue;
auto paramPos = find(paramNames.begin(), paramNames.end(), param);
int paramIndex;
if (paramPos == paramNames.end()) {
paramIndex = paramNames.size();
paramNames.push_back(param);
}
else
paramIndex = paramPos-paramNames.begin();
exceptionOffsetVec[index-startIndex].push_back(make_float4(charge, sigma, epsilon, paramIndex));
}
paramValues.resize(paramNames.size(), 0.0);
particleParamOffsets.initialize<float4>(cu, max(force.getNumParticleParameterOffsets(), 1), "particleParamOffsets");
particleOffsetIndices.initialize<int>(cu, cu.getPaddedNumAtoms()+1, "particleOffsetIndices");
vector<int> particleOffsetIndicesVec, exceptionOffsetIndicesVec;
vector<float4> p, e;
for (int i = 0; i < particleOffsetVec.size(); i++) {
particleOffsetIndicesVec.push_back(p.size());
for (int j = 0; j < particleOffsetVec[i].size(); j++)
p.push_back(particleOffsetVec[i][j]);
}
while (particleOffsetIndicesVec.size() < particleOffsetIndices.getSize())
particleOffsetIndicesVec.push_back(p.size());
for (int i = 0; i < exceptionOffsetVec.size(); i++) {
exceptionOffsetIndicesVec.push_back(e.size());
for (int j = 0; j < exceptionOffsetVec[i].size(); j++)
e.push_back(exceptionOffsetVec[i][j]);
}
exceptionOffsetIndicesVec.push_back(e.size());
if (force.getNumParticleParameterOffsets() > 0) {
particleParamOffsets.upload(p);
particleOffsetIndices.upload(particleOffsetIndicesVec);
}
exceptionParamOffsets.initialize<float4>(cu, max((int) e.size(), 1), "exceptionParamOffsets");
exceptionOffsetIndices.initialize<int>(cu, exceptionOffsetIndicesVec.size(), "exceptionOffsetIndices");
if (e.size() > 0) {
exceptionParamOffsets.upload(e);
exceptionOffsetIndices.upload(exceptionOffsetIndicesVec);
}
globalParams.initialize(cu, max((int) paramValues.size(), 1), cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float), "globalParams");
if (paramValues.size() > 0)
globalParams.upload(paramValues, true);
chargeBuffer.initialize(cu, cu.getNumThreadBlocks(), cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float), "chargeBuffer");
cu.clearBuffer(chargeBuffer);
recomputeParams = true;
// Initialize the kernel for updating parameters.
CUmodule module = cu.createModule(CommonKernelSources::nonbondedParameters, paramsDefines);
computeParamsKernel = cu.getKernel(module, "computeParameters");
computeExclusionParamsKernel = cu.getKernel(module, "computeExclusionParameters");
computePlasmaCorrectionKernel = cu.getKernel(module, "computePlasmaCorrection");
info = new ForceInfo(force);
cu.addForce(info);
}
double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
// Update particle and exception parameters.
ContextSelector selector(cu);
bool paramChanged = false;
for (int i = 0; i < paramNames.size(); i++) {
double value = context.getParameter(paramNames[i]);
if (value != paramValues[i]) {
paramValues[i] = value;;
paramChanged = true;
}
}
if (paramChanged) {
recomputeParams = true;
globalParams.upload(paramValues, true);
}
double energy = 0.0;
if (includeReciprocal && (pmeGrid1.isInitialized() || cosSinSums.isInitialized())) {
double4 boxSize = cu.getPeriodicBoxSize();
double volume = boxSize.x*boxSize.y*boxSize.z;
energy = ewaldSelfEnergy - totalCharge*totalCharge/(8*EPSILON0*volume*alpha*alpha);
}
if (recomputeParams || hasOffsets) {
int computeSelfEnergy = (includeEnergy && includeReciprocal);
int numAtoms = cu.getNumAtoms();
vector<void*> paramsArgs = {&cu.getEnergyBuffer().getDevicePointer(), &computeSelfEnergy, &globalParams.getDevicePointer(), &numAtoms,
&baseParticleParams.getDevicePointer(), &cu.getPosq().getDevicePointer(), &charges.getDevicePointer(), &sigmaEpsilon.getDevicePointer(),
&particleParamOffsets.getDevicePointer(), &particleOffsetIndices.getDevicePointer(), &chargeBuffer.getDevicePointer()};
int numExceptions;
if (exceptionParams.isInitialized()) {
numExceptions = exceptionParams.getSize();
paramsArgs.push_back(&numExceptions);
paramsArgs.push_back(&baseExceptionParams.getDevicePointer());
paramsArgs.push_back(&exceptionParams.getDevicePointer());
paramsArgs.push_back(&exceptionParamOffsets.getDevicePointer());
paramsArgs.push_back(&exceptionOffsetIndices.getDevicePointer());
}
cu.executeKernel(computeParamsKernel, &paramsArgs[0], cu.getPaddedNumAtoms());
if (exclusionParams.isInitialized()) {
int numExclusions = exclusionParams.getSize();
vector<void*> exclusionParamsArgs = {&cu.getPosq().getDevicePointer(), &charges.getDevicePointer(), &sigmaEpsilon.getDevicePointer(),
&numExclusions, &exclusionAtoms.getDevicePointer(), &exclusionParams.getDevicePointer()};
cu.executeKernel(computeExclusionParamsKernel, &exclusionParamsArgs[0], numExclusions);
}
if (usePmeStream) {
cuEventRecord(paramsSyncEvent, cu.getCurrentStream());
cuStreamWaitEvent(dynamic_cast<CudaQueue*>(pmeQueue.get())->getStream(), paramsSyncEvent, 0);
}
if (hasOffsets) {
// The Ewald self energy was computed in the kernel.
energy = 0.0;
if (pmeGrid1.isInitialized() || cosSinSums.isInitialized()) {
// Invoke a kernel to compute the correction for the neutralizing plasma.
double4 boxSize = cu.getPeriodicBoxSize();
if (cu.getUseDoublePrecision()) {
double volume = boxSize.x*boxSize.y*boxSize.z;
vector<void*> correctionArgs = {&chargeBuffer.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), &alpha, &volume};
cu.executeKernel(computePlasmaCorrectionKernel, &correctionArgs[0], CudaContext::ThreadBlockSize, CudaContext::ThreadBlockSize);
}
else {
float alphaFloat = (float) alpha;
float volume = boxSize.x*boxSize.y*boxSize.z;
vector<void*> correctionArgs = {&chargeBuffer.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), &alphaFloat, &volume};
cu.executeKernel(computePlasmaCorrectionKernel, &correctionArgs[0], CudaContext::ThreadBlockSize, CudaContext::ThreadBlockSize);
}
}
}
recomputeParams = false;
}
// Do reciprocal space calculations.
if (cosSinSums.isInitialized() && includeReciprocal) {
void* sumsArgs[] = {&cu.getEnergyBuffer().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums.getDevicePointer(), cu.getPeriodicBoxSizePointer()};
cu.executeKernel(ewaldSumsKernel, sumsArgs, cosSinSums.getSize());
void* forcesArgs[] = {&cu.getForce().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums.getDevicePointer(), cu.getPeriodicBoxSizePointer()};
cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms());
}
if (pmeGrid1.isInitialized() && includeReciprocal) {
if (usePmeStream)
cu.setCurrentQueue(pmeQueue);
// Invert the periodic box vectors.
Vec3 boxVectors[3];
cu.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
double determinant = boxVectors[0][0]*boxVectors[1][1]*boxVectors[2][2];
double scale = 1.0/determinant;
double4 recipBoxVectors[3];
recipBoxVectors[0] = make_double4(boxVectors[1][1]*boxVectors[2][2]*scale, 0, 0, 0);
recipBoxVectors[1] = make_double4(-boxVectors[1][0]*boxVectors[2][2]*scale, boxVectors[0][0]*boxVectors[2][2]*scale, 0, 0);
recipBoxVectors[2] = make_double4((boxVectors[1][0]*boxVectors[2][1]-boxVectors[1][1]*boxVectors[2][0])*scale, -boxVectors[0][0]*boxVectors[2][1]*scale, boxVectors[0][0]*boxVectors[1][1]*scale, 0);
float4 recipBoxVectorsFloat[3];
void* recipBoxVectorPointer[3];
if (cu.getUseDoublePrecision()) {
recipBoxVectorPointer[0] = &recipBoxVectors[0];
recipBoxVectorPointer[1] = &recipBoxVectors[1];
recipBoxVectorPointer[2] = &recipBoxVectors[2];
}
else {
recipBoxVectorsFloat[0] = make_float4((float) recipBoxVectors[0].x, 0, 0, 0);
recipBoxVectorsFloat[1] = make_float4((float) recipBoxVectors[1].x, (float) recipBoxVectors[1].y, 0, 0);
recipBoxVectorsFloat[2] = make_float4((float) recipBoxVectors[2].x, (float) recipBoxVectors[2].y, (float) recipBoxVectors[2].z, 0);
recipBoxVectorPointer[0] = &recipBoxVectorsFloat[0];
recipBoxVectorPointer[1] = &recipBoxVectorsFloat[1];
recipBoxVectorPointer[2] = &recipBoxVectorsFloat[2];
}
// Execute the reciprocal space kernels.
// With fixed point charge spreading, finishSpreadCharge kernel is not needed as
// gridSpreadCharge can write directly to pmeGrid1.
CudaArray& pmeSpreadDstGrid = useFixedPointChargeSpreading ? pmeGrid2 : pmeGrid1;
if (hasCoulomb) {
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
sort->sort(pmeAtomGridIndex);
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeSpreadDstGrid.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&charges.getDevicePointer()};
cu.executeKernel(pmeSpreadChargeKernel, spreadArgs, PmeOrder*cu.getNumAtoms(), 128);
if (useFixedPointChargeSpreading) {
void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()};
cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
}
fft->execFFT(pmeGrid1, pmeGrid2, true);
if (includeEnergy) {
void* computeEnergyArgs[] = {&pmeGrid2.getDevicePointer(), usePmeStream ? &pmeEnergyBuffer.getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
&pmeBsplineModuliX.getDevicePointer(), &pmeBsplineModuliY.getDevicePointer(), &pmeBsplineModuliZ.getDevicePointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeEvalEnergyKernel, computeEnergyArgs, gridSizeX*gridSizeY*gridSizeZ);
}
void* convolutionArgs[] = {&pmeGrid2.getDevicePointer(), &pmeBsplineModuliX.getDevicePointer(),
&pmeBsplineModuliY.getDevicePointer(), &pmeBsplineModuliZ.getDevicePointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeConvolutionKernel, convolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
fft->execFFT(pmeGrid2, pmeGrid1, false);
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &pmeGrid1.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&charges.getDevicePointer()};
cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
}
if (doLJPME && hasLJ) {
if (!hasCoulomb) {
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeDispersionGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
sort->sort(pmeAtomGridIndex);
cu.clearBuffer(pmeEnergyBuffer);
}
cu.clearBuffer(pmeSpreadDstGrid);
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeSpreadDstGrid.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&sigmaEpsilon.getDevicePointer()};
cu.executeKernel(pmeDispersionSpreadChargeKernel, spreadArgs, PmeOrder*cu.getNumAtoms(), 128);
if (useFixedPointChargeSpreading) {
void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()};
cu.executeKernel(pmeDispersionFinishSpreadChargeKernel, finishSpreadArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
}
dispersionFft->execFFT(pmeGrid1, pmeGrid2, true);
if (includeEnergy) {
void* computeEnergyArgs[] = {&pmeGrid2.getDevicePointer(), usePmeStream ? &pmeEnergyBuffer.getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
&pmeDispersionBsplineModuliX.getDevicePointer(), &pmeDispersionBsplineModuliY.getDevicePointer(), &pmeDispersionBsplineModuliZ.getDevicePointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeEvalDispersionEnergyKernel, computeEnergyArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
}
void* convolutionArgs[] = {&pmeGrid2.getDevicePointer(), &pmeDispersionBsplineModuliX.getDevicePointer(),
&pmeDispersionBsplineModuliY.getDevicePointer(), &pmeDispersionBsplineModuliZ.getDevicePointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeDispersionConvolutionKernel, convolutionArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
dispersionFft->execFFT(pmeGrid2, pmeGrid1, false);
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &pmeGrid1.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&sigmaEpsilon.getDevicePointer()};
cu.executeKernel(pmeInterpolateDispersionForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
}
if (usePmeStream) {
cuEventRecord(pmeSyncEvent, dynamic_cast<CudaQueue*>(pmeQueue.get())->getStream());
cu.restoreDefaultQueue();
}
}
if (dispersionCoefficient != 0.0 && includeDirect) {
double4 boxSize = cu.getPeriodicBoxSize();
energy += dispersionCoefficient/(boxSize.x*boxSize.y*boxSize.z);
}
return energy;
}
void CudaCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context, const NonbondedForce& force, int firstParticle, int lastParticle, int firstException, int lastException) {
// Make sure the new parameters are acceptable.
ContextSelector selector(cu);
if (force.getNumParticles() != cu.getNumAtoms())
throw OpenMMException("updateParametersInContext: The number of particles has changed");
if (!hasCoulomb || !hasLJ) {
for (int i = 0; i < force.getNumParticles(); i++) {
double charge, sigma, epsilon;
force.getParticleParameters(i, charge, sigma, epsilon);
if (!hasCoulomb && charge != 0.0)
throw OpenMMException("updateParametersInContext: The nonbonded force kernel does not include Coulomb interactions, because all charges were originally 0");
if (!hasLJ && epsilon != 0.0)
throw OpenMMException("updateParametersInContext: The nonbonded force kernel does not include Lennard-Jones interactions, because all epsilons were originally 0");
}
}
set<int> exceptionsWithOffsets;
for (int i = 0; i < force.getNumExceptionParameterOffsets(); i++) {
string param;
int exception;
double charge, sigma, epsilon;
force.getExceptionParameterOffset(i, param, exception, charge, sigma, epsilon);
exceptionsWithOffsets.insert(exception);
}
vector<int> exceptions;
for (int i = 0; i < force.getNumExceptions(); i++) {
int particle1, particle2;
double chargeProd, sigma, epsilon;
force.getExceptionParameters(i, particle1, particle2, chargeProd, sigma, epsilon);
if (exceptionIndex.find(i) == exceptionIndex.end()) {
if (chargeProd != 0.0 || epsilon != 0.0 || exceptionsWithOffsets.find(i) != exceptionsWithOffsets.end())
throw OpenMMException("updateParametersInContext: The set of non-excluded exceptions has changed");
}
else
exceptions.push_back(i);
}
int numContexts = cu.getPlatformData().contexts.size();
int startIndex = cu.getContextIndex()*exceptions.size()/numContexts;
int endIndex = (cu.getContextIndex()+1)*exceptions.size()/numContexts;
int numExceptions = endIndex-startIndex;
if (numExceptions != exceptionAtoms.size())
throw OpenMMException("updateParametersInContext: The set of non-excluded exceptions has changed");
// Record the per-particle parameters.
if (firstParticle <= lastParticle) {
vector<float4> baseParticleParamVec(cu.getPaddedNumAtoms(), make_float4(0, 0, 0, 0));
for (int i = 0; i < force.getNumParticles(); i++) {
double charge, sigma, epsilon;
force.getParticleParameters(i, charge, sigma, epsilon);
baseParticleParamVec[i] = make_float4(charge, sigma, epsilon, 0);
}
baseParticleParams.uploadSubArray(&baseParticleParamVec[firstParticle], firstParticle, lastParticle-firstParticle+1);
// Compute the self energy.
ewaldSelfEnergy = 0.0;
totalCharge = 0.0;
if (nonbondedMethod == Ewald || nonbondedMethod == PME || nonbondedMethod == LJPME) {
if (cu.getContextIndex() == 0) {
for (int i = 0; i < force.getNumParticles(); i++) {
ewaldSelfEnergy -= baseParticleParamVec[i].x*baseParticleParamVec[i].x*ONE_4PI_EPS0*alpha/sqrt(M_PI);
totalCharge += baseParticleParamVec[i].x;
if (doLJPME)
ewaldSelfEnergy += baseParticleParamVec[i].z*pow(baseParticleParamVec[i].y*dispersionAlpha, 6)/3.0;
}
}
}
}
// Record the exceptions.
if (firstException <= lastException) {
vector<float4> baseExceptionParamsVec(numExceptions);
for (int i = 0; i < numExceptions; i++) {
int particle1, particle2;
double chargeProd, sigma, epsilon;
force.getExceptionParameters(exceptions[startIndex+i], particle1, particle2, chargeProd, sigma, epsilon);
if (make_pair(particle1, particle2) != exceptionAtoms[i])
throw OpenMMException("updateParametersInContext: The set of non-excluded exceptions has changed");
baseExceptionParamsVec[i] = make_float4(chargeProd, sigma, epsilon, 0);
}
baseExceptionParams.upload(baseExceptionParamsVec);
}
// Record parameter offsets.
vector<vector<mm_float4> > particleOffsetVec(force.getNumParticles());
vector<vector<mm_float4> > exceptionOffsetVec(numExceptions);
for (int i = 0; i < force.getNumParticleParameterOffsets(); i++) {
string param;
int particle;
double charge, sigma, epsilon;
force.getParticleParameterOffset(i, param, particle, charge, sigma, epsilon);
auto paramPos = find(paramNames.begin(), paramNames.end(), param);
if (paramPos == paramNames.end())
throw OpenMMException("updateParametersInContext: The parameter of a particle parameter offset has changed");
int paramIndex = paramPos-paramNames.begin();
particleOffsetVec[particle].push_back(mm_float4(charge, sigma, epsilon, paramIndex));
}
for (int i = 0; i < force.getNumExceptionParameterOffsets(); i++) {
string param;
int exception;
double charge, sigma, epsilon;
force.getExceptionParameterOffset(i, param, exception, charge, sigma, epsilon);
int index = exceptionIndex[exception];
if (index < startIndex || index >= endIndex)
continue;
auto paramPos = find(paramNames.begin(), paramNames.end(), param);
if (paramPos == paramNames.end())
throw OpenMMException("updateParametersInContext: The parameter of an exception parameter offset has changed");
int paramIndex = paramPos-paramNames.begin();
exceptionOffsetVec[index-startIndex].push_back(mm_float4(charge, sigma, epsilon, paramIndex));
}
if (max(force.getNumParticleParameterOffsets(), 1) != particleParamOffsets.getSize())
throw OpenMMException("updateParametersInContext: The number of particle parameter offsets has changed");
vector<mm_float4> p, e;
for (int i = 0; i < particleOffsetVec.size(); i++)
for (int j = 0; j < particleOffsetVec[i].size(); j++)
p.push_back(particleOffsetVec[i][j]);
for (int i = 0; i < exceptionOffsetVec.size(); i++)
for (int j = 0; j < exceptionOffsetVec[i].size(); j++)
e.push_back(exceptionOffsetVec[i][j]);
if (force.getNumParticleParameterOffsets() > 0)
particleParamOffsets.upload(p);
if (max((int) e.size(), 1) != exceptionParamOffsets.getSize())
throw OpenMMException("updateParametersInContext: The number of exception parameter offsets has changed");
if (e.size() > 0)
exceptionParamOffsets.upload(e);
// Compute other values.
if (force.getUseDispersionCorrection() && cu.getContextIndex() == 0 && (nonbondedMethod == CutoffPeriodic || nonbondedMethod == Ewald || nonbondedMethod == PME))
dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(context.getSystem(), force);
cu.invalidateMolecules(info, firstParticle <= lastParticle || force.getNumParticleParameterOffsets() > 0,
firstException <= lastException || force.getNumExceptionParameterOffsets() > 0);
recomputeParams = true;
}
void CudaCalcNonbondedForceKernel::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
if (nonbondedMethod != PME)
throw OpenMMException("getPMEParametersInContext: This Context is not using PME");
if (cu.getPlatformData().useCpuPme)
cpuPme.getAs<CalcPmeReciprocalForceKernel>().getPMEParameters(alpha, nx, ny, nz);
else {
alpha = this->alpha;
nx = gridSizeX;
ny = gridSizeY;
nz = gridSizeZ;
}
}
void CudaCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
if (!doLJPME)
throw OpenMMException("getPMEParametersInContext: This Context is not using PME");
if (cu.getPlatformData().useCpuPme)
//cpuPme.getAs<CalcPmeReciprocalForceKernel>().getLJPMEParameters(alpha, nx, ny, nz);
throw OpenMMException("getPMEParametersInContext: CPUPME has not been implemented for LJPME yet.");
else {
alpha = this->dispersionAlpha;
nx = dispersionGridSizeX;
ny = dispersionGridSizeY;
nz = dispersionGridSizeZ;
}
bool usePmeQueue = (!cu.getPlatformData().disablePmeStream && !cu.getPlatformData().useCpuPme);
bool useFixedPointChargeSpreading = cu.getUseDoublePrecision() || cu.getPlatformData().deterministicForces;
commonInitialize(system, force, usePmeQueue, false, useFixedPointChargeSpreading, cu.getPlatformData().useCpuPme);
}
......@@ -30,7 +30,6 @@
#include "CudaContext.h"
#include "CudaKernelSources.h"
#include "CudaExpressionUtilities.h"
#include "CudaSort.h"
#include <algorithm>
#include <map>
#include <set>
......@@ -47,7 +46,7 @@ using namespace std;
}
class CudaNonbondedUtilities::BlockSortTrait : public CudaSort::SortTrait {
class CudaNonbondedUtilities::BlockSortTrait : public ComputeSortImpl::SortTrait {
public:
BlockSortTrait() {}
int getDataSize() const {return sizeof(int);}
......@@ -61,7 +60,7 @@ public:
};
CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), useCutoff(false), usePeriodic(false), useNeighborList(false), anyExclusions(false), usePadding(true),
blockSorter(NULL), pinnedCountBuffer(NULL), forceRebuildNeighborList(true), groupFlags(0), canUsePairList(true), tilesAfterReorder(0) {
pinnedCountBuffer(NULL), forceRebuildNeighborList(true), groupFlags(0), canUsePairList(true), tilesAfterReorder(0) {
// Decide how many thread blocks to use.
string errorMessage = "Error initializing nonbonded utilities";
......@@ -82,18 +81,13 @@ CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(c
}
CudaNonbondedUtilities::~CudaNonbondedUtilities() {
if (blockSorter != NULL)
delete blockSorter;
if (pinnedCountBuffer != NULL)
cuMemFreeHost(pinnedCountBuffer);
cuEventDestroy(downloadCountEvent);
}
void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup, bool useNeighborList) {
addInteraction(usesCutoff, usesPeriodic, usesExclusions, cutoffDistance, exclusionList, kernel, forceGroup, useNeighborList, false);
}
void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup, bool useNeighborList, bool supportsPairList) {
void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance,
const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup, bool useNeighborList, bool supportsPairList) {
if (groupCutoff.size() > 0) {
if (usesCutoff != useCutoff)
throw OpenMMException("All Forces must agree on whether to use a cutoff");
......@@ -289,7 +283,7 @@ void CudaNonbondedUtilities::initialize(const System& system) {
largeBlockBoundingBox.initialize(context, numAtomBlocks, 4*elementSize, "largeBlockBoundingBox");
oldPositions.initialize(context, numAtoms, 4*elementSize, "oldPositions");
rebuildNeighborList.initialize<int>(context, 1, "rebuildNeighborList");
blockSorter = new CudaSort(context, new BlockSortTrait(), numAtomBlocks, false);
blockSorter = context.createSort(new BlockSortTrait(), numAtomBlocks, false);
vector<unsigned int> count(2, 0);
interactionCount.upload(count);
rebuildNeighborList.upload(&count[0]);
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2021 Stanford University and the Authors. *
* Portions copyright (c) 2010-2025 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -32,7 +32,7 @@
using namespace OpenMM;
using namespace std;
CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length, bool uniform) :
CudaSort::CudaSort(CudaContext& context, ComputeSortImpl::SortTrait* trait, unsigned int length, bool uniform) :
context(context), trait(trait), dataLength(length), uniform(uniform) {
// Create kernels.
......@@ -92,21 +92,22 @@ CudaSort::~CudaSort() {
delete trait;
}
void CudaSort::sort(CudaArray& data) {
void CudaSort::sort(ArrayInterface& data) {
if (data.getSize() != dataLength || data.getElementSize() != trait->getDataSize())
throw OpenMMException("CudaSort called with different data size");
if (data.getSize() == 0)
return;
CudaArray& cudata = context.unwrap(data);
if (isShortList) {
// We can use a simpler sort kernel that does the entire operation in one kernel.
if (dataLength <= CudaContext::ThreadBlockSize*context.getNumThreadBlocks()) {
void* sortArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &dataLength};
void* sortArgs[] = {&cudata.getDevicePointer(), &buckets.getDevicePointer(), &dataLength};
context.executeKernel(shortList2Kernel, sortArgs, dataLength);
buckets.copyTo(data);
buckets.copyTo(cudata);
}
else {
void* sortArgs[] = {&data.getDevicePointer(), &dataLength};
void* sortArgs[] = {&cudata.getDevicePointer(), &dataLength};
context.executeKernel(shortListKernel, sortArgs, sortKernelSize, sortKernelSize, dataLength*trait->getDataSize());
}
}
......@@ -114,14 +115,14 @@ void CudaSort::sort(CudaArray& data) {
// Compute the range of data values.
unsigned int numBuckets = bucketOffset.getSize();
void* rangeArgs[] = {&data.getDevicePointer(), &dataLength, &dataRange.getDevicePointer(), &numBuckets, &bucketOffset.getDevicePointer()};
void* rangeArgs[] = {&cudata.getDevicePointer(), &dataLength, &dataRange.getDevicePointer(), &numBuckets, &bucketOffset.getDevicePointer()};
context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelSize, rangeKernelSize, 2*rangeKernelSize*trait->getKeySize());
// Assign array elements to buckets.
void* elementsArgs[] = {&data.getDevicePointer(), &dataLength, &numBuckets, &dataRange.getDevicePointer(),
void* elementsArgs[] = {&cudata.getDevicePointer(), &dataLength, &numBuckets, &dataRange.getDevicePointer(),
&bucketOffset.getDevicePointer(), &bucketOfElement.getDevicePointer(), &offsetInBucket.getDevicePointer()};
context.executeKernel(assignElementsKernel, elementsArgs, data.getSize(), 128);
context.executeKernel(assignElementsKernel, elementsArgs, cudata.getSize(), 128);
// Compute the position of each bucket.
......@@ -130,13 +131,13 @@ void CudaSort::sort(CudaArray& data) {
// Copy the data into the buckets.
void* copyArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &dataLength, &bucketOffset.getDevicePointer(),
void* copyArgs[] = {&cudata.getDevicePointer(), &buckets.getDevicePointer(), &dataLength, &bucketOffset.getDevicePointer(),
&bucketOfElement.getDevicePointer(), &offsetInBucket.getDevicePointer()};
context.executeKernel(copyToBucketsKernel, copyArgs, data.getSize());
context.executeKernel(copyToBucketsKernel, copyArgs, cudata.getSize());
// Sort each bucket.
void* sortArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &numBuckets, &bucketOffset.getDevicePointer()};
context.executeKernel(sortBucketsKernel, sortArgs, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize());
void* sortArgs[] = {&cudata.getDevicePointer(), &buckets.getDevicePointer(), &numBuckets, &bucketOffset.getDevicePointer()};
context.executeKernel(sortBucketsKernel, sortArgs, ((cudata.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize());
}
}
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2021 Stanford University and the Authors. *
* Portions copyright (c) 2008-2025 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -48,7 +48,7 @@ using namespace std;
CudaPlatform platform;
class SortTrait : public CudaSort::SortTrait {
class SortTrait : public ComputeSortImpl::SortTrait {
int getDataSize() const {return 4;}
int getKeySize() const {return 4;}
const char* getDataType() const {return "float";}
......
......@@ -51,7 +51,6 @@
#include "HipArray.h"
#include "HipBondedUtilities.h"
#include "HipExpressionUtilities.h"
#include "HipFFT3D.h"
#include "HipIntegrationUtilities.h"
#include "HipNonbondedUtilities.h"
#include "HipPlatform.h"
......@@ -156,6 +155,12 @@ public:
* one ComputeContext is created for each device.
*/
std::vector<ComputeContext*> getAllContexts();
/**
* Get the ContextImpl is ComputeContext is associated with.
*/
ContextImpl& getContextImpl() {
return *platformData.context;
}
/**
* Get a workspace used for accumulating energy when a simulation is parallelized across
* multiple devices.
......@@ -178,6 +183,19 @@ public:
* Construct a ComputeEvent object of the appropriate class for this platform.
*/
ComputeEvent createEvent();
/**
* Construct a ComputeSort object of the appropriate class for this platform.
*
* @param trait a SortTrait defining the type of data to sort. It should have been allocated
* on the heap with the "new" operator. This object takes over ownership of it,
* and deletes it when the ComputeSort is deleted.
* @param length the length of the arrays this object will be used to sort
* @param uniform whether the input data is expected to follow a uniform or nonuniform
* distribution. This argument is used only as a hint. It allows parts
* of the algorithm to be tuned for faster performance on the expected
* distribution.
*/
ComputeSort createSort(ComputeSortImpl::SortTrait* trait, unsigned int length, bool uniform=true);
/**
* Create an object for performing 3D FFTs. The caller is responsible for deleting
* the object when it is no longer needed.
......@@ -187,7 +205,7 @@ public:
* @param zsize the third dimension of the data sets on which FFTs will be performed
* @param realToComplex if true, a real-to-complex transform will be done. Otherwise, it is complex-to-complex.
*/
HipFFT3D* createFFT(int xsize, int ysize, int zsize, bool realToComplex=false);
FFT3D createFFT(int xsize, int ysize, int zsize, bool realToComplex=false);
/**
* Get the smallest legal size for a dimension of the grid.
*/
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment