Commit 4334eca9 authored by Lee-Ping's avatar Lee-Ping
Browse files

Merge branch 'master' of https://github.com/SimTk/openmm

parents f91a528c 7844aa09
...@@ -396,6 +396,20 @@ IF(OPENMM_BUILD_DRUDE_PLUGIN) ...@@ -396,6 +396,20 @@ IF(OPENMM_BUILD_DRUDE_PLUGIN)
ADD_SUBDIRECTORY(plugins/drude) ADD_SUBDIRECTORY(plugins/drude)
ENDIF(OPENMM_BUILD_DRUDE_PLUGIN) ENDIF(OPENMM_BUILD_DRUDE_PLUGIN)
# CPU PME plugin
FIND_PACKAGE(FFTW QUIET)
IF(FFTW_FOUND)
SET(OPENMM_BUILD_PME_PLUGIN ON CACHE BOOL "Build CPU PME plugin")
ELSE(FFTW_FOUND)
SET(OPENMM_BUILD_PME_PLUGIN OFF CACHE BOOL "Build CPU PME plugin")
ENDIF(FFTW_FOUND)
SET(OPENMM_BUILD_PME_PATH)
IF(OPENMM_BUILD_PME_PLUGIN)
SET(OPENMM_BUILD_PME_PATH ${CMAKE_CURRENT_SOURCE_DIR}/plugins/cpupme)
ADD_SUBDIRECTORY(plugins/cpupme)
ENDIF(OPENMM_BUILD_PME_PLUGIN)
INSTALL_TARGETS(/lib RUNTIME_DIRECTORY /lib ${SHARED_TARGET}) INSTALL_TARGETS(/lib RUNTIME_DIRECTORY /lib ${SHARED_TARGET})
IF(OPENMM_BUILD_STATIC_LIB) IF(OPENMM_BUILD_STATIC_LIB)
INSTALL_TARGETS(/lib RUNTIME_DIRECTORY /lib ${STATIC_TARGET}) INSTALL_TARGETS(/lib RUNTIME_DIRECTORY /lib ${STATIC_TARGET})
......
# - Find FFTW
# Find the native FFTW includes and library
#
# FFTW_INCLUDES - where to find fftw3.h
# FFTW_LIBRARY - the main FFTW library.
# FFTW_THREADS_LIBRARY - the FFTW multithreading support library.
# FFTW_FOUND - True if FFTW found.
if (FFTW_INCLUDES)
# Already in cache, be silent
set (FFTW_FIND_QUIETLY TRUE)
endif (FFTW_INCLUDES)
find_path (FFTW_INCLUDES fftw3.h)
find_library (FFTW_LIBRARY NAMES fftw3f)
find_library (FFTW_THREADS_LIBRARY NAMES fftw3f_threads)
# handle the QUIETLY and REQUIRED arguments and set FFTW_FOUND to TRUE if
# all listed variables are TRUE
include (FindPackageHandleStandardArgs)
find_package_handle_standard_args (FFTW DEFAULT_MSG FFTW_LIBRARY FFTW_INCLUDES)
find_package_handle_standard_args (FFTW_THREADS DEFAULT_MSG FFTW_THREADS_LIBRARY FFTW_INCLUDES)
mark_as_advanced (FFTW_LIBRARY FFTW_THREADS_LIBRARY FFTW_INCLUDES)
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2011 Stanford University and the Authors. * * Portions copyright (c) 2009-2013 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -112,7 +112,7 @@ ParseToken Parser::getNextToken(const string& expression, int start) { ...@@ -112,7 +112,7 @@ ParseToken Parser::getNextToken(const string& expression, int start) {
} }
if ((c == 'e' || c == 'E') && !foundExp) { if ((c == 'e' || c == 'E') && !foundExp) {
foundExp = true; foundExp = true;
if (pos < (int) expression.size()-1 && expression[pos+1] == '-') if (pos < (int) expression.size()-1 && (expression[pos+1] == '-' || expression[pos+1] == '+'))
pos++; pos++;
continue; continue;
} }
......
...@@ -1155,6 +1155,70 @@ public: ...@@ -1155,6 +1155,70 @@ public:
virtual void execute(ContextImpl& context) = 0; virtual void execute(ContextImpl& context) = 0;
}; };
/**
* This kernel performs the reciprocal space calculation for PME. In most cases, this
* calculation is done directly by CalcNonbondedForceKernel so this kernel is unneeded.
* In some cases it may want to outsource the work to a different kernel. In particular,
* GPU based platforms sometimes use a CPU based implementation provided by a separate
* plugin.
*/
class CalcPmeReciprocalForceKernel : public KernelImpl {
public:
class IO;
static std::string Name() {
return "CalcPmeReciprocalForce";
}
CalcPmeReciprocalForceKernel(std::string name, const Platform& platform) : KernelImpl(name, platform) {
}
/**
* Initialize the kernel.
*
* @param gridx the x size of the PME grid
* @param gridy the y size of the PME grid
* @param gridz the z size of the PME grid
* @param numParticles the number of particles in the system
* @param alpha the Ewald blending parameter
*/
virtual void initialize(int gridx, int gridy, int gridz, int numParticles, double alpha) = 0;
/**
* Begin computing the force and energy.
*
* @param io an object that coordinates data transfer
* @param periodicBoxSize the size of the periodic box (measured in nm)
* @param includeEnergy true if potential energy should be computed
*/
virtual void beginComputation(IO& io, Vec3 periodicBoxSize, bool includeEnergy) = 0;
/**
* Finish computing the force and energy.
*
* @param io an object that coordinates data transfer
* @return the potential energy due to the PME reciprocal space interactions
*/
virtual double finishComputation(IO& io) = 0;
};
/**
* Any class that uses CalcPmeReciprocalForceKernel should create an implementation of this
* class, then pass it to the kernel to manage communication with it.
*/
class CalcPmeReciprocalForceKernel::IO {
public:
/**
* Get a pointer to the atom charges and positions. This array should contain four
* elements for each atom: x, y, z, and q in that order.
*/
virtual float* getPosq() = 0;
/**
* Record the forces calculated by the kernel.
*
* @param force an array containing four elements for each atom. The first three
* are the x, y, and z components of the force, while the fourth element
* should be ignored.
*/
virtual void setForce(float* force) = 0;
};
} // namespace OpenMM } // namespace OpenMM
#endif /*OPENMM_KERNELS_H_*/ #endif /*OPENMM_KERNELS_H_*/
...@@ -40,13 +40,19 @@ namespace OpenMM { ...@@ -40,13 +40,19 @@ namespace OpenMM {
/** /**
* This class implements an implicit solvation force using the GBSA-OBC model. * This class implements an implicit solvation force using the GBSA-OBC model.
* <p> *
* To use this class, create a GBSAOBCForce object, then call addParticle() once for each particle in the * To use this class, create a GBSAOBCForce object, then call addParticle() once for each particle in the
* System to define its parameters. The number of particles for which you define GBSA parameters must * System to define its parameters. The number of particles for which you define GBSA parameters must
* be exactly equal to the number of particles in the System, or else an exception will be thrown when you * be exactly equal to the number of particles in the System, or else an exception will be thrown when you
* try to create a Context. After a particle has been added, you can modify its force field parameters * try to create a Context. After a particle has been added, you can modify its force field parameters
* by calling setParticleParameters(). This will have no effect on Contexts that already exist unless you * by calling setParticleParameters(). This will have no effect on Contexts that already exist unless you
* call updateParametersInContext(). * call updateParametersInContext().
*
* When using this Force, the System should also include a NonbondedForce, and both objects must specify
* identical charges for all particles. Otherwise, the results will not be correct. Furthermore, if the
* nonbonded method is set to CutoffNonPeriodic or CutoffPeriodic, you should call setReactionFieldDielectric(1.0)
* on the NonbondedForce to turn off the reaction field approximation, which does not produce correct results
* when combined with GBSA.
*/ */
class OPENMM_EXPORT GBSAOBCForce : public Force { class OPENMM_EXPORT GBSAOBCForce : public Force {
...@@ -70,7 +76,7 @@ public: ...@@ -70,7 +76,7 @@ public:
*/ */
CutoffPeriodic = 2, CutoffPeriodic = 2,
}; };
/* /**
* Create a GBSAOBCForce. * Create a GBSAOBCForce.
*/ */
GBSAOBCForce(); GBSAOBCForce();
......
...@@ -81,6 +81,13 @@ public: ...@@ -81,6 +81,13 @@ public:
static const std::string key = "CudaPrecision"; static const std::string key = "CudaPrecision";
return key; return key;
} }
/**
* This is the name of the parameter for selecting whether to use the CPU based PME calculation.
*/
static const std::string& CudaUseCpuPme() {
static const std::string key = "CudaUseCpuPme";
return key;
}
/** /**
* This is the name of the parameter for specifying the path to the CUDA compiler. * This is the name of the parameter for specifying the path to the CUDA compiler.
*/ */
...@@ -99,14 +106,15 @@ public: ...@@ -99,14 +106,15 @@ public:
class OPENMM_EXPORT_CUDA CudaPlatform::PlatformData { class OPENMM_EXPORT_CUDA CudaPlatform::PlatformData {
public: public:
PlatformData(const System& system, const std::string& deviceIndexProperty, const std::string& blockingProperty, const std::string& precisionProperty, PlatformData(ContextImpl* context, const System& system, const std::string& deviceIndexProperty, const std::string& blockingProperty, const std::string& precisionProperty,
const std::string& compilerProperty, const std::string& tempProperty); const std::string& cpuPmeProperty, const std::string& compilerProperty, const std::string& tempProperty);
~PlatformData(); ~PlatformData();
void initializeContexts(const System& system); void initializeContexts(const System& system);
void syncContexts(); void syncContexts();
ContextImpl* context;
std::vector<CudaContext*> contexts; std::vector<CudaContext*> contexts;
std::vector<double> contextEnergy; std::vector<double> contextEnergy;
bool removeCM, peerAccessSupported; bool removeCM, peerAccessSupported, useCpuPme;
int cmMotionFrequency; int cmMotionFrequency;
int stepCount, computeForceCount; int stepCount, computeForceCount;
double time; double time;
......
...@@ -231,6 +231,10 @@ CudaContext::~CudaContext() { ...@@ -231,6 +231,10 @@ CudaContext::~CudaContext() {
delete forces[i]; delete forces[i];
for (int i = 0; i < (int) reorderListeners.size(); i++) for (int i = 0; i < (int) reorderListeners.size(); i++)
delete reorderListeners[i]; delete reorderListeners[i];
for (int i = 0; i < (int) preComputations.size(); i++)
delete preComputations[i];
for (int i = 0; i < (int) postComputations.size(); i++)
delete postComputations[i];
if (pinnedBuffer != NULL) if (pinnedBuffer != NULL)
cuMemFreeHost(pinnedBuffer); cuMemFreeHost(pinnedBuffer);
if (posq != NULL) if (posq != NULL)
...@@ -1102,6 +1106,14 @@ void CudaContext::addReorderListener(ReorderListener* listener) { ...@@ -1102,6 +1106,14 @@ void CudaContext::addReorderListener(ReorderListener* listener) {
reorderListeners.push_back(listener); reorderListeners.push_back(listener);
} }
void CudaContext::addPreComputation(ForcePreComputation* computation) {
preComputations.push_back(computation);
}
void CudaContext::addPostComputation(ForcePostComputation* computation) {
postComputations.push_back(computation);
}
struct CudaContext::WorkThread::ThreadData { struct CudaContext::WorkThread::ThreadData {
ThreadData(std::queue<CudaContext::WorkTask*>& tasks, bool& waiting, bool& finished, ThreadData(std::queue<CudaContext::WorkTask*>& tasks, bool& waiting, bool& finished,
pthread_mutex_t& queueLock, pthread_cond_t& waitForTaskCondition, pthread_cond_t& queueEmptyCondition) : pthread_mutex_t& queueLock, pthread_cond_t& waitForTaskCondition, pthread_cond_t& queueEmptyCondition) :
......
...@@ -70,6 +70,8 @@ public: ...@@ -70,6 +70,8 @@ public:
class WorkTask; class WorkTask;
class WorkThread; class WorkThread;
class ReorderListener; class ReorderListener;
class ForcePreComputation;
class ForcePostComputation;
static const int ThreadBlockSize; static const int ThreadBlockSize;
static const int TileSize; static const int TileSize;
CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const std::string& precision, CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const std::string& precision,
...@@ -454,6 +456,28 @@ public: ...@@ -454,6 +456,28 @@ public:
std::vector<ReorderListener*>& getReorderListeners() { std::vector<ReorderListener*>& getReorderListeners() {
return reorderListeners; return reorderListeners;
} }
/**
* Add a pre-computation that should be called at the very start of force and energy evaluations.
* The CudaContext assumes ownership of the object, and deletes it when the context itself is deleted.
*/
void addPreComputation(ForcePreComputation* computation);
/**
* Get the list of ForcePreComputations.
*/
std::vector<ForcePreComputation*>& getPreComputations() {
return preComputations;
}
/**
* Add a post-computation that should be called at the very end of force and energy evaluations.
* The CudaContext assumes ownership of the object, and deletes it when the context itself is deleted.
*/
void addPostComputation(ForcePostComputation* computation);
/**
* Get the list of ForcePostComputations.
*/
std::vector<ForcePostComputation*>& getPostComputations() {
return postComputations;
}
/** /**
* Mark that the current molecule definitions (and hence the atom order) may be invalid. * Mark that the current molecule definitions (and hence the atom order) may be invalid.
* This should be called whenever force field parameters change. It will cause the definitions * This should be called whenever force field parameters change. It will cause the definitions
...@@ -519,6 +543,8 @@ private: ...@@ -519,6 +543,8 @@ private:
std::vector<CUdeviceptr> autoclearBuffers; std::vector<CUdeviceptr> autoclearBuffers;
std::vector<int> autoclearBufferSizes; std::vector<int> autoclearBufferSizes;
std::vector<ReorderListener*> reorderListeners; std::vector<ReorderListener*> reorderListeners;
std::vector<ForcePreComputation*> preComputations;
std::vector<ForcePostComputation*> postComputations;
CudaIntegrationUtilities* integration; CudaIntegrationUtilities* integration;
CudaExpressionUtilities* expression; CudaExpressionUtilities* expression;
CudaBondedUtilities* bonded; CudaBondedUtilities* bonded;
...@@ -580,7 +606,7 @@ private: ...@@ -580,7 +606,7 @@ private:
/** /**
* This abstract class defines a function to be executed whenever atoms get reordered. * This abstract class defines a function to be executed whenever atoms get reordered.
* Objects that need to know when reordering happens should create a reorderListener * Objects that need to know when reordering happens should create a ReorderListener
* and register it by calling addReorderListener(). * and register it by calling addReorderListener().
*/ */
class CudaContext::ReorderListener { class CudaContext::ReorderListener {
...@@ -590,6 +616,39 @@ public: ...@@ -590,6 +616,39 @@ public:
} }
}; };
/**
* This abstract class defines a function to be executed at the very beginning of force and
* energy evaluation, before any other calculation has been done. It is useful for operations
* that need to be performed at a nonstandard point in the process. After creating a
* ForcePreComputation, register it by calling addForcePreComputation().
*/
class CudaContext::ForcePreComputation {
public:
/**
* @param includeForce true if forces should be computed
* @param includeEnergy true if potential energy should be computed
* @param groups a set of bit flags for which force groups to include
*/
virtual void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) = 0;
};
/**
* This abstract class defines a function to be executed at the very end of force and
* energy evaluation, after all other calculations have been done. It is useful for operations
* that need to be performed at a nonstandard point in the process. After creating a
* ForcePostComputation, register it by calling addForcePostComputation().
*/
class CudaContext::ForcePostComputation {
public:
/**
* @param includeForce true if forces should be computed
* @param includeEnergy true if potential energy should be computed
* @param groups a set of bit flags for which force groups to include
* @return an optional contribution to add to the potential energy.
*/
virtual double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) = 0;
};
} // namespace OpenMM } // namespace OpenMM
#endif /*OPENMM_CUDACONTEXT_H_*/ #endif /*OPENMM_CUDACONTEXT_H_*/
...@@ -181,7 +181,7 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express ...@@ -181,7 +181,7 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
out << "ASIN(" << getTempName(node.getChildren()[0], temps) << ")"; out << "ASIN(" << getTempName(node.getChildren()[0], temps) << ")";
break; break;
case Operation::ACOS: case Operation::ACOS:
out << "ACSO(" << getTempName(node.getChildren()[0], temps) << ")"; out << "ACOS(" << getTempName(node.getChildren()[0], temps) << ")";
break; break;
case Operation::ATAN: case Operation::ATAN:
out << "ATAN(" << getTempName(node.getChildren()[0], temps) << ")"; out << "ATAN(" << getTempName(node.getChildren()[0], temps) << ")";
......
...@@ -84,10 +84,12 @@ void CudaCalcForcesAndEnergyKernel::initialize(const System& system) { ...@@ -84,10 +84,12 @@ void CudaCalcForcesAndEnergyKernel::initialize(const System& system) {
void CudaCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups) { void CudaCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups) {
cu.setAsCurrent(); cu.setAsCurrent();
cu.clearAutoclearBuffers();
for (vector<CudaContext::ForcePreComputation*>::iterator iter = cu.getPreComputations().begin(); iter != cu.getPreComputations().end(); ++iter)
(*iter)->computeForceAndEnergy(includeForces, includeEnergy, groups);
CudaNonbondedUtilities& nb = cu.getNonbondedUtilities(); CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
bool includeNonbonded = ((groups&(1<<nb.getForceGroup())) != 0); bool includeNonbonded = ((groups&(1<<nb.getForceGroup())) != 0);
cu.setComputeForceCount(cu.getComputeForceCount()+1); cu.setComputeForceCount(cu.getComputeForceCount()+1);
cu.clearAutoclearBuffers();
if (includeNonbonded) if (includeNonbonded)
nb.prepareInteractions(); nb.prepareInteractions();
} }
...@@ -96,8 +98,10 @@ double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bo ...@@ -96,8 +98,10 @@ double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bo
cu.getBondedUtilities().computeInteractions(groups); cu.getBondedUtilities().computeInteractions(groups);
if ((groups&(1<<cu.getNonbondedUtilities().getForceGroup())) != 0) if ((groups&(1<<cu.getNonbondedUtilities().getForceGroup())) != 0)
cu.getNonbondedUtilities().computeInteractions(); cu.getNonbondedUtilities().computeInteractions();
cu.getIntegrationUtilities().distributeForcesFromVirtualSites();
double sum = 0.0; double sum = 0.0;
for (vector<CudaContext::ForcePostComputation*>::iterator iter = cu.getPostComputations().begin(); iter != cu.getPostComputations().end(); ++iter)
sum += (*iter)->computeForceAndEnergy(includeForces, includeEnergy, groups);
cu.getIntegrationUtilities().distributeForcesFromVirtualSites();
if (includeEnergy) { if (includeEnergy) {
CudaArray& energyArray = cu.getEnergyBuffer(); CudaArray& energyArray = cu.getEnergyBuffer();
if (cu.getUseDoublePrecision()) { if (cu.getUseDoublePrecision()) {
...@@ -1330,6 +1334,58 @@ private: ...@@ -1330,6 +1334,58 @@ private:
const NonbondedForce& force; const NonbondedForce& force;
}; };
class CudaCalcNonbondedForceKernel::PmeIO : public CalcPmeReciprocalForceKernel::IO {
public:
PmeIO(CudaContext& cu, CUfunction addForcesKernel) : cu(cu), addForcesKernel(addForcesKernel), forceTemp(NULL) {
forceTemp = CudaArray::create<float4>(cu, cu.getNumAtoms(), "PmeForce");
}
~PmeIO() {
if (forceTemp != NULL)
delete forceTemp;
}
float* getPosq() {
cu.setAsCurrent();
cu.getPosq().download(posq);
return (float*) &posq[0];
}
void setForce(float* force) {
forceTemp->upload(force);
void* args[] = {&forceTemp->getDevicePointer(), &cu.getForce().getDevicePointer()};
cu.executeKernel(addForcesKernel, args, cu.getNumAtoms());
}
private:
CudaContext& cu;
vector<float4> posq;
CudaArray* forceTemp;
CUfunction addForcesKernel;
};
class CudaCalcNonbondedForceKernel::PmePreComputation : public CudaContext::ForcePreComputation {
public:
PmePreComputation(CudaContext& cu, Kernel& pme, CalcPmeReciprocalForceKernel::IO& io) : cu(cu), pme(pme), io(io) {
}
void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
Vec3 boxSize(cu.getPeriodicBoxSize().x, cu.getPeriodicBoxSize().y, cu.getPeriodicBoxSize().z);
pme.getAs<CalcPmeReciprocalForceKernel>().beginComputation(io, boxSize, includeEnergy);
}
private:
CudaContext& cu;
Kernel pme;
CalcPmeReciprocalForceKernel::IO& io;
};
class CudaCalcNonbondedForceKernel::PmePostComputation : public CudaContext::ForcePostComputation {
public:
PmePostComputation(Kernel& pme, CalcPmeReciprocalForceKernel::IO& io) : pme(pme), io(io) {
}
double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
return pme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
}
private:
Kernel pme;
CalcPmeReciprocalForceKernel::IO& io;
};
CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() { CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
cu.setAsCurrent(); cu.setAsCurrent();
if (sigmaEpsilon != NULL) if (sigmaEpsilon != NULL)
...@@ -1354,6 +1410,8 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() { ...@@ -1354,6 +1410,8 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
delete pmeAtomGridIndex; delete pmeAtomGridIndex;
if (sort != NULL) if (sort != NULL)
delete sort; delete sort;
if (pmeio != NULL)
delete pmeio;
if (hasInitializedFFT) { if (hasInitializedFFT) {
cufftDestroy(fftForward); cufftDestroy(fftForward);
cufftDestroy(fftBackward); cufftDestroy(fftBackward);
...@@ -1457,7 +1515,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1457,7 +1515,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
else else
dispersionCoefficient = 0.0; dispersionCoefficient = 0.0;
alpha = 0; alpha = 0;
if (force.getNonbondedMethod() == NonbondedForce::Ewald) { if (force.getNonbondedMethod() == NonbondedForce::Ewald && cu.getContextIndex() == 0) {
// Compute the Ewald parameters. // Compute the Ewald parameters.
int kmaxx, kmaxy, kmaxz; int kmaxx, kmaxy, kmaxz;
...@@ -1465,7 +1523,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1465,7 +1523,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
defines["EWALD_ALPHA"] = cu.doubleToString(alpha); defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI)); defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
defines["USE_EWALD"] = "1"; defines["USE_EWALD"] = "1";
ewaldSelfEnergy = (cu.getContextIndex() == 0 ? -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI) : 0.0); ewaldSelfEnergy = -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI);
// Create the reciprocal space kernels. // Create the reciprocal space kernels.
...@@ -1484,7 +1542,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1484,7 +1542,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
int elementSize = (cu.getUseDoublePrecision() ? sizeof(double2) : sizeof(float2)); int elementSize = (cu.getUseDoublePrecision() ? sizeof(double2) : sizeof(float2));
cosSinSums = new CudaArray(cu, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), elementSize, "cosSinSums"); cosSinSums = new CudaArray(cu, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), elementSize, "cosSinSums");
} }
else if (force.getNonbondedMethod() == NonbondedForce::PME) { else if (force.getNonbondedMethod() == NonbondedForce::PME && cu.getContextIndex() == 0) {
// Compute the PME parameters. // Compute the PME parameters.
int gridSizeX, gridSizeY, gridSizeZ; int gridSizeX, gridSizeY, gridSizeZ;
...@@ -1497,7 +1555,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1497,7 +1555,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
defines["EWALD_ALPHA"] = cu.doubleToString(alpha); defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI)); defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
defines["USE_EWALD"] = "1"; defines["USE_EWALD"] = "1";
ewaldSelfEnergy = (cu.getContextIndex() == 0 ? -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI) : 0.0); ewaldSelfEnergy = -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI);
pmeDefines["PME_ORDER"] = cu.intToString(PmeOrder); pmeDefines["PME_ORDER"] = cu.intToString(PmeOrder);
pmeDefines["NUM_ATOMS"] = cu.intToString(numParticles); pmeDefines["NUM_ATOMS"] = cu.intToString(numParticles);
pmeDefines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms()); pmeDefines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
...@@ -1510,6 +1568,22 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1510,6 +1568,22 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
if (cu.getUseDoublePrecision()) if (cu.getUseDoublePrecision())
pmeDefines["USE_DOUBLE_PRECISION"] = "1"; pmeDefines["USE_DOUBLE_PRECISION"] = "1";
CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme, pmeDefines); CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme, pmeDefines);
if (cu.getPlatformData().useCpuPme) {
// Create the CPU PME kernel.
try {
cpuPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), *cu.getPlatformData().context);
cpuPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSizeX, gridSizeY, gridSizeZ, numParticles, alpha);
CUfunction addForcesKernel = cu.getKernel(module, "addForces");
pmeio = new PmeIO(cu, addForcesKernel);
cu.addPreComputation(new PmePreComputation(cu, cpuPme, *pmeio));
cu.addPostComputation(new PmePostComputation(cpuPme, *pmeio));
}
catch (OpenMMException& ex) {
// The CPU PME plugin isn't available.
}
}
if (pmeio == NULL) {
pmeGridIndexKernel = cu.getKernel(module, "findAtomGridIndex"); pmeGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
pmeSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge"); pmeSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge");
pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution"); pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
...@@ -1618,6 +1692,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1618,6 +1692,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
} }
} }
} }
}
else else
ewaldSelfEnergy = 0.0; ewaldSelfEnergy = 0.0;
...@@ -1654,13 +1729,13 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1654,13 +1729,13 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
} }
double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) { double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
if (cosSinSums != NULL && cu.getContextIndex() == 0 && includeReciprocal) { if (cosSinSums != NULL && includeReciprocal) {
void* sumsArgs[] = {&cu.getEnergyBuffer().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums->getDevicePointer(), cu.getPeriodicBoxSizePointer()}; void* sumsArgs[] = {&cu.getEnergyBuffer().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums->getDevicePointer(), cu.getPeriodicBoxSizePointer()};
cu.executeKernel(ewaldSumsKernel, sumsArgs, cosSinSums->getSize()); cu.executeKernel(ewaldSumsKernel, sumsArgs, cosSinSums->getSize());
void* forcesArgs[] = {&cu.getForce().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums->getDevicePointer(), cu.getPeriodicBoxSizePointer()}; void* forcesArgs[] = {&cu.getForce().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums->getDevicePointer(), cu.getPeriodicBoxSizePointer()};
cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms()); cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms());
} }
if (directPmeGrid != NULL && cu.getContextIndex() == 0 && includeReciprocal) { if (directPmeGrid != NULL && includeReciprocal) {
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()}; void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms()); cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
......
...@@ -557,7 +557,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel { ...@@ -557,7 +557,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public: public:
CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcNonbondedForceKernel(name, platform), CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcNonbondedForceKernel(name, platform),
cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL), cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL),
pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL) { pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL), pmeio(NULL) {
} }
~CudaCalcNonbondedForceKernel(); ~CudaCalcNonbondedForceKernel();
/** /**
...@@ -596,6 +596,9 @@ private: ...@@ -596,6 +596,9 @@ private:
const char* getMaxValue() const {return "make_int2(INT_MAX, INT_MAX)";} const char* getMaxValue() const {return "make_int2(INT_MAX, INT_MAX)";}
const char* getSortKey() const {return "value.y";} const char* getSortKey() const {return "value.y";}
}; };
class PmeIO;
class PmePreComputation;
class PmePostComputation;
CudaContext& cu; CudaContext& cu;
bool hasInitializedFFT; bool hasInitializedFFT;
CudaArray* sigmaEpsilon; CudaArray* sigmaEpsilon;
...@@ -609,6 +612,8 @@ private: ...@@ -609,6 +612,8 @@ private:
CudaArray* pmeAtomRange; CudaArray* pmeAtomRange;
CudaArray* pmeAtomGridIndex; CudaArray* pmeAtomGridIndex;
CudaSort* sort; CudaSort* sort;
Kernel cpuPme;
PmeIO* pmeio;
cufftHandle fftForward; cufftHandle fftForward;
cufftHandle fftBackward; cufftHandle fftBackward;
CUfunction ewaldSumsKernel; CUfunction ewaldSumsKernel;
......
...@@ -87,12 +87,14 @@ CudaPlatform::CudaPlatform() { ...@@ -87,12 +87,14 @@ CudaPlatform::CudaPlatform() {
platformProperties.push_back(CudaDeviceName()); platformProperties.push_back(CudaDeviceName());
platformProperties.push_back(CudaUseBlockingSync()); platformProperties.push_back(CudaUseBlockingSync());
platformProperties.push_back(CudaPrecision()); platformProperties.push_back(CudaPrecision());
platformProperties.push_back(CudaUseCpuPme());
platformProperties.push_back(CudaCompiler()); platformProperties.push_back(CudaCompiler());
platformProperties.push_back(CudaTempDirectory()); platformProperties.push_back(CudaTempDirectory());
setPropertyDefaultValue(CudaDeviceIndex(), ""); setPropertyDefaultValue(CudaDeviceIndex(), "");
setPropertyDefaultValue(CudaDeviceName(), ""); setPropertyDefaultValue(CudaDeviceName(), "");
setPropertyDefaultValue(CudaUseBlockingSync(), "true"); setPropertyDefaultValue(CudaUseBlockingSync(), "true");
setPropertyDefaultValue(CudaPrecision(), "single"); setPropertyDefaultValue(CudaPrecision(), "single");
setPropertyDefaultValue(CudaUseCpuPme(), "false");
#ifdef _MSC_VER #ifdef _MSC_VER
char* bindir = getenv("CUDA_BIN_PATH"); char* bindir = getenv("CUDA_BIN_PATH");
string nvcc = (bindir == NULL ? "nvcc.exe" : string(bindir)+"\\nvcc.exe"); string nvcc = (bindir == NULL ? "nvcc.exe" : string(bindir)+"\\nvcc.exe");
...@@ -141,13 +143,20 @@ void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string ...@@ -141,13 +143,20 @@ void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string
getPropertyDefaultValue(CudaUseBlockingSync()) : properties.find(CudaUseBlockingSync())->second); getPropertyDefaultValue(CudaUseBlockingSync()) : properties.find(CudaUseBlockingSync())->second);
string precisionPropValue = (properties.find(CudaPrecision()) == properties.end() ? string precisionPropValue = (properties.find(CudaPrecision()) == properties.end() ?
getPropertyDefaultValue(CudaPrecision()) : properties.find(CudaPrecision())->second); getPropertyDefaultValue(CudaPrecision()) : properties.find(CudaPrecision())->second);
string cpuPmePropValue = (properties.find(CudaUseCpuPme()) == properties.end() ?
getPropertyDefaultValue(CudaUseCpuPme()) : properties.find(CudaUseCpuPme())->second);
const string& compilerPropValue = (properties.find(CudaCompiler()) == properties.end() ? const string& compilerPropValue = (properties.find(CudaCompiler()) == properties.end() ?
getPropertyDefaultValue(CudaCompiler()) : properties.find(CudaCompiler())->second); getPropertyDefaultValue(CudaCompiler()) : properties.find(CudaCompiler())->second);
const string& tempPropValue = (properties.find(CudaTempDirectory()) == properties.end() ? const string& tempPropValue = (properties.find(CudaTempDirectory()) == properties.end() ?
getPropertyDefaultValue(CudaTempDirectory()) : properties.find(CudaTempDirectory())->second); getPropertyDefaultValue(CudaTempDirectory()) : properties.find(CudaTempDirectory())->second);
transform(blockingPropValue.begin(), blockingPropValue.end(), blockingPropValue.begin(), ::tolower); transform(blockingPropValue.begin(), blockingPropValue.end(), blockingPropValue.begin(), ::tolower);
transform(precisionPropValue.begin(), precisionPropValue.end(), precisionPropValue.begin(), ::tolower); transform(precisionPropValue.begin(), precisionPropValue.end(), precisionPropValue.begin(), ::tolower);
context.setPlatformData(new PlatformData(context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, compilerPropValue, tempPropValue)); transform(cpuPmePropValue.begin(), cpuPmePropValue.end(), cpuPmePropValue.begin(), ::tolower);
vector<string> pmeKernelName;
pmeKernelName.push_back(CalcPmeReciprocalForceKernel::Name());
if (!supportsKernels(pmeKernelName))
cpuPmePropValue = "false";
context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue));
} }
void CudaPlatform::contextDestroyed(ContextImpl& context) const { void CudaPlatform::contextDestroyed(ContextImpl& context) const {
...@@ -155,8 +164,8 @@ void CudaPlatform::contextDestroyed(ContextImpl& context) const { ...@@ -155,8 +164,8 @@ void CudaPlatform::contextDestroyed(ContextImpl& context) const {
delete data; delete data;
} }
CudaPlatform::PlatformData::PlatformData(const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty, CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty,
const string& compilerProperty, const string& tempProperty) : removeCM(false), stepCount(0), computeForceCount(0), time(0.0) { const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty) : context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0) {
bool blocking = (blockingProperty == "true"); bool blocking = (blockingProperty == "true");
vector<string> devices; vector<string> devices;
size_t searchPos = 0, nextPos; size_t searchPos = 0, nextPos;
...@@ -185,10 +194,12 @@ CudaPlatform::PlatformData::PlatformData(const System& system, const string& dev ...@@ -185,10 +194,12 @@ CudaPlatform::PlatformData::PlatformData(const System& system, const string& dev
CHECK_RESULT(cuDeviceGetName(name, 1000, contexts[i]->getDevice()), "Error querying device name"); CHECK_RESULT(cuDeviceGetName(name, 1000, contexts[i]->getDevice()), "Error querying device name");
deviceName << name; deviceName << name;
} }
useCpuPme = (cpuPmeProperty == "true" && !contexts[0]->getUseDoublePrecision());
propertyValues[CudaPlatform::CudaDeviceIndex()] = deviceIndex.str(); propertyValues[CudaPlatform::CudaDeviceIndex()] = deviceIndex.str();
propertyValues[CudaPlatform::CudaDeviceName()] = deviceName.str(); propertyValues[CudaPlatform::CudaDeviceName()] = deviceName.str();
propertyValues[CudaPlatform::CudaUseBlockingSync()] = blocking ? "true" : "false"; propertyValues[CudaPlatform::CudaUseBlockingSync()] = blocking ? "true" : "false";
propertyValues[CudaPlatform::CudaPrecision()] = precisionProperty; propertyValues[CudaPlatform::CudaPrecision()] = precisionProperty;
propertyValues[CudaPlatform::CudaUseCpuPme()] = useCpuPme ? "true" : "false";
propertyValues[CudaPlatform::CudaCompiler()] = compilerProperty; propertyValues[CudaPlatform::CudaCompiler()] = compilerProperty;
propertyValues[CudaPlatform::CudaTempDirectory()] = tempProperty; propertyValues[CudaPlatform::CudaTempDirectory()] = tempProperty;
contextEnergy.resize(contexts.size()); contextEnergy.resize(contexts.size());
......
...@@ -271,3 +271,13 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __ ...@@ -271,3 +271,13 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
forceBuffers[atom+2*PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (-q*force.z*GRID_SIZE_Z*invPeriodicBoxSize.z*0x100000000)); forceBuffers[atom+2*PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (-q*force.z*GRID_SIZE_Z*invPeriodicBoxSize.z*0x100000000));
} }
} }
extern "C" __global__
void addForces(const real4* __restrict__ forces, unsigned long long* __restrict__ forceBuffers) {
for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < NUM_ATOMS; atom += blockDim.x*gridDim.x) {
real4 f = forces[atom];
forceBuffers[atom] += static_cast<unsigned long long>((long long) (f.x*0x100000000));
forceBuffers[atom+PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (f.y*0x100000000));
forceBuffers[atom+2*PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (f.z*0x100000000));
}
}
...@@ -54,7 +54,7 @@ void testGaussian() { ...@@ -54,7 +54,7 @@ void testGaussian() {
System system; System system;
for (int i = 0; i < numAtoms; i++) for (int i = 0; i < numAtoms; i++)
system.addParticle(1.0); system.addParticle(1.0);
CudaPlatform::PlatformData platformData(system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory())); platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()));
CudaContext& context = *platformData.contexts[0]; CudaContext& context = *platformData.contexts[0];
context.initialize(); context.initialize();
......
...@@ -64,7 +64,7 @@ void verifySorting(vector<float> array) { ...@@ -64,7 +64,7 @@ void verifySorting(vector<float> array) {
System system; System system;
system.addParticle(0.0); system.addParticle(0.0);
CudaPlatform::PlatformData platformData(system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory())); platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()));
CudaContext& context = *platformData.contexts[0]; CudaContext& context = *platformData.contexts[0];
context.initialize(); context.initialize();
......
...@@ -87,17 +87,25 @@ public: ...@@ -87,17 +87,25 @@ public:
static const std::string key = "OpenCLPrecision"; static const std::string key = "OpenCLPrecision";
return key; return key;
} }
/**
* This is the name of the parameter for selecting whether to use the CPU based PME calculation.
*/
static const std::string& OpenCLUseCpuPme() {
static const std::string key = "OpenCLUseCpuPme";
return key;
}
}; };
class OPENMM_EXPORT_OPENCL OpenCLPlatform::PlatformData { class OPENMM_EXPORT_OPENCL OpenCLPlatform::PlatformData {
public: public:
PlatformData(const System& system, const std::string& platformPropValue, const std::string& deviceIndexProperty, const std::string& precisionProperty); PlatformData(const System& system, const std::string& platformPropValue, const std::string& deviceIndexProperty, const std::string& precisionProperty, const std::string& cpuPmeProperty);
~PlatformData(); ~PlatformData();
void initializeContexts(const System& system); void initializeContexts(const System& system);
void syncContexts(); void syncContexts();
ContextImpl* context;
std::vector<OpenCLContext*> contexts; std::vector<OpenCLContext*> contexts;
std::vector<double> contextEnergy; std::vector<double> contextEnergy;
bool removeCM; bool removeCM, useCpuPme;
int cmMotionFrequency; int cmMotionFrequency;
int stepCount, computeForceCount; int stepCount, computeForceCount;
double time; double time;
......
...@@ -334,6 +334,10 @@ OpenCLContext::~OpenCLContext() { ...@@ -334,6 +334,10 @@ OpenCLContext::~OpenCLContext() {
delete forces[i]; delete forces[i];
for (int i = 0; i < (int) reorderListeners.size(); i++) for (int i = 0; i < (int) reorderListeners.size(); i++)
delete reorderListeners[i]; delete reorderListeners[i];
for (int i = 0; i < (int) preComputations.size(); i++)
delete preComputations[i];
for (int i = 0; i < (int) postComputations.size(); i++)
delete postComputations[i];
if (pinnedBuffer != NULL) if (pinnedBuffer != NULL)
delete pinnedBuffer; delete pinnedBuffer;
if (posq != NULL) if (posq != NULL)
...@@ -1106,6 +1110,14 @@ void OpenCLContext::addReorderListener(ReorderListener* listener) { ...@@ -1106,6 +1110,14 @@ void OpenCLContext::addReorderListener(ReorderListener* listener) {
reorderListeners.push_back(listener); reorderListeners.push_back(listener);
} }
void OpenCLContext::addPreComputation(ForcePreComputation* computation) {
preComputations.push_back(computation);
}
void OpenCLContext::addPostComputation(ForcePostComputation* computation) {
postComputations.push_back(computation);
}
struct OpenCLContext::WorkThread::ThreadData { struct OpenCLContext::WorkThread::ThreadData {
ThreadData(std::queue<OpenCLContext::WorkTask*>& tasks, bool& waiting, bool& finished, ThreadData(std::queue<OpenCLContext::WorkTask*>& tasks, bool& waiting, bool& finished,
pthread_mutex_t& queueLock, pthread_cond_t& waitForTaskCondition, pthread_cond_t& queueEmptyCondition) : pthread_mutex_t& queueLock, pthread_cond_t& waitForTaskCondition, pthread_cond_t& queueEmptyCondition) :
......
...@@ -158,6 +158,8 @@ public: ...@@ -158,6 +158,8 @@ public:
class WorkTask; class WorkTask;
class WorkThread; class WorkThread;
class ReorderListener; class ReorderListener;
class ForcePreComputation;
class ForcePostComputation;
static const int ThreadBlockSize; static const int ThreadBlockSize;
static const int TileSize; static const int TileSize;
OpenCLContext(const System& system, int platformIndex, int deviceIndex, const std::string& precision, OpenCLPlatform::PlatformData& platformData); OpenCLContext(const System& system, int platformIndex, int deviceIndex, const std::string& precision, OpenCLPlatform::PlatformData& platformData);
...@@ -554,6 +556,28 @@ public: ...@@ -554,6 +556,28 @@ public:
std::vector<ReorderListener*>& getReorderListeners() { std::vector<ReorderListener*>& getReorderListeners() {
return reorderListeners; return reorderListeners;
} }
/**
* Add a pre-computation that should be called at the very start of force and energy evaluations.
* The OpenCLContext assumes ownership of the object, and deletes it when the context itself is deleted.
*/
void addPreComputation(ForcePreComputation* computation);
/**
* Get the list of ForcePreComputations.
*/
std::vector<ForcePreComputation*>& getPreComputations() {
return preComputations;
}
/**
* Add a post-computation that should be called at the very end of force and energy evaluations.
* The OpenCLContext assumes ownership of the object, and deletes it when the context itself is deleted.
*/
void addPostComputation(ForcePostComputation* computation);
/**
* Get the list of ForcePostComputations.
*/
std::vector<ForcePostComputation*>& getPostComputations() {
return postComputations;
}
/** /**
* Mark that the current molecule definitions (and hence the atom order) may be invalid. * Mark that the current molecule definitions (and hence the atom order) may be invalid.
* This should be called whenever force field parameters change. It will cause the definitions * This should be called whenever force field parameters change. It will cause the definitions
...@@ -625,6 +649,8 @@ private: ...@@ -625,6 +649,8 @@ private:
std::vector<cl::Memory*> autoclearBuffers; std::vector<cl::Memory*> autoclearBuffers;
std::vector<int> autoclearBufferSizes; std::vector<int> autoclearBufferSizes;
std::vector<ReorderListener*> reorderListeners; std::vector<ReorderListener*> reorderListeners;
std::vector<ForcePreComputation*> preComputations;
std::vector<ForcePostComputation*> postComputations;
OpenCLIntegrationUtilities* integration; OpenCLIntegrationUtilities* integration;
OpenCLExpressionUtilities* expression; OpenCLExpressionUtilities* expression;
OpenCLBondedUtilities* bonded; OpenCLBondedUtilities* bonded;
...@@ -686,7 +712,7 @@ private: ...@@ -686,7 +712,7 @@ private:
/** /**
* This abstract class defines a function to be executed whenever atoms get reordered. * This abstract class defines a function to be executed whenever atoms get reordered.
* Objects that need to know when reordering happens should create a reorderListener * Objects that need to know when reordering happens should create a ReorderListener
* and register it by calling addReorderListener(). * and register it by calling addReorderListener().
*/ */
class OpenCLContext::ReorderListener { class OpenCLContext::ReorderListener {
...@@ -696,6 +722,39 @@ public: ...@@ -696,6 +722,39 @@ public:
} }
}; };
/**
* This abstract class defines a function to be executed at the very beginning of force and
* energy evaluation, before any other calculation has been done. It is useful for operations
* that need to be performed at a nonstandard point in the process. After creating a
* ForcePreComputation, register it by calling addForcePreComputation().
*/
class OpenCLContext::ForcePreComputation {
public:
/**
* @param includeForce true if forces should be computed
* @param includeEnergy true if potential energy should be computed
* @param groups a set of bit flags for which force groups to include
*/
virtual void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) = 0;
};
/**
* This abstract class defines a function to be executed at the very end of force and
* energy evaluation, after all other calculations have been done. It is useful for operations
* that need to be performed at a nonstandard point in the process. After creating a
* ForcePostComputation, register it by calling addForcePostComputation().
*/
class OpenCLContext::ForcePostComputation {
public:
/**
* @param includeForce true if forces should be computed
* @param includeEnergy true if potential energy should be computed
* @param groups a set of bit flags for which force groups to include
* @return an optional contribution to add to the potential energy.
*/
virtual double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) = 0;
};
} // namespace OpenMM } // namespace OpenMM
#endif /*OPENMM_OPENCLCONTEXT_H_*/ #endif /*OPENMM_OPENCLCONTEXT_H_*/
...@@ -104,10 +104,12 @@ void OpenCLCalcForcesAndEnergyKernel::initialize(const System& system) { ...@@ -104,10 +104,12 @@ void OpenCLCalcForcesAndEnergyKernel::initialize(const System& system) {
} }
void OpenCLCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups) { void OpenCLCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups) {
cl.clearAutoclearBuffers();
for (vector<OpenCLContext::ForcePreComputation*>::iterator iter = cl.getPreComputations().begin(); iter != cl.getPreComputations().end(); ++iter)
(*iter)->computeForceAndEnergy(includeForces, includeEnergy, groups);
OpenCLNonbondedUtilities& nb = cl.getNonbondedUtilities(); OpenCLNonbondedUtilities& nb = cl.getNonbondedUtilities();
bool includeNonbonded = ((groups&(1<<nb.getForceGroup())) != 0); bool includeNonbonded = ((groups&(1<<nb.getForceGroup())) != 0);
cl.setComputeForceCount(cl.getComputeForceCount()+1); cl.setComputeForceCount(cl.getComputeForceCount()+1);
cl.clearAutoclearBuffers();
if (includeNonbonded) if (includeNonbonded)
nb.prepareInteractions(); nb.prepareInteractions();
} }
...@@ -117,8 +119,10 @@ double OpenCLCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, ...@@ -117,8 +119,10 @@ double OpenCLCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context,
if ((groups&(1<<cl.getNonbondedUtilities().getForceGroup())) != 0) if ((groups&(1<<cl.getNonbondedUtilities().getForceGroup())) != 0)
cl.getNonbondedUtilities().computeInteractions(); cl.getNonbondedUtilities().computeInteractions();
cl.reduceForces(); cl.reduceForces();
double sum = 0.0;
for (vector<OpenCLContext::ForcePostComputation*>::iterator iter = cl.getPostComputations().begin(); iter != cl.getPostComputations().end(); ++iter)
sum += (*iter)->computeForceAndEnergy(includeForces, includeEnergy, groups);
cl.getIntegrationUtilities().distributeForcesFromVirtualSites(); cl.getIntegrationUtilities().distributeForcesFromVirtualSites();
double sum = 0.0f;
if (includeEnergy) { if (includeEnergy) {
OpenCLArray& energyArray = cl.getEnergyBuffer(); OpenCLArray& energyArray = cl.getEnergyBuffer();
if (cl.getUseDoublePrecision()) { if (cl.getUseDoublePrecision()) {
...@@ -1323,6 +1327,58 @@ private: ...@@ -1323,6 +1327,58 @@ private:
const NonbondedForce& force; const NonbondedForce& force;
}; };
class OpenCLCalcNonbondedForceKernel::PmeIO : public CalcPmeReciprocalForceKernel::IO {
public:
PmeIO(OpenCLContext& cl, cl::Kernel addForcesKernel) : cl(cl), addForcesKernel(addForcesKernel), forceTemp(NULL) {
forceTemp = OpenCLArray::create<mm_float4>(cl, cl.getNumAtoms(), "PmeForce");
addForcesKernel.setArg<cl::Buffer>(0, forceTemp->getDeviceBuffer());
}
~PmeIO() {
if (forceTemp != NULL)
delete forceTemp;
}
float* getPosq() {
cl.getPosq().download(posq);
return (float*) &posq[0];
}
void setForce(float* force) {
forceTemp->upload(force);
addForcesKernel.setArg<cl::Buffer>(1, cl.getForce().getDeviceBuffer());
cl.executeKernel(addForcesKernel, cl.getNumAtoms());
}
private:
OpenCLContext& cl;
vector<mm_float4> posq;
OpenCLArray* forceTemp;
cl::Kernel addForcesKernel;
};
class OpenCLCalcNonbondedForceKernel::PmePreComputation : public OpenCLContext::ForcePreComputation {
public:
PmePreComputation(OpenCLContext& cl, Kernel& pme, CalcPmeReciprocalForceKernel::IO& io) : cl(cl), pme(pme), io(io) {
}
void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
Vec3 boxSize(cl.getPeriodicBoxSize().x, cl.getPeriodicBoxSize().y, cl.getPeriodicBoxSize().z);
pme.getAs<CalcPmeReciprocalForceKernel>().beginComputation(io, boxSize, includeEnergy);
}
private:
OpenCLContext& cl;
Kernel pme;
CalcPmeReciprocalForceKernel::IO& io;
};
class OpenCLCalcNonbondedForceKernel::PmePostComputation : public OpenCLContext::ForcePostComputation {
public:
PmePostComputation(Kernel& pme, CalcPmeReciprocalForceKernel::IO& io) : pme(pme), io(io) {
}
double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
return pme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
}
private:
Kernel pme;
CalcPmeReciprocalForceKernel::IO& io;
};
OpenCLCalcNonbondedForceKernel::~OpenCLCalcNonbondedForceKernel() { OpenCLCalcNonbondedForceKernel::~OpenCLCalcNonbondedForceKernel() {
if (sigmaEpsilon != NULL) if (sigmaEpsilon != NULL)
delete sigmaEpsilon; delete sigmaEpsilon;
...@@ -1350,6 +1406,8 @@ OpenCLCalcNonbondedForceKernel::~OpenCLCalcNonbondedForceKernel() { ...@@ -1350,6 +1406,8 @@ OpenCLCalcNonbondedForceKernel::~OpenCLCalcNonbondedForceKernel() {
delete sort; delete sort;
if (fft != NULL) if (fft != NULL)
delete fft; delete fft;
if (pmeio != NULL)
delete pmeio;
} }
void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) { void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
...@@ -1430,7 +1488,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb ...@@ -1430,7 +1488,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
else else
dispersionCoefficient = 0.0; dispersionCoefficient = 0.0;
alpha = 0; alpha = 0;
if (force.getNonbondedMethod() == NonbondedForce::Ewald) { if (force.getNonbondedMethod() == NonbondedForce::Ewald && cl.getContextIndex() == 0) {
// Compute the Ewald parameters. // Compute the Ewald parameters.
int kmaxx, kmaxy, kmaxz; int kmaxx, kmaxy, kmaxz;
...@@ -1438,7 +1496,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb ...@@ -1438,7 +1496,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
defines["EWALD_ALPHA"] = cl.doubleToString(alpha); defines["EWALD_ALPHA"] = cl.doubleToString(alpha);
defines["TWO_OVER_SQRT_PI"] = cl.doubleToString(2.0/sqrt(M_PI)); defines["TWO_OVER_SQRT_PI"] = cl.doubleToString(2.0/sqrt(M_PI));
defines["USE_EWALD"] = "1"; defines["USE_EWALD"] = "1";
ewaldSelfEnergy = (cl.getContextIndex() == 0 ? -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI) : 0.0); ewaldSelfEnergy = -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI);
// Create the reciprocal space kernels. // Create the reciprocal space kernels.
...@@ -1454,7 +1512,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb ...@@ -1454,7 +1512,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
int elementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2)); int elementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2));
cosSinSums = new OpenCLArray(cl, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), elementSize, "cosSinSums"); cosSinSums = new OpenCLArray(cl, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), elementSize, "cosSinSums");
} }
else if (force.getNonbondedMethod() == NonbondedForce::PME) { else if (force.getNonbondedMethod() == NonbondedForce::PME && cl.getContextIndex() == 0) {
// Compute the PME parameters. // Compute the PME parameters.
int gridSizeX, gridSizeY, gridSizeZ; int gridSizeX, gridSizeY, gridSizeZ;
...@@ -1465,7 +1523,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb ...@@ -1465,7 +1523,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
defines["EWALD_ALPHA"] = cl.doubleToString(alpha); defines["EWALD_ALPHA"] = cl.doubleToString(alpha);
defines["TWO_OVER_SQRT_PI"] = cl.doubleToString(2.0/sqrt(M_PI)); defines["TWO_OVER_SQRT_PI"] = cl.doubleToString(2.0/sqrt(M_PI));
defines["USE_EWALD"] = "1"; defines["USE_EWALD"] = "1";
ewaldSelfEnergy = (cl.getContextIndex() == 0 ? -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI) : 0.0); ewaldSelfEnergy = -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI);
pmeDefines["PME_ORDER"] = cl.intToString(PmeOrder); pmeDefines["PME_ORDER"] = cl.intToString(PmeOrder);
pmeDefines["NUM_ATOMS"] = cl.intToString(numParticles); pmeDefines["NUM_ATOMS"] = cl.intToString(numParticles);
pmeDefines["RECIP_EXP_FACTOR"] = cl.doubleToString(M_PI*M_PI/(alpha*alpha)); pmeDefines["RECIP_EXP_FACTOR"] = cl.doubleToString(M_PI*M_PI/(alpha*alpha));
...@@ -1476,7 +1534,23 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb ...@@ -1476,7 +1534,23 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU); bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
if (deviceIsCpu) if (deviceIsCpu)
pmeDefines["DEVICE_IS_CPU"] = "1"; pmeDefines["DEVICE_IS_CPU"] = "1";
if (cl.getPlatformData().useCpuPme) {
// Create the CPU PME kernel.
try {
cpuPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), *cl.getPlatformData().context);
cpuPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSizeX, gridSizeY, gridSizeZ, numParticles, alpha);
cl::Program program = cl.createProgram(OpenCLKernelSources::pme, pmeDefines);
cl::Kernel addForcesKernel = cl::Kernel(program, "addForces");
pmeio = new PmeIO(cl, addForcesKernel);
cl.addPreComputation(new PmePreComputation(cl, cpuPme, *pmeio));
cl.addPostComputation(new PmePostComputation(cpuPme, *pmeio));
}
catch (OpenMMException& ex) {
// The CPU PME plugin isn't available.
}
}
if (pmeio == NULL) {
// Create required data structures. // Create required data structures.
int elementSize = (cl.getUseDoublePrecision() ? sizeof(double) : sizeof(float)); int elementSize = (cl.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
...@@ -1565,6 +1639,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb ...@@ -1565,6 +1639,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
} }
} }
} }
}
else else
ewaldSelfEnergy = 0.0; ewaldSelfEnergy = 0.0;
...@@ -1650,7 +1725,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -1650,7 +1725,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
} }
} }
} }
if (cosSinSums != NULL && cl.getContextIndex() == 0 && includeReciprocal) { if (cosSinSums != NULL && includeReciprocal) {
mm_double4 boxSize = cl.getPeriodicBoxSizeDouble(); mm_double4 boxSize = cl.getPeriodicBoxSizeDouble();
mm_double4 recipBoxSize = mm_double4(2*M_PI/boxSize.x, 2*M_PI/boxSize.y, 2*M_PI/boxSize.z, 0.0); mm_double4 recipBoxSize = mm_double4(2*M_PI/boxSize.x, 2*M_PI/boxSize.y, 2*M_PI/boxSize.z, 0.0);
double recipCoefficient = ONE_4PI_EPS0*4*M_PI/(boxSize.x*boxSize.y*boxSize.z); double recipCoefficient = ONE_4PI_EPS0*4*M_PI/(boxSize.x*boxSize.y*boxSize.z);
...@@ -1669,7 +1744,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -1669,7 +1744,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
cl.executeKernel(ewaldSumsKernel, cosSinSums->getSize()); cl.executeKernel(ewaldSumsKernel, cosSinSums->getSize());
cl.executeKernel(ewaldForcesKernel, cl.getNumAtoms()); cl.executeKernel(ewaldForcesKernel, cl.getNumAtoms());
} }
if (pmeGrid != NULL && cl.getContextIndex() == 0 && includeReciprocal) { if (pmeGrid != NULL && includeReciprocal) {
setPeriodicBoxSizeArg(cl, pmeUpdateBsplinesKernel, 4); setPeriodicBoxSizeArg(cl, pmeUpdateBsplinesKernel, 4);
setInvPeriodicBoxSizeArg(cl, pmeUpdateBsplinesKernel, 5); setInvPeriodicBoxSizeArg(cl, pmeUpdateBsplinesKernel, 5);
cl.executeKernel(pmeUpdateBsplinesKernel, cl.getNumAtoms()); cl.executeKernel(pmeUpdateBsplinesKernel, cl.getNumAtoms());
......
...@@ -557,7 +557,7 @@ public: ...@@ -557,7 +557,7 @@ public:
OpenCLCalcNonbondedForceKernel(std::string name, const Platform& platform, OpenCLContext& cl, const System& system) : CalcNonbondedForceKernel(name, platform), OpenCLCalcNonbondedForceKernel(std::string name, const Platform& platform, OpenCLContext& cl, const System& system) : CalcNonbondedForceKernel(name, platform),
hasInitializedKernel(false), cl(cl), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), pmeGrid(NULL), hasInitializedKernel(false), cl(cl), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), pmeGrid(NULL),
pmeGrid2(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL), pmeGrid2(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL),
pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL), fft(NULL) { pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL), fft(NULL), pmeio(NULL) {
} }
~OpenCLCalcNonbondedForceKernel(); ~OpenCLCalcNonbondedForceKernel();
/** /**
...@@ -596,6 +596,9 @@ private: ...@@ -596,6 +596,9 @@ private:
const char* getMaxValue() const {return "(int2) (INT_MAX, INT_MAX)";} const char* getMaxValue() const {return "(int2) (INT_MAX, INT_MAX)";}
const char* getSortKey() const {return "value.y";} const char* getSortKey() const {return "value.y";}
}; };
class PmeIO;
class PmePreComputation;
class PmePostComputation;
OpenCLContext& cl; OpenCLContext& cl;
bool hasInitializedKernel; bool hasInitializedKernel;
OpenCLArray* sigmaEpsilon; OpenCLArray* sigmaEpsilon;
...@@ -611,6 +614,8 @@ private: ...@@ -611,6 +614,8 @@ private:
OpenCLArray* pmeAtomGridIndex; OpenCLArray* pmeAtomGridIndex;
OpenCLSort* sort; OpenCLSort* sort;
OpenCLFFT3D* fft; OpenCLFFT3D* fft;
Kernel cpuPme;
PmeIO* pmeio;
cl::Kernel ewaldSumsKernel; cl::Kernel ewaldSumsKernel;
cl::Kernel ewaldForcesKernel; cl::Kernel ewaldForcesKernel;
cl::Kernel pmeGridIndexKernel; cl::Kernel pmeGridIndexKernel;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment