Commit 18295108 authored by peastman's avatar peastman
Browse files

Merge changes from main branch

parents e6101f68 8d7234e5
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2014 Stanford University and the Authors. * * Portions copyright (c) 2009-2017 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -158,8 +158,11 @@ private: ...@@ -158,8 +158,11 @@ private:
CudaArray* vsite3AvgWeights; CudaArray* vsite3AvgWeights;
CudaArray* vsiteOutOfPlaneAtoms; CudaArray* vsiteOutOfPlaneAtoms;
CudaArray* vsiteOutOfPlaneWeights; CudaArray* vsiteOutOfPlaneWeights;
CudaArray* vsiteLocalCoordsIndex;
CudaArray* vsiteLocalCoordsAtoms; CudaArray* vsiteLocalCoordsAtoms;
CudaArray* vsiteLocalCoordsParams; CudaArray* vsiteLocalCoordsWeights;
CudaArray* vsiteLocalCoordsPos;
CudaArray* vsiteLocalCoordsStartIndex;
int randomPos; int randomPos;
int lastSeed, numVsites; int lastSeed, numVsites;
double2 lastStepSize; double2 lastStepSize;
......
...@@ -38,6 +38,7 @@ ...@@ -38,6 +38,7 @@
#include "openmm/internal/CompiledExpressionSet.h" #include "openmm/internal/CompiledExpressionSet.h"
#include "openmm/internal/CustomIntegratorUtilities.h" #include "openmm/internal/CustomIntegratorUtilities.h"
#include "lepton/CompiledExpression.h" #include "lepton/CompiledExpression.h"
#include "lepton/ExpressionProgram.h"
#include <cufft.h> #include <cufft.h>
namespace OpenMM { namespace OpenMM {
...@@ -1229,6 +1230,54 @@ private: ...@@ -1229,6 +1230,54 @@ private:
CUevent event; CUevent event;
}; };
/**
* This kernel is invoked by CustomCVForce to calculate the forces acting on the system and the energy of the system.
*/
class CudaCalcCustomCVForceKernel : public CalcCustomCVForceKernel {
public:
CudaCalcCustomCVForceKernel(std::string name, const Platform& platform, CudaContext& cu) : CalcCustomCVForceKernel(name, platform),
cu(cu), hasInitializedListeners(false), invAtomOrder(NULL), innerInvAtomOrder(NULL) {
}
~CudaCalcCustomCVForceKernel();
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
* @param force the CustomCVForce this kernel will be used for
* @param innerContext the context created by the CustomCVForce for computing collective variables
*/
void initialize(const System& system, const CustomCVForce& force, ContextImpl& innerContext);
/**
* Execute the kernel to calculate the forces and/or energy.
*
* @param context the context in which to execute this kernel
* @param innerContext the context created by the CustomCVForce for computing collective variables
* @param includeForces true if forces should be calculated
* @param includeEnergy true if the energy should be calculated
* @return the potential energy due to the force
*/
double execute(ContextImpl& context, ContextImpl& innerContext, bool includeForces, bool includeEnergy);
/**
* Copy state information to the inner context.
*
* @param context the context in which to execute this kernel
* @param innerContext the context created by the CustomCVForce for computing collective variables
*/
void copyState(ContextImpl& context, ContextImpl& innerContext);
private:
class ReorderListener;
CudaContext& cu;
bool hasInitializedListeners;
Lepton::ExpressionProgram energyExpression;
std::vector<std::string> variableNames, paramDerivNames, globalParameterNames;
std::vector<Lepton::ExpressionProgram> variableDerivExpressions;
std::vector<Lepton::ExpressionProgram> paramDerivExpressions;
std::vector<CudaArray*> cvForces;
CudaArray* invAtomOrder;
CudaArray* innerInvAtomOrder;
CUfunction copyStateKernel, copyForcesKernel, addForcesKernel;
};
/** /**
* This kernel is invoked by VerletIntegrator to take one time step. * This kernel is invoked by VerletIntegrator to take one time step.
*/ */
...@@ -1485,7 +1534,9 @@ private: ...@@ -1485,7 +1534,9 @@ private:
class ReorderListener; class ReorderListener;
class GlobalTarget; class GlobalTarget;
class DerivFunction; class DerivFunction;
std::string createPerDofComputation(const std::string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator, const std::string& forceName, const std::string& energyName); std::string createPerDofComputation(const std::string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator,
const std::string& forceName, const std::string& energyName, std::vector<const TabulatedFunction*>& functions,
std::vector<std::pair<std::string, std::string> >& functionNames);
void prepareForComputation(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid); void prepareForComputation(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid);
Lepton::ExpressionTreeNode replaceDerivFunctions(const Lepton::ExpressionTreeNode& node, OpenMM::ContextImpl& context); Lepton::ExpressionTreeNode replaceDerivFunctions(const Lepton::ExpressionTreeNode& node, OpenMM::ContextImpl& context);
void findExpressionsForDerivs(const Lepton::ExpressionTreeNode& node, std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& variableNodes); void findExpressionsForDerivs(const Lepton::ExpressionTreeNode& node, std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& variableNodes);
...@@ -1495,7 +1546,7 @@ private: ...@@ -1495,7 +1546,7 @@ private:
CudaContext& cu; CudaContext& cu;
double energy; double energy;
float energyFloat; float energyFloat;
int numGlobalVariables; int numGlobalVariables, sumWorkGroupSize;
bool hasInitializedKernels, deviceValuesAreCurrent, deviceGlobalsAreCurrent, modifiesParameters, keNeedsForce, hasAnyConstraints, needsEnergyParamDerivs; bool hasInitializedKernels, deviceValuesAreCurrent, deviceGlobalsAreCurrent, modifiesParameters, keNeedsForce, hasAnyConstraints, needsEnergyParamDerivs;
mutable bool localValuesAreCurrent; mutable bool localValuesAreCurrent;
CudaArray* globalValues; CudaArray* globalValues;
...@@ -1504,6 +1555,8 @@ private: ...@@ -1504,6 +1555,8 @@ private:
CudaArray* uniformRandoms; CudaArray* uniformRandoms;
CudaArray* randomSeed; CudaArray* randomSeed;
CudaArray* perDofEnergyParamDerivs; CudaArray* perDofEnergyParamDerivs;
std::vector<CudaArray*> tabulatedFunctions;
std::map<int, double> savedEnergy;
std::map<int, CudaArray*> savedForces; std::map<int, CudaArray*> savedForces;
std::set<int> validSavedForces; std::set<int> validSavedForces;
CudaParameterSet* perDofValues; CudaParameterSet* perDofValues;
...@@ -1587,7 +1640,7 @@ private: ...@@ -1587,7 +1640,7 @@ private:
class CudaApplyMonteCarloBarostatKernel : public ApplyMonteCarloBarostatKernel { class CudaApplyMonteCarloBarostatKernel : public ApplyMonteCarloBarostatKernel {
public: public:
CudaApplyMonteCarloBarostatKernel(std::string name, const Platform& platform, CudaContext& cu) : ApplyMonteCarloBarostatKernel(name, platform), cu(cu), CudaApplyMonteCarloBarostatKernel(std::string name, const Platform& platform, CudaContext& cu) : ApplyMonteCarloBarostatKernel(name, platform), cu(cu),
hasInitializedKernels(false), savedPositions(NULL), moleculeAtoms(NULL), moleculeStartIndex(NULL) { hasInitializedKernels(false), savedPositions(NULL), savedForces(NULL), moleculeAtoms(NULL), moleculeStartIndex(NULL) {
} }
~CudaApplyMonteCarloBarostatKernel(); ~CudaApplyMonteCarloBarostatKernel();
/** /**
...@@ -1622,6 +1675,7 @@ private: ...@@ -1622,6 +1675,7 @@ private:
bool hasInitializedKernels; bool hasInitializedKernels;
int numMolecules; int numMolecules;
CudaArray* savedPositions; CudaArray* savedPositions;
CudaArray* savedForces;
CudaArray* moleculeAtoms; CudaArray* moleculeAtoms;
CudaArray* moleculeStartIndex; CudaArray* moleculeStartIndex;
CUfunction kernel; CUfunction kernel;
......
...@@ -53,6 +53,7 @@ public: ...@@ -53,6 +53,7 @@ public:
const std::string& getPropertyValue(const Context& context, const std::string& property) const; const std::string& getPropertyValue(const Context& context, const std::string& property) const;
void setPropertyValue(Context& context, const std::string& property, const std::string& value) const; void setPropertyValue(Context& context, const std::string& property, const std::string& value) const;
void contextCreated(ContextImpl& context, const std::map<std::string, std::string>& properties) const; void contextCreated(ContextImpl& context, const std::map<std::string, std::string>& properties) const;
void linkedContextCreated(ContextImpl& context, ContextImpl& originalContext) const;
void contextDestroyed(ContextImpl& context) const; void contextDestroyed(ContextImpl& context) const;
/** /**
* This is the name of the parameter for selecting which CUDA device or devices to use. * This is the name of the parameter for selecting which CUDA device or devices to use.
...@@ -130,7 +131,7 @@ class OPENMM_EXPORT_CUDA CudaPlatform::PlatformData { ...@@ -130,7 +131,7 @@ class OPENMM_EXPORT_CUDA CudaPlatform::PlatformData {
public: public:
PlatformData(ContextImpl* context, const System& system, const std::string& deviceIndexProperty, const std::string& blockingProperty, const std::string& precisionProperty, PlatformData(ContextImpl* context, const System& system, const std::string& deviceIndexProperty, const std::string& blockingProperty, const std::string& precisionProperty,
const std::string& cpuPmeProperty, const std::string& compilerProperty, const std::string& tempProperty, const std::string& hostCompilerProperty, const std::string& cpuPmeProperty, const std::string& compilerProperty, const std::string& tempProperty, const std::string& hostCompilerProperty,
const std::string& pmeStreamProperty, const std::string& deterministicForcesProperty, int numThreads); const std::string& pmeStreamProperty, const std::string& deterministicForcesProperty, int numThreads, ContextImpl* originalContext);
~PlatformData(); ~PlatformData();
void initializeContexts(const System& system); void initializeContexts(const System& system);
void syncContexts(); void syncContexts();
......
...@@ -106,9 +106,9 @@ static int executeInWindows(const string &command) { ...@@ -106,9 +106,9 @@ static int executeInWindows(const string &command) {
#endif #endif
CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler, CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
const string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData) : system(system), currentStream(0), const string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData, CudaContext* originalContext) : system(system), currentStream(0),
time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), contextIsValid(false), atomsWereReordered(false), hasCompilerKernel(false), isNvccAvailable(false), time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), contextIsValid(false), atomsWereReordered(false), hasCompilerKernel(false), isNvccAvailable(false),
pinnedBuffer(NULL), posq(NULL), posqCorrection(NULL), velm(NULL), force(NULL), energyBuffer(NULL), energyParamDerivBuffer(NULL), atomIndexDevice(NULL), chargeBuffer(NULL), pinnedBuffer(NULL), posq(NULL), posqCorrection(NULL), velm(NULL), force(NULL), energyBuffer(NULL), energySum(NULL), energyParamDerivBuffer(NULL), atomIndexDevice(NULL), chargeBuffer(NULL),
integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) { integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
// Determine what compiler to use. // Determine what compiler to use.
...@@ -173,40 +173,49 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -173,40 +173,49 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
cacheDir = cacheDir+"/"; cacheDir = cacheDir+"/";
#endif #endif
contextIndex = platformData.contexts.size(); contextIndex = platformData.contexts.size();
int numDevices;
string errorMessage = "Error initializing Context"; string errorMessage = "Error initializing Context";
CHECK_RESULT(cuDeviceGetCount(&numDevices)); if (originalContext == NULL) {
if (deviceIndex < -1 || deviceIndex >= numDevices) isLinkedContext = false;
throw OpenMMException("Illegal value for DeviceIndex: "+intToString(deviceIndex)); int numDevices;
CHECK_RESULT(cuDeviceGetCount(&numDevices));
vector<int> devicePrecedence; if (deviceIndex < -1 || deviceIndex >= numDevices)
if (deviceIndex == -1) { throw OpenMMException("Illegal value for DeviceIndex: "+intToString(deviceIndex));
devicePrecedence = getDevicePrecedence();
} else { vector<int> devicePrecedence;
devicePrecedence.push_back(deviceIndex); if (deviceIndex == -1) {
} devicePrecedence = getDevicePrecedence();
} else {
this->deviceIndex = -1; devicePrecedence.push_back(deviceIndex);
for (int i = 0; i < static_cast<int>(devicePrecedence.size()); i++) { }
int trialDeviceIndex = devicePrecedence[i];
CHECK_RESULT(cuDeviceGet(&device, trialDeviceIndex));
defaultOptimizationOptions = "--use_fast_math";
unsigned int flags = CU_CTX_MAP_HOST;
if (useBlockingSync)
flags += CU_CTX_SCHED_BLOCKING_SYNC;
else
flags += CU_CTX_SCHED_SPIN;
if (cuCtxCreate(&context, flags, device) == CUDA_SUCCESS) { this->deviceIndex = -1;
this->deviceIndex = trialDeviceIndex; for (int i = 0; i < static_cast<int>(devicePrecedence.size()); i++) {
break; int trialDeviceIndex = devicePrecedence[i];
CHECK_RESULT(cuDeviceGet(&device, trialDeviceIndex));
defaultOptimizationOptions = "--use_fast_math";
unsigned int flags = CU_CTX_MAP_HOST;
if (useBlockingSync)
flags += CU_CTX_SCHED_BLOCKING_SYNC;
else
flags += CU_CTX_SCHED_SPIN;
if (cuCtxCreate(&context, flags, device) == CUDA_SUCCESS) {
this->deviceIndex = trialDeviceIndex;
break;
}
} }
if (this->deviceIndex == -1)
if (deviceIndex != -1)
throw OpenMMException("The requested CUDA device could not be loaded");
else
throw OpenMMException("No compatible CUDA device is available");
}
else {
isLinkedContext = true;
context = originalContext->context;
this->deviceIndex = originalContext->deviceIndex;
this->device = originalContext->device;
} }
if (this->deviceIndex == -1)
if (deviceIndex != -1)
throw OpenMMException("The requested CUDA device could not be loaded");
else
throw OpenMMException("No compatible CUDA device is available");
int major, minor; int major, minor;
CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device)); CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
...@@ -227,6 +236,12 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -227,6 +236,12 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
minor = 3; minor = 3;
} }
} }
if (major == 7) {
// Don't generate Volta-specific code until we've made the changes needed
// to support it properly.
major = 6;
minor = 0;
}
gpuArchitecture = intToString(major)+intToString(minor); gpuArchitecture = intToString(major)+intToString(minor);
computeCapability = major+0.1*minor; computeCapability = major+0.1*minor;
...@@ -292,6 +307,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -292,6 +307,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
clearFourBuffersKernel = getKernel(utilities, "clearFourBuffers"); clearFourBuffersKernel = getKernel(utilities, "clearFourBuffers");
clearFiveBuffersKernel = getKernel(utilities, "clearFiveBuffers"); clearFiveBuffersKernel = getKernel(utilities, "clearFiveBuffers");
clearSixBuffersKernel = getKernel(utilities, "clearSixBuffers"); clearSixBuffersKernel = getKernel(utilities, "clearSixBuffers");
reduceEnergyKernel = getKernel(utilities, "reduceEnergy");
setChargesKernel = getKernel(utilities, "setCharges"); setChargesKernel = getKernel(utilities, "setCharges");
// Set defines based on the requested precision. // Set defines based on the requested precision.
...@@ -405,6 +421,8 @@ CudaContext::~CudaContext() { ...@@ -405,6 +421,8 @@ CudaContext::~CudaContext() {
delete force; delete force;
if (energyBuffer != NULL) if (energyBuffer != NULL)
delete energyBuffer; delete energyBuffer;
if (energySum != NULL)
delete energySum;
if (energyParamDerivBuffer != NULL) if (energyParamDerivBuffer != NULL)
delete energyParamDerivBuffer; delete energyParamDerivBuffer;
if (atomIndexDevice != NULL) if (atomIndexDevice != NULL)
...@@ -422,7 +440,7 @@ CudaContext::~CudaContext() { ...@@ -422,7 +440,7 @@ CudaContext::~CudaContext() {
if (thread != NULL) if (thread != NULL)
delete thread; delete thread;
string errorMessage = "Error deleting Context"; string errorMessage = "Error deleting Context";
if (contextIsValid) { if (contextIsValid && !isLinkedContext) {
cuProfilerStop(); cuProfilerStop();
CHECK_RESULT(cuCtxDestroy(context)); CHECK_RESULT(cuCtxDestroy(context));
} }
...@@ -435,16 +453,19 @@ void CudaContext::initialize() { ...@@ -435,16 +453,19 @@ void CudaContext::initialize() {
int numEnergyBuffers = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()); int numEnergyBuffers = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers());
if (useDoublePrecision) { if (useDoublePrecision) {
energyBuffer = CudaArray::create<double>(*this, numEnergyBuffers, "energyBuffer"); energyBuffer = CudaArray::create<double>(*this, numEnergyBuffers, "energyBuffer");
energySum = CudaArray::create<double>(*this, 1, "energySum");
int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers); int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0)); CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
} }
else if (useMixedPrecision) { else if (useMixedPrecision) {
energyBuffer = CudaArray::create<double>(*this, numEnergyBuffers, "energyBuffer"); energyBuffer = CudaArray::create<double>(*this, numEnergyBuffers, "energyBuffer");
energySum = CudaArray::create<double>(*this, 1, "energySum");
int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers); int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0)); CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
} }
else { else {
energyBuffer = CudaArray::create<float>(*this, numEnergyBuffers, "energyBuffer"); energyBuffer = CudaArray::create<float>(*this, numEnergyBuffers, "energyBuffer");
energySum = CudaArray::create<float>(*this, 1, "energySum");
int pinnedBufferSize = max(paddedNumAtoms*6, numEnergyBuffers); int pinnedBufferSize = max(paddedNumAtoms*6, numEnergyBuffers);
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), 0)); CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), 0));
} }
...@@ -864,6 +885,18 @@ void CudaContext::clearAutoclearBuffers() { ...@@ -864,6 +885,18 @@ void CudaContext::clearAutoclearBuffers() {
} }
} }
double CudaContext::reduceEnergy() {
int bufferSize = energyBuffer->getSize();
int workGroupSize = 512;
void* args[] = {&energyBuffer->getDevicePointer(), &energySum->getDevicePointer(), &bufferSize, &workGroupSize};
executeKernel(reduceEnergyKernel, args, workGroupSize, workGroupSize, workGroupSize*energyBuffer->getElementSize());
energySum->download(pinnedBuffer);
if (getUseDoublePrecision() || getUseMixedPrecision())
return *((double*) pinnedBuffer);
else
return *((float*) pinnedBuffer);
}
void CudaContext::setCharges(const vector<double>& charges) { void CudaContext::setCharges(const vector<double>& charges) {
if (chargeBuffer == NULL) if (chargeBuffer == NULL)
chargeBuffer = new CudaArray(*this, numAtoms, useDoublePrecision ? sizeof(double) : sizeof(float), "chargeBuffer"); chargeBuffer = new CudaArray(*this, numAtoms, useDoublePrecision ? sizeof(double) : sizeof(float), "chargeBuffer");
...@@ -1050,9 +1083,16 @@ void CudaContext::findMoleculeGroups() { ...@@ -1050,9 +1083,16 @@ void CudaContext::findMoleculeGroups() {
for (int i = 0; i < (int) forces.size() && identical; i++) { for (int i = 0; i < (int) forces.size() && identical; i++) {
if (mol.groups[i].size() != mol2.groups[i].size()) if (mol.groups[i].size() != mol2.groups[i].size())
identical = false; identical = false;
for (int k = 0; k < (int) mol.groups[i].size() && identical; k++) for (int k = 0; k < (int) mol.groups[i].size() && identical; k++) {
if (!forces[i]->areGroupsIdentical(mol.groups[i][k], mol2.groups[i][k])) if (!forces[i]->areGroupsIdentical(mol.groups[i][k], mol2.groups[i][k]))
identical = false; identical = false;
vector<int> p1, p2;
forces[i]->getParticlesInGroup(mol.groups[i][k], p1);
forces[i]->getParticlesInGroup(mol2.groups[i][k], p2);
for (int m = 0; m < p1.size(); m++)
if (p1[m] != p2[m]-atomOffset)
identical = false;
}
} }
if (identical) { if (identical) {
moleculeInstances[j].push_back(molIndex); moleculeInstances[j].push_back(molIndex);
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2015 Stanford University and the Authors. * * Portions copyright (c) 2009-2017 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -103,7 +103,8 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S ...@@ -103,7 +103,8 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
ccmaReducedMass(NULL), ccmaAtomConstraints(NULL), ccmaNumAtomConstraints(NULL), ccmaConstraintMatrixColumn(NULL), ccmaReducedMass(NULL), ccmaAtomConstraints(NULL), ccmaNumAtomConstraints(NULL), ccmaConstraintMatrixColumn(NULL),
ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConverged(NULL), ccmaConvergedMemory(NULL), ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConverged(NULL), ccmaConvergedMemory(NULL),
vsite2AvgAtoms(NULL), vsite2AvgWeights(NULL), vsite3AvgAtoms(NULL), vsite3AvgWeights(NULL), vsite2AvgAtoms(NULL), vsite2AvgWeights(NULL), vsite3AvgAtoms(NULL), vsite3AvgWeights(NULL),
vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL), vsiteLocalCoordsAtoms(NULL), vsiteLocalCoordsParams(NULL) { vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL), vsiteLocalCoordsIndex(NULL), vsiteLocalCoordsAtoms(NULL),
vsiteLocalCoordsWeights(NULL), vsiteLocalCoordsPos(NULL), vsiteLocalCoordsStartIndex(NULL) {
// Create workspace arrays. // Create workspace arrays.
lastStepSize = make_double2(0.0, 0.0); lastStepSize = make_double2(0.0, 0.0);
...@@ -454,8 +455,11 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S ...@@ -454,8 +455,11 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
vector<double4> vsite3AvgWeightVec; vector<double4> vsite3AvgWeightVec;
vector<int4> vsiteOutOfPlaneAtomVec; vector<int4> vsiteOutOfPlaneAtomVec;
vector<double4> vsiteOutOfPlaneWeightVec; vector<double4> vsiteOutOfPlaneWeightVec;
vector<int4> vsiteLocalCoordsAtomVec; vector<int> vsiteLocalCoordsIndexVec;
vector<double> vsiteLocalCoordsParamVec; vector<int> vsiteLocalCoordsAtomVec;
vector<int> vsiteLocalCoordsStartVec;
vector<double> vsiteLocalCoordsWeightVec;
vector<double4> vsiteLocalCoordsPosVec;
for (int i = 0; i < numAtoms; i++) { for (int i = 0; i < numAtoms; i++) {
if (system.isVirtualSite(i)) { if (system.isVirtualSite(i)) {
if (dynamic_cast<const TwoParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) { if (dynamic_cast<const TwoParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
...@@ -480,64 +484,72 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S ...@@ -480,64 +484,72 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
vsiteOutOfPlaneWeightVec.push_back(make_double4(site.getWeight12(), site.getWeight13(), site.getWeightCross(), 0.0)); vsiteOutOfPlaneWeightVec.push_back(make_double4(site.getWeight12(), site.getWeight13(), site.getWeightCross(), 0.0));
} }
else if (dynamic_cast<const LocalCoordinatesSite*>(&system.getVirtualSite(i)) != NULL) { else if (dynamic_cast<const LocalCoordinatesSite*>(&system.getVirtualSite(i)) != NULL) {
// An out of plane site. // A local coordinates site.
const LocalCoordinatesSite& site = dynamic_cast<const LocalCoordinatesSite&>(system.getVirtualSite(i)); const LocalCoordinatesSite& site = dynamic_cast<const LocalCoordinatesSite&>(system.getVirtualSite(i));
vsiteLocalCoordsAtomVec.push_back(make_int4(i, site.getParticle(0), site.getParticle(1), site.getParticle(2))); int numParticles = site.getNumParticles();
Vec3 origin = site.getOriginWeights(); vector<double> origin, x, y;
Vec3 x = site.getXWeights(); site.getOriginWeights(origin);
Vec3 y = site.getYWeights(); site.getXWeights(x);
site.getYWeights(y);
vsiteLocalCoordsIndexVec.push_back(i);
vsiteLocalCoordsStartVec.push_back(vsiteLocalCoordsAtomVec.size());
for (int j = 0; j < numParticles; j++) {
vsiteLocalCoordsAtomVec.push_back(site.getParticle(j));
vsiteLocalCoordsWeightVec.push_back(origin[j]);
vsiteLocalCoordsWeightVec.push_back(x[j]);
vsiteLocalCoordsWeightVec.push_back(y[j]);
}
Vec3 pos = site.getLocalPosition(); Vec3 pos = site.getLocalPosition();
vsiteLocalCoordsParamVec.push_back(origin[0]); vsiteLocalCoordsPosVec.push_back(make_double4(pos[0], pos[1], pos[2], 0.0));
vsiteLocalCoordsParamVec.push_back(origin[1]);
vsiteLocalCoordsParamVec.push_back(origin[2]);
vsiteLocalCoordsParamVec.push_back(x[0]);
vsiteLocalCoordsParamVec.push_back(x[1]);
vsiteLocalCoordsParamVec.push_back(x[2]);
vsiteLocalCoordsParamVec.push_back(y[0]);
vsiteLocalCoordsParamVec.push_back(y[1]);
vsiteLocalCoordsParamVec.push_back(y[2]);
vsiteLocalCoordsParamVec.push_back(pos[0]);
vsiteLocalCoordsParamVec.push_back(pos[1]);
vsiteLocalCoordsParamVec.push_back(pos[2]);
} }
} }
} }
vsiteLocalCoordsStartVec.push_back(vsiteLocalCoordsAtomVec.size());
int num2Avg = vsite2AvgAtomVec.size(); int num2Avg = vsite2AvgAtomVec.size();
int num3Avg = vsite3AvgAtomVec.size(); int num3Avg = vsite3AvgAtomVec.size();
int numOutOfPlane = vsiteOutOfPlaneAtomVec.size(); int numOutOfPlane = vsiteOutOfPlaneAtomVec.size();
int numLocalCoords = vsiteLocalCoordsAtomVec.size(); int numLocalCoords = vsiteLocalCoordsPosVec.size();
vsite2AvgAtoms = CudaArray::create<int4>(context, max(1, num2Avg), "vsite2AvgAtoms"); vsite2AvgAtoms = CudaArray::create<int4>(context, max(1, num2Avg), "vsite2AvgAtoms");
vsite3AvgAtoms = CudaArray::create<int4>(context, max(1, num3Avg), "vsite3AvgAtoms"); vsite3AvgAtoms = CudaArray::create<int4>(context, max(1, num3Avg), "vsite3AvgAtoms");
vsiteOutOfPlaneAtoms = CudaArray::create<int4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneAtoms"); vsiteOutOfPlaneAtoms = CudaArray::create<int4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneAtoms");
vsiteLocalCoordsAtoms = CudaArray::create<int4>(context, max(1, numLocalCoords), "vsiteLocalCoordinatesAtoms"); vsiteLocalCoordsIndex = CudaArray::create<int>(context, max(1, (int) vsiteLocalCoordsIndexVec.size()), "vsiteLocalCoordsIndex");
vsiteLocalCoordsAtoms = CudaArray::create<int>(context, max(1, (int) vsiteLocalCoordsAtomVec.size()), "vsiteLocalCoordsAtoms");
vsiteLocalCoordsStartIndex = CudaArray::create<int>(context, max(1, (int) vsiteLocalCoordsStartVec.size()), "vsiteLocalCoordsStartIndex");
if (num2Avg > 0) if (num2Avg > 0)
vsite2AvgAtoms->upload(vsite2AvgAtomVec); vsite2AvgAtoms->upload(vsite2AvgAtomVec);
if (num3Avg > 0) if (num3Avg > 0)
vsite3AvgAtoms->upload(vsite3AvgAtomVec); vsite3AvgAtoms->upload(vsite3AvgAtomVec);
if (numOutOfPlane > 0) if (numOutOfPlane > 0)
vsiteOutOfPlaneAtoms->upload(vsiteOutOfPlaneAtomVec); vsiteOutOfPlaneAtoms->upload(vsiteOutOfPlaneAtomVec);
if (numLocalCoords > 0) if (numLocalCoords > 0) {
vsiteLocalCoordsIndex->upload(vsiteLocalCoordsIndexVec);
vsiteLocalCoordsAtoms->upload(vsiteLocalCoordsAtomVec); vsiteLocalCoordsAtoms->upload(vsiteLocalCoordsAtomVec);
vsiteLocalCoordsStartIndex->upload(vsiteLocalCoordsStartVec);
}
if (context.getUseDoublePrecision()) { if (context.getUseDoublePrecision()) {
vsite2AvgWeights = CudaArray::create<double2>(context, max(1, num2Avg), "vsite2AvgWeights"); vsite2AvgWeights = CudaArray::create<double2>(context, max(1, num2Avg), "vsite2AvgWeights");
vsite3AvgWeights = CudaArray::create<double4>(context, max(1, num3Avg), "vsite3AvgWeights"); vsite3AvgWeights = CudaArray::create<double4>(context, max(1, num3Avg), "vsite3AvgWeights");
vsiteOutOfPlaneWeights = CudaArray::create<double4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights"); vsiteOutOfPlaneWeights = CudaArray::create<double4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights");
vsiteLocalCoordsParams = CudaArray::create<double>(context, max(1, 12*numLocalCoords), "vsiteLocalCoordinatesParams"); vsiteLocalCoordsWeights = CudaArray::create<double>(context, max(1, (int) vsiteLocalCoordsWeightVec.size()), "vsiteLocalCoordsWeights");
vsiteLocalCoordsPos = CudaArray::create<double4>(context, max(1, (int) vsiteLocalCoordsPosVec.size()), "vsiteLocalCoordsPos");
if (num2Avg > 0) if (num2Avg > 0)
vsite2AvgWeights->upload(vsite2AvgWeightVec); vsite2AvgWeights->upload(vsite2AvgWeightVec);
if (num3Avg > 0) if (num3Avg > 0)
vsite3AvgWeights->upload(vsite3AvgWeightVec); vsite3AvgWeights->upload(vsite3AvgWeightVec);
if (numOutOfPlane > 0) if (numOutOfPlane > 0)
vsiteOutOfPlaneWeights->upload(vsiteOutOfPlaneWeightVec); vsiteOutOfPlaneWeights->upload(vsiteOutOfPlaneWeightVec);
if (numLocalCoords > 0) if (numLocalCoords > 0) {
vsiteLocalCoordsParams->upload(vsiteLocalCoordsParamVec); vsiteLocalCoordsWeights->upload(vsiteLocalCoordsWeightVec);
vsiteLocalCoordsPos->upload(vsiteLocalCoordsPosVec);
}
} }
else { else {
vsite2AvgWeights = CudaArray::create<float2>(context, max(1, num2Avg), "vsite2AvgWeights"); vsite2AvgWeights = CudaArray::create<float2>(context, max(1, num2Avg), "vsite2AvgWeights");
vsite3AvgWeights = CudaArray::create<float4>(context, max(1, num3Avg), "vsite3AvgWeights"); vsite3AvgWeights = CudaArray::create<float4>(context, max(1, num3Avg), "vsite3AvgWeights");
vsiteOutOfPlaneWeights = CudaArray::create<float4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights"); vsiteOutOfPlaneWeights = CudaArray::create<float4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights");
vsiteLocalCoordsParams = CudaArray::create<float>(context, max(1, 12*numLocalCoords), "vsiteLocalCoordinatesParams"); vsiteLocalCoordsWeights = CudaArray::create<float>(context, max(1, (int) vsiteLocalCoordsWeightVec.size()), "vsiteLocalCoordsWeights");
vsiteLocalCoordsPos = CudaArray::create<float4>(context, max(1, (int) vsiteLocalCoordsPosVec.size()), "vsiteLocalCoordsPos");
if (num2Avg > 0) { if (num2Avg > 0) {
vector<float2> floatWeights(num2Avg); vector<float2> floatWeights(num2Avg);
for (int i = 0; i < num2Avg; i++) for (int i = 0; i < num2Avg; i++)
...@@ -557,10 +569,14 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S ...@@ -557,10 +569,14 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
vsiteOutOfPlaneWeights->upload(floatWeights); vsiteOutOfPlaneWeights->upload(floatWeights);
} }
if (numLocalCoords > 0) { if (numLocalCoords > 0) {
vector<float> floatParams(vsiteLocalCoordsParamVec.size()); vector<float> floatWeights(vsiteLocalCoordsWeightVec.size());
for (int i = 0; i < (int) vsiteLocalCoordsParamVec.size(); i++) for (int i = 0; i < (int) vsiteLocalCoordsWeightVec.size(); i++)
floatParams[i] = (float) vsiteLocalCoordsParamVec[i]; floatWeights[i] = (float) vsiteLocalCoordsWeightVec[i];
vsiteLocalCoordsParams->upload(floatParams); vsiteLocalCoordsWeights->upload(floatWeights);
vector<float4> floatPos(vsiteLocalCoordsPosVec.size());
for (int i = 0; i < (int) vsiteLocalCoordsPosVec.size(); i++)
floatPos[i] = make_float4((float) vsiteLocalCoordsPosVec[i].x, (float) vsiteLocalCoordsPosVec[i].y, (float) vsiteLocalCoordsPosVec[i].z, 0.0f);
vsiteLocalCoordsPos->upload(floatPos);
} }
} }
...@@ -644,10 +660,16 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() { ...@@ -644,10 +660,16 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() {
delete vsiteOutOfPlaneAtoms; delete vsiteOutOfPlaneAtoms;
if (vsiteOutOfPlaneWeights != NULL) if (vsiteOutOfPlaneWeights != NULL)
delete vsiteOutOfPlaneWeights; delete vsiteOutOfPlaneWeights;
if (vsiteLocalCoordsIndex != NULL)
delete vsiteLocalCoordsIndex;
if (vsiteLocalCoordsAtoms != NULL) if (vsiteLocalCoordsAtoms != NULL)
delete vsiteLocalCoordsAtoms; delete vsiteLocalCoordsAtoms;
if (vsiteLocalCoordsParams != NULL) if (vsiteLocalCoordsWeights != NULL)
delete vsiteLocalCoordsParams; delete vsiteLocalCoordsWeights;
if (vsiteLocalCoordsPos != NULL)
delete vsiteLocalCoordsPos;
if (vsiteLocalCoordsStartIndex != NULL)
delete vsiteLocalCoordsStartIndex;
} }
void CudaIntegrationUtilities::setNextStepSize(double size) { void CudaIntegrationUtilities::setNextStepSize(double size) {
...@@ -747,7 +769,9 @@ void CudaIntegrationUtilities::computeVirtualSites() { ...@@ -747,7 +769,9 @@ void CudaIntegrationUtilities::computeVirtualSites() {
void* args[] = {&context.getPosq().getDevicePointer(), &posCorrection, &vsite2AvgAtoms->getDevicePointer(), &vsite2AvgWeights->getDevicePointer(), void* args[] = {&context.getPosq().getDevicePointer(), &posCorrection, &vsite2AvgAtoms->getDevicePointer(), &vsite2AvgWeights->getDevicePointer(),
&vsite3AvgAtoms->getDevicePointer(), &vsite3AvgWeights->getDevicePointer(), &vsite3AvgAtoms->getDevicePointer(), &vsite3AvgWeights->getDevicePointer(),
&vsiteOutOfPlaneAtoms->getDevicePointer(), &vsiteOutOfPlaneWeights->getDevicePointer(), &vsiteOutOfPlaneAtoms->getDevicePointer(), &vsiteOutOfPlaneWeights->getDevicePointer(),
&vsiteLocalCoordsAtoms->getDevicePointer(), &vsiteLocalCoordsParams->getDevicePointer()}; &vsiteLocalCoordsIndex->getDevicePointer(), &vsiteLocalCoordsAtoms->getDevicePointer(),
&vsiteLocalCoordsWeights->getDevicePointer(), &vsiteLocalCoordsPos->getDevicePointer(),
&vsiteLocalCoordsStartIndex->getDevicePointer()};
context.executeKernel(vsitePositionKernel, args, numVsites); context.executeKernel(vsitePositionKernel, args, numVsites);
} }
} }
...@@ -759,7 +783,9 @@ void CudaIntegrationUtilities::distributeForcesFromVirtualSites() { ...@@ -759,7 +783,9 @@ void CudaIntegrationUtilities::distributeForcesFromVirtualSites() {
&vsite2AvgAtoms->getDevicePointer(), &vsite2AvgWeights->getDevicePointer(), &vsite2AvgAtoms->getDevicePointer(), &vsite2AvgWeights->getDevicePointer(),
&vsite3AvgAtoms->getDevicePointer(), &vsite3AvgWeights->getDevicePointer(), &vsite3AvgAtoms->getDevicePointer(), &vsite3AvgWeights->getDevicePointer(),
&vsiteOutOfPlaneAtoms->getDevicePointer(), &vsiteOutOfPlaneWeights->getDevicePointer(), &vsiteOutOfPlaneAtoms->getDevicePointer(), &vsiteOutOfPlaneWeights->getDevicePointer(),
&vsiteLocalCoordsAtoms->getDevicePointer(), &vsiteLocalCoordsParams->getDevicePointer()}; &vsiteLocalCoordsIndex->getDevicePointer(), &vsiteLocalCoordsAtoms->getDevicePointer(),
&vsiteLocalCoordsWeights->getDevicePointer(), &vsiteLocalCoordsPos->getDevicePointer(),
&vsiteLocalCoordsStartIndex->getDevicePointer()};
context.executeKernel(vsiteForceKernel, args, numVsites); context.executeKernel(vsiteForceKernel, args, numVsites);
} }
} }
......
...@@ -108,6 +108,8 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform ...@@ -108,6 +108,8 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
return new CudaCalcCustomCentroidBondForceKernel(name, platform, cu, context.getSystem()); return new CudaCalcCustomCentroidBondForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomCompoundBondForceKernel::Name()) if (name == CalcCustomCompoundBondForceKernel::Name())
return new CudaCalcCustomCompoundBondForceKernel(name, platform, cu, context.getSystem()); return new CudaCalcCustomCompoundBondForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomCVForceKernel::Name())
return new CudaCalcCustomCVForceKernel(name, platform, cu);
if (name == CalcCustomManyParticleForceKernel::Name()) if (name == CalcCustomManyParticleForceKernel::Name())
return new CudaCalcCustomManyParticleForceKernel(name, platform, cu, context.getSystem()); return new CudaCalcCustomManyParticleForceKernel(name, platform, cu, context.getSystem());
if (name == CalcGayBerneForceKernel::Name()) if (name == CalcGayBerneForceKernel::Name())
......
...@@ -48,6 +48,7 @@ ...@@ -48,6 +48,7 @@
#include "lepton/Operation.h" #include "lepton/Operation.h"
#include "lepton/Parser.h" #include "lepton/Parser.h"
#include "lepton/ParsedExpression.h" #include "lepton/ParsedExpression.h"
#include "ReferenceTabulatedFunction.h"
#include "SimTKOpenMMRealType.h" #include "SimTKOpenMMRealType.h"
#include "SimTKOpenMMUtilities.h" #include "SimTKOpenMMUtilities.h"
#include <algorithm> #include <algorithm>
...@@ -114,21 +115,8 @@ double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bo ...@@ -114,21 +115,8 @@ double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bo
for (auto computation : cu.getPostComputations()) for (auto computation : cu.getPostComputations())
sum += computation->computeForceAndEnergy(includeForces, includeEnergy, groups); sum += computation->computeForceAndEnergy(includeForces, includeEnergy, groups);
cu.getIntegrationUtilities().distributeForcesFromVirtualSites(); cu.getIntegrationUtilities().distributeForcesFromVirtualSites();
if (includeEnergy) { if (includeEnergy)
CudaArray& energyArray = cu.getEnergyBuffer(); sum += cu.reduceEnergy();
if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
double* energy = (double*) cu.getPinnedBuffer();
energyArray.download(energy);
for (int i = 0; i < energyArray.getSize(); i++)
sum += energy[i];
}
else {
float* energy = (float*) cu.getPinnedBuffer();
energyArray.download(energy);
for (int i = 0; i < energyArray.getSize(); i++)
sum += energy[i];
}
}
if (!cu.getForcesValid()) if (!cu.getForcesValid())
valid = false; valid = false;
return sum; return sum;
...@@ -1805,7 +1793,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1805,7 +1793,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
try { try {
cpuPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), *cu.getPlatformData().context); cpuPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), *cu.getPlatformData().context);
cpuPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSizeX, gridSizeY, gridSizeZ, numParticles, alpha); cpuPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSizeX, gridSizeY, gridSizeZ, numParticles, alpha, cu.getPlatformData().deterministicForces);
CUfunction addForcesKernel = cu.getKernel(module, "addForces"); CUfunction addForcesKernel = cu.getKernel(module, "addForces");
pmeio = new PmeIO(cu, addForcesKernel); pmeio = new PmeIO(cu, addForcesKernel);
cu.addPreComputation(new PmePreComputation(cu, cpuPme, *pmeio)); cu.addPreComputation(new PmePreComputation(cu, cpuPme, *pmeio));
...@@ -4457,7 +4445,7 @@ void CudaCalcCustomHbondForceKernel::initialize(const System& system, const Cust ...@@ -4457,7 +4445,7 @@ void CudaCalcCustomHbondForceKernel::initialize(const System& system, const Cust
const vector<int>& atoms = distance.second; const vector<int>& atoms = distance.second;
string deltaName = atomNames[atoms[0]]+atomNames[atoms[1]]; string deltaName = atomNames[atoms[0]]+atomNames[atoms[1]];
if (computedDeltas.count(deltaName) == 0) { if (computedDeltas.count(deltaName) == 0) {
addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 delta"+deltaName+" = delta("+atomNamesLower[atoms[0]]+", "+atomNamesLower[atoms[1]]+");\n"); addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 delta"+deltaName+" = delta("+atomNamesLower[atoms[0]]+", "+atomNamesLower[atoms[1]]+", periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);\n");
computedDeltas.insert(deltaName); computedDeltas.insert(deltaName);
} }
addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real r_"+deltaName+" = SQRT(delta"+deltaName+".w);\n"); addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real r_"+deltaName+" = SQRT(delta"+deltaName+".w);\n");
...@@ -4472,11 +4460,11 @@ void CudaCalcCustomHbondForceKernel::initialize(const System& system, const Cust ...@@ -4472,11 +4460,11 @@ void CudaCalcCustomHbondForceKernel::initialize(const System& system, const Cust
string deltaName2 = atomNames[atoms[1]]+atomNames[atoms[2]]; string deltaName2 = atomNames[atoms[1]]+atomNames[atoms[2]];
string angleName = "angle_"+atomNames[atoms[0]]+atomNames[atoms[1]]+atomNames[atoms[2]]; string angleName = "angle_"+atomNames[atoms[0]]+atomNames[atoms[1]]+atomNames[atoms[2]];
if (computedDeltas.count(deltaName1) == 0) { if (computedDeltas.count(deltaName1) == 0) {
addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 delta"+deltaName1+" = delta("+atomNamesLower[atoms[1]]+", "+atomNamesLower[atoms[0]]+");\n"); addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 delta"+deltaName1+" = delta("+atomNamesLower[atoms[1]]+", "+atomNamesLower[atoms[0]]+", periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);\n");
computedDeltas.insert(deltaName1); computedDeltas.insert(deltaName1);
} }
if (computedDeltas.count(deltaName2) == 0) { if (computedDeltas.count(deltaName2) == 0) {
addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 delta"+deltaName2+" = delta("+atomNamesLower[atoms[1]]+", "+atomNamesLower[atoms[2]]+");\n"); addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 delta"+deltaName2+" = delta("+atomNamesLower[atoms[1]]+", "+atomNamesLower[atoms[2]]+", periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);\n");
computedDeltas.insert(deltaName2); computedDeltas.insert(deltaName2);
} }
addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real "+angleName+" = computeAngle(delta"+deltaName1+", delta"+deltaName2+");\n"); addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real "+angleName+" = computeAngle(delta"+deltaName1+", delta"+deltaName2+");\n");
...@@ -4494,15 +4482,15 @@ void CudaCalcCustomHbondForceKernel::initialize(const System& system, const Cust ...@@ -4494,15 +4482,15 @@ void CudaCalcCustomHbondForceKernel::initialize(const System& system, const Cust
string crossName2 = "cross_"+deltaName2+"_"+deltaName3; string crossName2 = "cross_"+deltaName2+"_"+deltaName3;
string dihedralName = "dihedral_"+atomNames[atoms[0]]+atomNames[atoms[1]]+atomNames[atoms[2]]+atomNames[atoms[3]]; string dihedralName = "dihedral_"+atomNames[atoms[0]]+atomNames[atoms[1]]+atomNames[atoms[2]]+atomNames[atoms[3]];
if (computedDeltas.count(deltaName1) == 0) { if (computedDeltas.count(deltaName1) == 0) {
addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 delta"+deltaName1+" = delta("+atomNamesLower[atoms[0]]+", "+atomNamesLower[atoms[1]]+");\n"); addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 delta"+deltaName1+" = delta("+atomNamesLower[atoms[0]]+", "+atomNamesLower[atoms[1]]+", periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);\n");
computedDeltas.insert(deltaName1); computedDeltas.insert(deltaName1);
} }
if (computedDeltas.count(deltaName2) == 0) { if (computedDeltas.count(deltaName2) == 0) {
addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 delta"+deltaName2+" = delta("+atomNamesLower[atoms[2]]+", "+atomNamesLower[atoms[1]]+");\n"); addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 delta"+deltaName2+" = delta("+atomNamesLower[atoms[2]]+", "+atomNamesLower[atoms[1]]+", periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);\n");
computedDeltas.insert(deltaName2); computedDeltas.insert(deltaName2);
} }
if (computedDeltas.count(deltaName3) == 0) { if (computedDeltas.count(deltaName3) == 0) {
addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 delta"+deltaName3+" = delta("+atomNamesLower[atoms[2]]+", "+atomNamesLower[atoms[3]]+");\n"); addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 delta"+deltaName3+" = delta("+atomNamesLower[atoms[2]]+", "+atomNamesLower[atoms[3]]+", periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);\n");
computedDeltas.insert(deltaName3); computedDeltas.insert(deltaName3);
} }
addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 "+crossName1+" = computeCross(delta"+deltaName1+", delta"+deltaName2+");\n"); addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 "+crossName1+" = computeCross(delta"+deltaName1+", delta"+deltaName2+");\n");
...@@ -4521,12 +4509,12 @@ void CudaCalcCustomHbondForceKernel::initialize(const System& system, const Cust ...@@ -4521,12 +4509,12 @@ void CudaCalcCustomHbondForceKernel::initialize(const System& system, const Cust
for (int i = 0; i < (int) donorParams->getBuffers().size(); i++) { for (int i = 0; i < (int) donorParams->getBuffers().size(); i++) {
CudaNonbondedUtilities::ParameterInfo& buffer = donorParams->getBuffers()[i]; CudaNonbondedUtilities::ParameterInfo& buffer = donorParams->getBuffers()[i];
extraArgs << ", const "+buffer.getType()+"* __restrict__ donor"+buffer.getName(); extraArgs << ", const "+buffer.getType()+"* __restrict__ donor"+buffer.getName();
addDonorAndAcceptorCode(computeDonor, computeAcceptor, buffer.getType()+" donorParams"+cu.intToString(i+1)+" = donor"+buffer.getName()+"[index];\n"); addDonorAndAcceptorCode(computeDonor, computeAcceptor, buffer.getType()+" donorParams"+cu.intToString(i+1)+" = donor"+buffer.getName()+"[donorIndex];\n");
} }
for (int i = 0; i < (int) acceptorParams->getBuffers().size(); i++) { for (int i = 0; i < (int) acceptorParams->getBuffers().size(); i++) {
CudaNonbondedUtilities::ParameterInfo& buffer = acceptorParams->getBuffers()[i]; CudaNonbondedUtilities::ParameterInfo& buffer = acceptorParams->getBuffers()[i];
extraArgs << ", const "+buffer.getType()+"* __restrict__ acceptor"+buffer.getName(); extraArgs << ", const "+buffer.getType()+"* __restrict__ acceptor"+buffer.getName();
addDonorAndAcceptorCode(computeDonor, computeAcceptor, buffer.getType()+" acceptorParams"+cu.intToString(i+1)+" = acceptor"+buffer.getName()+"[index];\n"); addDonorAndAcceptorCode(computeDonor, computeAcceptor, buffer.getType()+" acceptorParams"+cu.intToString(i+1)+" = acceptor"+buffer.getName()+"[acceptorIndex];\n");
} }
// Now evaluate the expressions. // Now evaluate the expressions.
...@@ -6608,6 +6596,176 @@ void CudaCalcGayBerneForceKernel::sortAtoms() { ...@@ -6608,6 +6596,176 @@ void CudaCalcGayBerneForceKernel::sortAtoms() {
exclusionStartIndex->upload(startIndexVec); exclusionStartIndex->upload(startIndexVec);
} }
class CudaCalcCustomCVForceKernel::ReorderListener : public CudaContext::ReorderListener {
public:
ReorderListener(CudaContext& cu, CudaArray& invAtomOrder) : cu(cu), invAtomOrder(invAtomOrder) {
}
void execute() {
vector<int> invOrder(cu.getPaddedNumAtoms());
const vector<int>& order = cu.getAtomIndex();
for (int i = 0; i < order.size(); i++)
invOrder[order[i]] = i;
invAtomOrder.upload(invOrder);
}
private:
CudaContext& cu;
CudaArray& invAtomOrder;
};
CudaCalcCustomCVForceKernel::~CudaCalcCustomCVForceKernel() {
for (auto force : cvForces)
delete force;
if (invAtomOrder != NULL)
delete invAtomOrder;
if (innerInvAtomOrder != NULL)
delete innerInvAtomOrder;
}
void CudaCalcCustomCVForceKernel::initialize(const System& system, const CustomCVForce& force, ContextImpl& innerContext) {
int numCVs = force.getNumCollectiveVariables();
for (int i = 0; i < force.getNumGlobalParameters(); i++)
globalParameterNames.push_back(force.getGlobalParameterName(i));
// Create custom functions for the tabulated functions.
map<string, Lepton::CustomFunction*> functions;
for (int i = 0; i < (int) force.getNumTabulatedFunctions(); i++)
functions[force.getTabulatedFunctionName(i)] = createReferenceTabulatedFunction(force.getTabulatedFunction(i));
// Create the expressions.
Lepton::ParsedExpression energyExpr = Lepton::Parser::parse(force.getEnergyFunction(), functions);
energyExpression = energyExpr.createProgram();
for (int i = 0; i < numCVs; i++) {
string name = force.getCollectiveVariableName(i);
variableNames.push_back(name);
variableDerivExpressions.push_back(energyExpr.differentiate(name).optimize().createProgram());
}
for (int i = 0; i < force.getNumEnergyParameterDerivatives(); i++) {
string name = force.getEnergyParameterDerivativeName(i);
paramDerivNames.push_back(name);
paramDerivExpressions.push_back(energyExpr.differentiate(name).optimize().createProgram());
cu.addEnergyParameterDerivative(name);
}
// Delete the custom functions.
for (auto& function : functions)
delete function.second;
// Copy parameter derivatives from the inner context.
CudaContext& cu2 = *reinterpret_cast<CudaPlatform::PlatformData*>(innerContext.getPlatformData())->contexts[0];
for (auto& param : cu2.getEnergyParamDerivNames())
cu.addEnergyParameterDerivative(param);
// Create arrays for storing information.
int elementSize = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
for (int i = 0; i < numCVs; i++)
cvForces.push_back(CudaArray::create<long long>(cu, 3*cu.getPaddedNumAtoms(), "cvForce"));
invAtomOrder = CudaArray::create<int>(cu, cu.getPaddedNumAtoms(), "invAtomOrder");
innerInvAtomOrder = CudaArray::create<int>(cu, cu.getPaddedNumAtoms(), "innerInvAtomOrder");
// Create the kernels.
stringstream args, add;
for (int i = 0; i < numCVs; i++) {
args << ", long long* __restrict__ force" << i << ", real dEdV" << i;
add << "forces[i] += (long long) (force" << i << "[i]*dEdV" << i << ");\n";
}
map<string, string> replacements;
replacements["PARAMETER_ARGUMENTS"] = args.str();
replacements["ADD_FORCES"] = add.str();
CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::vectorOps+CudaKernelSources::customCVForce, replacements));
copyStateKernel = cu.getKernel(module, "copyState");
copyForcesKernel = cu.getKernel(module, "copyForces");
addForcesKernel = cu.getKernel(module, "addForces");
}
double CudaCalcCustomCVForceKernel::execute(ContextImpl& context, ContextImpl& innerContext, bool includeForces, bool includeEnergy) {
copyState(context, innerContext);
int numCVs = variableNames.size();
int numAtoms = cu.getNumAtoms();
int paddedNumAtoms = cu.getPaddedNumAtoms();
CudaContext& cu2 = *reinterpret_cast<CudaPlatform::PlatformData*>(innerContext.getPlatformData())->contexts[0];
vector<double> cvValues;
vector<map<string, double> > cvDerivs(numCVs);
void* copyForcesArgs[] = {NULL, &invAtomOrder->getDevicePointer(), &cu2.getForce().getDevicePointer(), &cu2.getAtomIndexArray().getDevicePointer(), &numAtoms, &paddedNumAtoms};
for (int i = 0; i < numCVs; i++) {
cvValues.push_back(innerContext.calcForcesAndEnergy(true, true, 1<<i));
copyForcesArgs[0] = &cvForces[i]->getDevicePointer();
cu.executeKernel(copyForcesKernel, copyForcesArgs, numAtoms);
innerContext.getEnergyParameterDerivatives(cvDerivs[i]);
}
// Compute the energy and forces.
map<string, double> variables;
for (auto& name : globalParameterNames)
variables[name] = context.getParameter(name);
for (int i = 0; i < numCVs; i++)
variables[variableNames[i]] = cvValues[i];
double energy = energyExpression.evaluate(variables);
int bufferSize = cu.getForce().getSize();
vector<void*> addForcesArgs;
addForcesArgs.push_back(&cu.getForce().getDevicePointer());
addForcesArgs.push_back(&bufferSize);
vector<double> dEdV(numCVs);
vector<float> dEdVFloat(numCVs);
for (int i = 0; i < numCVs; i++) {
dEdV[i] = variableDerivExpressions[i].evaluate(variables);
dEdVFloat[i] = (float) dEdV[i];
addForcesArgs.push_back(&cvForces[i]->getDevicePointer());
if (cu.getUseDoublePrecision())
addForcesArgs.push_back(&dEdV[i]);
else
addForcesArgs.push_back(&dEdVFloat[i]);
}
cu.executeKernel(addForcesKernel, &addForcesArgs[0], numAtoms);
// Compute the energy parameter derivatives.
map<string, double>& energyParamDerivs = cu.getEnergyParamDerivWorkspace();
for (int i = 0; i < paramDerivExpressions.size(); i++)
energyParamDerivs[paramDerivNames[i]] += paramDerivExpressions[i].evaluate(variables);
for (int i = 0; i < numCVs; i++) {
double dEdV = variableDerivExpressions[i].evaluate(variables);
for (auto& deriv : cvDerivs[i])
energyParamDerivs[deriv.first] += dEdV*deriv.second;
}
return energy;
}
void CudaCalcCustomCVForceKernel::copyState(ContextImpl& context, ContextImpl& innerContext) {
int numAtoms = cu.getNumAtoms();
CudaContext& cu2 = *reinterpret_cast<CudaPlatform::PlatformData*>(innerContext.getPlatformData())->contexts[0];
if (!hasInitializedListeners) {
hasInitializedListeners = true;
// Initialize the listeners.
ReorderListener* listener1 = new ReorderListener(cu, *invAtomOrder);
ReorderListener* listener2 = new ReorderListener(cu2, *innerInvAtomOrder);
cu.addReorderListener(listener1);
cu2.addReorderListener(listener2);
listener1->execute();
listener2->execute();
}
CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
CUdeviceptr posCorrection2 = (cu2.getUseMixedPrecision() ? cu2.getPosqCorrection().getDevicePointer() : 0);
void* copyStateArgs[] = {&cu.getPosq().getDevicePointer(), &posCorrection, &cu.getVelm().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(),
&cu2.getPosq().getDevicePointer(), &posCorrection2,& cu2.getVelm().getDevicePointer(), &innerInvAtomOrder->getDevicePointer(), &numAtoms};
cu.executeKernel(copyStateKernel, copyStateArgs, numAtoms);
Vec3 a, b, c;
context.getPeriodicBoxVectors(a, b, c);
innerContext.setPeriodicBoxVectors(a, b, c);
innerContext.setTime(context.getTime());
map<string, double> innerParameters = innerContext.getParameters();
for (auto& param : innerParameters)
innerContext.setParameter(param.first, context.getParameter(param.first));
}
CudaIntegrateVerletStepKernel::~CudaIntegrateVerletStepKernel() { CudaIntegrateVerletStepKernel::~CudaIntegrateVerletStepKernel() {
} }
...@@ -7061,6 +7219,8 @@ CudaIntegrateCustomStepKernel::~CudaIntegrateCustomStepKernel() { ...@@ -7061,6 +7219,8 @@ CudaIntegrateCustomStepKernel::~CudaIntegrateCustomStepKernel() {
delete perDofEnergyParamDerivs; delete perDofEnergyParamDerivs;
if (perDofValues != NULL) if (perDofValues != NULL)
delete perDofValues; delete perDofValues;
for (auto function : tabulatedFunctions)
delete function;
for (auto& f : savedForces) for (auto& f : savedForces)
delete f.second; delete f.second;
} }
...@@ -7078,7 +7238,8 @@ void CudaIntegrateCustomStepKernel::initialize(const System& system, const Custo ...@@ -7078,7 +7238,8 @@ void CudaIntegrateCustomStepKernel::initialize(const System& system, const Custo
SimTKOpenMMUtilities::setRandomNumberSeed(integrator.getRandomNumberSeed()); SimTKOpenMMUtilities::setRandomNumberSeed(integrator.getRandomNumberSeed());
} }
string CudaIntegrateCustomStepKernel::createPerDofComputation(const string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator, const string& forceName, const string& energyName) { string CudaIntegrateCustomStepKernel::createPerDofComputation(const string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator,
const string& forceName, const string& energyName, vector<const TabulatedFunction*>& functions, vector<pair<string, string> >& functionNames) {
const string suffixes[] = {".x", ".y", ".z"}; const string suffixes[] = {".x", ".y", ".z"};
string suffix = suffixes[component]; string suffix = suffixes[component];
map<string, Lepton::ParsedExpression> expressions; map<string, Lepton::ParsedExpression> expressions;
...@@ -7111,8 +7272,6 @@ string CudaIntegrateCustomStepKernel::createPerDofComputation(const string& vari ...@@ -7111,8 +7272,6 @@ string CudaIntegrateCustomStepKernel::createPerDofComputation(const string& vari
variables[integrator.getPerDofVariableName(i)] = "perDof"+suffix.substr(1)+perDofValues->getParameterSuffix(i); variables[integrator.getPerDofVariableName(i)] = "perDof"+suffix.substr(1)+perDofValues->getParameterSuffix(i);
for (int i = 0; i < (int) parameterNames.size(); i++) for (int i = 0; i < (int) parameterNames.size(); i++)
variables[parameterNames[i]] = "globals["+cu.intToString(parameterVariableIndex[i])+"]"; variables[parameterNames[i]] = "globals["+cu.intToString(parameterVariableIndex[i])+"]";
vector<const TabulatedFunction*> functions;
vector<pair<string, string> > functionNames;
vector<pair<ExpressionTreeNode, string> > variableNodes; vector<pair<ExpressionTreeNode, string> > variableNodes;
findExpressionsForDerivs(expr.getRootNode(), variableNodes); findExpressionsForDerivs(expr.getRootNode(), variableNodes);
for (auto& var : variables) for (auto& var : variables)
...@@ -7144,18 +7303,41 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context, ...@@ -7144,18 +7303,41 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
stepTarget.resize(numSteps); stepTarget.resize(numSteps);
merged.resize(numSteps, false); merged.resize(numSteps, false);
modifiesParameters = false; modifiesParameters = false;
sumWorkGroupSize = 512;
map<string, string> defines; map<string, string> defines;
defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms()); defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms()); defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
defines["WORK_GROUP_SIZE"] = cu.intToString(CudaContext::ThreadBlockSize); defines["WORK_GROUP_SIZE"] = cu.intToString(sumWorkGroupSize);
defines["SUM_BUFFER_SIZE"] = "0"; defines["SUM_BUFFER_SIZE"] = "0";
// Record the tabulated functions.
map<string, Lepton::CustomFunction*> functions;
vector<pair<string, string> > functionNames;
vector<const TabulatedFunction*> functionList;
vector<string> tableTypes;
for (int i = 0; i < integrator.getNumTabulatedFunctions(); i++) {
functionList.push_back(&integrator.getTabulatedFunction(i));
string name = integrator.getTabulatedFunctionName(i);
string arrayName = "table"+cu.intToString(i);
functionNames.push_back(make_pair(name, arrayName));
functions[name] = createReferenceTabulatedFunction(integrator.getTabulatedFunction(i));
int width;
vector<float> f = cu.getExpressionUtilities().computeFunctionCoefficients(integrator.getTabulatedFunction(i), width);
tabulatedFunctions.push_back(CudaArray::create<float>(cu, f.size(), "TabulatedFunction"));
tabulatedFunctions[tabulatedFunctions.size()-1]->upload(f);
if (width == 1)
tableTypes.push_back("float");
else
tableTypes.push_back("float"+cu.intToString(width));
}
// Record information about all the computation steps. // Record information about all the computation steps.
vector<string> variable(numSteps); vector<string> variable(numSteps);
vector<int> forceGroup; vector<int> forceGroup;
vector<vector<Lepton::ParsedExpression> > expression; vector<vector<Lepton::ParsedExpression> > expression;
CustomIntegratorUtilities::analyzeComputations(context, integrator, expression, comparisons, blockEnd, invalidatesForces, needsForces, needsEnergy, computeBothForceAndEnergy, forceGroup); CustomIntegratorUtilities::analyzeComputations(context, integrator, expression, comparisons, blockEnd, invalidatesForces, needsForces, needsEnergy, computeBothForceAndEnergy, forceGroup, functions);
for (int step = 0; step < numSteps; step++) { for (int step = 0; step < numSteps; step++) {
string expr; string expr;
integrator.getComputationStep(step, stepType[step], variable[step], expr); integrator.getComputationStep(step, stepType[step], variable[step], expr);
...@@ -7326,7 +7508,7 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context, ...@@ -7326,7 +7508,7 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
if (numUniform > 0) if (numUniform > 0)
compute << "float4 uniform = uniformValues[uniformIndex+index];\n"; compute << "float4 uniform = uniformValues[uniformIndex+index];\n";
for (int i = 0; i < 3; i++) for (int i = 0; i < 3; i++)
compute << createPerDofComputation(stepType[j] == CustomIntegrator::ComputePerDof ? variable[j] : "", expression[j][0], i, integrator, forceName[j], energyName[j]); compute << createPerDofComputation(stepType[j] == CustomIntegrator::ComputePerDof ? variable[j] : "", expression[j][0], i, integrator, forceName[j], energyName[j], functionList, functionNames);
if (variable[j] == "x") { if (variable[j] == "x") {
if (storePosAsDelta[j]) if (storePosAsDelta[j])
compute << "posDelta[index] = convertFromDouble4(position-convertToDouble4(loadPos(posq, posqCorrection, index)));\n"; compute << "posDelta[index] = convertFromDouble4(position-convertToDouble4(loadPos(posq, posqCorrection, index)));\n";
...@@ -7357,6 +7539,8 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context, ...@@ -7357,6 +7539,8 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
string valueName = "perDofValues"+cu.intToString(i+1); string valueName = "perDofValues"+cu.intToString(i+1);
args << ", " << buffer.getType() << "* __restrict__ " << valueName; args << ", " << buffer.getType() << "* __restrict__ " << valueName;
} }
for (int i = 0; i < (int) tableTypes.size(); i++)
args << ", const " << tableTypes[i]<< "* __restrict__ table" << i;
replacements["PARAMETER_ARGUMENTS"] = args.str(); replacements["PARAMETER_ARGUMENTS"] = args.str();
if (loadPosAsDelta[step]) if (loadPosAsDelta[step])
defines["LOAD_POS_AS_DELTA"] = "1"; defines["LOAD_POS_AS_DELTA"] = "1";
...@@ -7386,6 +7570,8 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context, ...@@ -7386,6 +7570,8 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
args1.push_back(&perDofEnergyParamDerivs->getDevicePointer()); args1.push_back(&perDofEnergyParamDerivs->getDevicePointer());
for (auto& buffer : perDofValues->getBuffers()) for (auto& buffer : perDofValues->getBuffers())
args1.push_back(&buffer.getMemory()); args1.push_back(&buffer.getMemory());
for (auto array : tabulatedFunctions)
args1.push_back(&array->getDevicePointer());
kernelArgs[step].push_back(args1); kernelArgs[step].push_back(args1);
if (stepType[step] == CustomIntegrator::ComputeSum) { if (stepType[step] == CustomIntegrator::ComputeSum) {
// Create a second kernel for this step that sums the values. // Create a second kernel for this step that sums the values.
...@@ -7448,7 +7634,7 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context, ...@@ -7448,7 +7634,7 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
} }
Lepton::ParsedExpression keExpression = Lepton::Parser::parse(integrator.getKineticEnergyExpression()).optimize(); Lepton::ParsedExpression keExpression = Lepton::Parser::parse(integrator.getKineticEnergyExpression()).optimize();
for (int i = 0; i < 3; i++) for (int i = 0; i < 3; i++)
computeKE << createPerDofComputation("", keExpression, i, integrator, "f", ""); computeKE << createPerDofComputation("", keExpression, i, integrator, "f", "", functionList, functionNames);
map<string, string> replacements; map<string, string> replacements;
replacements["COMPUTE_STEP"] = computeKE.str(); replacements["COMPUTE_STEP"] = computeKE.str();
stringstream args; stringstream args;
...@@ -7457,6 +7643,8 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context, ...@@ -7457,6 +7643,8 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
string valueName = "perDofValues"+cu.intToString(i+1); string valueName = "perDofValues"+cu.intToString(i+1);
args << ", " << buffer.getType() << "* __restrict__ " << valueName; args << ", " << buffer.getType() << "* __restrict__ " << valueName;
} }
for (int i = 0; i < (int) tableTypes.size(); i++)
args << ", const " << tableTypes[i]<< "* __restrict__ table" << i;
replacements["PARAMETER_ARGUMENTS"] = args.str(); replacements["PARAMETER_ARGUMENTS"] = args.str();
defines["SUM_BUFFER_SIZE"] = cu.intToString(3*numAtoms); defines["SUM_BUFFER_SIZE"] = cu.intToString(3*numAtoms);
if (defines.find("LOAD_POS_AS_DELTA") != defines.end()) if (defines.find("LOAD_POS_AS_DELTA") != defines.end())
...@@ -7481,6 +7669,8 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context, ...@@ -7481,6 +7669,8 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
kineticEnergyArgs.push_back(&perDofEnergyParamDerivs->getDevicePointer()); kineticEnergyArgs.push_back(&perDofEnergyParamDerivs->getDevicePointer());
for (int i = 0; i < (int) perDofValues->getBuffers().size(); i++) for (int i = 0; i < (int) perDofValues->getBuffers().size(); i++)
kineticEnergyArgs.push_back(&perDofValues->getBuffers()[i].getMemory()); kineticEnergyArgs.push_back(&perDofValues->getBuffers()[i].getMemory());
for (auto array : tabulatedFunctions)
kineticEnergyArgs.push_back(&array->getDevicePointer());
keNeedsForce = usesVariable(keExpression, "f"); keNeedsForce = usesVariable(keExpression, "f");
// Create a second kernel to sum the values. // Create a second kernel to sum the values.
...@@ -7488,6 +7678,11 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context, ...@@ -7488,6 +7678,11 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
defines["SUM_BUFFER_SIZE"] = cu.intToString(3*numAtoms); defines["SUM_BUFFER_SIZE"] = cu.intToString(3*numAtoms);
module = cu.createModule(CudaKernelSources::customIntegrator, defines); module = cu.createModule(CudaKernelSources::customIntegrator, defines);
sumKineticEnergyKernel = cu.getKernel(module, useDouble ? "computeDoubleSum" : "computeFloatSum"); sumKineticEnergyKernel = cu.getKernel(module, useDouble ? "computeDoubleSum" : "computeFloatSum");
// Delete the custom functions.
for (auto& function : functions)
delete function.second;
} }
// Make sure all values (variables, parameters, etc.) are up to date. // Make sure all values (variables, parameters, etc.) are up to date.
...@@ -7557,6 +7752,8 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat ...@@ -7557,6 +7752,8 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
CudaIntegrationUtilities& integration = cu.getIntegrationUtilities(); CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
int numAtoms = cu.getNumAtoms(); int numAtoms = cu.getNumAtoms();
int numSteps = integrator.getNumComputations(); int numSteps = integrator.getNumComputations();
if (!forcesAreValid)
savedEnergy.clear();
// Loop over computation steps in the integrator and execute them. // Loop over computation steps in the integrator and execute them.
...@@ -7565,14 +7762,19 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat ...@@ -7565,14 +7762,19 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0); CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
for (int step = 0; step < numSteps; ) { for (int step = 0; step < numSteps; ) {
int nextStep = step+1; int nextStep = step+1;
int forceGroups = forceGroupFlags[step];
int lastForceGroups = context.getLastForceGroups(); int lastForceGroups = context.getLastForceGroups();
if ((needsForces[step] || needsEnergy[step]) && (!forcesAreValid || lastForceGroups != forceGroupFlags[step])) { bool haveForces = (!needsForces[step] || (forcesAreValid && lastForceGroups == forceGroups));
if (forcesAreValid && savedForces.find(lastForceGroups) != savedForces.end()) { bool haveEnergy = (!needsEnergy[step] || savedEnergy.find(forceGroups) != savedEnergy.end());
// The forces are still valid. We just need a different force group right now. Save the old if (!haveForces || !haveEnergy) {
// forces in case we need them again. if (forcesAreValid) {
if (savedForces.find(lastForceGroups) != savedForces.end() && validSavedForces.find(lastForceGroups) == validSavedForces.end()) {
cu.getForce().copyTo(*savedForces[lastForceGroups]); // The forces are still valid. We just need a different force group right now. Save the old
validSavedForces.insert(lastForceGroups); // forces in case we need them again.
cu.getForce().copyTo(*savedForces[lastForceGroups]);
validSavedForces.insert(lastForceGroups);
}
} }
else else
validSavedForces.clear(); validSavedForces.clear();
...@@ -7582,15 +7784,16 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat ...@@ -7582,15 +7784,16 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
bool computeForce = (needsForces[step] || computeBothForceAndEnergy[step]); bool computeForce = (needsForces[step] || computeBothForceAndEnergy[step]);
bool computeEnergy = (needsEnergy[step] || computeBothForceAndEnergy[step]); bool computeEnergy = (needsEnergy[step] || computeBothForceAndEnergy[step]);
if (!computeEnergy && validSavedForces.find(forceGroupFlags[step]) != validSavedForces.end()) { if (!computeEnergy && validSavedForces.find(forceGroups) != validSavedForces.end()) {
// We can just restore the forces we saved earlier. // We can just restore the forces we saved earlier.
savedForces[forceGroupFlags[step]]->copyTo(cu.getForce()); savedForces[forceGroups]->copyTo(cu.getForce());
context.getLastForceGroups() = forceGroups;
} }
else { else {
recordChangedParameters(context); recordChangedParameters(context);
energy = context.calcForcesAndEnergy(computeForce, computeEnergy, forceGroupFlags[step]); energy = context.calcForcesAndEnergy(computeForce, computeEnergy, forceGroups);
energyFloat = (float) energy; savedEnergy[forceGroups] = energy;
if (needsEnergyParamDerivs) { if (needsEnergyParamDerivs) {
context.getEnergyParameterDerivatives(energyParamDerivs); context.getEnergyParameterDerivatives(energyParamDerivs);
if (perDofEnergyParamDerivNames.size() > 0) { if (perDofEnergyParamDerivNames.size() > 0) {
...@@ -7609,6 +7812,10 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat ...@@ -7609,6 +7812,10 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
} }
forcesAreValid = true; forcesAreValid = true;
} }
if (needsEnergy[step]) {
energy = savedEnergy[forceGroups];
energyFloat = (float) energy;
}
if (needsGlobals[step] && !deviceGlobalsAreCurrent) { if (needsGlobals[step] && !deviceGlobalsAreCurrent) {
// Upload the global values to the device. // Upload the global values to the device.
...@@ -7620,6 +7827,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat ...@@ -7620,6 +7827,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
globalValues->upload(globalValuesFloat); globalValues->upload(globalValuesFloat);
} }
} }
bool stepInvalidatesForces = invalidatesForces[step];
if (stepType[step] == CustomIntegrator::ComputePerDof && !merged[step]) { if (stepType[step] == CustomIntegrator::ComputePerDof && !merged[step]) {
int randomIndex = integration.prepareRandomNumbers(requiredGaussian[step]); int randomIndex = integration.prepareRandomNumbers(requiredGaussian[step]);
kernelArgs[step][0][1] = &posCorrection; kernelArgs[step][0][1] = &posCorrection;
...@@ -7646,7 +7854,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat ...@@ -7646,7 +7854,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
cu.executeKernel(randomKernel, &randomArgs[0], numAtoms); cu.executeKernel(randomKernel, &randomArgs[0], numAtoms);
cu.clearBuffer(*sumBuffer); cu.clearBuffer(*sumBuffer);
cu.executeKernel(kernels[step][0], &kernelArgs[step][0][0], numAtoms, 128); cu.executeKernel(kernels[step][0], &kernelArgs[step][0][0], numAtoms, 128);
cu.executeKernel(kernels[step][1], &kernelArgs[step][1][0], CudaContext::ThreadBlockSize, CudaContext::ThreadBlockSize); cu.executeKernel(kernels[step][1], &kernelArgs[step][1][0], sumWorkGroupSize, sumWorkGroupSize);
if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) { if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
double value; double value;
summedValue->download(&value); summedValue->download(&value);
...@@ -7660,7 +7868,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat ...@@ -7660,7 +7868,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
} }
else if (stepType[step] == CustomIntegrator::UpdateContextState) { else if (stepType[step] == CustomIntegrator::UpdateContextState) {
recordChangedParameters(context); recordChangedParameters(context);
context.updateContextState(); stepInvalidatesForces = context.updateContextState();
} }
else if (stepType[step] == CustomIntegrator::ConstrainPositions) { else if (stepType[step] == CustomIntegrator::ConstrainPositions) {
if (hasAnyConstraints) { if (hasAnyConstraints) {
...@@ -7685,8 +7893,10 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat ...@@ -7685,8 +7893,10 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
if (blockEnd[step] != -1) if (blockEnd[step] != -1)
nextStep = blockEnd[step]; // Return to the start of a while block. nextStep = blockEnd[step]; // Return to the start of a while block.
} }
if (invalidatesForces[step]) if (stepInvalidatesForces) {
forcesAreValid = false; forcesAreValid = false;
savedEnergy.clear();
}
step = nextStep; step = nextStep;
} }
recordChangedParameters(context); recordChangedParameters(context);
...@@ -7746,7 +7956,7 @@ double CudaIntegrateCustomStepKernel::computeKineticEnergy(ContextImpl& context, ...@@ -7746,7 +7956,7 @@ double CudaIntegrateCustomStepKernel::computeKineticEnergy(ContextImpl& context,
cu.clearBuffer(*sumBuffer); cu.clearBuffer(*sumBuffer);
cu.executeKernel(kineticEnergyKernel, &kineticEnergyArgs[0], cu.getNumAtoms()); cu.executeKernel(kineticEnergyKernel, &kineticEnergyArgs[0], cu.getNumAtoms());
void* args[] = {&sumBuffer->getDevicePointer(), &summedValue->getDevicePointer()}; void* args[] = {&sumBuffer->getDevicePointer(), &summedValue->getDevicePointer()};
cu.executeKernel(sumKineticEnergyKernel, args, CudaContext::ThreadBlockSize, CudaContext::ThreadBlockSize); cu.executeKernel(sumKineticEnergyKernel, args, sumWorkGroupSize, sumWorkGroupSize);
if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) { if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
double ke; double ke;
summedValue->download(&ke); summedValue->download(&ke);
...@@ -7902,6 +8112,8 @@ CudaApplyMonteCarloBarostatKernel::~CudaApplyMonteCarloBarostatKernel() { ...@@ -7902,6 +8112,8 @@ CudaApplyMonteCarloBarostatKernel::~CudaApplyMonteCarloBarostatKernel() {
cu.setAsCurrent(); cu.setAsCurrent();
if (savedPositions != NULL) if (savedPositions != NULL)
delete savedPositions; delete savedPositions;
if (savedForces != NULL)
delete savedForces;
if (moleculeAtoms != NULL) if (moleculeAtoms != NULL)
delete moleculeAtoms; delete moleculeAtoms;
if (moleculeStartIndex != NULL) if (moleculeStartIndex != NULL)
...@@ -7911,6 +8123,7 @@ CudaApplyMonteCarloBarostatKernel::~CudaApplyMonteCarloBarostatKernel() { ...@@ -7911,6 +8123,7 @@ CudaApplyMonteCarloBarostatKernel::~CudaApplyMonteCarloBarostatKernel() {
void CudaApplyMonteCarloBarostatKernel::initialize(const System& system, const Force& thermostat) { void CudaApplyMonteCarloBarostatKernel::initialize(const System& system, const Force& thermostat) {
cu.setAsCurrent(); cu.setAsCurrent();
savedPositions = new CudaArray(cu, cu.getPaddedNumAtoms(), cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4), "savedPositions"); savedPositions = new CudaArray(cu, cu.getPaddedNumAtoms(), cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4), "savedPositions");
savedForces = CudaArray::create<long long>(cu, cu.getPaddedNumAtoms()*3, "savedForces");
CUmodule module = cu.createModule(CudaKernelSources::monteCarloBarostat); CUmodule module = cu.createModule(CudaKernelSources::monteCarloBarostat);
kernel = cu.getKernel(module, "scalePositions"); kernel = cu.getKernel(module, "scalePositions");
} }
...@@ -7948,6 +8161,12 @@ void CudaApplyMonteCarloBarostatKernel::scaleCoordinates(ContextImpl& context, d ...@@ -7948,6 +8161,12 @@ void CudaApplyMonteCarloBarostatKernel::scaleCoordinates(ContextImpl& context, d
m<<"Error saving positions for MC barostat: "<<cu.getErrorString(result)<<" ("<<result<<")"; m<<"Error saving positions for MC barostat: "<<cu.getErrorString(result)<<" ("<<result<<")";
throw OpenMMException(m.str()); throw OpenMMException(m.str());
} }
result = cuMemcpyDtoD(savedForces->getDevicePointer(), cu.getForce().getDevicePointer(), savedForces->getSize()*savedForces->getElementSize());
if (result != CUDA_SUCCESS) {
std::stringstream m;
m<<"Error saving forces for MC barostat: "<<cu.getErrorString(result)<<" ("<<result<<")";
throw OpenMMException(m.str());
}
float scalefX = (float) scaleX; float scalefX = (float) scaleX;
float scalefY = (float) scaleY; float scalefY = (float) scaleY;
float scalefZ = (float) scaleZ; float scalefZ = (float) scaleZ;
...@@ -7969,6 +8188,12 @@ void CudaApplyMonteCarloBarostatKernel::restoreCoordinates(ContextImpl& context) ...@@ -7969,6 +8188,12 @@ void CudaApplyMonteCarloBarostatKernel::restoreCoordinates(ContextImpl& context)
m<<"Error restoring positions for MC barostat: "<<cu.getErrorString(result)<<" ("<<result<<")"; m<<"Error restoring positions for MC barostat: "<<cu.getErrorString(result)<<" ("<<result<<")";
throw OpenMMException(m.str()); throw OpenMMException(m.str());
} }
result = cuMemcpyDtoD(cu.getForce().getDevicePointer(), savedForces->getDevicePointer(), savedForces->getSize()*savedForces->getElementSize());
if (result != CUDA_SUCCESS) {
std::stringstream m;
m<<"Error restoring forces for MC barostat: "<<cu.getErrorString(result)<<" ("<<result<<")";
throw OpenMMException(m.str());
}
} }
CudaRemoveCMMotionKernel::~CudaRemoveCMMotionKernel() { CudaRemoveCMMotionKernel::~CudaRemoveCMMotionKernel() {
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2008-2016 Stanford University and the Authors. * * Portions copyright (c) 2008-2017 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -91,6 +91,7 @@ CudaPlatform::CudaPlatform() { ...@@ -91,6 +91,7 @@ CudaPlatform::CudaPlatform() {
registerKernelFactory(CalcCustomHbondForceKernel::Name(), factory); registerKernelFactory(CalcCustomHbondForceKernel::Name(), factory);
registerKernelFactory(CalcCustomCentroidBondForceKernel::Name(), factory); registerKernelFactory(CalcCustomCentroidBondForceKernel::Name(), factory);
registerKernelFactory(CalcCustomCompoundBondForceKernel::Name(), factory); registerKernelFactory(CalcCustomCompoundBondForceKernel::Name(), factory);
registerKernelFactory(CalcCustomCVForceKernel::Name(), factory);
registerKernelFactory(CalcCustomManyParticleForceKernel::Name(), factory); registerKernelFactory(CalcCustomManyParticleForceKernel::Name(), factory);
registerKernelFactory(CalcGayBerneForceKernel::Name(), factory); registerKernelFactory(CalcGayBerneForceKernel::Name(), factory);
registerKernelFactory(IntegrateVerletStepKernel::Name(), factory); registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
...@@ -198,7 +199,23 @@ void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string ...@@ -198,7 +199,23 @@ void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string
if (threadsEnv != NULL) if (threadsEnv != NULL)
stringstream(threadsEnv) >> threads; stringstream(threadsEnv) >> threads;
context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue, context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue,
hostCompilerPropValue, pmeStreamPropValue, deterministicForcesValue, threads)); hostCompilerPropValue, pmeStreamPropValue, deterministicForcesValue, threads, NULL));
}
void CudaPlatform::linkedContextCreated(ContextImpl& context, ContextImpl& originalContext) const {
Platform& platform = originalContext.getPlatform();
string devicePropValue = platform.getPropertyValue(originalContext.getOwner(), CudaDeviceIndex());
string blockingPropValue = platform.getPropertyValue(originalContext.getOwner(), CudaUseBlockingSync());
string precisionPropValue = platform.getPropertyValue(originalContext.getOwner(), CudaPrecision());
string cpuPmePropValue = platform.getPropertyValue(originalContext.getOwner(), CudaUseCpuPme());
string compilerPropValue = platform.getPropertyValue(originalContext.getOwner(), CudaCompiler());
string tempPropValue = platform.getPropertyValue(originalContext.getOwner(), CudaTempDirectory());
string hostCompilerPropValue = platform.getPropertyValue(originalContext.getOwner(), CudaHostCompiler());
string pmeStreamPropValue = platform.getPropertyValue(originalContext.getOwner(), CudaDisablePmeStream());
string deterministicForcesValue = platform.getPropertyValue(originalContext.getOwner(), CudaDeterministicForces());
int threads = reinterpret_cast<PlatformData*>(originalContext.getPlatformData())->threads.getNumThreads();
context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue,
hostCompilerPropValue, pmeStreamPropValue, deterministicForcesValue, threads, &originalContext));
} }
void CudaPlatform::contextDestroyed(ContextImpl& context) const { void CudaPlatform::contextDestroyed(ContextImpl& context) const {
...@@ -208,7 +225,7 @@ void CudaPlatform::contextDestroyed(ContextImpl& context) const { ...@@ -208,7 +225,7 @@ void CudaPlatform::contextDestroyed(ContextImpl& context) const {
CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty, CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty,
const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty, const string& hostCompilerProperty, const string& pmeStreamProperty, const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty, const string& hostCompilerProperty, const string& pmeStreamProperty,
const string& deterministicForcesProperty, int numThreads) : const string& deterministicForcesProperty, int numThreads, ContextImpl* originalContext) :
context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0), hasInitializedContexts(false), threads(numThreads) { context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0), hasInitializedContexts(false), threads(numThreads) {
bool blocking = (blockingProperty == "true"); bool blocking = (blockingProperty == "true");
vector<string> devices; vector<string> devices;
...@@ -218,16 +235,19 @@ CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& sys ...@@ -218,16 +235,19 @@ CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& sys
searchPos = nextPos+1; searchPos = nextPos+1;
} }
devices.push_back(deviceIndexProperty.substr(searchPos)); devices.push_back(deviceIndexProperty.substr(searchPos));
PlatformData* originalData = NULL;
if (originalContext != NULL)
originalData = reinterpret_cast<PlatformData*>(originalContext->getPlatformData());
try { try {
for (int i = 0; i < (int) devices.size(); i++) { for (int i = 0; i < (int) devices.size(); i++) {
if (devices[i].length() > 0) { if (devices[i].length() > 0) {
int deviceIndex; int deviceIndex;
stringstream(devices[i]) >> deviceIndex; stringstream(devices[i]) >> deviceIndex;
contexts.push_back(new CudaContext(system, deviceIndex, blocking, precisionProperty, compilerProperty, tempProperty, hostCompilerProperty, *this)); contexts.push_back(new CudaContext(system, deviceIndex, blocking, precisionProperty, compilerProperty, tempProperty, hostCompilerProperty, *this, (originalData == NULL ? NULL : originalData->contexts[i])));
} }
} }
if (contexts.size() == 0) if (contexts.size() == 0)
contexts.push_back(new CudaContext(system, -1, blocking, precisionProperty, compilerProperty, tempProperty, hostCompilerProperty, *this)); contexts.push_back(new CudaContext(system, -1, blocking, precisionProperty, compilerProperty, tempProperty, hostCompilerProperty, *this, (originalData == NULL ? NULL : originalData->contexts[0])));
} }
catch (...) { catch (...) {
// If an exception was thrown, do our best to clean up memory. // If an exception was thrown, do our best to clean up memory.
......
/**
* Copy the positions and velocities to the inner context.
*/
extern "C" __global__ void copyState(real4* posq, real4* posqCorrection, mixed4* velm, int* __restrict__ atomOrder,
real4* innerPosq, real4* innerPosqCorrection, mixed4* innerVelm, int* __restrict__ innerInvAtomOrder,
int numAtoms) {
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < numAtoms; i += blockDim.x*gridDim.x) {
int index = innerInvAtomOrder[atomOrder[i]];
innerPosq[index] = posq[i];
innerVelm[index] = velm[i];
#ifdef USE_MIXED_PRECISION
innerPosqCorrection[index] = posqCorrection[i];
#endif
}
}
/**
* Copy the forces back to the main context.
*/
extern "C" __global__ void copyForces(long long* forces, int* __restrict__ invAtomOrder, long long* innerForces,
int* __restrict__ innerAtomOrder, int numAtoms, int paddedNumAtoms) {
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < numAtoms; i += blockDim.x*gridDim.x) {
int index = invAtomOrder[innerAtomOrder[i]];
forces[index] = innerForces[i];
forces[index+paddedNumAtoms] = innerForces[i+paddedNumAtoms];
forces[index+paddedNumAtoms*2] = innerForces[i+paddedNumAtoms*2];
}
}
/**
* Add all the forces from the CVs.
*/
extern "C" __global__ void addForces(long long* forces, int bufferSize
PARAMETER_ARGUMENTS) {
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < bufferSize; i += blockDim.x*gridDim.x) {
ADD_FORCES
}
}
...@@ -6,26 +6,17 @@ inline __device__ real3 trim(real4 v) { ...@@ -6,26 +6,17 @@ inline __device__ real3 trim(real4 v) {
} }
/** /**
* This does nothing, and just exists to simply the code generation. * This does nothing, and just exists to simplify the code generation.
*/ */
inline __device__ real3 trim(real3 v) { inline __device__ real3 trim(real3 v) {
return v; return v;
} }
/** /**
* Compute the difference between two vectors, setting the fourth component to the squared magnitude. * Compute the difference between two vectors, optionally taking periodic boundary conditions into account
*/
inline __device__ real4 delta(real4 vec1, real4 vec2) {
real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
result.w = result.x*result.x + result.y*result.y + result.z*result.z;
return result;
}
/**
* Compute the difference between two vectors, taking periodic boundary conditions into account
* and setting the fourth component to the squared magnitude. * and setting the fourth component to the squared magnitude.
*/ */
inline __device__ real4 deltaPeriodic(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) { inline __device__ real4 delta(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f); real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(result) APPLY_PERIODIC_TO_DELTA(result)
...@@ -95,6 +86,7 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f ...@@ -95,6 +86,7 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f
for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += blockDim.x) { for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += blockDim.x) {
// Load the next block of acceptors into local memory. // Load the next block of acceptors into local memory.
__syncthreads();
int blockSize = min((int) blockDim.x, NUM_ACCEPTORS-acceptorStart); int blockSize = min((int) blockDim.x, NUM_ACCEPTORS-acceptorStart);
if (threadIdx.x < blockSize) { if (threadIdx.x < blockSize) {
int4 atoms2 = acceptorAtoms[acceptorStart+threadIdx.x]; int4 atoms2 = acceptorAtoms[acceptorStart+threadIdx.x];
...@@ -105,8 +97,8 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f ...@@ -105,8 +97,8 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f
__syncthreads(); __syncthreads();
if (donorIndex < NUM_DONORS) { if (donorIndex < NUM_DONORS) {
for (int index = 0; index < blockSize; index++) { for (int index = 0; index < blockSize; index++) {
#ifdef USE_EXCLUSIONS
int acceptorIndex = acceptorStart+index; int acceptorIndex = acceptorStart+index;
#ifdef USE_EXCLUSIONS
if (acceptorIndex == exclusionIndices.x || acceptorIndex == exclusionIndices.y || acceptorIndex == exclusionIndices.z || acceptorIndex == exclusionIndices.w) if (acceptorIndex == exclusionIndices.x || acceptorIndex == exclusionIndices.y || acceptorIndex == exclusionIndices.z || acceptorIndex == exclusionIndices.w)
continue; continue;
#endif #endif
...@@ -115,7 +107,7 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f ...@@ -115,7 +107,7 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f
real4 a1 = posBuffer[3*index]; real4 a1 = posBuffer[3*index];
real4 a2 = posBuffer[3*index+1]; real4 a2 = posBuffer[3*index+1];
real4 a3 = posBuffer[3*index+2]; real4 a3 = posBuffer[3*index+2];
real4 deltaD1A1 = deltaPeriodic(d1, a1, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ); real4 deltaD1A1 = delta(d1, a1, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (deltaD1A1.w < CUTOFF_SQUARED) { if (deltaD1A1.w < CUTOFF_SQUARED) {
#endif #endif
...@@ -183,6 +175,7 @@ extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict_ ...@@ -183,6 +175,7 @@ extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict_
for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += blockDim.x) { for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += blockDim.x) {
// Load the next block of donors into local memory. // Load the next block of donors into local memory.
__syncthreads();
int blockSize = min((int) blockDim.x, NUM_DONORS-donorStart); int blockSize = min((int) blockDim.x, NUM_DONORS-donorStart);
if (threadIdx.x < blockSize) { if (threadIdx.x < blockSize) {
int4 atoms2 = donorAtoms[donorStart+threadIdx.x]; int4 atoms2 = donorAtoms[donorStart+threadIdx.x];
...@@ -193,8 +186,8 @@ extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict_ ...@@ -193,8 +186,8 @@ extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict_
__syncthreads(); __syncthreads();
if (acceptorIndex < NUM_ACCEPTORS) { if (acceptorIndex < NUM_ACCEPTORS) {
for (int index = 0; index < blockSize; index++) { for (int index = 0; index < blockSize; index++) {
#ifdef USE_EXCLUSIONS
int donorIndex = donorStart+index; int donorIndex = donorStart+index;
#ifdef USE_EXCLUSIONS
if (donorIndex == exclusionIndices.x || donorIndex == exclusionIndices.y || donorIndex == exclusionIndices.z || donorIndex == exclusionIndices.w) if (donorIndex == exclusionIndices.x || donorIndex == exclusionIndices.y || donorIndex == exclusionIndices.z || donorIndex == exclusionIndices.w)
continue; continue;
#endif #endif
...@@ -203,7 +196,7 @@ extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict_ ...@@ -203,7 +196,7 @@ extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict_
real4 d1 = posBuffer[3*index]; real4 d1 = posBuffer[3*index];
real4 d2 = posBuffer[3*index+1]; real4 d2 = posBuffer[3*index+1];
real4 d3 = posBuffer[3*index+2]; real4 d3 = posBuffer[3*index+2];
real4 deltaD1A1 = deltaPeriodic(d1, a1, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ); real4 deltaD1A1 = delta(d1, a1, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (deltaD1A1.w < CUTOFF_SQUARED) { if (deltaD1A1.w < CUTOFF_SQUARED) {
#endif #endif
......
...@@ -60,6 +60,11 @@ inline __device__ real4 computeCross(real4 vec1, real4 vec2) { ...@@ -60,6 +60,11 @@ inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
* Determine whether a particular interaction is in the list of exclusions. * Determine whether a particular interaction is in the list of exclusions.
*/ */
inline __device__ bool isInteractionExcluded(int atom1, int atom2, const int* __restrict__ exclusions, const int* __restrict__ exclusionStartIndex) { inline __device__ bool isInteractionExcluded(int atom1, int atom2, const int* __restrict__ exclusions, const int* __restrict__ exclusionStartIndex) {
if (atom1 > atom2) {
int temp = atom1;
atom1 = atom2;
atom2 = temp;
}
int first = exclusionStartIndex[atom1]; int first = exclusionStartIndex[atom1];
int last = exclusionStartIndex[atom1+1]; int last = exclusionStartIndex[atom1+1];
for (int i = last-1; i >= first; i--) { for (int i = last-1; i >= first; i--) {
......
...@@ -680,7 +680,9 @@ extern "C" __global__ void updateCCMAAtomPositions(const int* __restrict__ numAt ...@@ -680,7 +680,9 @@ extern "C" __global__ void updateCCMAAtomPositions(const int* __restrict__ numAt
extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4* __restrict__ posqCorrection, const int4* __restrict__ avg2Atoms, const real2* __restrict__ avg2Weights, extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4* __restrict__ posqCorrection, const int4* __restrict__ avg2Atoms, const real2* __restrict__ avg2Weights,
const int4* __restrict__ avg3Atoms, const real4* __restrict__ avg3Weights, const int4* __restrict__ avg3Atoms, const real4* __restrict__ avg3Weights,
const int4* __restrict__ outOfPlaneAtoms, const real4* __restrict__ outOfPlaneWeights, const int4* __restrict__ outOfPlaneAtoms, const real4* __restrict__ outOfPlaneWeights,
const int4* __restrict__ localCoordsAtoms, const real* __restrict__ localCoordsParams) { const int* __restrict__ localCoordsIndex, const int* __restrict__ localCoordsAtoms,
const real* __restrict__ localCoordsWeights, const real4* __restrict__ localCoordsPos,
const int* __restrict__ localCoordsStartIndex) {
// Two particle average sites. // Two particle average sites.
...@@ -732,30 +734,31 @@ extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4* ...@@ -732,30 +734,31 @@ extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4*
// Local coordinates sites. // Local coordinates sites.
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_LOCAL_COORDS; index += blockDim.x*gridDim.x) { for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_LOCAL_COORDS; index += blockDim.x*gridDim.x) {
int4 atoms = localCoordsAtoms[index]; int siteAtomIndex = localCoordsIndex[index];
const real* params = &localCoordsParams[12*index]; int start = localCoordsStartIndex[index];
mixed4 pos = loadPos(posq, posqCorrection, atoms.x); int end = localCoordsStartIndex[index+1];
mixed4 pos1_4 = loadPos(posq, posqCorrection, atoms.y); mixed3 origin = make_mixed3(0), xdir = make_mixed3(0), ydir = make_mixed3(0);
mixed4 pos2_4 = loadPos(posq, posqCorrection, atoms.z); for (int j = start; j < end; j++) {
mixed4 pos3_4 = loadPos(posq, posqCorrection, atoms.w); mixed3 pos = trimTo3(loadPos(posq, posqCorrection, localCoordsAtoms[j]));
mixed3 pos1 = make_mixed3(pos1_4.x, pos1_4.y, pos1_4.z); origin += pos*localCoordsWeights[3*j];
mixed3 pos2 = make_mixed3(pos2_4.x, pos2_4.y, pos2_4.z); xdir += pos*localCoordsWeights[3*j+1];
mixed3 pos3 = make_mixed3(pos3_4.x, pos3_4.y, pos3_4.z); ydir += pos*localCoordsWeights[3*j+2];
mixed3 originWeights = make_mixed3(params[0], params[1], params[2]); }
mixed3 xWeights = make_mixed3(params[3], params[4], params[5]);
mixed3 yWeights = make_mixed3(params[6], params[7], params[8]);
mixed3 localPosition = make_mixed3(params[9], params[10], params[11]);
mixed3 origin = pos1*originWeights.x + pos2*originWeights.y + pos3*originWeights.z;
mixed3 xdir = pos1*xWeights.x + pos2*xWeights.y + pos3*xWeights.z;
mixed3 ydir = pos1*yWeights.x + pos2*yWeights.y + pos3*yWeights.z;
mixed3 zdir = cross(xdir, ydir); mixed3 zdir = cross(xdir, ydir);
xdir *= rsqrt(xdir.x*xdir.x+xdir.y*xdir.y+xdir.z*xdir.z); mixed normXdir = sqrt(xdir.x*xdir.x+xdir.y*xdir.y+xdir.z*xdir.z);
zdir *= rsqrt(zdir.x*zdir.x+zdir.y*zdir.y+zdir.z*zdir.z); mixed normZdir = sqrt(zdir.x*zdir.x+zdir.y*zdir.y+zdir.z*zdir.z);
mixed invNormXdir = (normXdir > 0 ? 1/normXdir : 0);
mixed invNormZdir = (normZdir > 0 ? 1/normZdir : 0);
xdir *= invNormXdir;
zdir *= invNormZdir;
ydir = cross(zdir, xdir); ydir = cross(zdir, xdir);
real4 localPosition_4 = localCoordsPos[index];
mixed3 localPosition = make_mixed3(localPosition_4.x, localPosition_4.y, localPosition_4.z);
mixed4 pos = loadPos(posq, posqCorrection, siteAtomIndex);
pos.x = origin.x + xdir.x*localPosition.x + ydir.x*localPosition.y + zdir.x*localPosition.z; pos.x = origin.x + xdir.x*localPosition.x + ydir.x*localPosition.y + zdir.x*localPosition.z;
pos.y = origin.y + xdir.y*localPosition.x + ydir.y*localPosition.y + zdir.y*localPosition.z; pos.y = origin.y + xdir.y*localPosition.x + ydir.y*localPosition.y + zdir.y*localPosition.z;
pos.z = origin.z + xdir.z*localPosition.x + ydir.z*localPosition.y + zdir.z*localPosition.z; pos.z = origin.z + xdir.z*localPosition.x + ydir.z*localPosition.y + zdir.z*localPosition.z;
storePos(posq, posqCorrection, atoms.x, pos); storePos(posq, posqCorrection, siteAtomIndex, pos);
} }
} }
...@@ -778,7 +781,9 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__ ...@@ -778,7 +781,9 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__
const int4* __restrict__ avg2Atoms, const real2* __restrict__ avg2Weights, const int4* __restrict__ avg2Atoms, const real2* __restrict__ avg2Weights,
const int4* __restrict__ avg3Atoms, const real4* __restrict__ avg3Weights, const int4* __restrict__ avg3Atoms, const real4* __restrict__ avg3Weights,
const int4* __restrict__ outOfPlaneAtoms, const real4* __restrict__ outOfPlaneWeights, const int4* __restrict__ outOfPlaneAtoms, const real4* __restrict__ outOfPlaneWeights,
const int4* __restrict__ localCoordsAtoms, const real* __restrict__ localCoordsParams) { const int* __restrict__ localCoordsIndex, const int* __restrict__ localCoordsAtoms,
const real* __restrict__ localCoordsWeights, const real4* __restrict__ localCoordsPos,
const int* __restrict__ localCoordsStartIndex) {
// Two particle average sites. // Two particle average sites.
...@@ -826,87 +831,56 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__ ...@@ -826,87 +831,56 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__
// Local coordinates sites. // Local coordinates sites.
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_LOCAL_COORDS; index += blockDim.x*gridDim.x) { for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_LOCAL_COORDS; index += blockDim.x*gridDim.x) {
int4 atoms = localCoordsAtoms[index]; int siteAtomIndex = localCoordsIndex[index];
const real* params = &localCoordsParams[12*index]; int start = localCoordsStartIndex[index];
mixed4 pos = loadPos(posq, posqCorrection, atoms.x); int end = localCoordsStartIndex[index+1];
mixed4 pos1_4 = loadPos(posq, posqCorrection, atoms.y); mixed3 origin = make_mixed3(0), xdir = make_mixed3(0), ydir = make_mixed3(0);
mixed4 pos2_4 = loadPos(posq, posqCorrection, atoms.z); for (int j = start; j < end; j++) {
mixed4 pos3_4 = loadPos(posq, posqCorrection, atoms.w); mixed3 pos = trimTo3(loadPos(posq, posqCorrection, localCoordsAtoms[j]));
mixed3 pos1 = make_mixed3(pos1_4.x, pos1_4.y, pos1_4.z); origin += pos*localCoordsWeights[3*j];
mixed3 pos2 = make_mixed3(pos2_4.x, pos2_4.y, pos2_4.z); xdir += pos*localCoordsWeights[3*j+1];
mixed3 pos3 = make_mixed3(pos3_4.x, pos3_4.y, pos3_4.z); ydir += pos*localCoordsWeights[3*j+2];
mixed3 originWeights = make_mixed3(params[0], params[1], params[2]); }
mixed3 wx = make_mixed3(params[3], params[4], params[5]);
mixed3 wy = make_mixed3(params[6], params[7], params[8]);
mixed3 localPosition = make_mixed3(params[9], params[10], params[11]);
mixed3 origin = pos1*originWeights.x + pos2*originWeights.y + pos3*originWeights.z;
mixed3 xdir = pos1*wx.x + pos2*wx.y + pos3*wx.z;
mixed3 ydir = pos1*wy.x + pos2*wy.y + pos3*wy.z;
mixed3 zdir = cross(xdir, ydir); mixed3 zdir = cross(xdir, ydir);
mixed invNormXdir = rsqrt(xdir.x*xdir.x+xdir.y*xdir.y+xdir.z*xdir.z); mixed normXdir = sqrt(xdir.x*xdir.x+xdir.y*xdir.y+xdir.z*xdir.z);
mixed invNormZdir = rsqrt(zdir.x*zdir.x+zdir.y*zdir.y+zdir.z*zdir.z); mixed normZdir = sqrt(zdir.x*zdir.x+zdir.y*zdir.y+zdir.z*zdir.z);
mixed invNormXdir = (normXdir > 0 ? 1/normXdir : 0);
mixed invNormZdir = (normZdir > 0 ? 1/normZdir : 0);
mixed3 dx = xdir*invNormXdir; mixed3 dx = xdir*invNormXdir;
mixed3 dz = zdir*invNormZdir; mixed3 dz = zdir*invNormZdir;
mixed3 dy = cross(dz, dx); mixed3 dy = cross(dz, dx);
real4 localPosition_4 = localCoordsPos[index];
mixed3 localPosition = make_mixed3(localPosition_4.x, localPosition_4.y, localPosition_4.z);
// The derivatives for this case are very complicated. They were computed with SymPy then simplified by hand. // The derivatives for this case are very complicated. They were computed with SymPy then simplified by hand.
mixed t11 = (wx.x*ydir.x-wy.x*xdir.x)*invNormZdir; real3 f = loadForce(siteAtomIndex, force);
mixed t12 = (wx.x*ydir.y-wy.x*xdir.y)*invNormZdir;
mixed t13 = (wx.x*ydir.z-wy.x*xdir.z)*invNormZdir;
mixed t21 = (wx.y*ydir.x-wy.y*xdir.x)*invNormZdir;
mixed t22 = (wx.y*ydir.y-wy.y*xdir.y)*invNormZdir;
mixed t23 = (wx.y*ydir.z-wy.y*xdir.z)*invNormZdir;
mixed t31 = (wx.z*ydir.x-wy.z*xdir.x)*invNormZdir;
mixed t32 = (wx.z*ydir.y-wy.z*xdir.y)*invNormZdir;
mixed t33 = (wx.z*ydir.z-wy.z*xdir.z)*invNormZdir;
mixed sx1 = t13*dz.y-t12*dz.z;
mixed sy1 = t11*dz.z-t13*dz.x;
mixed sz1 = t12*dz.x-t11*dz.y;
mixed sx2 = t23*dz.y-t22*dz.z;
mixed sy2 = t21*dz.z-t23*dz.x;
mixed sz2 = t22*dz.x-t21*dz.y;
mixed sx3 = t33*dz.y-t32*dz.z;
mixed sy3 = t31*dz.z-t33*dz.x;
mixed sz3 = t32*dz.x-t31*dz.y;
mixed3 wxScaled = wx*invNormXdir;
real3 f = loadForce(atoms.x, force);
mixed3 fp1 = localPosition*f.x; mixed3 fp1 = localPosition*f.x;
mixed3 fp2 = localPosition*f.y; mixed3 fp2 = localPosition*f.y;
mixed3 fp3 = localPosition*f.z; mixed3 fp3 = localPosition*f.z;
real3 f1 = make_real3(0); for (int j = start; j < end; j++) {
real3 f2 = make_real3(0); real originWeight = localCoordsWeights[3*j];
real3 f3 = make_real3(0); real wx = localCoordsWeights[3*j+1];
f1.x += fp1.x*wxScaled.x*(1-dx.x*dx.x) + fp1.z*(dz.x*sx1 ) + fp1.y*((-dx.x*dy.x )*wxScaled.x + dy.x*sx1 - dx.y*t12 - dx.z*t13) + f.x*originWeights.x; real wy = localCoordsWeights[3*j+2];
f1.y += fp1.x*wxScaled.x*( -dx.x*dx.y) + fp1.z*(dz.x*sy1+t13) + fp1.y*((-dx.y*dy.x-dz.z)*wxScaled.x + dy.x*sy1 + dx.y*t11); mixed wxScaled = wx*invNormXdir;
f1.z += fp1.x*wxScaled.x*( -dx.x*dx.z) + fp1.z*(dz.x*sz1-t12) + fp1.y*((-dx.z*dy.x+dz.y)*wxScaled.x + dy.x*sz1 + dx.z*t11); mixed t1 = (wx*ydir.x-wy*xdir.x)*invNormZdir;
f2.x += fp1.x*wxScaled.y*(1-dx.x*dx.x) + fp1.z*(dz.x*sx2 ) + fp1.y*((-dx.x*dy.x )*wxScaled.y + dy.x*sx2 - dx.y*t22 - dx.z*t23) + f.x*originWeights.y; mixed t2 = (wx*ydir.y-wy*xdir.y)*invNormZdir;
f2.y += fp1.x*wxScaled.y*( -dx.x*dx.y) + fp1.z*(dz.x*sy2+t23) + fp1.y*((-dx.y*dy.x-dz.z)*wxScaled.y + dy.x*sy2 + dx.y*t21); mixed t3 = (wx*ydir.z-wy*xdir.z)*invNormZdir;
f2.z += fp1.x*wxScaled.y*( -dx.x*dx.z) + fp1.z*(dz.x*sz2-t22) + fp1.y*((-dx.z*dy.x+dz.y)*wxScaled.y + dy.x*sz2 + dx.z*t21); mixed sx = t3*dz.y-t2*dz.z;
f3.x += fp1.x*wxScaled.z*(1-dx.x*dx.x) + fp1.z*(dz.x*sx3 ) + fp1.y*((-dx.x*dy.x )*wxScaled.z + dy.x*sx3 - dx.y*t32 - dx.z*t33) + f.x*originWeights.z; mixed sy = t1*dz.z-t3*dz.x;
f3.y += fp1.x*wxScaled.z*( -dx.x*dx.y) + fp1.z*(dz.x*sy3+t33) + fp1.y*((-dx.y*dy.x-dz.z)*wxScaled.z + dy.x*sy3 + dx.y*t31); mixed sz = t2*dz.x-t1*dz.y;
f3.z += fp1.x*wxScaled.z*( -dx.x*dx.z) + fp1.z*(dz.x*sz3-t32) + fp1.y*((-dx.z*dy.x+dz.y)*wxScaled.z + dy.x*sz3 + dx.z*t31); real3 fresult = make_real3(0);
f1.x += fp2.x*wxScaled.x*( -dx.y*dx.x) + fp2.z*(dz.y*sx1-t13) - fp2.y*(( dx.x*dy.y-dz.z)*wxScaled.x - dy.y*sx1 - dx.x*t12); fresult.x += fp1.x*wxScaled*(1-dx.x*dx.x) + fp1.z*(dz.x*sx ) + fp1.y*((-dx.x*dy.x )*wxScaled + dy.x*sx - dx.y*t2 - dx.z*t3) + f.x*originWeight;
f1.y += fp2.x*wxScaled.x*(1-dx.y*dx.y) + fp2.z*(dz.y*sy1 ) - fp2.y*(( dx.y*dy.y )*wxScaled.x - dy.y*sy1 + dx.x*t11 + dx.z*t13) + f.y*originWeights.x; fresult.y += fp1.x*wxScaled*( -dx.x*dx.y) + fp1.z*(dz.x*sy+t3) + fp1.y*((-dx.y*dy.x-dz.z)*wxScaled + dy.x*sy + dx.y*t1);
f1.z += fp2.x*wxScaled.x*( -dx.y*dx.z) + fp2.z*(dz.y*sz1+t11) - fp2.y*(( dx.z*dy.y+dz.x)*wxScaled.x - dy.y*sz1 - dx.z*t12); fresult.z += fp1.x*wxScaled*( -dx.x*dx.z) + fp1.z*(dz.x*sz-t2) + fp1.y*((-dx.z*dy.x+dz.y)*wxScaled + dy.x*sz + dx.z*t1);
f2.x += fp2.x*wxScaled.y*( -dx.y*dx.x) + fp2.z*(dz.y*sx2-t23) - fp2.y*(( dx.x*dy.y-dz.z)*wxScaled.y - dy.y*sx2 - dx.x*t22); fresult.x += fp2.x*wxScaled*( -dx.y*dx.x) + fp2.z*(dz.y*sx-t3) - fp2.y*(( dx.x*dy.y-dz.z)*wxScaled - dy.y*sx - dx.x*t2);
f2.y += fp2.x*wxScaled.y*(1-dx.y*dx.y) + fp2.z*(dz.y*sy2 ) - fp2.y*(( dx.y*dy.y )*wxScaled.y - dy.y*sy2 + dx.x*t21 + dx.z*t23) + f.y*originWeights.y; fresult.y += fp2.x*wxScaled*(1-dx.y*dx.y) + fp2.z*(dz.y*sy ) - fp2.y*(( dx.y*dy.y )*wxScaled - dy.y*sy + dx.x*t1 + dx.z*t3) + f.y*originWeight;
f2.z += fp2.x*wxScaled.y*( -dx.y*dx.z) + fp2.z*(dz.y*sz2+t21) - fp2.y*(( dx.z*dy.y+dz.x)*wxScaled.y - dy.y*sz2 - dx.z*t22); fresult.z += fp2.x*wxScaled*( -dx.y*dx.z) + fp2.z*(dz.y*sz+t1) - fp2.y*(( dx.z*dy.y+dz.x)*wxScaled - dy.y*sz - dx.z*t2);
f3.x += fp2.x*wxScaled.z*( -dx.y*dx.x) + fp2.z*(dz.y*sx3-t33) - fp2.y*(( dx.x*dy.y-dz.z)*wxScaled.z - dy.y*sx3 - dx.x*t32); fresult.x += fp3.x*wxScaled*( -dx.z*dx.x) + fp3.z*(dz.z*sx+t2) + fp3.y*((-dx.x*dy.z-dz.y)*wxScaled + dy.z*sx + dx.x*t3);
f3.y += fp2.x*wxScaled.z*(1-dx.y*dx.y) + fp2.z*(dz.y*sy3 ) - fp2.y*(( dx.y*dy.y )*wxScaled.z - dy.y*sy3 + dx.x*t31 + dx.z*t33) + f.y*originWeights.z; fresult.y += fp3.x*wxScaled*( -dx.z*dx.y) + fp3.z*(dz.z*sy-t1) + fp3.y*((-dx.y*dy.z+dz.x)*wxScaled + dy.z*sy + dx.y*t3);
f3.z += fp2.x*wxScaled.z*( -dx.y*dx.z) + fp2.z*(dz.y*sz3+t31) - fp2.y*(( dx.z*dy.y+dz.x)*wxScaled.z - dy.y*sz3 - dx.z*t32); fresult.z += fp3.x*wxScaled*(1-dx.z*dx.z) + fp3.z*(dz.z*sz ) + fp3.y*((-dx.z*dy.z )*wxScaled + dy.z*sz - dx.x*t1 - dx.y*t2) + f.z*originWeight;
f1.x += fp3.x*wxScaled.x*( -dx.z*dx.x) + fp3.z*(dz.z*sx1+t12) + fp3.y*((-dx.x*dy.z-dz.y)*wxScaled.x + dy.z*sx1 + dx.x*t13); addForce(localCoordsAtoms[j], force, fresult);
f1.y += fp3.x*wxScaled.x*( -dx.z*dx.y) + fp3.z*(dz.z*sy1-t11) + fp3.y*((-dx.y*dy.z+dz.x)*wxScaled.x + dy.z*sy1 + dx.y*t13); }
f1.z += fp3.x*wxScaled.x*(1-dx.z*dx.z) + fp3.z*(dz.z*sz1 ) + fp3.y*((-dx.z*dy.z )*wxScaled.x + dy.z*sz1 - dx.x*t11 - dx.y*t12) + f.z*originWeights.x;
f2.x += fp3.x*wxScaled.y*( -dx.z*dx.x) + fp3.z*(dz.z*sx2+t22) + fp3.y*((-dx.x*dy.z-dz.y)*wxScaled.y + dy.z*sx2 + dx.x*t23);
f2.y += fp3.x*wxScaled.y*( -dx.z*dx.y) + fp3.z*(dz.z*sy2-t21) + fp3.y*((-dx.y*dy.z+dz.x)*wxScaled.y + dy.z*sy2 + dx.y*t23);
f2.z += fp3.x*wxScaled.y*(1-dx.z*dx.z) + fp3.z*(dz.z*sz2 ) + fp3.y*((-dx.z*dy.z )*wxScaled.y + dy.z*sz2 - dx.x*t21 - dx.y*t22) + f.z*originWeights.y;
f3.x += fp3.x*wxScaled.z*( -dx.z*dx.x) + fp3.z*(dz.z*sx3+t32) + fp3.y*((-dx.x*dy.z-dz.y)*wxScaled.z + dy.z*sx3 + dx.x*t33);
f3.y += fp3.x*wxScaled.z*( -dx.z*dx.y) + fp3.z*(dz.z*sy3-t31) + fp3.y*((-dx.y*dy.z+dz.x)*wxScaled.z + dy.z*sy3 + dx.y*t33);
f3.z += fp3.x*wxScaled.z*(1-dx.z*dx.z) + fp3.z*(dz.z*sz3 ) + fp3.y*((-dx.z*dy.z )*wxScaled.z + dy.z*sz3 - dx.x*t31 - dx.y*t32) + f.z*originWeights.z;
addForce(atoms.y, force, f1);
addForce(atoms.z, force, f2);
addForce(atoms.w, force, f3);
} }
} }
...@@ -924,4 +898,4 @@ extern "C" __global__ void timeShiftVelocities(mixed4* __restrict__ velm, const ...@@ -924,4 +898,4 @@ extern "C" __global__ void timeShiftVelocities(mixed4* __restrict__ velm, const
velm[index] = velocity; velm[index] = velocity;
} }
} }
} }
\ No newline at end of file
...@@ -73,6 +73,25 @@ __global__ void clearSixBuffers(int* __restrict__ buffer1, int size1, int* __res ...@@ -73,6 +73,25 @@ __global__ void clearSixBuffers(int* __restrict__ buffer1, int size1, int* __res
clearSingleBuffer(buffer6, size6); clearSingleBuffer(buffer6, size6);
} }
/**
* Sum the energy buffer.
*/
__global__ void reduceEnergy(const mixed* __restrict__ energyBuffer, mixed* __restrict__ result, int bufferSize, int workGroupSize) {
extern __shared__ mixed tempBuffer[];
const unsigned int thread = threadIdx.x;
mixed sum = 0;
for (unsigned int index = thread; index < bufferSize; index += blockDim.x)
sum += energyBuffer[index];
tempBuffer[thread] = sum;
for (int i = 1; i < workGroupSize; i *= 2) {
__syncthreads();
if (thread%(i*2) == 0 && thread+i < workGroupSize)
tempBuffer[thread] += tempBuffer[thread+i];
}
if (thread == 0)
*result = tempBuffer[0];
}
/** /**
* Record the atomic charges into the posq array. * Record the atomic charges into the posq array.
*/ */
......
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2017 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "CudaTests.h"
#include "TestCustomCVForce.h"
void runPlatformTests() {
}
...@@ -56,7 +56,7 @@ void testTransform(bool realToComplex, int xsize, int ysize, int zsize) { ...@@ -56,7 +56,7 @@ void testTransform(bool realToComplex, int xsize, int ysize, int zsize) {
system.addParticle(0.0); system.addParticle(0.0);
CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false", CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()), platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()),
platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), "false", 1); platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), "false", 1, NULL);
CudaContext& context = *platformData.contexts[0]; CudaContext& context = *platformData.contexts[0];
context.initialize(); context.initialize();
OpenMM_SFMT::SFMT sfmt; OpenMM_SFMT::SFMT sfmt;
......
...@@ -56,7 +56,7 @@ void testGaussian() { ...@@ -56,7 +56,7 @@ void testGaussian() {
system.addParticle(1.0); system.addParticle(1.0);
CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false", CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()), platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()),
platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), "false", 1); platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), "false", 1, NULL);
CudaContext& context = *platformData.contexts[0]; CudaContext& context = *platformData.contexts[0];
context.initialize(); context.initialize();
context.getIntegrationUtilities().initRandomNumberGenerator(0); context.getIntegrationUtilities().initRandomNumberGenerator(0);
......
...@@ -66,7 +66,7 @@ void verifySorting(vector<float> array) { ...@@ -66,7 +66,7 @@ void verifySorting(vector<float> array) {
system.addParticle(0.0); system.addParticle(0.0);
CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false", CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()), platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()),
platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), "false", 1); platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), "false", 1, NULL);
CudaContext& context = *platformData.contexts[0]; CudaContext& context = *platformData.contexts[0];
context.initialize(); context.initialize();
CudaArray data(context, array.size(), 4, "sortData"); CudaArray data(context, array.size(), 4, "sortData");
......
...@@ -163,7 +163,8 @@ public: ...@@ -163,7 +163,8 @@ public:
class ForcePostComputation; class ForcePostComputation;
static const int ThreadBlockSize; static const int ThreadBlockSize;
static const int TileSize; static const int TileSize;
OpenCLContext(const System& system, int platformIndex, int deviceIndex, const std::string& precision, OpenCLPlatform::PlatformData& platformData); OpenCLContext(const System& system, int platformIndex, int deviceIndex, const std::string& precision, OpenCLPlatform::PlatformData& platformData,
OpenCLContext* originalContext);
~OpenCLContext(); ~OpenCLContext();
/** /**
* This is called to initialize internal data structures after all Forces in the system * This is called to initialize internal data structures after all Forces in the system
...@@ -363,9 +364,13 @@ public: ...@@ -363,9 +364,13 @@ public:
*/ */
void reduceBuffer(OpenCLArray& array, int numBuffers); void reduceBuffer(OpenCLArray& array, int numBuffers);
/** /**
* Sum the buffesr containing forces. * Sum the buffers containing forces.
*/ */
void reduceForces(); void reduceForces();
/**
* Sum the buffer containing energy.
*/
double reduceEnergy();
/** /**
* Get the current simulation time. * Get the current simulation time.
*/ */
...@@ -749,6 +754,7 @@ private: ...@@ -749,6 +754,7 @@ private:
cl::Kernel clearSixBuffersKernel; cl::Kernel clearSixBuffersKernel;
cl::Kernel reduceReal4Kernel; cl::Kernel reduceReal4Kernel;
cl::Kernel reduceForcesKernel; cl::Kernel reduceForcesKernel;
cl::Kernel reduceEnergyKernel;
cl::Kernel setChargesKernel; cl::Kernel setChargesKernel;
std::vector<OpenCLForceInfo*> forces; std::vector<OpenCLForceInfo*> forces;
std::vector<Molecule> molecules; std::vector<Molecule> molecules;
...@@ -763,6 +769,7 @@ private: ...@@ -763,6 +769,7 @@ private:
OpenCLArray* forceBuffers; OpenCLArray* forceBuffers;
OpenCLArray* longForceBuffer; OpenCLArray* longForceBuffer;
OpenCLArray* energyBuffer; OpenCLArray* energyBuffer;
OpenCLArray* energySum;
OpenCLArray* energyParamDerivBuffer; OpenCLArray* energyParamDerivBuffer;
OpenCLArray* atomIndexDevice; OpenCLArray* atomIndexDevice;
OpenCLArray* chargeBuffer; OpenCLArray* chargeBuffer;
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2014 Stanford University and the Authors. * * Portions copyright (c) 2009-2017 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -156,8 +156,11 @@ private: ...@@ -156,8 +156,11 @@ private:
OpenCLArray* vsite3AvgWeights; OpenCLArray* vsite3AvgWeights;
OpenCLArray* vsiteOutOfPlaneAtoms; OpenCLArray* vsiteOutOfPlaneAtoms;
OpenCLArray* vsiteOutOfPlaneWeights; OpenCLArray* vsiteOutOfPlaneWeights;
OpenCLArray* vsiteLocalCoordsIndex;
OpenCLArray* vsiteLocalCoordsAtoms; OpenCLArray* vsiteLocalCoordsAtoms;
OpenCLArray* vsiteLocalCoordsParams; OpenCLArray* vsiteLocalCoordsWeights;
OpenCLArray* vsiteLocalCoordsPos;
OpenCLArray* vsiteLocalCoordsStartIndex;
int randomPos; int randomPos;
int lastSeed, numVsites; int lastSeed, numVsites;
bool hasInitializedPosConstraintKernels, hasInitializedVelConstraintKernels, ccmaUseDirectBuffer, hasOverlappingVsites; bool hasInitializedPosConstraintKernels, hasInitializedVelConstraintKernels, ccmaUseDirectBuffer, hasOverlappingVsites;
......
...@@ -37,6 +37,7 @@ ...@@ -37,6 +37,7 @@
#include "openmm/internal/CompiledExpressionSet.h" #include "openmm/internal/CompiledExpressionSet.h"
#include "openmm/internal/CustomIntegratorUtilities.h" #include "openmm/internal/CustomIntegratorUtilities.h"
#include "lepton/CompiledExpression.h" #include "lepton/CompiledExpression.h"
#include "lepton/ExpressionProgram.h"
#include "openmm/System.h" #include "openmm/System.h"
namespace OpenMM { namespace OpenMM {
...@@ -1207,6 +1208,54 @@ private: ...@@ -1207,6 +1208,54 @@ private:
cl::Kernel framesKernel, blockBoundsKernel, neighborsKernel, forceKernel, torqueKernel; cl::Kernel framesKernel, blockBoundsKernel, neighborsKernel, forceKernel, torqueKernel;
}; };
/**
* This kernel is invoked by CustomCVForce to calculate the forces acting on the system and the energy of the system.
*/
class OpenCLCalcCustomCVForceKernel : public CalcCustomCVForceKernel {
public:
OpenCLCalcCustomCVForceKernel(std::string name, const Platform& platform, OpenCLContext& cl) : CalcCustomCVForceKernel(name, platform),
cl(cl), hasInitializedKernels(false), invAtomOrder(NULL), innerInvAtomOrder(NULL) {
}
~OpenCLCalcCustomCVForceKernel();
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
* @param force the CustomCVForce this kernel will be used for
* @param innerContext the context created by the CustomCVForce for computing collective variables
*/
void initialize(const System& system, const CustomCVForce& force, ContextImpl& innerContext);
/**
* Execute the kernel to calculate the forces and/or energy.
*
* @param context the context in which to execute this kernel
* @param innerContext the context created by the CustomCVForce for computing collective variables
* @param includeForces true if forces should be calculated
* @param includeEnergy true if the energy should be calculated
* @return the potential energy due to the force
*/
double execute(ContextImpl& context, ContextImpl& innerContext, bool includeForces, bool includeEnergy);
/**
* Copy state information to the inner context.
*
* @param context the context in which to execute this kernel
* @param innerContext the context created by the CustomCVForce for computing collective variables
*/
void copyState(ContextImpl& context, ContextImpl& innerContext);
private:
class ReorderListener;
OpenCLContext& cl;
bool hasInitializedKernels;
Lepton::ExpressionProgram energyExpression;
std::vector<std::string> variableNames, paramDerivNames, globalParameterNames;
std::vector<Lepton::ExpressionProgram> variableDerivExpressions;
std::vector<Lepton::ExpressionProgram> paramDerivExpressions;
std::vector<OpenCLArray*> cvForces;
OpenCLArray* invAtomOrder;
OpenCLArray* innerInvAtomOrder;
cl::Kernel copyStateKernel, copyForcesKernel, addForcesKernel;
};
/** /**
* This kernel is invoked by VerletIntegrator to take one time step. * This kernel is invoked by VerletIntegrator to take one time step.
*/ */
...@@ -1472,7 +1521,9 @@ private: ...@@ -1472,7 +1521,9 @@ private:
class ReorderListener; class ReorderListener;
class GlobalTarget; class GlobalTarget;
class DerivFunction; class DerivFunction;
std::string createPerDofComputation(const std::string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator, const std::string& forceName, const std::string& energyName); std::string createPerDofComputation(const std::string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator,
const std::string& forceName, const std::string& energyName, std::vector<const TabulatedFunction*>& functions,
std::vector<std::pair<std::string, std::string> >& functionNames);
void prepareForComputation(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid); void prepareForComputation(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid);
Lepton::ExpressionTreeNode replaceDerivFunctions(const Lepton::ExpressionTreeNode& node, OpenMM::ContextImpl& context); Lepton::ExpressionTreeNode replaceDerivFunctions(const Lepton::ExpressionTreeNode& node, OpenMM::ContextImpl& context);
void findExpressionsForDerivs(const Lepton::ExpressionTreeNode& node, std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& variableNodes); void findExpressionsForDerivs(const Lepton::ExpressionTreeNode& node, std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& variableNodes);
...@@ -1482,7 +1533,7 @@ private: ...@@ -1482,7 +1533,7 @@ private:
OpenCLContext& cl; OpenCLContext& cl;
double energy; double energy;
float energyFloat; float energyFloat;
int numGlobalVariables; int numGlobalVariables, sumWorkGroupSize;
bool hasInitializedKernels, deviceValuesAreCurrent, deviceGlobalsAreCurrent, modifiesParameters, keNeedsForce, hasAnyConstraints, needsEnergyParamDerivs; bool hasInitializedKernels, deviceValuesAreCurrent, deviceGlobalsAreCurrent, modifiesParameters, keNeedsForce, hasAnyConstraints, needsEnergyParamDerivs;
mutable bool localValuesAreCurrent; mutable bool localValuesAreCurrent;
OpenCLArray* globalValues; OpenCLArray* globalValues;
...@@ -1491,6 +1542,8 @@ private: ...@@ -1491,6 +1542,8 @@ private:
OpenCLArray* uniformRandoms; OpenCLArray* uniformRandoms;
OpenCLArray* randomSeed; OpenCLArray* randomSeed;
OpenCLArray* perDofEnergyParamDerivs; OpenCLArray* perDofEnergyParamDerivs;
std::vector<OpenCLArray*> tabulatedFunctions;
std::map<int, double> savedEnergy;
std::map<int, OpenCLArray*> savedForces; std::map<int, OpenCLArray*> savedForces;
std::set<int> validSavedForces; std::set<int> validSavedForces;
OpenCLParameterSet* perDofValues; OpenCLParameterSet* perDofValues;
...@@ -1573,7 +1626,7 @@ private: ...@@ -1573,7 +1626,7 @@ private:
class OpenCLApplyMonteCarloBarostatKernel : public ApplyMonteCarloBarostatKernel { class OpenCLApplyMonteCarloBarostatKernel : public ApplyMonteCarloBarostatKernel {
public: public:
OpenCLApplyMonteCarloBarostatKernel(std::string name, const Platform& platform, OpenCLContext& cl) : ApplyMonteCarloBarostatKernel(name, platform), cl(cl), OpenCLApplyMonteCarloBarostatKernel(std::string name, const Platform& platform, OpenCLContext& cl) : ApplyMonteCarloBarostatKernel(name, platform), cl(cl),
hasInitializedKernels(false), savedPositions(NULL), moleculeAtoms(NULL), moleculeStartIndex(NULL) { hasInitializedKernels(false), savedPositions(NULL), savedForces(NULL), moleculeAtoms(NULL), moleculeStartIndex(NULL) {
} }
~OpenCLApplyMonteCarloBarostatKernel(); ~OpenCLApplyMonteCarloBarostatKernel();
/** /**
...@@ -1608,6 +1661,7 @@ private: ...@@ -1608,6 +1661,7 @@ private:
bool hasInitializedKernels; bool hasInitializedKernels;
int numMolecules; int numMolecules;
OpenCLArray* savedPositions; OpenCLArray* savedPositions;
OpenCLArray* savedForces;
OpenCLArray* moleculeAtoms; OpenCLArray* moleculeAtoms;
OpenCLArray* moleculeStartIndex; OpenCLArray* moleculeStartIndex;
cl::Kernel kernel; cl::Kernel kernel;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment