Commit 18295108 authored by peastman's avatar peastman
Browse files

Merge changes from main branch

parents e6101f68 8d7234e5
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2014 Stanford University and the Authors. * * Portions copyright (c) 2009-2017 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -158,8 +158,11 @@ private: ...@@ -158,8 +158,11 @@ private:
CudaArray* vsite3AvgWeights; CudaArray* vsite3AvgWeights;
CudaArray* vsiteOutOfPlaneAtoms; CudaArray* vsiteOutOfPlaneAtoms;
CudaArray* vsiteOutOfPlaneWeights; CudaArray* vsiteOutOfPlaneWeights;
CudaArray* vsiteLocalCoordsIndex;
CudaArray* vsiteLocalCoordsAtoms; CudaArray* vsiteLocalCoordsAtoms;
CudaArray* vsiteLocalCoordsParams; CudaArray* vsiteLocalCoordsWeights;
CudaArray* vsiteLocalCoordsPos;
CudaArray* vsiteLocalCoordsStartIndex;
int randomPos; int randomPos;
int lastSeed, numVsites; int lastSeed, numVsites;
double2 lastStepSize; double2 lastStepSize;
......
...@@ -38,6 +38,7 @@ ...@@ -38,6 +38,7 @@
#include "openmm/internal/CompiledExpressionSet.h" #include "openmm/internal/CompiledExpressionSet.h"
#include "openmm/internal/CustomIntegratorUtilities.h" #include "openmm/internal/CustomIntegratorUtilities.h"
#include "lepton/CompiledExpression.h" #include "lepton/CompiledExpression.h"
#include "lepton/ExpressionProgram.h"
#include <cufft.h> #include <cufft.h>
namespace OpenMM { namespace OpenMM {
...@@ -1229,6 +1230,54 @@ private: ...@@ -1229,6 +1230,54 @@ private:
CUevent event; CUevent event;
}; };
/**
* This kernel is invoked by CustomCVForce to calculate the forces acting on the system and the energy of the system.
*/
class CudaCalcCustomCVForceKernel : public CalcCustomCVForceKernel {
public:
CudaCalcCustomCVForceKernel(std::string name, const Platform& platform, CudaContext& cu) : CalcCustomCVForceKernel(name, platform),
cu(cu), hasInitializedListeners(false), invAtomOrder(NULL), innerInvAtomOrder(NULL) {
}
~CudaCalcCustomCVForceKernel();
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
* @param force the CustomCVForce this kernel will be used for
* @param innerContext the context created by the CustomCVForce for computing collective variables
*/
void initialize(const System& system, const CustomCVForce& force, ContextImpl& innerContext);
/**
* Execute the kernel to calculate the forces and/or energy.
*
* @param context the context in which to execute this kernel
* @param innerContext the context created by the CustomCVForce for computing collective variables
* @param includeForces true if forces should be calculated
* @param includeEnergy true if the energy should be calculated
* @return the potential energy due to the force
*/
double execute(ContextImpl& context, ContextImpl& innerContext, bool includeForces, bool includeEnergy);
/**
* Copy state information to the inner context.
*
* @param context the context in which to execute this kernel
* @param innerContext the context created by the CustomCVForce for computing collective variables
*/
void copyState(ContextImpl& context, ContextImpl& innerContext);
private:
class ReorderListener;
CudaContext& cu;
bool hasInitializedListeners;
Lepton::ExpressionProgram energyExpression;
std::vector<std::string> variableNames, paramDerivNames, globalParameterNames;
std::vector<Lepton::ExpressionProgram> variableDerivExpressions;
std::vector<Lepton::ExpressionProgram> paramDerivExpressions;
std::vector<CudaArray*> cvForces;
CudaArray* invAtomOrder;
CudaArray* innerInvAtomOrder;
CUfunction copyStateKernel, copyForcesKernel, addForcesKernel;
};
/** /**
* This kernel is invoked by VerletIntegrator to take one time step. * This kernel is invoked by VerletIntegrator to take one time step.
*/ */
...@@ -1485,7 +1534,9 @@ private: ...@@ -1485,7 +1534,9 @@ private:
class ReorderListener; class ReorderListener;
class GlobalTarget; class GlobalTarget;
class DerivFunction; class DerivFunction;
std::string createPerDofComputation(const std::string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator, const std::string& forceName, const std::string& energyName); std::string createPerDofComputation(const std::string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator,
const std::string& forceName, const std::string& energyName, std::vector<const TabulatedFunction*>& functions,
std::vector<std::pair<std::string, std::string> >& functionNames);
void prepareForComputation(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid); void prepareForComputation(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid);
Lepton::ExpressionTreeNode replaceDerivFunctions(const Lepton::ExpressionTreeNode& node, OpenMM::ContextImpl& context); Lepton::ExpressionTreeNode replaceDerivFunctions(const Lepton::ExpressionTreeNode& node, OpenMM::ContextImpl& context);
void findExpressionsForDerivs(const Lepton::ExpressionTreeNode& node, std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& variableNodes); void findExpressionsForDerivs(const Lepton::ExpressionTreeNode& node, std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& variableNodes);
...@@ -1495,7 +1546,7 @@ private: ...@@ -1495,7 +1546,7 @@ private:
CudaContext& cu; CudaContext& cu;
double energy; double energy;
float energyFloat; float energyFloat;
int numGlobalVariables; int numGlobalVariables, sumWorkGroupSize;
bool hasInitializedKernels, deviceValuesAreCurrent, deviceGlobalsAreCurrent, modifiesParameters, keNeedsForce, hasAnyConstraints, needsEnergyParamDerivs; bool hasInitializedKernels, deviceValuesAreCurrent, deviceGlobalsAreCurrent, modifiesParameters, keNeedsForce, hasAnyConstraints, needsEnergyParamDerivs;
mutable bool localValuesAreCurrent; mutable bool localValuesAreCurrent;
CudaArray* globalValues; CudaArray* globalValues;
...@@ -1504,6 +1555,8 @@ private: ...@@ -1504,6 +1555,8 @@ private:
CudaArray* uniformRandoms; CudaArray* uniformRandoms;
CudaArray* randomSeed; CudaArray* randomSeed;
CudaArray* perDofEnergyParamDerivs; CudaArray* perDofEnergyParamDerivs;
std::vector<CudaArray*> tabulatedFunctions;
std::map<int, double> savedEnergy;
std::map<int, CudaArray*> savedForces; std::map<int, CudaArray*> savedForces;
std::set<int> validSavedForces; std::set<int> validSavedForces;
CudaParameterSet* perDofValues; CudaParameterSet* perDofValues;
...@@ -1587,7 +1640,7 @@ private: ...@@ -1587,7 +1640,7 @@ private:
class CudaApplyMonteCarloBarostatKernel : public ApplyMonteCarloBarostatKernel { class CudaApplyMonteCarloBarostatKernel : public ApplyMonteCarloBarostatKernel {
public: public:
CudaApplyMonteCarloBarostatKernel(std::string name, const Platform& platform, CudaContext& cu) : ApplyMonteCarloBarostatKernel(name, platform), cu(cu), CudaApplyMonteCarloBarostatKernel(std::string name, const Platform& platform, CudaContext& cu) : ApplyMonteCarloBarostatKernel(name, platform), cu(cu),
hasInitializedKernels(false), savedPositions(NULL), moleculeAtoms(NULL), moleculeStartIndex(NULL) { hasInitializedKernels(false), savedPositions(NULL), savedForces(NULL), moleculeAtoms(NULL), moleculeStartIndex(NULL) {
} }
~CudaApplyMonteCarloBarostatKernel(); ~CudaApplyMonteCarloBarostatKernel();
/** /**
...@@ -1622,6 +1675,7 @@ private: ...@@ -1622,6 +1675,7 @@ private:
bool hasInitializedKernels; bool hasInitializedKernels;
int numMolecules; int numMolecules;
CudaArray* savedPositions; CudaArray* savedPositions;
CudaArray* savedForces;
CudaArray* moleculeAtoms; CudaArray* moleculeAtoms;
CudaArray* moleculeStartIndex; CudaArray* moleculeStartIndex;
CUfunction kernel; CUfunction kernel;
......
...@@ -53,6 +53,7 @@ public: ...@@ -53,6 +53,7 @@ public:
const std::string& getPropertyValue(const Context& context, const std::string& property) const; const std::string& getPropertyValue(const Context& context, const std::string& property) const;
void setPropertyValue(Context& context, const std::string& property, const std::string& value) const; void setPropertyValue(Context& context, const std::string& property, const std::string& value) const;
void contextCreated(ContextImpl& context, const std::map<std::string, std::string>& properties) const; void contextCreated(ContextImpl& context, const std::map<std::string, std::string>& properties) const;
void linkedContextCreated(ContextImpl& context, ContextImpl& originalContext) const;
void contextDestroyed(ContextImpl& context) const; void contextDestroyed(ContextImpl& context) const;
/** /**
* This is the name of the parameter for selecting which CUDA device or devices to use. * This is the name of the parameter for selecting which CUDA device or devices to use.
...@@ -130,7 +131,7 @@ class OPENMM_EXPORT_CUDA CudaPlatform::PlatformData { ...@@ -130,7 +131,7 @@ class OPENMM_EXPORT_CUDA CudaPlatform::PlatformData {
public: public:
PlatformData(ContextImpl* context, const System& system, const std::string& deviceIndexProperty, const std::string& blockingProperty, const std::string& precisionProperty, PlatformData(ContextImpl* context, const System& system, const std::string& deviceIndexProperty, const std::string& blockingProperty, const std::string& precisionProperty,
const std::string& cpuPmeProperty, const std::string& compilerProperty, const std::string& tempProperty, const std::string& hostCompilerProperty, const std::string& cpuPmeProperty, const std::string& compilerProperty, const std::string& tempProperty, const std::string& hostCompilerProperty,
const std::string& pmeStreamProperty, const std::string& deterministicForcesProperty, int numThreads); const std::string& pmeStreamProperty, const std::string& deterministicForcesProperty, int numThreads, ContextImpl* originalContext);
~PlatformData(); ~PlatformData();
void initializeContexts(const System& system); void initializeContexts(const System& system);
void syncContexts(); void syncContexts();
......
...@@ -106,9 +106,9 @@ static int executeInWindows(const string &command) { ...@@ -106,9 +106,9 @@ static int executeInWindows(const string &command) {
#endif #endif
CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler, CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
const string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData) : system(system), currentStream(0), const string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData, CudaContext* originalContext) : system(system), currentStream(0),
time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), contextIsValid(false), atomsWereReordered(false), hasCompilerKernel(false), isNvccAvailable(false), time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), contextIsValid(false), atomsWereReordered(false), hasCompilerKernel(false), isNvccAvailable(false),
pinnedBuffer(NULL), posq(NULL), posqCorrection(NULL), velm(NULL), force(NULL), energyBuffer(NULL), energyParamDerivBuffer(NULL), atomIndexDevice(NULL), chargeBuffer(NULL), pinnedBuffer(NULL), posq(NULL), posqCorrection(NULL), velm(NULL), force(NULL), energyBuffer(NULL), energySum(NULL), energyParamDerivBuffer(NULL), atomIndexDevice(NULL), chargeBuffer(NULL),
integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) { integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
// Determine what compiler to use. // Determine what compiler to use.
...@@ -173,40 +173,49 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -173,40 +173,49 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
cacheDir = cacheDir+"/"; cacheDir = cacheDir+"/";
#endif #endif
contextIndex = platformData.contexts.size(); contextIndex = platformData.contexts.size();
int numDevices;
string errorMessage = "Error initializing Context"; string errorMessage = "Error initializing Context";
CHECK_RESULT(cuDeviceGetCount(&numDevices)); if (originalContext == NULL) {
if (deviceIndex < -1 || deviceIndex >= numDevices) isLinkedContext = false;
throw OpenMMException("Illegal value for DeviceIndex: "+intToString(deviceIndex)); int numDevices;
CHECK_RESULT(cuDeviceGetCount(&numDevices));
vector<int> devicePrecedence; if (deviceIndex < -1 || deviceIndex >= numDevices)
if (deviceIndex == -1) { throw OpenMMException("Illegal value for DeviceIndex: "+intToString(deviceIndex));
devicePrecedence = getDevicePrecedence();
} else { vector<int> devicePrecedence;
devicePrecedence.push_back(deviceIndex); if (deviceIndex == -1) {
} devicePrecedence = getDevicePrecedence();
} else {
this->deviceIndex = -1; devicePrecedence.push_back(deviceIndex);
for (int i = 0; i < static_cast<int>(devicePrecedence.size()); i++) { }
int trialDeviceIndex = devicePrecedence[i];
CHECK_RESULT(cuDeviceGet(&device, trialDeviceIndex));
defaultOptimizationOptions = "--use_fast_math";
unsigned int flags = CU_CTX_MAP_HOST;
if (useBlockingSync)
flags += CU_CTX_SCHED_BLOCKING_SYNC;
else
flags += CU_CTX_SCHED_SPIN;
if (cuCtxCreate(&context, flags, device) == CUDA_SUCCESS) { this->deviceIndex = -1;
this->deviceIndex = trialDeviceIndex; for (int i = 0; i < static_cast<int>(devicePrecedence.size()); i++) {
break; int trialDeviceIndex = devicePrecedence[i];
CHECK_RESULT(cuDeviceGet(&device, trialDeviceIndex));
defaultOptimizationOptions = "--use_fast_math";
unsigned int flags = CU_CTX_MAP_HOST;
if (useBlockingSync)
flags += CU_CTX_SCHED_BLOCKING_SYNC;
else
flags += CU_CTX_SCHED_SPIN;
if (cuCtxCreate(&context, flags, device) == CUDA_SUCCESS) {
this->deviceIndex = trialDeviceIndex;
break;
}
} }
if (this->deviceIndex == -1)
if (deviceIndex != -1)
throw OpenMMException("The requested CUDA device could not be loaded");
else
throw OpenMMException("No compatible CUDA device is available");
}
else {
isLinkedContext = true;
context = originalContext->context;
this->deviceIndex = originalContext->deviceIndex;
this->device = originalContext->device;
} }
if (this->deviceIndex == -1)
if (deviceIndex != -1)
throw OpenMMException("The requested CUDA device could not be loaded");
else
throw OpenMMException("No compatible CUDA device is available");
int major, minor; int major, minor;
CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device)); CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
...@@ -227,6 +236,12 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -227,6 +236,12 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
minor = 3; minor = 3;
} }
} }
if (major == 7) {
// Don't generate Volta-specific code until we've made the changes needed
// to support it properly.
major = 6;
minor = 0;
}
gpuArchitecture = intToString(major)+intToString(minor); gpuArchitecture = intToString(major)+intToString(minor);
computeCapability = major+0.1*minor; computeCapability = major+0.1*minor;
...@@ -292,6 +307,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -292,6 +307,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
clearFourBuffersKernel = getKernel(utilities, "clearFourBuffers"); clearFourBuffersKernel = getKernel(utilities, "clearFourBuffers");
clearFiveBuffersKernel = getKernel(utilities, "clearFiveBuffers"); clearFiveBuffersKernel = getKernel(utilities, "clearFiveBuffers");
clearSixBuffersKernel = getKernel(utilities, "clearSixBuffers"); clearSixBuffersKernel = getKernel(utilities, "clearSixBuffers");
reduceEnergyKernel = getKernel(utilities, "reduceEnergy");
setChargesKernel = getKernel(utilities, "setCharges"); setChargesKernel = getKernel(utilities, "setCharges");
// Set defines based on the requested precision. // Set defines based on the requested precision.
...@@ -405,6 +421,8 @@ CudaContext::~CudaContext() { ...@@ -405,6 +421,8 @@ CudaContext::~CudaContext() {
delete force; delete force;
if (energyBuffer != NULL) if (energyBuffer != NULL)
delete energyBuffer; delete energyBuffer;
if (energySum != NULL)
delete energySum;
if (energyParamDerivBuffer != NULL) if (energyParamDerivBuffer != NULL)
delete energyParamDerivBuffer; delete energyParamDerivBuffer;
if (atomIndexDevice != NULL) if (atomIndexDevice != NULL)
...@@ -422,7 +440,7 @@ CudaContext::~CudaContext() { ...@@ -422,7 +440,7 @@ CudaContext::~CudaContext() {
if (thread != NULL) if (thread != NULL)
delete thread; delete thread;
string errorMessage = "Error deleting Context"; string errorMessage = "Error deleting Context";
if (contextIsValid) { if (contextIsValid && !isLinkedContext) {
cuProfilerStop(); cuProfilerStop();
CHECK_RESULT(cuCtxDestroy(context)); CHECK_RESULT(cuCtxDestroy(context));
} }
...@@ -435,16 +453,19 @@ void CudaContext::initialize() { ...@@ -435,16 +453,19 @@ void CudaContext::initialize() {
int numEnergyBuffers = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()); int numEnergyBuffers = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers());
if (useDoublePrecision) { if (useDoublePrecision) {
energyBuffer = CudaArray::create<double>(*this, numEnergyBuffers, "energyBuffer"); energyBuffer = CudaArray::create<double>(*this, numEnergyBuffers, "energyBuffer");
energySum = CudaArray::create<double>(*this, 1, "energySum");
int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers); int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0)); CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
} }
else if (useMixedPrecision) { else if (useMixedPrecision) {
energyBuffer = CudaArray::create<double>(*this, numEnergyBuffers, "energyBuffer"); energyBuffer = CudaArray::create<double>(*this, numEnergyBuffers, "energyBuffer");
energySum = CudaArray::create<double>(*this, 1, "energySum");
int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers); int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0)); CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
} }
else { else {
energyBuffer = CudaArray::create<float>(*this, numEnergyBuffers, "energyBuffer"); energyBuffer = CudaArray::create<float>(*this, numEnergyBuffers, "energyBuffer");
energySum = CudaArray::create<float>(*this, 1, "energySum");
int pinnedBufferSize = max(paddedNumAtoms*6, numEnergyBuffers); int pinnedBufferSize = max(paddedNumAtoms*6, numEnergyBuffers);
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), 0)); CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), 0));
} }
...@@ -864,6 +885,18 @@ void CudaContext::clearAutoclearBuffers() { ...@@ -864,6 +885,18 @@ void CudaContext::clearAutoclearBuffers() {
} }
} }
double CudaContext::reduceEnergy() {
int bufferSize = energyBuffer->getSize();
int workGroupSize = 512;
void* args[] = {&energyBuffer->getDevicePointer(), &energySum->getDevicePointer(), &bufferSize, &workGroupSize};
executeKernel(reduceEnergyKernel, args, workGroupSize, workGroupSize, workGroupSize*energyBuffer->getElementSize());
energySum->download(pinnedBuffer);
if (getUseDoublePrecision() || getUseMixedPrecision())
return *((double*) pinnedBuffer);
else
return *((float*) pinnedBuffer);
}
void CudaContext::setCharges(const vector<double>& charges) { void CudaContext::setCharges(const vector<double>& charges) {
if (chargeBuffer == NULL) if (chargeBuffer == NULL)
chargeBuffer = new CudaArray(*this, numAtoms, useDoublePrecision ? sizeof(double) : sizeof(float), "chargeBuffer"); chargeBuffer = new CudaArray(*this, numAtoms, useDoublePrecision ? sizeof(double) : sizeof(float), "chargeBuffer");
...@@ -1050,9 +1083,16 @@ void CudaContext::findMoleculeGroups() { ...@@ -1050,9 +1083,16 @@ void CudaContext::findMoleculeGroups() {
for (int i = 0; i < (int) forces.size() && identical; i++) { for (int i = 0; i < (int) forces.size() && identical; i++) {
if (mol.groups[i].size() != mol2.groups[i].size()) if (mol.groups[i].size() != mol2.groups[i].size())
identical = false; identical = false;
for (int k = 0; k < (int) mol.groups[i].size() && identical; k++) for (int k = 0; k < (int) mol.groups[i].size() && identical; k++) {
if (!forces[i]->areGroupsIdentical(mol.groups[i][k], mol2.groups[i][k])) if (!forces[i]->areGroupsIdentical(mol.groups[i][k], mol2.groups[i][k]))
identical = false; identical = false;
vector<int> p1, p2;
forces[i]->getParticlesInGroup(mol.groups[i][k], p1);
forces[i]->getParticlesInGroup(mol2.groups[i][k], p2);
for (int m = 0; m < p1.size(); m++)
if (p1[m] != p2[m]-atomOffset)
identical = false;
}
} }
if (identical) { if (identical) {
moleculeInstances[j].push_back(molIndex); moleculeInstances[j].push_back(molIndex);
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2015 Stanford University and the Authors. * * Portions copyright (c) 2009-2017 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -103,7 +103,8 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S ...@@ -103,7 +103,8 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
ccmaReducedMass(NULL), ccmaAtomConstraints(NULL), ccmaNumAtomConstraints(NULL), ccmaConstraintMatrixColumn(NULL), ccmaReducedMass(NULL), ccmaAtomConstraints(NULL), ccmaNumAtomConstraints(NULL), ccmaConstraintMatrixColumn(NULL),
ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConverged(NULL), ccmaConvergedMemory(NULL), ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConverged(NULL), ccmaConvergedMemory(NULL),
vsite2AvgAtoms(NULL), vsite2AvgWeights(NULL), vsite3AvgAtoms(NULL), vsite3AvgWeights(NULL), vsite2AvgAtoms(NULL), vsite2AvgWeights(NULL), vsite3AvgAtoms(NULL), vsite3AvgWeights(NULL),
vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL), vsiteLocalCoordsAtoms(NULL), vsiteLocalCoordsParams(NULL) { vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL), vsiteLocalCoordsIndex(NULL), vsiteLocalCoordsAtoms(NULL),
vsiteLocalCoordsWeights(NULL), vsiteLocalCoordsPos(NULL), vsiteLocalCoordsStartIndex(NULL) {
// Create workspace arrays. // Create workspace arrays.
lastStepSize = make_double2(0.0, 0.0); lastStepSize = make_double2(0.0, 0.0);
...@@ -454,8 +455,11 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S ...@@ -454,8 +455,11 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
vector<double4> vsite3AvgWeightVec; vector<double4> vsite3AvgWeightVec;
vector<int4> vsiteOutOfPlaneAtomVec; vector<int4> vsiteOutOfPlaneAtomVec;
vector<double4> vsiteOutOfPlaneWeightVec; vector<double4> vsiteOutOfPlaneWeightVec;
vector<int4> vsiteLocalCoordsAtomVec; vector<int> vsiteLocalCoordsIndexVec;
vector<double> vsiteLocalCoordsParamVec; vector<int> vsiteLocalCoordsAtomVec;
vector<int> vsiteLocalCoordsStartVec;
vector<double> vsiteLocalCoordsWeightVec;
vector<double4> vsiteLocalCoordsPosVec;
for (int i = 0; i < numAtoms; i++) { for (int i = 0; i < numAtoms; i++) {
if (system.isVirtualSite(i)) { if (system.isVirtualSite(i)) {
if (dynamic_cast<const TwoParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) { if (dynamic_cast<const TwoParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
...@@ -480,64 +484,72 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S ...@@ -480,64 +484,72 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
vsiteOutOfPlaneWeightVec.push_back(make_double4(site.getWeight12(), site.getWeight13(), site.getWeightCross(), 0.0)); vsiteOutOfPlaneWeightVec.push_back(make_double4(site.getWeight12(), site.getWeight13(), site.getWeightCross(), 0.0));
} }
else if (dynamic_cast<const LocalCoordinatesSite*>(&system.getVirtualSite(i)) != NULL) { else if (dynamic_cast<const LocalCoordinatesSite*>(&system.getVirtualSite(i)) != NULL) {
// An out of plane site. // A local coordinates site.
const LocalCoordinatesSite& site = dynamic_cast<const LocalCoordinatesSite&>(system.getVirtualSite(i)); const LocalCoordinatesSite& site = dynamic_cast<const LocalCoordinatesSite&>(system.getVirtualSite(i));
vsiteLocalCoordsAtomVec.push_back(make_int4(i, site.getParticle(0), site.getParticle(1), site.getParticle(2))); int numParticles = site.getNumParticles();
Vec3 origin = site.getOriginWeights(); vector<double> origin, x, y;
Vec3 x = site.getXWeights(); site.getOriginWeights(origin);
Vec3 y = site.getYWeights(); site.getXWeights(x);
site.getYWeights(y);
vsiteLocalCoordsIndexVec.push_back(i);
vsiteLocalCoordsStartVec.push_back(vsiteLocalCoordsAtomVec.size());
for (int j = 0; j < numParticles; j++) {
vsiteLocalCoordsAtomVec.push_back(site.getParticle(j));
vsiteLocalCoordsWeightVec.push_back(origin[j]);
vsiteLocalCoordsWeightVec.push_back(x[j]);
vsiteLocalCoordsWeightVec.push_back(y[j]);
}
Vec3 pos = site.getLocalPosition(); Vec3 pos = site.getLocalPosition();
vsiteLocalCoordsParamVec.push_back(origin[0]); vsiteLocalCoordsPosVec.push_back(make_double4(pos[0], pos[1], pos[2], 0.0));
vsiteLocalCoordsParamVec.push_back(origin[1]);
vsiteLocalCoordsParamVec.push_back(origin[2]);
vsiteLocalCoordsParamVec.push_back(x[0]);
vsiteLocalCoordsParamVec.push_back(x[1]);
vsiteLocalCoordsParamVec.push_back(x[2]);
vsiteLocalCoordsParamVec.push_back(y[0]);
vsiteLocalCoordsParamVec.push_back(y[1]);
vsiteLocalCoordsParamVec.push_back(y[2]);
vsiteLocalCoordsParamVec.push_back(pos[0]);
vsiteLocalCoordsParamVec.push_back(pos[1]);
vsiteLocalCoordsParamVec.push_back(pos[2]);
} }
} }
} }
vsiteLocalCoordsStartVec.push_back(vsiteLocalCoordsAtomVec.size());
int num2Avg = vsite2AvgAtomVec.size(); int num2Avg = vsite2AvgAtomVec.size();
int num3Avg = vsite3AvgAtomVec.size(); int num3Avg = vsite3AvgAtomVec.size();
int numOutOfPlane = vsiteOutOfPlaneAtomVec.size(); int numOutOfPlane = vsiteOutOfPlaneAtomVec.size();
int numLocalCoords = vsiteLocalCoordsAtomVec.size(); int numLocalCoords = vsiteLocalCoordsPosVec.size();
vsite2AvgAtoms = CudaArray::create<int4>(context, max(1, num2Avg), "vsite2AvgAtoms"); vsite2AvgAtoms = CudaArray::create<int4>(context, max(1, num2Avg), "vsite2AvgAtoms");
vsite3AvgAtoms = CudaArray::create<int4>(context, max(1, num3Avg), "vsite3AvgAtoms"); vsite3AvgAtoms = CudaArray::create<int4>(context, max(1, num3Avg), "vsite3AvgAtoms");
vsiteOutOfPlaneAtoms = CudaArray::create<int4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneAtoms"); vsiteOutOfPlaneAtoms = CudaArray::create<int4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneAtoms");
vsiteLocalCoordsAtoms = CudaArray::create<int4>(context, max(1, numLocalCoords), "vsiteLocalCoordinatesAtoms"); vsiteLocalCoordsIndex = CudaArray::create<int>(context, max(1, (int) vsiteLocalCoordsIndexVec.size()), "vsiteLocalCoordsIndex");
vsiteLocalCoordsAtoms = CudaArray::create<int>(context, max(1, (int) vsiteLocalCoordsAtomVec.size()), "vsiteLocalCoordsAtoms");
vsiteLocalCoordsStartIndex = CudaArray::create<int>(context, max(1, (int) vsiteLocalCoordsStartVec.size()), "vsiteLocalCoordsStartIndex");
if (num2Avg > 0) if (num2Avg > 0)
vsite2AvgAtoms->upload(vsite2AvgAtomVec); vsite2AvgAtoms->upload(vsite2AvgAtomVec);
if (num3Avg > 0) if (num3Avg > 0)
vsite3AvgAtoms->upload(vsite3AvgAtomVec); vsite3AvgAtoms->upload(vsite3AvgAtomVec);
if (numOutOfPlane > 0) if (numOutOfPlane > 0)
vsiteOutOfPlaneAtoms->upload(vsiteOutOfPlaneAtomVec); vsiteOutOfPlaneAtoms->upload(vsiteOutOfPlaneAtomVec);
if (numLocalCoords > 0) if (numLocalCoords > 0) {
vsiteLocalCoordsIndex->upload(vsiteLocalCoordsIndexVec);
vsiteLocalCoordsAtoms->upload(vsiteLocalCoordsAtomVec); vsiteLocalCoordsAtoms->upload(vsiteLocalCoordsAtomVec);
vsiteLocalCoordsStartIndex->upload(vsiteLocalCoordsStartVec);
}
if (context.getUseDoublePrecision()) { if (context.getUseDoublePrecision()) {
vsite2AvgWeights = CudaArray::create<double2>(context, max(1, num2Avg), "vsite2AvgWeights"); vsite2AvgWeights = CudaArray::create<double2>(context, max(1, num2Avg), "vsite2AvgWeights");
vsite3AvgWeights = CudaArray::create<double4>(context, max(1, num3Avg), "vsite3AvgWeights"); vsite3AvgWeights = CudaArray::create<double4>(context, max(1, num3Avg), "vsite3AvgWeights");
vsiteOutOfPlaneWeights = CudaArray::create<double4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights"); vsiteOutOfPlaneWeights = CudaArray::create<double4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights");
vsiteLocalCoordsParams = CudaArray::create<double>(context, max(1, 12*numLocalCoords), "vsiteLocalCoordinatesParams"); vsiteLocalCoordsWeights = CudaArray::create<double>(context, max(1, (int) vsiteLocalCoordsWeightVec.size()), "vsiteLocalCoordsWeights");
vsiteLocalCoordsPos = CudaArray::create<double4>(context, max(1, (int) vsiteLocalCoordsPosVec.size()), "vsiteLocalCoordsPos");
if (num2Avg > 0) if (num2Avg > 0)
vsite2AvgWeights->upload(vsite2AvgWeightVec); vsite2AvgWeights->upload(vsite2AvgWeightVec);
if (num3Avg > 0) if (num3Avg > 0)
vsite3AvgWeights->upload(vsite3AvgWeightVec); vsite3AvgWeights->upload(vsite3AvgWeightVec);
if (numOutOfPlane > 0) if (numOutOfPlane > 0)
vsiteOutOfPlaneWeights->upload(vsiteOutOfPlaneWeightVec); vsiteOutOfPlaneWeights->upload(vsiteOutOfPlaneWeightVec);
if (numLocalCoords > 0) if (numLocalCoords > 0) {
vsiteLocalCoordsParams->upload(vsiteLocalCoordsParamVec); vsiteLocalCoordsWeights->upload(vsiteLocalCoordsWeightVec);
vsiteLocalCoordsPos->upload(vsiteLocalCoordsPosVec);
}
} }
else { else {
vsite2AvgWeights = CudaArray::create<float2>(context, max(1, num2Avg), "vsite2AvgWeights"); vsite2AvgWeights = CudaArray::create<float2>(context, max(1, num2Avg), "vsite2AvgWeights");
vsite3AvgWeights = CudaArray::create<float4>(context, max(1, num3Avg), "vsite3AvgWeights"); vsite3AvgWeights = CudaArray::create<float4>(context, max(1, num3Avg), "vsite3AvgWeights");
vsiteOutOfPlaneWeights = CudaArray::create<float4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights"); vsiteOutOfPlaneWeights = CudaArray::create<float4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights");
vsiteLocalCoordsParams = CudaArray::create<float>(context, max(1, 12*numLocalCoords), "vsiteLocalCoordinatesParams"); vsiteLocalCoordsWeights = CudaArray::create<float>(context, max(1, (int) vsiteLocalCoordsWeightVec.size()), "vsiteLocalCoordsWeights");
vsiteLocalCoordsPos = CudaArray::create<float4>(context, max(1, (int) vsiteLocalCoordsPosVec.size()), "vsiteLocalCoordsPos");
if (num2Avg > 0) { if (num2Avg > 0) {
vector<float2> floatWeights(num2Avg); vector<float2> floatWeights(num2Avg);
for (int i = 0; i < num2Avg; i++) for (int i = 0; i < num2Avg; i++)
...@@ -557,10 +569,14 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S ...@@ -557,10 +569,14 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
vsiteOutOfPlaneWeights->upload(floatWeights); vsiteOutOfPlaneWeights->upload(floatWeights);
} }
if (numLocalCoords > 0) { if (numLocalCoords > 0) {
vector<float> floatParams(vsiteLocalCoordsParamVec.size()); vector<float> floatWeights(vsiteLocalCoordsWeightVec.size());
for (int i = 0; i < (int) vsiteLocalCoordsParamVec.size(); i++) for (int i = 0; i < (int) vsiteLocalCoordsWeightVec.size(); i++)
floatParams[i] = (float) vsiteLocalCoordsParamVec[i]; floatWeights[i] = (float) vsiteLocalCoordsWeightVec[i];
vsiteLocalCoordsParams->upload(floatParams); vsiteLocalCoordsWeights->upload(floatWeights);
vector<float4> floatPos(vsiteLocalCoordsPosVec.size());
for (int i = 0; i < (int) vsiteLocalCoordsPosVec.size(); i++)
floatPos[i] = make_float4((float) vsiteLocalCoordsPosVec[i].x, (float) vsiteLocalCoordsPosVec[i].y, (float) vsiteLocalCoordsPosVec[i].z, 0.0f);
vsiteLocalCoordsPos->upload(floatPos);
} }
} }
...@@ -644,10 +660,16 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() { ...@@ -644,10 +660,16 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() {
delete vsiteOutOfPlaneAtoms; delete vsiteOutOfPlaneAtoms;
if (vsiteOutOfPlaneWeights != NULL) if (vsiteOutOfPlaneWeights != NULL)
delete vsiteOutOfPlaneWeights; delete vsiteOutOfPlaneWeights;
if (vsiteLocalCoordsIndex != NULL)
delete vsiteLocalCoordsIndex;
if (vsiteLocalCoordsAtoms != NULL) if (vsiteLocalCoordsAtoms != NULL)
delete vsiteLocalCoordsAtoms; delete vsiteLocalCoordsAtoms;
if (vsiteLocalCoordsParams != NULL) if (vsiteLocalCoordsWeights != NULL)
delete vsiteLocalCoordsParams; delete vsiteLocalCoordsWeights;
if (vsiteLocalCoordsPos != NULL)
delete vsiteLocalCoordsPos;
if (vsiteLocalCoordsStartIndex != NULL)
delete vsiteLocalCoordsStartIndex;
} }
void CudaIntegrationUtilities::setNextStepSize(double size) { void CudaIntegrationUtilities::setNextStepSize(double size) {
...@@ -747,7 +769,9 @@ void CudaIntegrationUtilities::computeVirtualSites() { ...@@ -747,7 +769,9 @@ void CudaIntegrationUtilities::computeVirtualSites() {
void* args[] = {&context.getPosq().getDevicePointer(), &posCorrection, &vsite2AvgAtoms->getDevicePointer(), &vsite2AvgWeights->getDevicePointer(), void* args[] = {&context.getPosq().getDevicePointer(), &posCorrection, &vsite2AvgAtoms->getDevicePointer(), &vsite2AvgWeights->getDevicePointer(),
&vsite3AvgAtoms->getDevicePointer(), &vsite3AvgWeights->getDevicePointer(), &vsite3AvgAtoms->getDevicePointer(), &vsite3AvgWeights->getDevicePointer(),
&vsiteOutOfPlaneAtoms->getDevicePointer(), &vsiteOutOfPlaneWeights->getDevicePointer(), &vsiteOutOfPlaneAtoms->getDevicePointer(), &vsiteOutOfPlaneWeights->getDevicePointer(),
&vsiteLocalCoordsAtoms->getDevicePointer(), &vsiteLocalCoordsParams->getDevicePointer()}; &vsiteLocalCoordsIndex->getDevicePointer(), &vsiteLocalCoordsAtoms->getDevicePointer(),
&vsiteLocalCoordsWeights->getDevicePointer(), &vsiteLocalCoordsPos->getDevicePointer(),
&vsiteLocalCoordsStartIndex->getDevicePointer()};
context.executeKernel(vsitePositionKernel, args, numVsites); context.executeKernel(vsitePositionKernel, args, numVsites);
} }
} }
...@@ -759,7 +783,9 @@ void CudaIntegrationUtilities::distributeForcesFromVirtualSites() { ...@@ -759,7 +783,9 @@ void CudaIntegrationUtilities::distributeForcesFromVirtualSites() {
&vsite2AvgAtoms->getDevicePointer(), &vsite2AvgWeights->getDevicePointer(), &vsite2AvgAtoms->getDevicePointer(), &vsite2AvgWeights->getDevicePointer(),
&vsite3AvgAtoms->getDevicePointer(), &vsite3AvgWeights->getDevicePointer(), &vsite3AvgAtoms->getDevicePointer(), &vsite3AvgWeights->getDevicePointer(),
&vsiteOutOfPlaneAtoms->getDevicePointer(), &vsiteOutOfPlaneWeights->getDevicePointer(), &vsiteOutOfPlaneAtoms->getDevicePointer(), &vsiteOutOfPlaneWeights->getDevicePointer(),
&vsiteLocalCoordsAtoms->getDevicePointer(), &vsiteLocalCoordsParams->getDevicePointer()}; &vsiteLocalCoordsIndex->getDevicePointer(), &vsiteLocalCoordsAtoms->getDevicePointer(),
&vsiteLocalCoordsWeights->getDevicePointer(), &vsiteLocalCoordsPos->getDevicePointer(),
&vsiteLocalCoordsStartIndex->getDevicePointer()};
context.executeKernel(vsiteForceKernel, args, numVsites); context.executeKernel(vsiteForceKernel, args, numVsites);
} }
} }
......
...@@ -108,6 +108,8 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform ...@@ -108,6 +108,8 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
return new CudaCalcCustomCentroidBondForceKernel(name, platform, cu, context.getSystem()); return new CudaCalcCustomCentroidBondForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomCompoundBondForceKernel::Name()) if (name == CalcCustomCompoundBondForceKernel::Name())
return new CudaCalcCustomCompoundBondForceKernel(name, platform, cu, context.getSystem()); return new CudaCalcCustomCompoundBondForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomCVForceKernel::Name())
return new CudaCalcCustomCVForceKernel(name, platform, cu);
if (name == CalcCustomManyParticleForceKernel::Name()) if (name == CalcCustomManyParticleForceKernel::Name())
return new CudaCalcCustomManyParticleForceKernel(name, platform, cu, context.getSystem()); return new CudaCalcCustomManyParticleForceKernel(name, platform, cu, context.getSystem());
if (name == CalcGayBerneForceKernel::Name()) if (name == CalcGayBerneForceKernel::Name())
......
This diff is collapsed.
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2008-2016 Stanford University and the Authors. * * Portions copyright (c) 2008-2017 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -91,6 +91,7 @@ CudaPlatform::CudaPlatform() { ...@@ -91,6 +91,7 @@ CudaPlatform::CudaPlatform() {
registerKernelFactory(CalcCustomHbondForceKernel::Name(), factory); registerKernelFactory(CalcCustomHbondForceKernel::Name(), factory);
registerKernelFactory(CalcCustomCentroidBondForceKernel::Name(), factory); registerKernelFactory(CalcCustomCentroidBondForceKernel::Name(), factory);
registerKernelFactory(CalcCustomCompoundBondForceKernel::Name(), factory); registerKernelFactory(CalcCustomCompoundBondForceKernel::Name(), factory);
registerKernelFactory(CalcCustomCVForceKernel::Name(), factory);
registerKernelFactory(CalcCustomManyParticleForceKernel::Name(), factory); registerKernelFactory(CalcCustomManyParticleForceKernel::Name(), factory);
registerKernelFactory(CalcGayBerneForceKernel::Name(), factory); registerKernelFactory(CalcGayBerneForceKernel::Name(), factory);
registerKernelFactory(IntegrateVerletStepKernel::Name(), factory); registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
...@@ -198,7 +199,23 @@ void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string ...@@ -198,7 +199,23 @@ void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string
if (threadsEnv != NULL) if (threadsEnv != NULL)
stringstream(threadsEnv) >> threads; stringstream(threadsEnv) >> threads;
context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue, context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue,
hostCompilerPropValue, pmeStreamPropValue, deterministicForcesValue, threads)); hostCompilerPropValue, pmeStreamPropValue, deterministicForcesValue, threads, NULL));
}
void CudaPlatform::linkedContextCreated(ContextImpl& context, ContextImpl& originalContext) const {
Platform& platform = originalContext.getPlatform();
string devicePropValue = platform.getPropertyValue(originalContext.getOwner(), CudaDeviceIndex());
string blockingPropValue = platform.getPropertyValue(originalContext.getOwner(), CudaUseBlockingSync());
string precisionPropValue = platform.getPropertyValue(originalContext.getOwner(), CudaPrecision());
string cpuPmePropValue = platform.getPropertyValue(originalContext.getOwner(), CudaUseCpuPme());
string compilerPropValue = platform.getPropertyValue(originalContext.getOwner(), CudaCompiler());
string tempPropValue = platform.getPropertyValue(originalContext.getOwner(), CudaTempDirectory());
string hostCompilerPropValue = platform.getPropertyValue(originalContext.getOwner(), CudaHostCompiler());
string pmeStreamPropValue = platform.getPropertyValue(originalContext.getOwner(), CudaDisablePmeStream());
string deterministicForcesValue = platform.getPropertyValue(originalContext.getOwner(), CudaDeterministicForces());
int threads = reinterpret_cast<PlatformData*>(originalContext.getPlatformData())->threads.getNumThreads();
context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue,
hostCompilerPropValue, pmeStreamPropValue, deterministicForcesValue, threads, &originalContext));
} }
void CudaPlatform::contextDestroyed(ContextImpl& context) const { void CudaPlatform::contextDestroyed(ContextImpl& context) const {
...@@ -208,7 +225,7 @@ void CudaPlatform::contextDestroyed(ContextImpl& context) const { ...@@ -208,7 +225,7 @@ void CudaPlatform::contextDestroyed(ContextImpl& context) const {
CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty, CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty,
const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty, const string& hostCompilerProperty, const string& pmeStreamProperty, const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty, const string& hostCompilerProperty, const string& pmeStreamProperty,
const string& deterministicForcesProperty, int numThreads) : const string& deterministicForcesProperty, int numThreads, ContextImpl* originalContext) :
context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0), hasInitializedContexts(false), threads(numThreads) { context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0), hasInitializedContexts(false), threads(numThreads) {
bool blocking = (blockingProperty == "true"); bool blocking = (blockingProperty == "true");
vector<string> devices; vector<string> devices;
...@@ -218,16 +235,19 @@ CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& sys ...@@ -218,16 +235,19 @@ CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& sys
searchPos = nextPos+1; searchPos = nextPos+1;
} }
devices.push_back(deviceIndexProperty.substr(searchPos)); devices.push_back(deviceIndexProperty.substr(searchPos));
PlatformData* originalData = NULL;
if (originalContext != NULL)
originalData = reinterpret_cast<PlatformData*>(originalContext->getPlatformData());
try { try {
for (int i = 0; i < (int) devices.size(); i++) { for (int i = 0; i < (int) devices.size(); i++) {
if (devices[i].length() > 0) { if (devices[i].length() > 0) {
int deviceIndex; int deviceIndex;
stringstream(devices[i]) >> deviceIndex; stringstream(devices[i]) >> deviceIndex;
contexts.push_back(new CudaContext(system, deviceIndex, blocking, precisionProperty, compilerProperty, tempProperty, hostCompilerProperty, *this)); contexts.push_back(new CudaContext(system, deviceIndex, blocking, precisionProperty, compilerProperty, tempProperty, hostCompilerProperty, *this, (originalData == NULL ? NULL : originalData->contexts[i])));
} }
} }
if (contexts.size() == 0) if (contexts.size() == 0)
contexts.push_back(new CudaContext(system, -1, blocking, precisionProperty, compilerProperty, tempProperty, hostCompilerProperty, *this)); contexts.push_back(new CudaContext(system, -1, blocking, precisionProperty, compilerProperty, tempProperty, hostCompilerProperty, *this, (originalData == NULL ? NULL : originalData->contexts[0])));
} }
catch (...) { catch (...) {
// If an exception was thrown, do our best to clean up memory. // If an exception was thrown, do our best to clean up memory.
......
/**
* Copy the positions and velocities to the inner context.
*/
extern "C" __global__ void copyState(real4* posq, real4* posqCorrection, mixed4* velm, int* __restrict__ atomOrder,
real4* innerPosq, real4* innerPosqCorrection, mixed4* innerVelm, int* __restrict__ innerInvAtomOrder,
int numAtoms) {
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < numAtoms; i += blockDim.x*gridDim.x) {
int index = innerInvAtomOrder[atomOrder[i]];
innerPosq[index] = posq[i];
innerVelm[index] = velm[i];
#ifdef USE_MIXED_PRECISION
innerPosqCorrection[index] = posqCorrection[i];
#endif
}
}
/**
* Copy the forces back to the main context.
*/
extern "C" __global__ void copyForces(long long* forces, int* __restrict__ invAtomOrder, long long* innerForces,
int* __restrict__ innerAtomOrder, int numAtoms, int paddedNumAtoms) {
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < numAtoms; i += blockDim.x*gridDim.x) {
int index = invAtomOrder[innerAtomOrder[i]];
forces[index] = innerForces[i];
forces[index+paddedNumAtoms] = innerForces[i+paddedNumAtoms];
forces[index+paddedNumAtoms*2] = innerForces[i+paddedNumAtoms*2];
}
}
/**
* Add all the forces from the CVs.
*/
extern "C" __global__ void addForces(long long* forces, int bufferSize
PARAMETER_ARGUMENTS) {
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < bufferSize; i += blockDim.x*gridDim.x) {
ADD_FORCES
}
}
...@@ -6,26 +6,17 @@ inline __device__ real3 trim(real4 v) { ...@@ -6,26 +6,17 @@ inline __device__ real3 trim(real4 v) {
} }
/** /**
* This does nothing, and just exists to simply the code generation. * This does nothing, and just exists to simplify the code generation.
*/ */
inline __device__ real3 trim(real3 v) { inline __device__ real3 trim(real3 v) {
return v; return v;
} }
/** /**
* Compute the difference between two vectors, setting the fourth component to the squared magnitude. * Compute the difference between two vectors, optionally taking periodic boundary conditions into account
*/
inline __device__ real4 delta(real4 vec1, real4 vec2) {
real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
result.w = result.x*result.x + result.y*result.y + result.z*result.z;
return result;
}
/**
* Compute the difference between two vectors, taking periodic boundary conditions into account
* and setting the fourth component to the squared magnitude. * and setting the fourth component to the squared magnitude.
*/ */
inline __device__ real4 deltaPeriodic(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) { inline __device__ real4 delta(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f); real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(result) APPLY_PERIODIC_TO_DELTA(result)
...@@ -95,6 +86,7 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f ...@@ -95,6 +86,7 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f
for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += blockDim.x) { for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += blockDim.x) {
// Load the next block of acceptors into local memory. // Load the next block of acceptors into local memory.
__syncthreads();
int blockSize = min((int) blockDim.x, NUM_ACCEPTORS-acceptorStart); int blockSize = min((int) blockDim.x, NUM_ACCEPTORS-acceptorStart);
if (threadIdx.x < blockSize) { if (threadIdx.x < blockSize) {
int4 atoms2 = acceptorAtoms[acceptorStart+threadIdx.x]; int4 atoms2 = acceptorAtoms[acceptorStart+threadIdx.x];
...@@ -105,8 +97,8 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f ...@@ -105,8 +97,8 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f
__syncthreads(); __syncthreads();
if (donorIndex < NUM_DONORS) { if (donorIndex < NUM_DONORS) {
for (int index = 0; index < blockSize; index++) { for (int index = 0; index < blockSize; index++) {
#ifdef USE_EXCLUSIONS
int acceptorIndex = acceptorStart+index; int acceptorIndex = acceptorStart+index;
#ifdef USE_EXCLUSIONS
if (acceptorIndex == exclusionIndices.x || acceptorIndex == exclusionIndices.y || acceptorIndex == exclusionIndices.z || acceptorIndex == exclusionIndices.w) if (acceptorIndex == exclusionIndices.x || acceptorIndex == exclusionIndices.y || acceptorIndex == exclusionIndices.z || acceptorIndex == exclusionIndices.w)
continue; continue;
#endif #endif
...@@ -115,7 +107,7 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f ...@@ -115,7 +107,7 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f
real4 a1 = posBuffer[3*index]; real4 a1 = posBuffer[3*index];
real4 a2 = posBuffer[3*index+1]; real4 a2 = posBuffer[3*index+1];
real4 a3 = posBuffer[3*index+2]; real4 a3 = posBuffer[3*index+2];
real4 deltaD1A1 = deltaPeriodic(d1, a1, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ); real4 deltaD1A1 = delta(d1, a1, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (deltaD1A1.w < CUTOFF_SQUARED) { if (deltaD1A1.w < CUTOFF_SQUARED) {
#endif #endif
...@@ -183,6 +175,7 @@ extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict_ ...@@ -183,6 +175,7 @@ extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict_
for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += blockDim.x) { for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += blockDim.x) {
// Load the next block of donors into local memory. // Load the next block of donors into local memory.
__syncthreads();
int blockSize = min((int) blockDim.x, NUM_DONORS-donorStart); int blockSize = min((int) blockDim.x, NUM_DONORS-donorStart);
if (threadIdx.x < blockSize) { if (threadIdx.x < blockSize) {
int4 atoms2 = donorAtoms[donorStart+threadIdx.x]; int4 atoms2 = donorAtoms[donorStart+threadIdx.x];
...@@ -193,8 +186,8 @@ extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict_ ...@@ -193,8 +186,8 @@ extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict_
__syncthreads(); __syncthreads();
if (acceptorIndex < NUM_ACCEPTORS) { if (acceptorIndex < NUM_ACCEPTORS) {
for (int index = 0; index < blockSize; index++) { for (int index = 0; index < blockSize; index++) {
#ifdef USE_EXCLUSIONS
int donorIndex = donorStart+index; int donorIndex = donorStart+index;
#ifdef USE_EXCLUSIONS
if (donorIndex == exclusionIndices.x || donorIndex == exclusionIndices.y || donorIndex == exclusionIndices.z || donorIndex == exclusionIndices.w) if (donorIndex == exclusionIndices.x || donorIndex == exclusionIndices.y || donorIndex == exclusionIndices.z || donorIndex == exclusionIndices.w)
continue; continue;
#endif #endif
...@@ -203,7 +196,7 @@ extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict_ ...@@ -203,7 +196,7 @@ extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict_
real4 d1 = posBuffer[3*index]; real4 d1 = posBuffer[3*index];
real4 d2 = posBuffer[3*index+1]; real4 d2 = posBuffer[3*index+1];
real4 d3 = posBuffer[3*index+2]; real4 d3 = posBuffer[3*index+2];
real4 deltaD1A1 = deltaPeriodic(d1, a1, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ); real4 deltaD1A1 = delta(d1, a1, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (deltaD1A1.w < CUTOFF_SQUARED) { if (deltaD1A1.w < CUTOFF_SQUARED) {
#endif #endif
......
...@@ -60,6 +60,11 @@ inline __device__ real4 computeCross(real4 vec1, real4 vec2) { ...@@ -60,6 +60,11 @@ inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
* Determine whether a particular interaction is in the list of exclusions. * Determine whether a particular interaction is in the list of exclusions.
*/ */
inline __device__ bool isInteractionExcluded(int atom1, int atom2, const int* __restrict__ exclusions, const int* __restrict__ exclusionStartIndex) { inline __device__ bool isInteractionExcluded(int atom1, int atom2, const int* __restrict__ exclusions, const int* __restrict__ exclusionStartIndex) {
if (atom1 > atom2) {
int temp = atom1;
atom1 = atom2;
atom2 = temp;
}
int first = exclusionStartIndex[atom1]; int first = exclusionStartIndex[atom1];
int last = exclusionStartIndex[atom1+1]; int last = exclusionStartIndex[atom1+1];
for (int i = last-1; i >= first; i--) { for (int i = last-1; i >= first; i--) {
......
...@@ -680,7 +680,9 @@ extern "C" __global__ void updateCCMAAtomPositions(const int* __restrict__ numAt ...@@ -680,7 +680,9 @@ extern "C" __global__ void updateCCMAAtomPositions(const int* __restrict__ numAt
extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4* __restrict__ posqCorrection, const int4* __restrict__ avg2Atoms, const real2* __restrict__ avg2Weights, extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4* __restrict__ posqCorrection, const int4* __restrict__ avg2Atoms, const real2* __restrict__ avg2Weights,
const int4* __restrict__ avg3Atoms, const real4* __restrict__ avg3Weights, const int4* __restrict__ avg3Atoms, const real4* __restrict__ avg3Weights,
const int4* __restrict__ outOfPlaneAtoms, const real4* __restrict__ outOfPlaneWeights, const int4* __restrict__ outOfPlaneAtoms, const real4* __restrict__ outOfPlaneWeights,
const int4* __restrict__ localCoordsAtoms, const real* __restrict__ localCoordsParams) { const int* __restrict__ localCoordsIndex, const int* __restrict__ localCoordsAtoms,
const real* __restrict__ localCoordsWeights, const real4* __restrict__ localCoordsPos,
const int* __restrict__ localCoordsStartIndex) {
// Two particle average sites. // Two particle average sites.
...@@ -732,30 +734,31 @@ extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4* ...@@ -732,30 +734,31 @@ extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4*
// Local coordinates sites. // Local coordinates sites.
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_LOCAL_COORDS; index += blockDim.x*gridDim.x) { for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_LOCAL_COORDS; index += blockDim.x*gridDim.x) {
int4 atoms = localCoordsAtoms[index]; int siteAtomIndex = localCoordsIndex[index];
const real* params = &localCoordsParams[12*index]; int start = localCoordsStartIndex[index];
mixed4 pos = loadPos(posq, posqCorrection, atoms.x); int end = localCoordsStartIndex[index+1];
mixed4 pos1_4 = loadPos(posq, posqCorrection, atoms.y); mixed3 origin = make_mixed3(0), xdir = make_mixed3(0), ydir = make_mixed3(0);
mixed4 pos2_4 = loadPos(posq, posqCorrection, atoms.z); for (int j = start; j < end; j++) {
mixed4 pos3_4 = loadPos(posq, posqCorrection, atoms.w); mixed3 pos = trimTo3(loadPos(posq, posqCorrection, localCoordsAtoms[j]));
mixed3 pos1 = make_mixed3(pos1_4.x, pos1_4.y, pos1_4.z); origin += pos*localCoordsWeights[3*j];
mixed3 pos2 = make_mixed3(pos2_4.x, pos2_4.y, pos2_4.z); xdir += pos*localCoordsWeights[3*j+1];
mixed3 pos3 = make_mixed3(pos3_4.x, pos3_4.y, pos3_4.z); ydir += pos*localCoordsWeights[3*j+2];
mixed3 originWeights = make_mixed3(params[0], params[1], params[2]); }
mixed3 xWeights = make_mixed3(params[3], params[4], params[5]);
mixed3 yWeights = make_mixed3(params[6], params[7], params[8]);
mixed3 localPosition = make_mixed3(params[9], params[10], params[11]);
mixed3 origin = pos1*originWeights.x + pos2*originWeights.y + pos3*originWeights.z;
mixed3 xdir = pos1*xWeights.x + pos2*xWeights.y + pos3*xWeights.z;
mixed3 ydir = pos1*yWeights.x + pos2*yWeights.y + pos3*yWeights.z;
mixed3 zdir = cross(xdir, ydir); mixed3 zdir = cross(xdir, ydir);
xdir *= rsqrt(xdir.x*xdir.x+xdir.y*xdir.y+xdir.z*xdir.z); mixed normXdir = sqrt(xdir.x*xdir.x+xdir.y*xdir.y+xdir.z*xdir.z);
zdir *= rsqrt(zdir.x*zdir.x+zdir.y*zdir.y+zdir.z*zdir.z); mixed normZdir = sqrt(zdir.x*zdir.x+zdir.y*zdir.y+zdir.z*zdir.z);
mixed invNormXdir = (normXdir > 0 ? 1/normXdir : 0);
mixed invNormZdir = (normZdir > 0 ? 1/normZdir : 0);
xdir *= invNormXdir;
zdir *= invNormZdir;
ydir = cross(zdir, xdir); ydir = cross(zdir, xdir);
real4 localPosition_4 = localCoordsPos[index];
mixed3 localPosition = make_mixed3(localPosition_4.x, localPosition_4.y, localPosition_4.z);
mixed4 pos = loadPos(posq, posqCorrection, siteAtomIndex);
pos.x = origin.x + xdir.x*localPosition.x + ydir.x*localPosition.y + zdir.x*localPosition.z; pos.x = origin.x + xdir.x*localPosition.x + ydir.x*localPosition.y + zdir.x*localPosition.z;
pos.y = origin.y + xdir.y*localPosition.x + ydir.y*localPosition.y + zdir.y*localPosition.z; pos.y = origin.y + xdir.y*localPosition.x + ydir.y*localPosition.y + zdir.y*localPosition.z;
pos.z = origin.z + xdir.z*localPosition.x + ydir.z*localPosition.y + zdir.z*localPosition.z; pos.z = origin.z + xdir.z*localPosition.x + ydir.z*localPosition.y + zdir.z*localPosition.z;
storePos(posq, posqCorrection, atoms.x, pos); storePos(posq, posqCorrection, siteAtomIndex, pos);
} }
} }
...@@ -778,7 +781,9 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__ ...@@ -778,7 +781,9 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__
const int4* __restrict__ avg2Atoms, const real2* __restrict__ avg2Weights, const int4* __restrict__ avg2Atoms, const real2* __restrict__ avg2Weights,
const int4* __restrict__ avg3Atoms, const real4* __restrict__ avg3Weights, const int4* __restrict__ avg3Atoms, const real4* __restrict__ avg3Weights,
const int4* __restrict__ outOfPlaneAtoms, const real4* __restrict__ outOfPlaneWeights, const int4* __restrict__ outOfPlaneAtoms, const real4* __restrict__ outOfPlaneWeights,
const int4* __restrict__ localCoordsAtoms, const real* __restrict__ localCoordsParams) { const int* __restrict__ localCoordsIndex, const int* __restrict__ localCoordsAtoms,
const real* __restrict__ localCoordsWeights, const real4* __restrict__ localCoordsPos,
const int* __restrict__ localCoordsStartIndex) {
// Two particle average sites. // Two particle average sites.
...@@ -826,87 +831,56 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__ ...@@ -826,87 +831,56 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__
// Local coordinates sites. // Local coordinates sites.
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_LOCAL_COORDS; index += blockDim.x*gridDim.x) { for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_LOCAL_COORDS; index += blockDim.x*gridDim.x) {
int4 atoms = localCoordsAtoms[index]; int siteAtomIndex = localCoordsIndex[index];
const real* params = &localCoordsParams[12*index]; int start = localCoordsStartIndex[index];
mixed4 pos = loadPos(posq, posqCorrection, atoms.x); int end = localCoordsStartIndex[index+1];
mixed4 pos1_4 = loadPos(posq, posqCorrection, atoms.y); mixed3 origin = make_mixed3(0), xdir = make_mixed3(0), ydir = make_mixed3(0);
mixed4 pos2_4 = loadPos(posq, posqCorrection, atoms.z); for (int j = start; j < end; j++) {
mixed4 pos3_4 = loadPos(posq, posqCorrection, atoms.w); mixed3 pos = trimTo3(loadPos(posq, posqCorrection, localCoordsAtoms[j]));
mixed3 pos1 = make_mixed3(pos1_4.x, pos1_4.y, pos1_4.z); origin += pos*localCoordsWeights[3*j];
mixed3 pos2 = make_mixed3(pos2_4.x, pos2_4.y, pos2_4.z); xdir += pos*localCoordsWeights[3*j+1];
mixed3 pos3 = make_mixed3(pos3_4.x, pos3_4.y, pos3_4.z); ydir += pos*localCoordsWeights[3*j+2];
mixed3 originWeights = make_mixed3(params[0], params[1], params[2]); }
mixed3 wx = make_mixed3(params[3], params[4], params[5]);
mixed3 wy = make_mixed3(params[6], params[7], params[8]);
mixed3 localPosition = make_mixed3(params[9], params[10], params[11]);
mixed3 origin = pos1*originWeights.x + pos2*originWeights.y + pos3*originWeights.z;
mixed3 xdir = pos1*wx.x + pos2*wx.y + pos3*wx.z;
mixed3 ydir = pos1*wy.x + pos2*wy.y + pos3*wy.z;
mixed3 zdir = cross(xdir, ydir); mixed3 zdir = cross(xdir, ydir);
mixed invNormXdir = rsqrt(xdir.x*xdir.x+xdir.y*xdir.y+xdir.z*xdir.z); mixed normXdir = sqrt(xdir.x*xdir.x+xdir.y*xdir.y+xdir.z*xdir.z);
mixed invNormZdir = rsqrt(zdir.x*zdir.x+zdir.y*zdir.y+zdir.z*zdir.z); mixed normZdir = sqrt(zdir.x*zdir.x+zdir.y*zdir.y+zdir.z*zdir.z);
mixed invNormXdir = (normXdir > 0 ? 1/normXdir : 0);
mixed invNormZdir = (normZdir > 0 ? 1/normZdir : 0);
mixed3 dx = xdir*invNormXdir; mixed3 dx = xdir*invNormXdir;
mixed3 dz = zdir*invNormZdir; mixed3 dz = zdir*invNormZdir;
mixed3 dy = cross(dz, dx); mixed3 dy = cross(dz, dx);
real4 localPosition_4 = localCoordsPos[index];
mixed3 localPosition = make_mixed3(localPosition_4.x, localPosition_4.y, localPosition_4.z);
// The derivatives for this case are very complicated. They were computed with SymPy then simplified by hand. // The derivatives for this case are very complicated. They were computed with SymPy then simplified by hand.
mixed t11 = (wx.x*ydir.x-wy.x*xdir.x)*invNormZdir; real3 f = loadForce(siteAtomIndex, force);
mixed t12 = (wx.x*ydir.y-wy.x*xdir.y)*invNormZdir;
mixed t13 = (wx.x*ydir.z-wy.x*xdir.z)*invNormZdir;
mixed t21 = (wx.y*ydir.x-wy.y*xdir.x)*invNormZdir;
mixed t22 = (wx.y*ydir.y-wy.y*xdir.y)*invNormZdir;
mixed t23 = (wx.y*ydir.z-wy.y*xdir.z)*invNormZdir;
mixed t31 = (wx.z*ydir.x-wy.z*xdir.x)*invNormZdir;
mixed t32 = (wx.z*ydir.y-wy.z*xdir.y)*invNormZdir;
mixed t33 = (wx.z*ydir.z-wy.z*xdir.z)*invNormZdir;
mixed sx1 = t13*dz.y-t12*dz.z;
mixed sy1 = t11*dz.z-t13*dz.x;
mixed sz1 = t12*dz.x-t11*dz.y;
mixed sx2 = t23*dz.y-t22*dz.z;
mixed sy2 = t21*dz.z-t23*dz.x;
mixed sz2 = t22*dz.x-t21*dz.y;
mixed sx3 = t33*dz.y-t32*dz.z;
mixed sy3 = t31*dz.z-t33*dz.x;
mixed sz3 = t32*dz.x-t31*dz.y;
mixed3 wxScaled = wx*invNormXdir;
real3 f = loadForce(atoms.x, force);
mixed3 fp1 = localPosition*f.x; mixed3 fp1 = localPosition*f.x;
mixed3 fp2 = localPosition*f.y; mixed3 fp2 = localPosition*f.y;
mixed3 fp3 = localPosition*f.z; mixed3 fp3 = localPosition*f.z;
real3 f1 = make_real3(0); for (int j = start; j < end; j++) {
real3 f2 = make_real3(0); real originWeight = localCoordsWeights[3*j];
real3 f3 = make_real3(0); real wx = localCoordsWeights[3*j+1];
f1.x += fp1.x*wxScaled.x*(1-dx.x*dx.x) + fp1.z*(dz.x*sx1 ) + fp1.y*((-dx.x*dy.x )*wxScaled.x + dy.x*sx1 - dx.y*t12 - dx.z*t13) + f.x*originWeights.x; real wy = localCoordsWeights[3*j+2];
f1.y += fp1.x*wxScaled.x*( -dx.x*dx.y) + fp1.z*(dz.x*sy1+t13) + fp1.y*((-dx.y*dy.x-dz.z)*wxScaled.x + dy.x*sy1 + dx.y*t11); mixed wxScaled = wx*invNormXdir;
f1.z += fp1.x*wxScaled.x*( -dx.x*dx.z) + fp1.z*(dz.x*sz1-t12) + fp1.y*((-dx.z*dy.x+dz.y)*wxScaled.x + dy.x*sz1 + dx.z*t11); mixed t1 = (wx*ydir.x-wy*xdir.x)*invNormZdir;
f2.x += fp1.x*wxScaled.y*(1-dx.x*dx.x) + fp1.z*(dz.x*sx2 ) + fp1.y*((-dx.x*dy.x )*wxScaled.y + dy.x*sx2 - dx.y*t22 - dx.z*t23) + f.x*originWeights.y; mixed t2 = (wx*ydir.y-wy*xdir.y)*invNormZdir;
f2.y += fp1.x*wxScaled.y*( -dx.x*dx.y) + fp1.z*(dz.x*sy2+t23) + fp1.y*((-dx.y*dy.x-dz.z)*wxScaled.y + dy.x*sy2 + dx.y*t21); mixed t3 = (wx*ydir.z-wy*xdir.z)*invNormZdir;
f2.z += fp1.x*wxScaled.y*( -dx.x*dx.z) + fp1.z*(dz.x*sz2-t22) + fp1.y*((-dx.z*dy.x+dz.y)*wxScaled.y + dy.x*sz2 + dx.z*t21); mixed sx = t3*dz.y-t2*dz.z;
f3.x += fp1.x*wxScaled.z*(1-dx.x*dx.x) + fp1.z*(dz.x*sx3 ) + fp1.y*((-dx.x*dy.x )*wxScaled.z + dy.x*sx3 - dx.y*t32 - dx.z*t33) + f.x*originWeights.z; mixed sy = t1*dz.z-t3*dz.x;
f3.y += fp1.x*wxScaled.z*( -dx.x*dx.y) + fp1.z*(dz.x*sy3+t33) + fp1.y*((-dx.y*dy.x-dz.z)*wxScaled.z + dy.x*sy3 + dx.y*t31); mixed sz = t2*dz.x-t1*dz.y;
f3.z += fp1.x*wxScaled.z*( -dx.x*dx.z) + fp1.z*(dz.x*sz3-t32) + fp1.y*((-dx.z*dy.x+dz.y)*wxScaled.z + dy.x*sz3 + dx.z*t31); real3 fresult = make_real3(0);
f1.x += fp2.x*wxScaled.x*( -dx.y*dx.x) + fp2.z*(dz.y*sx1-t13) - fp2.y*(( dx.x*dy.y-dz.z)*wxScaled.x - dy.y*sx1 - dx.x*t12); fresult.x += fp1.x*wxScaled*(1-dx.x*dx.x) + fp1.z*(dz.x*sx ) + fp1.y*((-dx.x*dy.x )*wxScaled + dy.x*sx - dx.y*t2 - dx.z*t3) + f.x*originWeight;
f1.y += fp2.x*wxScaled.x*(1-dx.y*dx.y) + fp2.z*(dz.y*sy1 ) - fp2.y*(( dx.y*dy.y )*wxScaled.x - dy.y*sy1 + dx.x*t11 + dx.z*t13) + f.y*originWeights.x; fresult.y += fp1.x*wxScaled*( -dx.x*dx.y) + fp1.z*(dz.x*sy+t3) + fp1.y*((-dx.y*dy.x-dz.z)*wxScaled + dy.x*sy + dx.y*t1);
f1.z += fp2.x*wxScaled.x*( -dx.y*dx.z) + fp2.z*(dz.y*sz1+t11) - fp2.y*(( dx.z*dy.y+dz.x)*wxScaled.x - dy.y*sz1 - dx.z*t12); fresult.z += fp1.x*wxScaled*( -dx.x*dx.z) + fp1.z*(dz.x*sz-t2) + fp1.y*((-dx.z*dy.x+dz.y)*wxScaled + dy.x*sz + dx.z*t1);
f2.x += fp2.x*wxScaled.y*( -dx.y*dx.x) + fp2.z*(dz.y*sx2-t23) - fp2.y*(( dx.x*dy.y-dz.z)*wxScaled.y - dy.y*sx2 - dx.x*t22); fresult.x += fp2.x*wxScaled*( -dx.y*dx.x) + fp2.z*(dz.y*sx-t3) - fp2.y*(( dx.x*dy.y-dz.z)*wxScaled - dy.y*sx - dx.x*t2);
f2.y += fp2.x*wxScaled.y*(1-dx.y*dx.y) + fp2.z*(dz.y*sy2 ) - fp2.y*(( dx.y*dy.y )*wxScaled.y - dy.y*sy2 + dx.x*t21 + dx.z*t23) + f.y*originWeights.y; fresult.y += fp2.x*wxScaled*(1-dx.y*dx.y) + fp2.z*(dz.y*sy ) - fp2.y*(( dx.y*dy.y )*wxScaled - dy.y*sy + dx.x*t1 + dx.z*t3) + f.y*originWeight;
f2.z += fp2.x*wxScaled.y*( -dx.y*dx.z) + fp2.z*(dz.y*sz2+t21) - fp2.y*(( dx.z*dy.y+dz.x)*wxScaled.y - dy.y*sz2 - dx.z*t22); fresult.z += fp2.x*wxScaled*( -dx.y*dx.z) + fp2.z*(dz.y*sz+t1) - fp2.y*(( dx.z*dy.y+dz.x)*wxScaled - dy.y*sz - dx.z*t2);
f3.x += fp2.x*wxScaled.z*( -dx.y*dx.x) + fp2.z*(dz.y*sx3-t33) - fp2.y*(( dx.x*dy.y-dz.z)*wxScaled.z - dy.y*sx3 - dx.x*t32); fresult.x += fp3.x*wxScaled*( -dx.z*dx.x) + fp3.z*(dz.z*sx+t2) + fp3.y*((-dx.x*dy.z-dz.y)*wxScaled + dy.z*sx + dx.x*t3);
f3.y += fp2.x*wxScaled.z*(1-dx.y*dx.y) + fp2.z*(dz.y*sy3 ) - fp2.y*(( dx.y*dy.y )*wxScaled.z - dy.y*sy3 + dx.x*t31 + dx.z*t33) + f.y*originWeights.z; fresult.y += fp3.x*wxScaled*( -dx.z*dx.y) + fp3.z*(dz.z*sy-t1) + fp3.y*((-dx.y*dy.z+dz.x)*wxScaled + dy.z*sy + dx.y*t3);
f3.z += fp2.x*wxScaled.z*( -dx.y*dx.z) + fp2.z*(dz.y*sz3+t31) - fp2.y*(( dx.z*dy.y+dz.x)*wxScaled.z - dy.y*sz3 - dx.z*t32); fresult.z += fp3.x*wxScaled*(1-dx.z*dx.z) + fp3.z*(dz.z*sz ) + fp3.y*((-dx.z*dy.z )*wxScaled + dy.z*sz - dx.x*t1 - dx.y*t2) + f.z*originWeight;
f1.x += fp3.x*wxScaled.x*( -dx.z*dx.x) + fp3.z*(dz.z*sx1+t12) + fp3.y*((-dx.x*dy.z-dz.y)*wxScaled.x + dy.z*sx1 + dx.x*t13); addForce(localCoordsAtoms[j], force, fresult);
f1.y += fp3.x*wxScaled.x*( -dx.z*dx.y) + fp3.z*(dz.z*sy1-t11) + fp3.y*((-dx.y*dy.z+dz.x)*wxScaled.x + dy.z*sy1 + dx.y*t13); }
f1.z += fp3.x*wxScaled.x*(1-dx.z*dx.z) + fp3.z*(dz.z*sz1 ) + fp3.y*((-dx.z*dy.z )*wxScaled.x + dy.z*sz1 - dx.x*t11 - dx.y*t12) + f.z*originWeights.x;
f2.x += fp3.x*wxScaled.y*( -dx.z*dx.x) + fp3.z*(dz.z*sx2+t22) + fp3.y*((-dx.x*dy.z-dz.y)*wxScaled.y + dy.z*sx2 + dx.x*t23);
f2.y += fp3.x*wxScaled.y*( -dx.z*dx.y) + fp3.z*(dz.z*sy2-t21) + fp3.y*((-dx.y*dy.z+dz.x)*wxScaled.y + dy.z*sy2 + dx.y*t23);
f2.z += fp3.x*wxScaled.y*(1-dx.z*dx.z) + fp3.z*(dz.z*sz2 ) + fp3.y*((-dx.z*dy.z )*wxScaled.y + dy.z*sz2 - dx.x*t21 - dx.y*t22) + f.z*originWeights.y;
f3.x += fp3.x*wxScaled.z*( -dx.z*dx.x) + fp3.z*(dz.z*sx3+t32) + fp3.y*((-dx.x*dy.z-dz.y)*wxScaled.z + dy.z*sx3 + dx.x*t33);
f3.y += fp3.x*wxScaled.z*( -dx.z*dx.y) + fp3.z*(dz.z*sy3-t31) + fp3.y*((-dx.y*dy.z+dz.x)*wxScaled.z + dy.z*sy3 + dx.y*t33);
f3.z += fp3.x*wxScaled.z*(1-dx.z*dx.z) + fp3.z*(dz.z*sz3 ) + fp3.y*((-dx.z*dy.z )*wxScaled.z + dy.z*sz3 - dx.x*t31 - dx.y*t32) + f.z*originWeights.z;
addForce(atoms.y, force, f1);
addForce(atoms.z, force, f2);
addForce(atoms.w, force, f3);
} }
} }
...@@ -924,4 +898,4 @@ extern "C" __global__ void timeShiftVelocities(mixed4* __restrict__ velm, const ...@@ -924,4 +898,4 @@ extern "C" __global__ void timeShiftVelocities(mixed4* __restrict__ velm, const
velm[index] = velocity; velm[index] = velocity;
} }
} }
} }
\ No newline at end of file
...@@ -73,6 +73,25 @@ __global__ void clearSixBuffers(int* __restrict__ buffer1, int size1, int* __res ...@@ -73,6 +73,25 @@ __global__ void clearSixBuffers(int* __restrict__ buffer1, int size1, int* __res
clearSingleBuffer(buffer6, size6); clearSingleBuffer(buffer6, size6);
} }
/**
* Sum the energy buffer.
*/
__global__ void reduceEnergy(const mixed* __restrict__ energyBuffer, mixed* __restrict__ result, int bufferSize, int workGroupSize) {
extern __shared__ mixed tempBuffer[];
const unsigned int thread = threadIdx.x;
mixed sum = 0;
for (unsigned int index = thread; index < bufferSize; index += blockDim.x)
sum += energyBuffer[index];
tempBuffer[thread] = sum;
for (int i = 1; i < workGroupSize; i *= 2) {
__syncthreads();
if (thread%(i*2) == 0 && thread+i < workGroupSize)
tempBuffer[thread] += tempBuffer[thread+i];
}
if (thread == 0)
*result = tempBuffer[0];
}
/** /**
* Record the atomic charges into the posq array. * Record the atomic charges into the posq array.
*/ */
......
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2017 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "CudaTests.h"
#include "TestCustomCVForce.h"
void runPlatformTests() {
}
...@@ -56,7 +56,7 @@ void testTransform(bool realToComplex, int xsize, int ysize, int zsize) { ...@@ -56,7 +56,7 @@ void testTransform(bool realToComplex, int xsize, int ysize, int zsize) {
system.addParticle(0.0); system.addParticle(0.0);
CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false", CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()), platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()),
platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), "false", 1); platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), "false", 1, NULL);
CudaContext& context = *platformData.contexts[0]; CudaContext& context = *platformData.contexts[0];
context.initialize(); context.initialize();
OpenMM_SFMT::SFMT sfmt; OpenMM_SFMT::SFMT sfmt;
......
...@@ -56,7 +56,7 @@ void testGaussian() { ...@@ -56,7 +56,7 @@ void testGaussian() {
system.addParticle(1.0); system.addParticle(1.0);
CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false", CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()), platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()),
platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), "false", 1); platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), "false", 1, NULL);
CudaContext& context = *platformData.contexts[0]; CudaContext& context = *platformData.contexts[0];
context.initialize(); context.initialize();
context.getIntegrationUtilities().initRandomNumberGenerator(0); context.getIntegrationUtilities().initRandomNumberGenerator(0);
......
...@@ -66,7 +66,7 @@ void verifySorting(vector<float> array) { ...@@ -66,7 +66,7 @@ void verifySorting(vector<float> array) {
system.addParticle(0.0); system.addParticle(0.0);
CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false", CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()), platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()),
platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), "false", 1); platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), "false", 1, NULL);
CudaContext& context = *platformData.contexts[0]; CudaContext& context = *platformData.contexts[0];
context.initialize(); context.initialize();
CudaArray data(context, array.size(), 4, "sortData"); CudaArray data(context, array.size(), 4, "sortData");
......
...@@ -163,7 +163,8 @@ public: ...@@ -163,7 +163,8 @@ public:
class ForcePostComputation; class ForcePostComputation;
static const int ThreadBlockSize; static const int ThreadBlockSize;
static const int TileSize; static const int TileSize;
OpenCLContext(const System& system, int platformIndex, int deviceIndex, const std::string& precision, OpenCLPlatform::PlatformData& platformData); OpenCLContext(const System& system, int platformIndex, int deviceIndex, const std::string& precision, OpenCLPlatform::PlatformData& platformData,
OpenCLContext* originalContext);
~OpenCLContext(); ~OpenCLContext();
/** /**
* This is called to initialize internal data structures after all Forces in the system * This is called to initialize internal data structures after all Forces in the system
...@@ -363,9 +364,13 @@ public: ...@@ -363,9 +364,13 @@ public:
*/ */
void reduceBuffer(OpenCLArray& array, int numBuffers); void reduceBuffer(OpenCLArray& array, int numBuffers);
/** /**
* Sum the buffesr containing forces. * Sum the buffers containing forces.
*/ */
void reduceForces(); void reduceForces();
/**
* Sum the buffer containing energy.
*/
double reduceEnergy();
/** /**
* Get the current simulation time. * Get the current simulation time.
*/ */
...@@ -749,6 +754,7 @@ private: ...@@ -749,6 +754,7 @@ private:
cl::Kernel clearSixBuffersKernel; cl::Kernel clearSixBuffersKernel;
cl::Kernel reduceReal4Kernel; cl::Kernel reduceReal4Kernel;
cl::Kernel reduceForcesKernel; cl::Kernel reduceForcesKernel;
cl::Kernel reduceEnergyKernel;
cl::Kernel setChargesKernel; cl::Kernel setChargesKernel;
std::vector<OpenCLForceInfo*> forces; std::vector<OpenCLForceInfo*> forces;
std::vector<Molecule> molecules; std::vector<Molecule> molecules;
...@@ -763,6 +769,7 @@ private: ...@@ -763,6 +769,7 @@ private:
OpenCLArray* forceBuffers; OpenCLArray* forceBuffers;
OpenCLArray* longForceBuffer; OpenCLArray* longForceBuffer;
OpenCLArray* energyBuffer; OpenCLArray* energyBuffer;
OpenCLArray* energySum;
OpenCLArray* energyParamDerivBuffer; OpenCLArray* energyParamDerivBuffer;
OpenCLArray* atomIndexDevice; OpenCLArray* atomIndexDevice;
OpenCLArray* chargeBuffer; OpenCLArray* chargeBuffer;
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2014 Stanford University and the Authors. * * Portions copyright (c) 2009-2017 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -156,8 +156,11 @@ private: ...@@ -156,8 +156,11 @@ private:
OpenCLArray* vsite3AvgWeights; OpenCLArray* vsite3AvgWeights;
OpenCLArray* vsiteOutOfPlaneAtoms; OpenCLArray* vsiteOutOfPlaneAtoms;
OpenCLArray* vsiteOutOfPlaneWeights; OpenCLArray* vsiteOutOfPlaneWeights;
OpenCLArray* vsiteLocalCoordsIndex;
OpenCLArray* vsiteLocalCoordsAtoms; OpenCLArray* vsiteLocalCoordsAtoms;
OpenCLArray* vsiteLocalCoordsParams; OpenCLArray* vsiteLocalCoordsWeights;
OpenCLArray* vsiteLocalCoordsPos;
OpenCLArray* vsiteLocalCoordsStartIndex;
int randomPos; int randomPos;
int lastSeed, numVsites; int lastSeed, numVsites;
bool hasInitializedPosConstraintKernels, hasInitializedVelConstraintKernels, ccmaUseDirectBuffer, hasOverlappingVsites; bool hasInitializedPosConstraintKernels, hasInitializedVelConstraintKernels, ccmaUseDirectBuffer, hasOverlappingVsites;
......
...@@ -37,6 +37,7 @@ ...@@ -37,6 +37,7 @@
#include "openmm/internal/CompiledExpressionSet.h" #include "openmm/internal/CompiledExpressionSet.h"
#include "openmm/internal/CustomIntegratorUtilities.h" #include "openmm/internal/CustomIntegratorUtilities.h"
#include "lepton/CompiledExpression.h" #include "lepton/CompiledExpression.h"
#include "lepton/ExpressionProgram.h"
#include "openmm/System.h" #include "openmm/System.h"
namespace OpenMM { namespace OpenMM {
...@@ -1207,6 +1208,54 @@ private: ...@@ -1207,6 +1208,54 @@ private:
cl::Kernel framesKernel, blockBoundsKernel, neighborsKernel, forceKernel, torqueKernel; cl::Kernel framesKernel, blockBoundsKernel, neighborsKernel, forceKernel, torqueKernel;
}; };
/**
* This kernel is invoked by CustomCVForce to calculate the forces acting on the system and the energy of the system.
*/
class OpenCLCalcCustomCVForceKernel : public CalcCustomCVForceKernel {
public:
OpenCLCalcCustomCVForceKernel(std::string name, const Platform& platform, OpenCLContext& cl) : CalcCustomCVForceKernel(name, platform),
cl(cl), hasInitializedKernels(false), invAtomOrder(NULL), innerInvAtomOrder(NULL) {
}
~OpenCLCalcCustomCVForceKernel();
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
* @param force the CustomCVForce this kernel will be used for
* @param innerContext the context created by the CustomCVForce for computing collective variables
*/
void initialize(const System& system, const CustomCVForce& force, ContextImpl& innerContext);
/**
* Execute the kernel to calculate the forces and/or energy.
*
* @param context the context in which to execute this kernel
* @param innerContext the context created by the CustomCVForce for computing collective variables
* @param includeForces true if forces should be calculated
* @param includeEnergy true if the energy should be calculated
* @return the potential energy due to the force
*/
double execute(ContextImpl& context, ContextImpl& innerContext, bool includeForces, bool includeEnergy);
/**
* Copy state information to the inner context.
*
* @param context the context in which to execute this kernel
* @param innerContext the context created by the CustomCVForce for computing collective variables
*/
void copyState(ContextImpl& context, ContextImpl& innerContext);
private:
class ReorderListener;
OpenCLContext& cl;
bool hasInitializedKernels;
Lepton::ExpressionProgram energyExpression;
std::vector<std::string> variableNames, paramDerivNames, globalParameterNames;
std::vector<Lepton::ExpressionProgram> variableDerivExpressions;
std::vector<Lepton::ExpressionProgram> paramDerivExpressions;
std::vector<OpenCLArray*> cvForces;
OpenCLArray* invAtomOrder;
OpenCLArray* innerInvAtomOrder;
cl::Kernel copyStateKernel, copyForcesKernel, addForcesKernel;
};
/** /**
* This kernel is invoked by VerletIntegrator to take one time step. * This kernel is invoked by VerletIntegrator to take one time step.
*/ */
...@@ -1472,7 +1521,9 @@ private: ...@@ -1472,7 +1521,9 @@ private:
class ReorderListener; class ReorderListener;
class GlobalTarget; class GlobalTarget;
class DerivFunction; class DerivFunction;
std::string createPerDofComputation(const std::string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator, const std::string& forceName, const std::string& energyName); std::string createPerDofComputation(const std::string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator,
const std::string& forceName, const std::string& energyName, std::vector<const TabulatedFunction*>& functions,
std::vector<std::pair<std::string, std::string> >& functionNames);
void prepareForComputation(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid); void prepareForComputation(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid);
Lepton::ExpressionTreeNode replaceDerivFunctions(const Lepton::ExpressionTreeNode& node, OpenMM::ContextImpl& context); Lepton::ExpressionTreeNode replaceDerivFunctions(const Lepton::ExpressionTreeNode& node, OpenMM::ContextImpl& context);
void findExpressionsForDerivs(const Lepton::ExpressionTreeNode& node, std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& variableNodes); void findExpressionsForDerivs(const Lepton::ExpressionTreeNode& node, std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& variableNodes);
...@@ -1482,7 +1533,7 @@ private: ...@@ -1482,7 +1533,7 @@ private:
OpenCLContext& cl; OpenCLContext& cl;
double energy; double energy;
float energyFloat; float energyFloat;
int numGlobalVariables; int numGlobalVariables, sumWorkGroupSize;
bool hasInitializedKernels, deviceValuesAreCurrent, deviceGlobalsAreCurrent, modifiesParameters, keNeedsForce, hasAnyConstraints, needsEnergyParamDerivs; bool hasInitializedKernels, deviceValuesAreCurrent, deviceGlobalsAreCurrent, modifiesParameters, keNeedsForce, hasAnyConstraints, needsEnergyParamDerivs;
mutable bool localValuesAreCurrent; mutable bool localValuesAreCurrent;
OpenCLArray* globalValues; OpenCLArray* globalValues;
...@@ -1491,6 +1542,8 @@ private: ...@@ -1491,6 +1542,8 @@ private:
OpenCLArray* uniformRandoms; OpenCLArray* uniformRandoms;
OpenCLArray* randomSeed; OpenCLArray* randomSeed;
OpenCLArray* perDofEnergyParamDerivs; OpenCLArray* perDofEnergyParamDerivs;
std::vector<OpenCLArray*> tabulatedFunctions;
std::map<int, double> savedEnergy;
std::map<int, OpenCLArray*> savedForces; std::map<int, OpenCLArray*> savedForces;
std::set<int> validSavedForces; std::set<int> validSavedForces;
OpenCLParameterSet* perDofValues; OpenCLParameterSet* perDofValues;
...@@ -1573,7 +1626,7 @@ private: ...@@ -1573,7 +1626,7 @@ private:
class OpenCLApplyMonteCarloBarostatKernel : public ApplyMonteCarloBarostatKernel { class OpenCLApplyMonteCarloBarostatKernel : public ApplyMonteCarloBarostatKernel {
public: public:
OpenCLApplyMonteCarloBarostatKernel(std::string name, const Platform& platform, OpenCLContext& cl) : ApplyMonteCarloBarostatKernel(name, platform), cl(cl), OpenCLApplyMonteCarloBarostatKernel(std::string name, const Platform& platform, OpenCLContext& cl) : ApplyMonteCarloBarostatKernel(name, platform), cl(cl),
hasInitializedKernels(false), savedPositions(NULL), moleculeAtoms(NULL), moleculeStartIndex(NULL) { hasInitializedKernels(false), savedPositions(NULL), savedForces(NULL), moleculeAtoms(NULL), moleculeStartIndex(NULL) {
} }
~OpenCLApplyMonteCarloBarostatKernel(); ~OpenCLApplyMonteCarloBarostatKernel();
/** /**
...@@ -1608,6 +1661,7 @@ private: ...@@ -1608,6 +1661,7 @@ private:
bool hasInitializedKernels; bool hasInitializedKernels;
int numMolecules; int numMolecules;
OpenCLArray* savedPositions; OpenCLArray* savedPositions;
OpenCLArray* savedForces;
OpenCLArray* moleculeAtoms; OpenCLArray* moleculeAtoms;
OpenCLArray* moleculeStartIndex; OpenCLArray* moleculeStartIndex;
cl::Kernel kernel; cl::Kernel kernel;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment