"olla/vscode:/vscode.git/clone" did not exist on "63e2123d965a47a19cad778aee08d3e364bff729"
Commit 1f0ec7b5 authored by Peter Eastman's avatar Peter Eastman
Browse files

Continuing to implement new CUDA platform

parent 99cebd08
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaBondedUtilities.h"
#include "CudaExpressionUtilities.h"
#include "openmm/OpenMMException.h"
#include "CudaNonbondedUtilities.h"
#include <iostream>
using namespace OpenMM;
using namespace std;
CudaBondedUtilities::CudaBondedUtilities(CudaContext& context) : context(context), numForceBuffers(0), maxBonds(0), hasInitializedKernels(false) {
}
CudaBondedUtilities::~CudaBondedUtilities() {
for (int i = 0; i < (int) atomIndices.size(); i++)
for (int j = 0; j < (int) atomIndices[i].size(); j++)
delete atomIndices[i][j];
}
void CudaBondedUtilities::addInteraction(const vector<vector<int> >& atoms, const string& source, int group) {
if (atoms.size() > 0) {
forceAtoms.push_back(atoms);
forceSource.push_back(source);
forceGroup.push_back(group);
}
}
std::string CudaBondedUtilities::addArgument(CUdeviceptr data, const string& type) {
arguments.push_back(data);
argTypes.push_back(type);
return "customArg"+context.intToString(arguments.size());
}
void CudaBondedUtilities::addPrefixCode(const string& source) {
prefixCode.push_back(source);
}
void CudaBondedUtilities::initialize(const System& system) {
int numForces = forceAtoms.size();
if (numForces == 0)
return;
// Build the lists of atom indices.
atomIndices.resize(numForces);
for (int i = 0; i < numForces; i++) {
int numBonds = forceAtoms[i].size();
int numAtoms = forceAtoms[i][0].size();
int startAtom = 0;
while (startAtom < numAtoms) {
int width = max(numAtoms-startAtom, 4);
if (width == 3)
width = 4;
vector<unsigned int> indexVec(width*numBonds);
for (int bond = 0; bond < numBonds; bond++) {
for (int atom = 0; atom < width; atom++)
indexVec[bond*width+atom] = forceAtoms[i][bond][startAtom+atom];
}
CudaArray* indices = CudaArray::create<unsigned int>(indexVec.size(), "bondedIndices");
indices->upload(indexVec);
atomIndices[i].push_back(indices);
startAtom += width;
}
}
// Create the kernel.
stringstream s;
for (int i = 0; i < (int) prefixCode.size(); i++)
s<<prefixCode[i];
s<<"extern \"C\" __global__ void computeBondedForces(long* __restrict__ forceBuffer, real* __restrict__ energyBuffer, const real4* __restrict__ posq, int groups";
for (int force = 0; force < numForces; force++) {
for (int i = 0; i < (int) atomIndices[force].size(); i++) {
int indexWidth = atomIndices[force][i]->getElementSize()/4;
string indexType = "unsigned int"+(indexWidth == 1 ? "" : context.intToString(indexWidth));
s<<", const "<<indexType<<"* __restrict__ atomIndices"<<force<<"_"<<i;
}
}
for (int i = 0; i < (int) arguments.size(); i++)
s<<", "<<argTypes[i]<<"* customArg"<<(i+1);
s<<") {\n";
s<<"real energy = 0;\n";
for (int force = 0; force < numForces; force++)
s<<createForceSource(force, forceAtoms[force].size(), forceAtoms[force][0].size(), forceGroup[force], forceSource[force]);
s<<"energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;\n";
s<<"}\n";
map<string, string> defines;
defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
CUmodule module = context.createModule(s.str(), defines);
kernel = context.getKernel(module, "computeBondedForces");
forceAtoms.clear();
forceSource.clear();
}
string CudaBondedUtilities::createForceSource(int forceIndex, int numBonds, int numAtoms, int group, const string& computeForce) {
maxBonds = max(maxBonds, numBonds);
string suffix1[] = {""};
string suffix4[] = {".x", ".y", ".z", ".w"};
string* suffix;
stringstream s;
s<<"if ((groups&"<<(1<<group)<<") != 0)\n";
s<<"for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < "<<numBonds<<"; index += blockDim.x*gridDim.x) {\n";
int startAtom = 0;
for (int i = 0; i < (int) atomIndices[forceIndex].size(); i++) {
int indexWidth = atomIndices[forceIndex][i]->getElementSize()/4;
suffix = (indexWidth == 1 ? suffix1 : suffix4);
string indexType = "unsigned int"+(indexWidth == 1 ? "" : context.intToString(indexWidth));
s<<" "<<indexType<<" atoms"<<i<<" = atomIndices"<<forceIndex<<"_"<<i<<"[index];\n";
s<<" "<<indexType<<" buffers = bufferIndices"<<forceIndex<<"[index];\n";
for (int j = 0; j < indexWidth; j++) {
s<<" unsigned int atom"<<(startAtom+j+1)<<" = atoms"<<i<<suffix[j]<<";\n";
s<<" real4 pos"<<(j+1)<<" = posq[atom"<<(j+1)<<"];\n";
}
startAtom += indexWidth;
}
s<<computeForce<<"\n";
for (int i = 0; i < numAtoms; i++) {
s<<" atomicAdd(&forceBuffer[atom"<<(i+1)<<"], (long) (force.x*0xFFFFFFFF));\n";
s<<" atomicAdd(&forceBuffer[atom"<<(i+1)<<"+PADDED_NUM_ATOMS], (long) (force.x*0xFFFFFFFF));\n";
s<<" atomicAdd(&forceBuffer[atom"<<(i+1)<<"+PADDED_NUM_ATOMS*2], (long) (force.x*0xFFFFFFFF));\n";
}
s<<"}\n";
return s.str();
}
void CudaBondedUtilities::computeInteractions(int groups) {
// if (!hasInitializedKernels) {
// hasInitializedKernels = true;
// for (int i = 0; i < (int) forceSets.size(); i++) {
// int index = 0;
// cl::Kernel& kernel = kernels[i];
// kernel.setArg<cl::Buffer>(index++, context.getForceBuffers().getDeviceBuffer());
// kernel.setArg<cl::Buffer>(index++, context.getEnergyBuffer().getDeviceBuffer());
// kernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());
// index++;
// for (int j = 0; j < (int) forceSets[i].size(); j++) {
// kernel.setArg<cl::Buffer>(index++, atomIndices[forceSets[i][j]]->getDeviceBuffer());
// kernel.setArg<cl::Buffer>(index++, bufferIndices[forceSets[i][j]]->getDeviceBuffer());
// }
// for (int j = 0; j < (int) arguments.size(); j++)
// kernel.setArg<cl::Memory>(index++, *arguments[j]);
// }
// }
// for (int i = 0; i < (int) kernels.size(); i++) {
// kernels[i].setArg<cl_int>(3, groups);
// context.executeKernel(kernels[i], maxBonds);
// }
}
#ifndef OPENMM_CUDABONDEDUTILITIES_H_
#define OPENMM_CUDABONDEDUTILITIES_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaArray.h"
#include "CudaContext.h"
#include "openmm/System.h"
#include <string>
#include <vector>
namespace OpenMM {
/**
* This class provides a generic mechanism for evaluating bonded interactions. You write only
* the source code needed to compute one interaction, and this class takes care of creating
* and executing a complete kernel that loops over bonds, evaluates each one, and accumulates
* the resulting forces and energies. This offers two advantages. First, it simplifies the
* task of writing a new Force. Second, it allows multiple forces to be evaluated by a single
* kernel, which reduces overhead and improves performance.
*
* A "bonded interaction" means an interaction that affects a small, fixed set of particles.
* The interaction energy may depend on the positions of only those particles, and the list of
* particles forming a "bond" may not change with time. Examples of bonded interactions
* include HarmonicBondForce, HarmonicAngleForce, and PeriodicTorsionForce.
*
* To create a bonded interaction, call addInteraction(). You pass to it a block of source
* code for evaluating the interaction. The inputs and outputs for that source code are as
* follows:
*
* <ol>
* <li>The index of the bond being evaluated will have been stored in the unsigned int variable "index".</li>
* <li>The indices of the atoms forming that bond will have been stored in the unsigned int variables "atom1",
* "atom2", ....</li>
* <li>The positions of those atoms will have been stored in the real4 variables "pos1", "pos2", ....</li>
* <li>A real variable called "energy" will exist. Your code should add the potential energy of the
* bond to that variable.</li>
* <li>Your code should define real4 variables called "force1", "force2", ... that contain the force to
* apply to each atom.</li>
* </ol>
*
* As a simple example, the following source code would be used to implement a pairwise interaction of
* the form E=r^2:
*
* <tt><pre>
* real4 delta = pos2-pos1;
* energy += delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
* real4 force1 = 2.0f*delta;
* real4 force2 = -2.0f*delta;
* </pre></tt>
*
* Interactions will often depend on parameters or other data. Call addArgument() to provide the data
* to this class. It will be passed to the interaction kernel as an argument, and you can refer to it
* from your interaction code.
*/
class OPENMM_EXPORT CudaBondedUtilities {
public:
CudaBondedUtilities(CudaContext& context);
~CudaBondedUtilities();
/**
* Add a bonded interaction.
*
* @param atoms this should have one entry for each bond, and that entry should contain the list
* of atoms involved in the bond. Every entry must have the same number of atoms.
* @param source the code to evaluate the interaction
* @param group the force group in which the interaction should be calculated
*/
void addInteraction(const std::vector<std::vector<int> >& atoms, const std::string& source, int group);
/**
* Add an argument that should be passed to the interaction kernel.
*
* @param data the device memory containing the data to pass
* @param type the data type contained in the memory (e.g. "float4")
* @return the name that will be used for the argument. Any code you pass to addInteraction() should
* refer to it by this name.
*/
std::string addArgument(CUdeviceptr data, const std::string& type);
/**
* Add some Cuda code that should be included in the program, before the start of the kernel.
* This can be used, for example, to define functions that will be called by the kernel.
*
* @param source the code to include
*/
void addPrefixCode(const std::string& source);
/**
* Initialize this object in preparation for a simulation.
*/
void initialize(const System& system);
/**
* Compute the bonded interactions.
*
* @param groups a set of bit flags for which force groups to include
*/
void computeInteractions(int groups);
private:
std::string createForceSource(int forceIndex, int numBonds, int numAtoms, int group, const std::string& computeForce);
CudaContext& context;
CUfunction kernel;
std::vector<std::vector<std::vector<int> > > forceAtoms;
std::vector<std::vector<int> > indexWidth;
std::vector<std::string> forceSource;
std::vector<int> forceGroup;
std::vector<CUdeviceptr> arguments;
std::vector<std::string> argTypes;
std::vector<std::vector<CudaArray*> > atomIndices;
std::vector<std::string> prefixCode;
int numForceBuffers, maxBonds;
bool hasInitializedKernels;
};
} // namespace OpenMM
#endif /*OPENMM_CUDABONDEDUTILITIES_H_*/
......@@ -32,7 +32,7 @@
#include "CudaArray.h"
//#include "CudaBondedUtilities.h"
#include "CudaForceInfo.h"
//#include "CudaIntegrationUtilities.h"
#include "CudaIntegrationUtilities.h"
#include "CudaKernelSources.h"
//#include "CudaNonbondedUtilities.h"
#include "hilbert.h"
......@@ -40,6 +40,7 @@
#include "openmm/Platform.h"
#include "openmm/System.h"
#include "openmm/VirtualSite.h"
#include "CudaExpressionUtilities.h"
#include <algorithm>
#include <cstdlib>
#include <fstream>
......@@ -66,8 +67,8 @@ bool CudaContext::hasInitializedCuda = false;
CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
const string& tempDir, CudaPlatform::PlatformData& platformData) : system(system), compiler(compiler),
time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), contextIsValid(false), atomsWereReordered(false), pinnedBuffer(NULL), posq(NULL),
velm(NULL), /*forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndex(NULL), integration(NULL),
bonded(NULL), nonbonded(NULL),*/ thread(NULL) {
velm(NULL), /*forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndex(NULL),*/ integration(NULL), expression(NULL),
/*bonded(NULL), nonbonded(NULL),*/ thread(NULL) {
if (!hasInitializedCuda) {
CHECK_RESULT2(cuInit(0), "Error initializing CUDA");
hasInitializedCuda = true;
......@@ -143,11 +144,17 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, paddedNumAtoms*sizeof(double4), 0));
posq = CudaArray::create<double4>(paddedNumAtoms, "posq");
velm = CudaArray::create<double4>(paddedNumAtoms, "velm");
compilationDefines["make_real2"] = "make_double2";
compilationDefines["make_real3"] = "make_double3";
compilationDefines["make_real4"] = "make_double4";
}
else {
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, paddedNumAtoms*sizeof(float4), 0));
posq = CudaArray::create<float4>(paddedNumAtoms, "posq");
velm = CudaArray::create<float4>(paddedNumAtoms, "velm");
compilationDefines["make_real2"] = "make_float2";
compilationDefines["make_real3"] = "make_float3";
compilationDefines["make_real4"] = "make_float4";
}
posCellOffsets.resize(paddedNumAtoms, make_int4(0, 0, 0, 0));
......@@ -160,8 +167,6 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
clearFourBuffersKernel = getKernel(utilities, "clearFourBuffers");
clearFiveBuffersKernel = getKernel(utilities, "clearFiveBuffers");
clearSixBuffersKernel = getKernel(utilities, "clearSixBuffers");
reduceFloat4Kernel = getKernel(utilities, "reduceFloat4Buffer");
reduceForcesKernel = getKernel(utilities, "reduceForces");
// Set defines based on the requested precision.
......@@ -170,14 +175,21 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
compilationDefines["RECIP"] = useDoublePrecision ? "1.0/" : "1.0f/";
compilationDefines["EXP"] = useDoublePrecision ? "exp" : "expf";
compilationDefines["LOG"] = useDoublePrecision ? "log" : "logf";
compilationDefines["COS"] = useDoublePrecision ? "cos" : "cosf";
compilationDefines["SIN"] = useDoublePrecision ? "sin" : "sinf";
compilationDefines["TAN"] = useDoublePrecision ? "tan" : "tanf";
compilationDefines["ACOS"] = useDoublePrecision ? "acos" : "acosf";
compilationDefines["ASIN"] = useDoublePrecision ? "asin" : "asinf";
compilationDefines["ATAN"] = useDoublePrecision ? "atan" : "atanf";
// Create the work thread used for parallelization when running on multiple devices.
thread = new WorkThread();
//
// // Create the integration utilities object.
//
// integration = new CudaIntegrationUtilities(*this, system);
// Create utilities objects.
integration = new CudaIntegrationUtilities(*this, system);
expression = new CudaExpressionUtilities(*this);
}
CudaContext::~CudaContext() {
......@@ -201,8 +213,10 @@ CudaContext::~CudaContext() {
// delete energyBuffer;
// if (atomIndex != NULL)
// delete atomIndex;
// if (integration != NULL)
// delete integration;
if (integration != NULL)
delete integration;
if (expression != NULL)
delete expression;
// if (bonded != NULL)
// delete bonded;
// if (nonbonded != NULL)
......@@ -272,6 +286,18 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
}
if (!compilationDefines.empty())
src << endl;
if (useDoublePrecision) {
src << "typedef double real;\n";
src << "typedef double2 real2;\n";
src << "typedef double3 real3;\n";
src << "typedef double4 real4;\n";
}
else {
src << "typedef float real;\n";
src << "typedef float2 real2;\n";
src << "typedef float3 real3;\n";
src << "typedef float4 real4;\n";
}
for (map<string, string>::const_iterator iter = defines.begin(); iter != defines.end(); ++iter) {
src << "#define " << iter->first;
if (!iter->second.empty())
......@@ -498,22 +524,7 @@ void CudaContext::addAutoclearBuffer(CUdeviceptr memory, int size) {
// clearBuffer(*autoclearBuffers[base], autoclearBufferSizes[base]);
// }
//}
//
//void CudaContext::reduceForces() {
// if (supports64BitGlobalAtomics)
// executeKernel(reduceForcesKernel, paddedNumAtoms, 128);
// else
// reduceBuffer(*forceBuffers, numForceBuffers);
//}
//
//void CudaContext::reduceBuffer(CudaArray<mm_float4>& array, int numBuffers) {
// int bufferSize = array.getSize()/numBuffers;
// reduceFloat4Kernel.setArg<cl::Buffer>(0, array.getDeviceBuffer());
// reduceFloat4Kernel.setArg<cl_int>(1, bufferSize);
// reduceFloat4Kernel.setArg<cl_int>(2, numBuffers);
// executeKernel(reduceFloat4Kernel, bufferSize, 128);
//}
//
void CudaContext::tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMolecule, vector<vector<int> >& atomBonds) {
// Recursively tag atoms as belonging to a particular molecule.
......
......@@ -46,6 +46,7 @@ namespace OpenMM {
class CudaArray;
class CudaForceInfo;
class CudaExpressionUtilities;
class CudaIntegrationUtilities;
class CudaBondedUtilities;
class CudaNonbondedUtilities;
......@@ -216,25 +217,13 @@ public:
* Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
*
* @param memory the memory to clear
* @param size the number of float/double elements in the buffer
* @param size the number of 4-byte elements in the buffer
*/
void addAutoclearBuffer(CUdeviceptr memory, int size);
// /**
// * Clear all buffers that have been registered with addAutoclearBuffer().
// */
// void clearAutoclearBuffers();
// /**
// * Given a collection of buffers packed into an array, sum them and store
// * the sum in the first buffer.
// *
// * @param array the array containing the buffers to reduce
// * @param numBuffers the number of buffers packed into the array
// */
// void reduceBuffer(CudaArray<mm_float4>& array, int numBuffers);
// /**
// * Sum the buffesr containing forces.
// */
// void reduceForces();
/**
* Get the current simulation time.
*/
......@@ -341,12 +330,18 @@ public:
// float4 getInvPeriodicBoxSize() const {
// return invPeriodicBoxSize;
// }
// /**
// * Get the CudaIntegrationUtilities for this context.
// */
// CudaIntegrationUtilities& getIntegrationUtilities() {
// return *integration;
// }
/**
* Get the CudaIntegrationUtilities for this context.
*/
CudaIntegrationUtilities& getIntegrationUtilities() {
return *integration;
}
/**
* Get the CudaExpressionUtilities for this context.
*/
CudaExpressionUtilities& getExpressionUtilities() {
return *expression;
}
// /**
// * Get the CudaBondedUtilities for this context.
// */
......@@ -445,8 +440,6 @@ private:
CUfunction clearFourBuffersKernel;
CUfunction clearFiveBuffersKernel;
CUfunction clearSixBuffersKernel;
CUfunction reduceFloat4Kernel;
CUfunction reduceForcesKernel;
std::vector<CudaForceInfo*> forces;
std::vector<Molecule> molecules;
std::vector<MoleculeGroup> moleculeGroups;
......@@ -461,7 +454,8 @@ private:
std::vector<CUdeviceptr> autoclearBuffers;
std::vector<int> autoclearBufferSizes;
std::vector<ReorderListener*> reorderListeners;
// CudaIntegrationUtilities* integration;
CudaIntegrationUtilities* integration;
CudaExpressionUtilities* expression;
// CudaBondedUtilities* bonded;
// CudaNonbondedUtilities* nonbonded;
WorkThread* thread;
......
......@@ -33,19 +33,6 @@ using namespace OpenMM;
using namespace Lepton;
using namespace std;
string CudaExpressionUtilities::doubleToString(double value) {
stringstream s;
s.precision(8);
s << scientific << value << "f";
return s.str();
}
string CudaExpressionUtilities::intToString(int value) {
stringstream s;
s << value;
return s.str();
}
string CudaExpressionUtilities::createExpressions(const map<string, ParsedExpression>& expressions, const map<string, string>& variables,
const vector<pair<string, string> >& functions, const string& prefix, const string& functionParams, const string& tempType) {
vector<pair<ExpressionTreeNode, string> > variableNodes;
......@@ -75,13 +62,13 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
return;
for (int i = 0; i < (int) node.getChildren().size(); i++)
processExpression(out, node.getChildren()[i], temps, functions, prefix, functionParams, allExpressions, tempType);
string name = prefix+intToString(temps.size());
string name = prefix+context.intToString(temps.size());
bool hasRecordedNode = false;
out << tempType << " " << name << " = ";
switch (node.getOperation().getId()) {
case Operation::CONSTANT:
out << doubleToString(dynamic_cast<const Operation::Constant*>(&node.getOperation())->getValue());
out << context.doubleToString(dynamic_cast<const Operation::Constant*>(&node.getOperation())->getValue());
break;
case Operation::VARIABLE:
throw OpenMMException("Unknown variable in expression: "+node.getOperation().getName());
......@@ -107,7 +94,7 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
string valueName = name;
string derivName = name;
if (valueNode != NULL && derivNode != NULL) {
string name2 = prefix+intToString(temps.size());
string name2 = prefix+context.intToString(temps.size());
out << tempType << " " << name2 << " = 0.0f;\n";
if (isDeriv) {
valueName = name2;
......@@ -120,14 +107,14 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
}
out << "{\n";
out << "float4 params = " << functionParams << "[" << i << "];\n";
out << "float x = " << getTempName(node.getChildren()[0], temps) << ";\n";
out << "real x = " << getTempName(node.getChildren()[0], temps) << ";\n";
out << "if (x >= params.x && x <= params.y) {\n";
out << "x = (x-params.x)*params.z;\n";
out << "int index = (int) (floor(x));\n";
out << "index = min(index, (int) params.w);\n";
out << "float4 coeff = " << functions[i].second << "[index];\n";
out << "float b = x-index;\n";
out << "float a = 1.0f-b;\n";
out << "real b = x-index;\n";
out << "real a = 1.0f-b;\n";
if (valueNode != NULL)
out << valueName << " = a*coeff.x+b*coeff.y+((a*a*a-a)*coeff.z+(b*b*b-b)*coeff.w)/(params.z*params.z);\n";
if (derivNode != NULL)
......@@ -164,7 +151,7 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
out << "-" << getTempName(node.getChildren()[0], temps);
break;
case Operation::SQRT:
out << "sqrt(" << getTempName(node.getChildren()[0], temps) << ")";
out << "SQRT(" << getTempName(node.getChildren()[0], temps) << ")";
break;
case Operation::EXP:
out << "EXP(" << getTempName(node.getChildren()[0], temps) << ")";
......@@ -173,31 +160,31 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
out << "LOG(" << getTempName(node.getChildren()[0], temps) << ")";
break;
case Operation::SIN:
out << "sin(" << getTempName(node.getChildren()[0], temps) << ")";
out << "SIN(" << getTempName(node.getChildren()[0], temps) << ")";
break;
case Operation::COS:
out << "cos(" << getTempName(node.getChildren()[0], temps) << ")";
out << "COS(" << getTempName(node.getChildren()[0], temps) << ")";
break;
case Operation::SEC:
out << "1.0f/cos(" << getTempName(node.getChildren()[0], temps) << ")";
out << "RECIP(COS(" << getTempName(node.getChildren()[0], temps) << "))";
break;
case Operation::CSC:
out << "1.0f/sin(" << getTempName(node.getChildren()[0], temps) << ")";
out << "RECIP(SIN(" << getTempName(node.getChildren()[0], temps) << "))";
break;
case Operation::TAN:
out << "tan(" << getTempName(node.getChildren()[0], temps) << ")";
out << "TAN(" << getTempName(node.getChildren()[0], temps) << ")";
break;
case Operation::COT:
out << "1.0f/tan(" << getTempName(node.getChildren()[0], temps) << ")";
out << "RECIP(TAN(" << getTempName(node.getChildren()[0], temps) << "))";
break;
case Operation::ASIN:
out << "asin(" << getTempName(node.getChildren()[0], temps) << ")";
out << "ASIN(" << getTempName(node.getChildren()[0], temps) << ")";
break;
case Operation::ACOS:
out << "acos(" << getTempName(node.getChildren()[0], temps) << ")";
out << "ACSO(" << getTempName(node.getChildren()[0], temps) << ")";
break;
case Operation::ATAN:
out << "atan(" << getTempName(node.getChildren()[0], temps) << ")";
out << "ATAN(" << getTempName(node.getChildren()[0], temps) << ")";
break;
case Operation::SINH:
out << "sinh(" << getTempName(node.getChildren()[0], temps) << ")";
......@@ -236,10 +223,10 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
out << "RECIP(" << getTempName(node.getChildren()[0], temps) << ")";
break;
case Operation::ADD_CONSTANT:
out << doubleToString(dynamic_cast<const Operation::AddConstant*>(&node.getOperation())->getValue()) << "+" << getTempName(node.getChildren()[0], temps);
out << context.doubleToString(dynamic_cast<const Operation::AddConstant*>(&node.getOperation())->getValue()) << "+" << getTempName(node.getChildren()[0], temps);
break;
case Operation::MULTIPLY_CONSTANT:
out << doubleToString(dynamic_cast<const Operation::MultiplyConstant*>(&node.getOperation())->getValue()) << "*" << getTempName(node.getChildren()[0], temps);
out << context.doubleToString(dynamic_cast<const Operation::MultiplyConstant*>(&node.getOperation())->getValue()) << "*" << getTempName(node.getChildren()[0], temps);
break;
case Operation::POWER_CONSTANT:
{
......@@ -266,14 +253,14 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
for (map<int, const ExpressionTreeNode*>::const_iterator iter = powers.begin(); iter != powers.end(); ++iter) {
if (iter->first != exponent) {
exponents.push_back(iter->first >= 0 ? iter->first : -iter->first);
string name2 = prefix+intToString(temps.size());
string name2 = prefix+context.intToString(temps.size());
names.push_back(name2);
temps.push_back(make_pair(*iter->second, name2));
out << tempType << " " << name2 << " = 0.0f;\n";
}
}
out << "{\n";
out << "float multiplier = " << (exponent < 0.0 ? "1.0f/" : "") << getTempName(node.getChildren()[0], temps) << ";\n";
out << "real multiplier = " << (exponent < 0.0 ? "RECIP(" : "(") << getTempName(node.getChildren()[0], temps) << ");\n";
bool done = false;
while (!done) {
done = true;
......@@ -295,7 +282,7 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
out << "}";
}
else
out << "pow(" << getTempName(node.getChildren()[0], temps) << ", " << doubleToString(exponent) << ")";
out << "pow(" << getTempName(node.getChildren()[0], temps) << ", " << context.doubleToString(exponent) << ")";
break;
}
case Operation::MIN:
......
......@@ -45,6 +45,8 @@ namespace OpenMM {
class OPENMM_EXPORT CudaExpressionUtilities {
public:
CudaExpressionUtilities(CudaContext& context) : context(context) {
}
/**
* Generate the source code for calculating a set of expressions.
*
......@@ -54,10 +56,10 @@ public:
* @param functions defines the variable name for each tabulated function that may appear in the expressions
* @param prefix a prefix to put in front of temporary variables
* @param functionParams the variable name containing the parameters for each tabulated function
* @param tempType the type of value to use for temporary variables (defaults to "float")
* @param tempType the type of value to use for temporary variables (defaults to "real")
*/
static std::string createExpressions(const std::map<std::string, Lepton::ParsedExpression>& expressions, const std::map<std::string, std::string>& variables,
const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams, const std::string& tempType="float");
std::string createExpressions(const std::map<std::string, Lepton::ParsedExpression>& expressions, const std::map<std::string, std::string>& variables,
const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams, const std::string& tempType="real");
/**
* Generate the source code for calculating a set of expressions.
*
......@@ -67,10 +69,10 @@ public:
* @param functions defines the variable name for each tabulated function that may appear in the expressions
* @param prefix a prefix to put in front of temporary variables
* @param functionParams the variable name containing the parameters for each tabulated function
* @param tempType the type of value to use for temporary variables (defaults to "float")
* @param tempType the type of value to use for temporary variables (defaults to "real")
*/
static std::string createExpressions(const std::map<std::string, Lepton::ParsedExpression>& expressions, const std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& variables,
const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams, const std::string& tempType="float");
std::string createExpressions(const std::map<std::string, Lepton::ParsedExpression>& expressions, const std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& variables,
const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams, const std::string& tempType="real");
/**
* Calculate the spline coefficients for a tabulated function that appears in expressions.
*
......@@ -79,26 +81,19 @@ public:
* @param max the value of the independent variable corresponding to the last element of values
* @return the spline coefficients
*/
static std::vector<float4> computeFunctionCoefficients(const std::vector<double>& values, double min, double max);
/**
* Convert a number to a string in a format suitable for including in a kernel.
*/
static std::string doubleToString(double value);
/**
* Convert a number to a string in a format suitable for including in a kernel.
*/
static std::string intToString(int value);
std::vector<float4> computeFunctionCoefficients(const std::vector<double>& values, double min, double max);
class FunctionPlaceholder;
private:
static void processExpression(std::stringstream& out, const Lepton::ExpressionTreeNode& node,
void processExpression(std::stringstream& out, const Lepton::ExpressionTreeNode& node,
std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& temps,
const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams,
const std::vector<Lepton::ParsedExpression>& allExpressions, const std::string& tempType);
static std::string getTempName(const Lepton::ExpressionTreeNode& node, const std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& temps);
static void findRelatedTabulatedFunctions(const Lepton::ExpressionTreeNode& node, const Lepton::ExpressionTreeNode& searchNode,
std::string getTempName(const Lepton::ExpressionTreeNode& node, const std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& temps);
void findRelatedTabulatedFunctions(const Lepton::ExpressionTreeNode& node, const Lepton::ExpressionTreeNode& searchNode,
const Lepton::ExpressionTreeNode*& valueNode, const Lepton::ExpressionTreeNode*& derivNode);
static void findRelatedPowers(const Lepton::ExpressionTreeNode& node, const Lepton::ExpressionTreeNode& searchNode,
void findRelatedPowers(const Lepton::ExpressionTreeNode& node, const Lepton::ExpressionTreeNode& searchNode,
std::map<int, const Lepton::ExpressionTreeNode*>& powers);
CudaContext& context;
};
/**
......
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaIntegrationUtilities.h"
#include "CudaArray.h"
#include "CudaKernelSources.h"
#include "openmm/HarmonicAngleForce.h"
#include "openmm/VirtualSite.h"
#include "quern.h"
#include "CudaExpressionUtilities.h"
#include <algorithm>
#include <cmath>
#include <cstdlib>
#include <map>
using namespace OpenMM;
using namespace std;
#define CHECK_RESULT(result) CHECK_RESULT2(result, errorMessage);
#define CHECK_RESULT2(result, prefix) \
if (result != CUDA_SUCCESS) { \
std::stringstream m; \
m<<prefix<<": "<<context.getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
throw OpenMMException(m.str());\
}
struct CudaIntegrationUtilities::ShakeCluster {
int centralID;
int peripheralID[3];
int size;
bool valid;
double distance;
double centralInvMass, peripheralInvMass;
ShakeCluster() : valid(true) {
}
ShakeCluster(int centralID, double invMass) : centralID(centralID), centralInvMass(invMass), size(0), valid(true) {
}
void addAtom(int id, double dist, double invMass) {
if (size == 3 || (size > 0 && abs(dist-distance)/distance > 1e-8) || (size > 0 && abs(invMass-peripheralInvMass)/peripheralInvMass > 1e-8))
valid = false;
else {
peripheralID[size++] = id;
distance = dist;
peripheralInvMass = invMass;
}
}
void markInvalid(map<int, ShakeCluster>& allClusters, vector<bool>& invalidForShake)
{
valid = false;
invalidForShake[centralID] = true;
for (int i = 0; i < size; i++) {
invalidForShake[peripheralID[i]] = true;
map<int, ShakeCluster>::iterator otherCluster = allClusters.find(peripheralID[i]);
if (otherCluster != allClusters.end() && otherCluster->second.valid)
otherCluster->second.markInvalid(allClusters, invalidForShake);
}
}
};
struct CudaIntegrationUtilities::ConstraintOrderer : public binary_function<int, int, bool> {
const vector<int>& atom1;
const vector<int>& atom2;
const vector<int>& constraints;
ConstraintOrderer(const vector<int>& atom1, const vector<int>& atom2, const vector<int>& constraints) : atom1(atom1), atom2(atom2), constraints(constraints) {
}
bool operator()(int x, int y) {
int ix = constraints[x];
int iy = constraints[y];
if (atom1[ix] != atom1[iy])
return atom1[ix] < atom1[iy];
return atom2[ix] < atom2[iy];
}
};
CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const System& system) : context(context),
posDelta(NULL), settleAtoms(NULL), settleParams(NULL), shakeAtoms(NULL), shakeParams(NULL),
random(NULL), randomSeed(NULL), randomPos(0), stepSize(NULL), ccmaAtoms(NULL), ccmaDistance(NULL),
ccmaReducedMass(NULL), ccmaAtomConstraints(NULL), ccmaNumAtomConstraints(NULL), ccmaConstraintMatrixColumn(NULL),
ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConverged(NULL),
ccmaConvergedMemory(NULL), vsite2AvgAtoms(NULL), vsite2AvgWeights(NULL), vsite3AvgAtoms(NULL), vsite3AvgWeights(NULL),
vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL), hasInitializedPosConstraintKernels(false), hasInitializedVelConstraintKernels(false) {
// Create workspace arrays.
if (context.getUseDoublePrecision()) {
posDelta = CudaArray::create<double4>(context.getPaddedNumAtoms(), "posDelta");
vector<double4> deltas(posDelta->getSize(), make_double4(0.0, 0.0, 0.0, 0.0));
posDelta->upload(deltas);
stepSize = CudaArray::create<double2>(1, "stepSize");
vector<double2> step(1, make_double2(0.0f, 0.0f));
stepSize->upload(step);
}
else {
posDelta = CudaArray::create<float4>(context.getPaddedNumAtoms(), "posDelta");
vector<float4> deltas(posDelta->getSize(), make_float4(0.0, 0.0, 0.0, 0.0));
posDelta->upload(deltas);
stepSize = CudaArray::create<float2>(1, "stepSize");
vector<float2> step(1, make_float2(0.0f, 0.0f));
stepSize->upload(step);
}
// Create kernels for enforcing constraints.
map<string, string> velocityDefines;
velocityDefines["CONSTRAIN_VELOCITIES"] = "1";
// CUmodule settleModule = context.createModule(CudaKernelSources::settle);
// settlePosKernel = context.getKernel(settleModule, "applySettle");
// settleVelKernel = context.getKernel(settleModule, "constrainVelocities");
// CUmodule shakeModule = context.createModule(CudaKernelSources::shakeHydrogens);
// shakePosKernel = context.getKernel(shakeModule, "applyShakeToHydrogens");
// shakeModule = context.createModule(CudaKernelSources::shakeHydrogens, velocityDefines);
// shakeVelKernel = context.getKernel(shakeModule, "applyShakeToHydrogens");
// Record the set of constraints and how many constraints each atom is involved in.
int numConstraints = system.getNumConstraints();
vector<int> atom1(numConstraints);
vector<int> atom2(numConstraints);
vector<double> distance(numConstraints);
vector<int> constraintCount(context.getNumAtoms(), 0);
for (int i = 0; i < numConstraints; i++) {
system.getConstraintParameters(i, atom1[i], atom2[i], distance[i]);
constraintCount[atom1[i]]++;
constraintCount[atom2[i]]++;
}
// Identify clusters of three atoms that can be treated with SETTLE. First, for every
// atom that might be part of such a cluster, make a list of the two other atoms it is
// connected to.
int numAtoms = system.getNumParticles();
vector<map<int, float> > settleConstraints(numAtoms);
for (int i = 0; i < (int)atom1.size(); i++) {
if (constraintCount[atom1[i]] == 2 && constraintCount[atom2[i]] == 2) {
settleConstraints[atom1[i]][atom2[i]] = (float) distance[i];
settleConstraints[atom2[i]][atom1[i]] = (float) distance[i];
}
}
// Now remove the ones that don't actually form closed loops of three atoms.
vector<int> settleClusters;
for (int i = 0; i < (int)settleConstraints.size(); i++) {
if (settleConstraints[i].size() == 2) {
int partner1 = settleConstraints[i].begin()->first;
int partner2 = (++settleConstraints[i].begin())->first;
if (settleConstraints[partner1].size() != 2 || settleConstraints[partner2].size() != 2 ||
settleConstraints[partner1].find(partner2) == settleConstraints[partner1].end())
settleConstraints[i].clear();
else if (i < partner1 && i < partner2)
settleClusters.push_back(i);
}
else
settleConstraints[i].clear();
}
// Record the SETTLE clusters.
vector<bool> isShakeAtom(numAtoms, false);
if (settleClusters.size() > 0) {
vector<int4> atoms;
vector<float2> params;
for (int i = 0; i < (int) settleClusters.size(); i++) {
int atom1 = settleClusters[i];
int atom2 = settleConstraints[atom1].begin()->first;
int atom3 = (++settleConstraints[atom1].begin())->first;
float dist12 = settleConstraints[atom1].find(atom2)->second;
float dist13 = settleConstraints[atom1].find(atom3)->second;
float dist23 = settleConstraints[atom2].find(atom3)->second;
if (dist12 == dist13) {
// atom1 is the central atom
atoms.push_back(make_int4(atom1, atom2, atom3, 0));
params.push_back(make_float2(dist12, dist23));
}
else if (dist12 == dist23) {
// atom2 is the central atom
atoms.push_back(make_int4(atom2, atom1, atom3, 0));
params.push_back(make_float2(dist12, dist13));
}
else if (dist13 == dist23) {
// atom3 is the central atom
atoms.push_back(make_int4(atom3, atom1, atom2, 0));
params.push_back(make_float2(dist13, dist12));
}
else
throw OpenMMException("Two of the three distances constrained with SETTLE must be the same.");
isShakeAtom[atom1] = true;
isShakeAtom[atom2] = true;
isShakeAtom[atom3] = true;
}
settleAtoms = CudaArray::create<int4>(atoms.size(), "settleAtoms");
settleParams = CudaArray::create<float2>(params.size(), "settleParams");
settleAtoms->upload(atoms);
settleParams->upload(params);
}
// Find clusters consisting of a central atom with up to three peripheral atoms.
map<int, ShakeCluster> clusters;
vector<bool> invalidForShake(numAtoms, false);
for (int i = 0; i < (int) atom1.size(); i++) {
if (isShakeAtom[atom1[i]])
continue; // This is being taken care of with SETTLE.
// Determine which is the central atom.
bool firstIsCentral;
if (constraintCount[atom1[i]] > 1)
firstIsCentral = true;
else if (constraintCount[atom2[i]] > 1)
firstIsCentral = false;
else if (atom1[i] < atom2[i])
firstIsCentral = true;
else
firstIsCentral = false;
int centralID, peripheralID;
if (firstIsCentral) {
centralID = atom1[i];
peripheralID = atom2[i];
}
else {
centralID = atom2[i];
peripheralID = atom1[i];
}
// Add it to the cluster.
if (clusters.find(centralID) == clusters.end()) {
clusters[centralID] = ShakeCluster(centralID, 1.0/system.getParticleMass(centralID));
}
ShakeCluster& cluster = clusters[centralID];
cluster.addAtom(peripheralID, distance[i], 1.0/system.getParticleMass(peripheralID));
if (constraintCount[peripheralID] != 1 || invalidForShake[atom1[i]] || invalidForShake[atom2[i]]) {
cluster.markInvalid(clusters, invalidForShake);
map<int, ShakeCluster>::iterator otherCluster = clusters.find(peripheralID);
if (otherCluster != clusters.end() && otherCluster->second.valid)
otherCluster->second.markInvalid(clusters, invalidForShake);
}
}
int validShakeClusters = 0;
for (map<int, ShakeCluster>::iterator iter = clusters.begin(); iter != clusters.end(); ++iter) {
ShakeCluster& cluster = iter->second;
if (cluster.valid) {
cluster.valid = !invalidForShake[cluster.centralID] && cluster.size == constraintCount[cluster.centralID];
for (int i = 0; i < cluster.size; i++)
if (invalidForShake[cluster.peripheralID[i]])
cluster.valid = false;
if (cluster.valid)
++validShakeClusters;
}
}
// Record the SHAKE clusters.
if (validShakeClusters > 0) {
vector<int4> atoms;
vector<float4> params;
int index = 0;
for (map<int, ShakeCluster>::const_iterator iter = clusters.begin(); iter != clusters.end(); ++iter) {
const ShakeCluster& cluster = iter->second;
if (!cluster.valid)
continue;
atoms.push_back(make_int4(cluster.centralID, cluster.peripheralID[0], (cluster.size > 1 ? cluster.peripheralID[1] : -1), (cluster.size > 2 ? cluster.peripheralID[2] : -1)));
params.push_back(make_float4((float) cluster.centralInvMass, (float) (0.5/(cluster.centralInvMass+cluster.peripheralInvMass)), (float) (cluster.distance*cluster.distance), (float) cluster.peripheralInvMass));
isShakeAtom[cluster.centralID] = true;
isShakeAtom[cluster.peripheralID[0]] = true;
if (cluster.size > 1)
isShakeAtom[cluster.peripheralID[1]] = true;
if (cluster.size > 2)
isShakeAtom[cluster.peripheralID[2]] = true;
++index;
}
shakeAtoms = CudaArray::create<int4>(atoms.size(), "shakeAtoms");
shakeParams = CudaArray::create<float4>(params.size(), "shakeParams");
shakeAtoms->upload(atoms);
shakeParams->upload(params);
}
// Find connected constraints for CCMA.
vector<int> ccmaConstraints;
for (unsigned i = 0; i < atom1.size(); i++)
if (!isShakeAtom[atom1[i]])
ccmaConstraints.push_back(i);
// Record the connections between constraints.
int numCCMA = (int) ccmaConstraints.size();
if (numCCMA > 0) {
vector<vector<int> > atomConstraints(context.getNumAtoms());
for (int i = 0; i < numCCMA; i++) {
atomConstraints[atom1[ccmaConstraints[i]]].push_back(i);
atomConstraints[atom2[ccmaConstraints[i]]].push_back(i);
}
vector<vector<int> > linkedConstraints(numCCMA);
for (unsigned atom = 0; atom < atomConstraints.size(); atom++) {
for (unsigned i = 0; i < atomConstraints[atom].size(); i++)
for (unsigned j = 0; j < i; j++) {
int c1 = atomConstraints[atom][i];
int c2 = atomConstraints[atom][j];
linkedConstraints[c1].push_back(c2);
linkedConstraints[c2].push_back(c1);
}
}
int maxLinks = 0;
for (unsigned i = 0; i < linkedConstraints.size(); i++)
maxLinks = max(maxLinks, (int) linkedConstraints[i].size());
int maxAtomConstraints = 0;
for (unsigned i = 0; i < atomConstraints.size(); i++)
maxAtomConstraints = max(maxAtomConstraints, (int) atomConstraints[i].size());
// Compute the constraint coupling matrix
vector<vector<int> > atomAngles(numAtoms);
HarmonicAngleForce const* angleForce = NULL;
for (int i = 0; i < system.getNumForces() && angleForce == NULL; i++)
angleForce = dynamic_cast<HarmonicAngleForce const*>(&system.getForce(i));
if (angleForce != NULL)
for (int i = 0; i < angleForce->getNumAngles(); i++) {
int particle1, particle2, particle3;
double angle, k;
angleForce->getAngleParameters(i, particle1, particle2, particle3, angle, k);
atomAngles[particle2].push_back(i);
}
vector<vector<pair<int, double> > > matrix(numCCMA);
for (int j = 0; j < numCCMA; j++) {
for (int k = 0; k < numCCMA; k++) {
if (j == k) {
matrix[j].push_back(pair<int, double>(j, 1.0));
continue;
}
double scale;
int cj = ccmaConstraints[j];
int ck = ccmaConstraints[k];
int atomj0 = atom1[cj];
int atomj1 = atom2[cj];
int atomk0 = atom1[ck];
int atomk1 = atom2[ck];
int atoma, atomb, atomc;
double imj0 = 1.0/system.getParticleMass(atomj0);
double imj1 = 1.0/system.getParticleMass(atomj1);
if (atomj0 == atomk0) {
atoma = atomj1;
atomb = atomj0;
atomc = atomk1;
scale = imj0/(imj0+imj1);
}
else if (atomj1 == atomk1) {
atoma = atomj0;
atomb = atomj1;
atomc = atomk0;
scale = imj1/(imj0+imj1);
}
else if (atomj0 == atomk1) {
atoma = atomj1;
atomb = atomj0;
atomc = atomk0;
scale = imj0/(imj0+imj1);
}
else if (atomj1 == atomk0) {
atoma = atomj0;
atomb = atomj1;
atomc = atomk1;
scale = imj1/(imj0+imj1);
}
else
continue; // These constraints are not connected.
// Look for a third constraint forming a triangle with these two.
bool foundConstraint = false;
for (int m = 0; m < numCCMA; m++) {
int other = ccmaConstraints[m];
if ((atom1[other] == atoma && atom2[other] == atomc) || (atom1[other] == atomc && atom2[other] == atoma)) {
double d1 = distance[cj];
double d2 = distance[ck];
double d3 = distance[other];
matrix[j].push_back(pair<int, double>(k, scale*(d1*d1+d2*d2-d3*d3)/(2.0*d1*d2)));
foundConstraint = true;
break;
}
}
if (!foundConstraint && angleForce != NULL) {
// We didn't find one, so look for an angle force field term.
const vector<int>& angleCandidates = atomAngles[atomb];
for (vector<int>::const_iterator iter = angleCandidates.begin(); iter != angleCandidates.end(); iter++) {
int particle1, particle2, particle3;
double angle, ka;
angleForce->getAngleParameters(*iter, particle1, particle2, particle3, angle, ka);
if ((particle1 == atoma && particle3 == atomc) || (particle3 == atoma && particle1 == atomc)) {
matrix[j].push_back(pair<int, double>(k, scale*cos(angle)));
break;
}
}
}
}
}
// Invert it using QR.
vector<int> matrixRowStart;
vector<int> matrixColIndex;
vector<double> matrixValue;
for (int i = 0; i < numCCMA; i++) {
matrixRowStart.push_back(matrixValue.size());
for (int j = 0; j < (int) matrix[i].size(); j++) {
pair<int, double> element = matrix[i][j];
matrixColIndex.push_back(element.first);
matrixValue.push_back(element.second);
}
}
matrixRowStart.push_back(matrixValue.size());
int *qRowStart, *qColIndex, *rRowStart, *rColIndex;
double *qValue, *rValue;
int result = QUERN_compute_qr(numCCMA, numCCMA, &matrixRowStart[0], &matrixColIndex[0], &matrixValue[0], NULL,
&qRowStart, &qColIndex, &qValue, &rRowStart, &rColIndex, &rValue);
vector<double> rhs(numCCMA);
matrix.clear();
matrix.resize(numCCMA);
for (int i = 0; i < numCCMA; i++) {
// Extract column i of the inverse matrix.
for (int j = 0; j < numCCMA; j++)
rhs[j] = (i == j ? 1.0 : 0.0);
result = QUERN_multiply_with_q_transpose(numCCMA, qRowStart, qColIndex, qValue, &rhs[0]);
result = QUERN_solve_with_r(numCCMA, rRowStart, rColIndex, rValue, &rhs[0], &rhs[0]);
for (int j = 0; j < numCCMA; j++) {
double value = rhs[j]*distance[ccmaConstraints[i]]/distance[ccmaConstraints[j]];
if (abs(value) > 0.1)
matrix[j].push_back(pair<int, double>(i, value));
}
}
QUERN_free_result(qRowStart, qColIndex, qValue);
QUERN_free_result(rRowStart, rColIndex, rValue);
int maxRowElements = 0;
for (unsigned i = 0; i < matrix.size(); i++)
maxRowElements = max(maxRowElements, (int) matrix[i].size());
maxRowElements++;
// Sort the constraints.
vector<int> constraintOrder(numCCMA);
for (int i = 0; i < numCCMA; ++i)
constraintOrder[i] = i;
sort(constraintOrder.begin(), constraintOrder.end(), ConstraintOrderer(atom1, atom2, ccmaConstraints));
vector<int> inverseOrder(numCCMA);
for (int i = 0; i < numCCMA; ++i)
inverseOrder[constraintOrder[i]] = i;
for (int i = 0; i < (int)matrix.size(); ++i)
for (int j = 0; j < (int)matrix[i].size(); ++j)
matrix[i][j].first = inverseOrder[matrix[i][j].first];
// Record the CCMA data structures.
ccmaAtoms = CudaArray::create<int2>(numCCMA, "CcmaAtoms");
ccmaDistance = CudaArray::create<float4>(numCCMA, "CcmaDistance");
ccmaAtomConstraints = CudaArray::create<int>(numAtoms*maxAtomConstraints, "CcmaAtomConstraints");
ccmaNumAtomConstraints = CudaArray::create<int>(numAtoms, "CcmaAtomConstraintsIndex");
ccmaDelta1 = CudaArray::create<float>(numCCMA, "CcmaDelta1");
ccmaDelta2 = CudaArray::create<float>(numCCMA, "CcmaDelta2");
ccmaConverged = CudaArray::create<int>(2, "CcmaConverged");
CHECK_RESULT2(cuMemHostAlloc((void**) &ccmaConvergedMemory, 2*sizeof(int), 0), "Error allocating pinned memory");
ccmaReducedMass = CudaArray::create<float>(numCCMA, "CcmaReducedMass");
ccmaConstraintMatrixColumn = CudaArray::create<int>(numCCMA*maxRowElements, "ConstraintMatrixColumn");
ccmaConstraintMatrixValue = CudaArray::create<float>(numCCMA*maxRowElements, "ConstraintMatrixValue");
vector<int2> atomsVec(ccmaAtoms->getSize());
vector<float4> distanceVec(ccmaDistance->getSize());
vector<int> atomConstraintsVec(ccmaAtomConstraints->getSize());
vector<int> numAtomConstraintsVec(ccmaNumAtomConstraints->getSize());
vector<float> reducedMassVec(ccmaReducedMass->getSize());
vector<int> constraintMatrixColumnVec(ccmaConstraintMatrixColumn->getSize());
vector<float> constraintMatrixValueVec(ccmaConstraintMatrixValue->getSize());
for (int i = 0; i < numCCMA; i++) {
int index = constraintOrder[i];
int c = ccmaConstraints[index];
atomsVec[i].x = atom1[c];
atomsVec[i].y = atom2[c];
distanceVec[i].w = (float) distance[c];
reducedMassVec[i] = (float) (0.5/(1.0/system.getParticleMass(atom1[c])+1.0/system.getParticleMass(atom2[c])));
for (unsigned int j = 0; j < matrix[index].size(); j++) {
constraintMatrixColumnVec[i+j*numCCMA] = matrix[index][j].first;
constraintMatrixValueVec[i+j*numCCMA] = (float) matrix[index][j].second;
}
constraintMatrixColumnVec[i+matrix[index].size()*numCCMA] = numCCMA;
}
for (unsigned int i = 0; i < atomConstraints.size(); i++) {
numAtomConstraintsVec[i] = atomConstraints[i].size();
for (unsigned int j = 0; j < atomConstraints[i].size(); j++) {
bool forward = (atom1[ccmaConstraints[atomConstraints[i][j]]] == i);
atomConstraintsVec[i+j*numAtoms] = (forward ? inverseOrder[atomConstraints[i][j]]+1 : -inverseOrder[atomConstraints[i][j]]-1);
}
}
ccmaAtoms->upload(atomsVec);
ccmaDistance->upload(distanceVec);
ccmaAtomConstraints->upload(atomConstraintsVec);
ccmaNumAtomConstraints->upload(numAtomConstraintsVec);
ccmaReducedMass->upload(reducedMassVec);
ccmaConstraintMatrixColumn->upload(constraintMatrixColumnVec);
ccmaConstraintMatrixValue->upload(constraintMatrixValueVec);
// Create the CCMA kernels.
map<string, string> defines;
defines["NUM_CONSTRAINTS"] = context.intToString(numCCMA);
defines["NUM_ATOMS"] = context.intToString(numAtoms);
// CUmodule ccmaModule = context.createModule(CudaKernelSources::ccma, defines);
// ccmaDirectionsKernel = context.getKernel(ccmaModule, "computeConstraintDirections");
// ccmaPosForceKernel = context.getKernel(ccmaModule, "computeConstraintForce");
// ccmaMultiplyKernel = context.getKernel(ccmaModule, "multiplyByConstraintMatrix");
// ccmaPosUpdateKernel = context.getKernel(ccmaModule, "updateAtomPositions");
// defines["CONSTRAIN_VELOCITIES"] = "1";
// ccmaModule = context.createModule(CudaKernelSources::ccma, defines);
// ccmaVelForceKernel = context.getKernel(ccmaModule, "computeConstraintForce");
// ccmaVelUpdateKernel = context.getKernel(ccmaModule, "updateAtomPositions");
}
// Build the list of virtual sites.
vector<int4> vsite2AvgAtomVec;
vector<float2> vsite2AvgWeightVec;
vector<int4> vsite3AvgAtomVec;
vector<float4> vsite3AvgWeightVec;
vector<int4> vsiteOutOfPlaneAtomVec;
vector<float4> vsiteOutOfPlaneWeightVec;
for (int i = 0; i < numAtoms; i++) {
if (system.isVirtualSite(i)) {
if (dynamic_cast<const TwoParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
// A two particle average.
const TwoParticleAverageSite& site = dynamic_cast<const TwoParticleAverageSite&>(system.getVirtualSite(i));
vsite2AvgAtomVec.push_back(make_int4(i, site.getParticle(0), site.getParticle(1), 0));
vsite2AvgWeightVec.push_back(make_float2((float) site.getWeight(0), (float) site.getWeight(1)));
}
else if (dynamic_cast<const ThreeParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
// A three particle average.
const ThreeParticleAverageSite& site = dynamic_cast<const ThreeParticleAverageSite&>(system.getVirtualSite(i));
vsite3AvgAtomVec.push_back(make_int4(i, site.getParticle(0), site.getParticle(1), site.getParticle(2)));
vsite3AvgWeightVec.push_back(make_float4((float) site.getWeight(0), (float) site.getWeight(1), (float) site.getWeight(2), 0.0f));
}
else if (dynamic_cast<const OutOfPlaneSite*>(&system.getVirtualSite(i)) != NULL) {
// An out of plane site.
const OutOfPlaneSite& site = dynamic_cast<const OutOfPlaneSite&>(system.getVirtualSite(i));
vsiteOutOfPlaneAtomVec.push_back(make_int4(i, site.getParticle(0), site.getParticle(1), site.getParticle(2)));
vsiteOutOfPlaneWeightVec.push_back(make_float4((float) site.getWeight12(), (float) site.getWeight13(), (float) site.getWeightCross(), 0.0f));
}
}
}
int num2Avg = vsite2AvgAtomVec.size();
int num3Avg = vsite3AvgAtomVec.size();
int numOutOfPlane = vsiteOutOfPlaneAtomVec.size();
vsite2AvgAtoms = CudaArray::create<int4>(max(1, num2Avg), "vsite2AvgAtoms");
vsite2AvgWeights = CudaArray::create<float2>(max(1, num2Avg), "vsite2AvgWeights");
vsite3AvgAtoms = CudaArray::create<int4>(max(1, num3Avg), "vsite3AvgAtoms");
vsite3AvgWeights = CudaArray::create<float4>(max(1, num3Avg), "vsite3AvgWeights");
vsiteOutOfPlaneAtoms = CudaArray::create<int4>(max(1, numOutOfPlane), "vsiteOutOfPlaneAtoms");
vsiteOutOfPlaneWeights = CudaArray::create<float4>(max(1, numOutOfPlane), "vsiteOutOfPlaneWeights");
if (num2Avg > 0) {
vsite2AvgAtoms->upload(vsite2AvgAtomVec);
vsite2AvgWeights->upload(vsite2AvgWeightVec);
}
if (num3Avg > 0) {
vsite3AvgAtoms->upload(vsite3AvgAtomVec);
vsite3AvgWeights->upload(vsite3AvgWeightVec);
}
if (numOutOfPlane > 0) {
vsiteOutOfPlaneAtoms->upload(vsiteOutOfPlaneAtomVec);
vsiteOutOfPlaneWeights->upload(vsiteOutOfPlaneWeightVec);
}
// Create the kernels for virtual sites.
map<string, string> defines;
defines["NUM_2_AVERAGE"] = context.intToString(num2Avg);
defines["NUM_3_AVERAGE"] = context.intToString(num3Avg);
defines["NUM_OUT_OF_PLANE"] = context.intToString(numOutOfPlane);
// CUmodule vsiteModule = context.createModule(CudaKernelSources::virtualSites, defines);
// vsitePositionKernel = context.getKernel(vsiteModule, "computeVirtualSites");
// vsitePositionKernel.setArg<cl::Buffer>(0, context.getPosq().getDeviceBuffer());
// vsitePositionKernel.setArg<cl::Buffer>(1, vsite2AvgAtoms->getDeviceBuffer());
// vsitePositionKernel.setArg<cl::Buffer>(2, vsite2AvgWeights->getDeviceBuffer());
// vsitePositionKernel.setArg<cl::Buffer>(3, vsite3AvgAtoms->getDeviceBuffer());
// vsitePositionKernel.setArg<cl::Buffer>(4, vsite3AvgWeights->getDeviceBuffer());
// vsitePositionKernel.setArg<cl::Buffer>(5, vsiteOutOfPlaneAtoms->getDeviceBuffer());
// vsitePositionKernel.setArg<cl::Buffer>(6, vsiteOutOfPlaneWeights->getDeviceBuffer());
// vsiteForceKernel = context.getKernel(vsiteModule, "distributeForces");
// vsiteForceKernel.setArg<cl::Buffer>(0, context.getPosq().getDeviceBuffer());
// // Skip argument 1: the force array hasn't been created yet.
// vsiteForceKernel.setArg<cl::Buffer>(2, vsite2AvgAtoms->getDeviceBuffer());
// vsiteForceKernel.setArg<cl::Buffer>(3, vsite2AvgWeights->getDeviceBuffer());
// vsiteForceKernel.setArg<cl::Buffer>(4, vsite3AvgAtoms->getDeviceBuffer());
// vsiteForceKernel.setArg<cl::Buffer>(5, vsite3AvgWeights->getDeviceBuffer());
// vsiteForceKernel.setArg<cl::Buffer>(6, vsiteOutOfPlaneAtoms->getDeviceBuffer());
// vsiteForceKernel.setArg<cl::Buffer>(7, vsiteOutOfPlaneWeights->getDeviceBuffer());
numVsites = num2Avg+num3Avg+numOutOfPlane;
}
CudaIntegrationUtilities::~CudaIntegrationUtilities() {
if (posDelta != NULL)
delete posDelta;
if (settleAtoms != NULL)
delete settleAtoms;
if (settleParams != NULL)
delete settleParams;
if (shakeAtoms != NULL)
delete shakeAtoms;
if (shakeParams != NULL)
delete shakeParams;
if (random != NULL)
delete random;
if (randomSeed != NULL)
delete randomSeed;
if (stepSize != NULL)
delete stepSize;
if (ccmaAtoms != NULL)
delete ccmaAtoms;
if (ccmaDistance != NULL)
delete ccmaDistance;
if (ccmaReducedMass != NULL)
delete ccmaReducedMass;
if (ccmaAtomConstraints != NULL)
delete ccmaAtomConstraints;
if (ccmaNumAtomConstraints != NULL)
delete ccmaNumAtomConstraints;
if (ccmaConstraintMatrixColumn != NULL)
delete ccmaConstraintMatrixColumn;
if (ccmaConstraintMatrixValue != NULL)
delete ccmaConstraintMatrixValue;
if (ccmaDelta1 != NULL)
delete ccmaDelta1;
if (ccmaDelta2 != NULL)
delete ccmaDelta2;
if (ccmaConverged != NULL)
delete ccmaConverged;
if (ccmaConvergedMemory != NULL)
cuMemFreeHost(ccmaConvergedMemory);
if (vsite2AvgAtoms != NULL)
delete vsite2AvgAtoms;
if (vsite2AvgWeights != NULL)
delete vsite2AvgWeights;
if (vsite3AvgAtoms != NULL)
delete vsite3AvgAtoms;
if (vsite3AvgWeights != NULL)
delete vsite3AvgWeights;
if (vsiteOutOfPlaneAtoms != NULL)
delete vsiteOutOfPlaneAtoms;
if (vsiteOutOfPlaneWeights != NULL)
delete vsiteOutOfPlaneWeights;
}
//void CudaIntegrationUtilities::applyConstraints(double tol) {
// applyConstraints(false, tol);
//}
//
//void CudaIntegrationUtilities::applyVelocityConstraints(double tol) {
// applyConstraints(true, tol);
//}
//
//void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double tol) {
// bool hasInitialized;
// CUfunction settleKernel, shakeKernel, ccmaForceKernel, ccmaUpdateKernel;
// if (constrainVelocities) {
// hasInitialized = hasInitializedVelConstraintKernels;
// settleKernel = settleVelKernel;
// shakeKernel = shakeVelKernel;
// ccmaForceKernel = ccmaVelForceKernel;
// ccmaUpdateKernel = ccmaVelUpdateKernel;
// hasInitializedVelConstraintKernels = true;
// }
// else {
// hasInitialized = hasInitializedPosConstraintKernels;
// settleKernel = settlePosKernel;
// shakeKernel = shakePosKernel;
// ccmaForceKernel = ccmaPosForceKernel;
// ccmaUpdateKernel = ccmaPosUpdateKernel;
// hasInitializedPosConstraintKernels = true;
// }
// if (settleAtoms != NULL) {
// if (!hasInitialized) {
// settleKernel.setArg<int>(0, settleAtoms->getSize());
// settleKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
// settleKernel.setArg<cl::Buffer>(3, posDelta->getDeviceBuffer());
// settleKernel.setArg<cl::Buffer>(4, context.getVelm().getDeviceBuffer());
// settleKernel.setArg<cl::Buffer>(5, settleAtoms->getDeviceBuffer());
// settleKernel.setArg<cl::Buffer>(6, settleParams->getDeviceBuffer());
// }
// settleKernel.setArg<float>(1, (float) tol);
// context.executeKernel(settleKernel, settleAtoms->getSize());
// }
// if (shakeAtoms != NULL) {
// if (!hasInitialized) {
// shakeKernel.setArg<int>(0, shakeAtoms->getSize());
// shakeKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
// shakeKernel.setArg<cl::Buffer>(3, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta->getDeviceBuffer());
// shakeKernel.setArg<cl::Buffer>(4, shakeAtoms->getDeviceBuffer());
// shakeKernel.setArg<cl::Buffer>(5, shakeParams->getDeviceBuffer());
// }
// shakeKernel.setArg<float>(1, (float) tol);
// context.executeKernel(shakeKernel, shakeAtoms->getSize());
// }
// if (ccmaAtoms != NULL) {
// if (!hasInitialized) {
// ccmaDirectionsKernel.setArg<cl::Buffer>(0, ccmaAtoms->getDeviceBuffer());
// ccmaDirectionsKernel.setArg<cl::Buffer>(1, ccmaDistance->getDeviceBuffer());
// ccmaDirectionsKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
// ccmaForceKernel.setArg<cl::Buffer>(0, ccmaAtoms->getDeviceBuffer());
// ccmaForceKernel.setArg<cl::Buffer>(1, ccmaDistance->getDeviceBuffer());
// ccmaForceKernel.setArg<cl::Buffer>(2, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta->getDeviceBuffer());
// ccmaForceKernel.setArg<cl::Buffer>(3, ccmaReducedMass->getDeviceBuffer());
// ccmaForceKernel.setArg<cl::Buffer>(4, ccmaDelta1->getDeviceBuffer());
// ccmaForceKernel.setArg<cl::Buffer>(5, ccmaConverged->getDeviceBuffer());
// ccmaMultiplyKernel.setArg<cl::Buffer>(0, ccmaDelta1->getDeviceBuffer());
// ccmaMultiplyKernel.setArg<cl::Buffer>(1, ccmaDelta2->getDeviceBuffer());
// ccmaMultiplyKernel.setArg<cl::Buffer>(2, ccmaConstraintMatrixColumn->getDeviceBuffer());
// ccmaMultiplyKernel.setArg<cl::Buffer>(3, ccmaConstraintMatrixValue->getDeviceBuffer());
// ccmaMultiplyKernel.setArg<cl::Buffer>(4, ccmaConverged->getDeviceBuffer());
// ccmaUpdateKernel.setArg<cl::Buffer>(0, ccmaNumAtomConstraints->getDeviceBuffer());
// ccmaUpdateKernel.setArg<cl::Buffer>(1, ccmaAtomConstraints->getDeviceBuffer());
// ccmaUpdateKernel.setArg<cl::Buffer>(2, ccmaDistance->getDeviceBuffer());
// ccmaUpdateKernel.setArg<cl::Buffer>(3, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta->getDeviceBuffer());
// ccmaUpdateKernel.setArg<cl::Buffer>(4, context.getVelm().getDeviceBuffer());
// ccmaUpdateKernel.setArg<cl::Buffer>(5, ccmaDelta1->getDeviceBuffer());
// ccmaUpdateKernel.setArg<cl::Buffer>(6, ccmaDelta2->getDeviceBuffer());
// ccmaUpdateKernel.setArg<cl::Buffer>(7, ccmaConverged->getDeviceBuffer());
// }
// ccmaForceKernel.setArg<float>(6, (float) tol);
// context.executeKernel(ccmaDirectionsKernel, ccmaAtoms->getSize());
// const int checkInterval = 4;
// cl::Event event;
// for (int i = 0; i < 150; i++) {
// ccmaForceKernel.setArg<int>(7, i);
// if (i == 0) {
// ccmaConvergedMemory[0] = 1;
// ccmaConvergedMemory[1] = 0;
// context.getQueue().enqueueWriteBuffer(ccmaConverged->getDeviceBuffer(), CL_FALSE, 0, 2*sizeof(int), ccmaConvergedMemory);
// }
// context.executeKernel(ccmaForceKernel, ccmaAtoms->getSize());
// if ((i+1)%checkInterval == 0)
// context.getQueue().enqueueReadBuffer(ccmaConverged->getDeviceBuffer(), CL_FALSE, 0, 2*sizeof(int), ccmaConvergedMemory, NULL, &event);
// ccmaMultiplyKernel.setArg<int>(5, i);
// context.executeKernel(ccmaMultiplyKernel, ccmaAtoms->getSize());
// ccmaUpdateKernel.setArg<int>(8, i);
// context.executeKernel(ccmaUpdateKernel, context.getNumAtoms());
// if ((i+1)%checkInterval == 0) {
// event.wait();
// if (ccmaConvergedMemory[i%2])
// break;
// }
// }
// }
//}
//
//void CudaIntegrationUtilities::computeVirtualSites() {
// if (numVsites > 0)
// context.executeKernel(vsitePositionKernel, numVsites);
//}
//
//void CudaIntegrationUtilities::distributeForcesFromVirtualSites() {
// if (numVsites > 0) {
// vsiteForceKernel.setArg<cl::Buffer>(1, context.getForce().getDeviceBuffer());
// context.executeKernel(vsiteForceKernel, numVsites);
// }
//}
void CudaIntegrationUtilities::initRandomNumberGenerator(unsigned int randomNumberSeed) {
if (random != NULL) {
if (randomNumberSeed != lastSeed)
throw OpenMMException("CudaIntegrationUtilities::initRandomNumberGenerator(): Requested two different values for the random number seed");
return;
}
// Create the random number arrays.
lastSeed = randomNumberSeed;
random = CudaArray::create<float4>(32*context.getPaddedNumAtoms(), "random");
randomSeed = CudaArray::create<int4>(context.getNumThreadBlocks()*CudaContext::ThreadBlockSize, "randomSeed");
randomPos = random->getSize();
// Use a quick and dirty RNG to pick seeds for the real random number generator.
vector<int4> seed(randomSeed->getSize());
unsigned int r = randomNumberSeed;
for (int i = 0; i < randomSeed->getSize(); i++) {
seed[i].x = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
seed[i].y = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
seed[i].z = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
seed[i].w = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
}
randomSeed->upload(seed);
// Create the kernel.
CUmodule randomModule = context.createModule(CudaKernelSources::random);
randomKernel = context.getKernel(randomModule, "generateRandomNumbers");
}
int CudaIntegrationUtilities::prepareRandomNumbers(int numValues) {
if (randomPos+numValues <= random->getSize()) {
int oldPos = randomPos;
randomPos += numValues;
return oldPos;
}
if (numValues > random->getSize()) {
delete random;
random = CudaArray::create<float4>(numValues, "random");
}
int size = random->getSize();
void* args[] = {&size, &random->getDevicePointer(), &randomSeed->getDevicePointer()};
context.executeKernel(randomKernel, args, random->getSize());
randomPos = numValues;
return 0;
}
void CudaIntegrationUtilities::createCheckpoint(ostream& stream) {
stream.write((char*) &randomPos, sizeof(int));
vector<float4> randomVec;
random->download(randomVec);
stream.write((char*) &randomVec[0], sizeof(float4)*random->getSize());
vector<int4> randomSeedVec;
randomSeed->download(randomSeedVec);
stream.write((char*) &randomSeedVec[0], sizeof(int4)*randomSeed->getSize());
}
void CudaIntegrationUtilities::loadCheckpoint(istream& stream) {
stream.read((char*) &randomPos, sizeof(int));
vector<float4> randomVec(random->getSize());
stream.read((char*) &randomVec[0], sizeof(float4)*random->getSize());
random->upload(randomVec);
vector<int4> randomSeedVec(randomSeed->getSize());
stream.read((char*) &randomSeedVec[0], sizeof(int4)*randomSeed->getSize());
randomSeed->upload(randomSeedVec);
}
#ifndef OPENMM_CUDAINTEGRATIONUTILITIES_H_
#define OPENMM_CUDAINTEGRATIONUTILITIES_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "openmm/System.h"
#include "CudaContext.h"
#include "openmm/internal/windowsExport.h"
#include <iosfwd>
namespace OpenMM {
/**
* This class implements features that are used by many different integrators, including
* common workspace arrays, random number generation, and enforcing constraints.
*/
class OPENMM_EXPORT CudaIntegrationUtilities {
public:
CudaIntegrationUtilities(CudaContext& context, const System& system);
~CudaIntegrationUtilities();
/**
* Get the array which contains position deltas.
*/
CudaArray& getPosDelta() {
return *posDelta;
}
/**
* Get the array which contains random values. Each element is a float4, whose components
* are independent, normally distributed random numbers with mean 0 and variance 1.
*/
CudaArray& getRandom() {
return *random;
}
/**
* Get the array which contains the current step size.
*/
CudaArray& getStepSize() {
return *stepSize;
}
/**
* Apply constraints to the atom positions.
*
* @param tol the constraint tolerance
*/
void applyConstraints(double tol);
/**
* Apply constraints to the atom velocities.
*
* @param tol the constraint tolerance
*/
void applyVelocityConstraints(double tol);
/**
* Initialize the random number generator.
*/
void initRandomNumberGenerator(unsigned int randomNumberSeed);
/**
* Ensure that sufficient random numbers are available in the array, and generate new ones if not.
*
* @param numValues the number of random float4's that will be required
* @return the index in the array at which to start reading
*/
int prepareRandomNumbers(int numValues);
/**
* Compute the positions of virtual sites.
*/
void computeVirtualSites();
/**
* Distribute forces from virtual sites to the atoms they are based on.
*/
void distributeForcesFromVirtualSites();
/**
* Create a checkpoint recording the current state of the random number generator.
*
* @param stream an output stream the checkpoint data should be written to
*/
void createCheckpoint(std::ostream& stream);
/**
* Load a checkpoint that was written by createCheckpoint().
*
* @param stream an input stream the checkpoint data should be read from
*/
void loadCheckpoint(std::istream& stream);
private:
void applyConstraints(bool constrainVelocities, double tol);
CudaContext& context;
CUfunction settlePosKernel, settleVelKernel;
CUfunction shakePosKernel, shakeVelKernel;
CUfunction ccmaDirectionsKernel;
CUfunction ccmaPosForceKernel, ccmaVelForceKernel;
CUfunction ccmaMultiplyKernel;
CUfunction ccmaPosUpdateKernel, ccmaVelUpdateKernel;
CUfunction vsitePositionKernel, vsiteForceKernel;
CUfunction randomKernel;
CudaArray* posDelta;
CudaArray* settleAtoms;
CudaArray* settleParams;
CudaArray* shakeAtoms;
CudaArray* shakeParams;
CudaArray* random;
CudaArray* randomSeed;
CudaArray* stepSize;
CudaArray* ccmaAtoms;
CudaArray* ccmaDistance;
CudaArray* ccmaReducedMass;
CudaArray* ccmaAtomConstraints;
CudaArray* ccmaNumAtomConstraints;
CudaArray* ccmaConstraintMatrixColumn;
CudaArray* ccmaConstraintMatrixValue;
CudaArray* ccmaDelta1;
CudaArray* ccmaDelta2;
CudaArray* ccmaConverged;
int* ccmaConvergedMemory;
CudaArray* vsite2AvgAtoms;
CudaArray* vsite2AvgWeights;
CudaArray* vsite3AvgAtoms;
CudaArray* vsite3AvgWeights;
CudaArray* vsiteOutOfPlaneAtoms;
CudaArray* vsiteOutOfPlaneWeights;
int randomPos;
int lastSeed, numVsites;
bool hasInitializedPosConstraintKernels, hasInitializedVelConstraintKernels;
struct ShakeCluster;
struct ConstraintOrderer;
};
} // namespace OpenMM
#endif /*OPENMM_CUDAINTEGRATIONUTILITIES_H_*/
/**
* Generate random numbers
*/
extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restrict__ random, uint4* __restrict__ seed) {
int index = blockIdx.x*blockDim.x+threadIdx.x;
uint4 state = seed[index];
unsigned int carry = 0;
while (index < numValues) {
float4 value;
// Generate first value.
state.x = state.x * 69069 + 1;
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
unsigned int k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
unsigned int m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x1 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
state.x = state.x * 69069 + 1;
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
x1 = sqrt(-2.0f * log(x1));
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x2 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
value.x = x1 * cos(2.0f * 3.14159265f * x2);
// Generate second value.
state.x = state.x * 69069 + 1;
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x3 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
state.x = state.x * 69069 + 1;
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
x3 = sqrt(-2.0f * log(x3));
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x4 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
value.y = x3 * cos(2.0f * 3.14159265f * x4);
// Generate third value.
state.x = state.x * 69069 + 1;
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x5 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
state.x = state.x * 69069 + 1;
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
x5 = sqrt(-2.0f * log(x5));
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x6 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
value.z = x5 * cos(2.0f * 3.14159265f * x6);
// Generate fourth value.
state.x = state.x * 69069 + 1;
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x7 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
state.x = state.x * 69069 + 1;
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
x7 = sqrt(-2.0f * log(x7));
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x8 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
value.w = x7 * cos(2.0f * 3.14159265f * x8);
// Record the values.
random[index] = value;
index += blockDim.x*gridDim.x;
}
seed[blockIdx.x*blockDim.x+threadIdx.x] = state;
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
/**
* This tests the CUDA implementation of random number generation.
*/
#include "openmm/internal/AssertionUtilities.h"
#include "../src/CudaArray.h"
#include "../src/CudaContext.h"
#include "../src/CudaIntegrationUtilities.h"
#include "openmm/System.h"
#include <iostream>
using namespace OpenMM;
using namespace std;
void testGaussian() {
int numAtoms = 5000;
System system;
for (int i = 0; i < numAtoms; i++)
system.addParticle(1.0);
CudaPlatform platform;
CudaPlatform::PlatformData platformData(system, "", "true", "single",
platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()));
CudaContext& context = *platformData.contexts[0];
context.initialize();
context.getIntegrationUtilities().initRandomNumberGenerator(0);
CudaArray& random = context.getIntegrationUtilities().getRandom();
context.getIntegrationUtilities().prepareRandomNumbers(random.getSize());
const int numValues = random.getSize()*4;
vector<float4> values(numValues);
random.download(values);
float* data = reinterpret_cast<float*>(&values[0]);
double mean = 0.0;
double var = 0.0;
double skew = 0.0;
double kurtosis = 0.0;
for (int i = 0; i < numValues; i++) {
double value = data[i];
mean += value;
var += value*value;
skew += value*value*value;
kurtosis += value*value*value*value;
}
mean /= numValues;
var /= numValues;
skew /= numValues;
kurtosis /= numValues;
double c2 = var-mean*mean;
double c3 = skew-3*var*mean+2*mean*mean*mean;
double c4 = kurtosis-4*skew*mean-3*var*var+12*var*mean*mean-6*mean*mean*mean*mean;
ASSERT_EQUAL_TOL(0.0, mean, 3.0/sqrt((double)numValues));
ASSERT_EQUAL_TOL(1.0, c2, 3.0/pow(numValues, 1.0/3.0));
ASSERT_EQUAL_TOL(0.0, c3, 3.0/pow(numValues, 1.0/4.0));
ASSERT_EQUAL_TOL(0.0, c4, 3.0/pow(numValues, 1.0/4.0));
}
int main() {
try {
testGaussian();
}
catch(const exception& e) {
cout << "exception: " << e.what() << endl;
return 1;
}
cout << "Done" << endl;
return 0;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment