Commit c8dac206 authored by Peter Eastman's avatar Peter Eastman
Browse files

Continuing to implement double precision in OpenCL

parent 34938e2c
......@@ -58,7 +58,7 @@ void OpenCLBondedUtilities::addInteraction(const vector<vector<int> >& atoms, co
std::string OpenCLBondedUtilities::addArgument(cl::Memory& data, const string& type) {
arguments.push_back(&data);
argTypes.push_back(type);
return "customArg"+OpenCLExpressionUtilities::intToString(arguments.size());
return "customArg"+context.intToString(arguments.size());
}
void OpenCLBondedUtilities::addPrefixCode(const string& source) {
......@@ -164,17 +164,17 @@ void OpenCLBondedUtilities::initialize(const System& system) {
stringstream s;
for (int i = 0; i < (int) prefixCode.size(); i++)
s<<prefixCode[i];
s<<"__kernel void computeBondedForces(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, int groups";
s<<"__kernel void computeBondedForces(__global real4* restrict forceBuffers, __global real* restrict energyBuffer, __global const real4* restrict posq, int groups";
for (int i = 0; i < setSize; i++) {
int force = set[i];
string indexType = "uint"+(indexWidth[force] == 1 ? "" : OpenCLExpressionUtilities::intToString(indexWidth[force]));
string indexType = "uint"+(indexWidth[force] == 1 ? "" : context.intToString(indexWidth[force]));
s<<", __global const "<<indexType<<"* restrict atomIndices"<<i;
s<<", __global const "<<indexType<<"* restrict bufferIndices"<<i;
}
for (int i = 0; i < (int) arguments.size(); i++)
s<<", __global "<<argTypes[i]<<"* customArg"<<(i+1);
s<<") {\n";
s<<"float energy = 0.0f;\n";
s<<"real energy = 0.0f;\n";
for (int i = 0; i < setSize; i++) {
int force = set[i];
s<<createForceSource(i, forceAtoms[force].size(), forceAtoms[force][0].size(), forceGroup[force], forceSource[force]);
......@@ -182,7 +182,7 @@ void OpenCLBondedUtilities::initialize(const System& system) {
s<<"energyBuffer[get_global_id(0)] += energy;\n";
s<<"}\n";
map<string, string> defines;
defines["PADDED_NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getPaddedNumAtoms());
defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
cl::Program program = context.createProgram(s.str(), defines);
kernels.push_back(cl::Kernel(program, "computeBondedForces"));
}
......@@ -206,7 +206,7 @@ string OpenCLBondedUtilities::createForceSource(int forceIndex, int numBonds, in
suffix = suffix4;
else
suffix = suffix16;
string indexType = "uint"+(width == 1 ? "" : OpenCLExpressionUtilities::intToString(width));
string indexType = "uint"+(width == 1 ? "" : context.intToString(width));
stringstream s;
s<<"if ((groups&"<<(1<<group)<<") != 0)\n";
s<<"for (unsigned int index = get_global_id(0); index < "<<numBonds<<"; index += get_global_size(0)) {\n";
......@@ -214,13 +214,13 @@ string OpenCLBondedUtilities::createForceSource(int forceIndex, int numBonds, in
s<<" "<<indexType<<" buffers = bufferIndices"<<forceIndex<<"[index];\n";
for (int i = 0; i < numAtoms; i++) {
s<<" unsigned int atom"<<(i+1)<<" = atoms"<<suffix[i]<<";\n";
s<<" float4 pos"<<(i+1)<<" = posq[atom"<<(i+1)<<"];\n";
s<<" real4 pos"<<(i+1)<<" = posq[atom"<<(i+1)<<"];\n";
}
s<<computeForce<<"\n";
for (int i = 0; i < numAtoms; i++) {
s<<" {\n";
s<<" unsigned int offset = atom"<<(i+1)<<"+buffers"<<suffix[i]<<"*PADDED_NUM_ATOMS;\n";
s<<" float4 force = forceBuffers[offset];\n";
s<<" real4 force = forceBuffers[offset];\n";
s<<" force.xyz += force"<<(i+1)<<".xyz;\n";
s<<" forceBuffers[offset] = force;\n";
s<<" }\n";
......
......@@ -68,7 +68,7 @@ static void CL_CALLBACK errorCallback(const char* errinfo, const void* private_i
OpenCLContext::OpenCLContext(const System& system, int platformIndex, int deviceIndex, const string& precision, OpenCLPlatform::PlatformData& platformData) :
system(system), time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), atomsWereReordered(false), posq(NULL),
posqCorrection(NULL), velm(NULL), forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndexDevice(NULL), integration(NULL),
bonded(NULL), nonbonded(NULL), thread(NULL) {
expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
if (precision == "single") {
useDoublePrecision = false;
useMixedPrecision = false;
......@@ -145,7 +145,7 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
this->deviceIndex = deviceIndex;
if (device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>() < minThreadBlockSize)
throw OpenMMException("The specified OpenCL device is not compatible with OpenMM");
compilationDefines["WORK_GROUP_SIZE"] = OpenCLExpressionUtilities::intToString(ThreadBlockSize);
compilationDefines["WORK_GROUP_SIZE"] = intToString(ThreadBlockSize);
if (platformVendor.size() >= 5 && platformVendor.substr(0, 5) == "Intel")
defaultOptimizationOptions = "";
else
......@@ -269,7 +269,7 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
clearFourBuffersKernel = cl::Kernel(utilities, "clearFourBuffers");
clearFiveBuffersKernel = cl::Kernel(utilities, "clearFiveBuffers");
clearSixBuffersKernel = cl::Kernel(utilities, "clearSixBuffers");
reduceFloat4Kernel = cl::Kernel(utilities, "reduceFloat4Buffer");
reduceReal4Kernel = cl::Kernel(utilities, "reduceReal4Buffer");
reduceForcesKernel = cl::Kernel(utilities, "reduceForces");
// Decide whether native_sqrt(), native_rsqrt(), and native_recip() are sufficiently accurate to use.
......@@ -316,9 +316,10 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
thread = new WorkThread();
// Create the integration utilities object.
// Create utilities objects.
integration = new OpenCLIntegrationUtilities(*this, system);
expression = new OpenCLExpressionUtilities(*this);
}
OpenCLContext::~OpenCLContext() {
......@@ -346,6 +347,8 @@ OpenCLContext::~OpenCLContext() {
delete atomIndexDevice;
if (integration != NULL)
delete integration;
if (expression != NULL)
delete expression;
if (bonded != NULL)
delete bonded;
if (nonbonded != NULL)
......@@ -376,10 +379,10 @@ void OpenCLContext::initialize() {
reduceForcesKernel.setArg<cl::Buffer>(1, forceBuffers->getDeviceBuffer());
reduceForcesKernel.setArg<cl_int>(2, paddedNumAtoms);
reduceForcesKernel.setArg<cl_int>(3, numForceBuffers);
addAutoclearBuffer(longForceBuffer->getDeviceBuffer(), longForceBuffer->getSize()*2);
addAutoclearBuffer(*longForceBuffer);
}
addAutoclearBuffer(forceBuffers->getDeviceBuffer(), forceBuffers->getSize()*4);
addAutoclearBuffer(energyBuffer->getDeviceBuffer(), energyBuffer->getSize());
addAutoclearBuffer(*forceBuffers);
addAutoclearBuffer(*energyBuffer);
int bufferBytes = max(posq->getSize()*posq->getElementSize(), energyBuffer->getSize()*energyBuffer->getElementSize());
pinnedBuffer = new cl::Buffer(context, CL_MEM_ALLOC_HOST_PTR, bufferBytes);
pinnedMemory = queue.enqueueMapBuffer(*pinnedBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, bufferBytes);
......@@ -479,6 +482,21 @@ cl::Program OpenCLContext::createProgram(const string source, const map<string,
return program;
}
string OpenCLContext::doubleToString(double value) {
stringstream s;
s.precision(useDoublePrecision ? 16 : 8);
s << scientific << value;
if (!useDoublePrecision)
s << "f";
return s.str();
}
string OpenCLContext::intToString(int value) {
stringstream s;
s << value;
return s.str();
}
void OpenCLContext::executeKernel(cl::Kernel& kernel, int workUnits, int blockSize) {
if (blockSize == -1)
blockSize = ThreadBlockSize;
......@@ -494,18 +512,23 @@ void OpenCLContext::executeKernel(cl::Kernel& kernel, int workUnits, int blockSi
}
void OpenCLContext::clearBuffer(OpenCLArray& array) {
clearBuffer(array.getDeviceBuffer(), array.getSize()*array.getElementSize()/sizeof(cl_float));
clearBuffer(array.getDeviceBuffer(), array.getSize()*array.getElementSize());
}
void OpenCLContext::clearBuffer(cl::Memory& memory, int size) {
int words = size/4;
clearBufferKernel.setArg<cl::Memory>(0, memory);
clearBufferKernel.setArg<cl_int>(1, size);
executeKernel(clearBufferKernel, size, 128);
clearBufferKernel.setArg<cl_int>(1, words);
executeKernel(clearBufferKernel, words, 128);
}
void OpenCLContext::addAutoclearBuffer(OpenCLArray& array) {
addAutoclearBuffer(array.getDeviceBuffer(), array.getSize()*array.getElementSize());
}
void OpenCLContext::addAutoclearBuffer(cl::Memory& memory, int size) {
autoclearBuffers.push_back(&memory);
autoclearBufferSizes.push_back(size);
autoclearBufferSizes.push_back(size/4);
}
void OpenCLContext::clearAutoclearBuffers() {
......@@ -581,10 +604,10 @@ void OpenCLContext::reduceForces() {
void OpenCLContext::reduceBuffer(OpenCLArray& array, int numBuffers) {
int bufferSize = array.getSize()/numBuffers;
reduceFloat4Kernel.setArg<cl::Buffer>(0, array.getDeviceBuffer());
reduceFloat4Kernel.setArg<cl_int>(1, bufferSize);
reduceFloat4Kernel.setArg<cl_int>(2, numBuffers);
executeKernel(reduceFloat4Kernel, bufferSize, 128);
reduceReal4Kernel.setArg<cl::Buffer>(0, array.getDeviceBuffer());
reduceReal4Kernel.setArg<cl_int>(1, bufferSize);
reduceReal4Kernel.setArg<cl_int>(2, numBuffers);
executeKernel(reduceReal4Kernel, bufferSize, 128);
}
void OpenCLContext::tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMolecule, vector<vector<int> >& atomBonds) {
......
......@@ -45,6 +45,7 @@ namespace OpenMM {
class OpenCLArray;
class OpenCLForceInfo;
class OpenCLIntegrationUtilities;
class OpenCLExpressionUtilities;
class OpenCLBondedUtilities;
class OpenCLNonbondedUtilities;
class System;
......@@ -314,14 +315,18 @@ public:
* Set all elements of an array to 0.
*
* @param memory the Memory to clear
* @param size the number of float elements in the buffer
* @param size the size of the buffer in bytes
*/
void clearBuffer(cl::Memory& memory, int size);
/**
* Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
*/
void addAutoclearBuffer(OpenCLArray& array);
/**
* Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
*
* @param memory the Memory to clear
* @param size the number of float elements in the buffer
* @param size the size of the buffer in bytes
*/
void addAutoclearBuffer(cl::Memory& memory, int size);
/**
......@@ -329,7 +334,7 @@ public:
*/
void clearAutoclearBuffers();
/**
* Given a collection of buffers packed into an array, sum them and store
* Given a collection of floating point buffers packed into an array, sum them and store
* the sum in the first buffer.
*
* @param array the array containing the buffers to reduce
......@@ -437,6 +442,15 @@ public:
bool getUseMixedPrecision() {
return useMixedPrecision;
}
/**
* Convert a number to a string in a format suitable for including in a kernel.
* This takes into account whether the context uses single or double precision.
*/
std::string doubleToString(double value);
/**
* Convert a number to a string in a format suitable for including in a kernel.
*/
std::string intToString(int value);
/**
* Get the size of the periodic box.
*/
......@@ -476,6 +490,12 @@ public:
OpenCLIntegrationUtilities& getIntegrationUtilities() {
return *integration;
}
/**
* Get the OpenCLExpressionUtilities for this context.
*/
OpenCLExpressionUtilities& getExpressionUtilities() {
return *expression;
}
/**
* Get the OpenCLBondedUtilities for this context.
*/
......@@ -580,7 +600,7 @@ private:
cl::Kernel clearFourBuffersKernel;
cl::Kernel clearFiveBuffersKernel;
cl::Kernel clearSixBuffersKernel;
cl::Kernel reduceFloat4Kernel;
cl::Kernel reduceReal4Kernel;
cl::Kernel reduceForcesKernel;
std::vector<OpenCLForceInfo*> forces;
std::vector<Molecule> molecules;
......@@ -601,6 +621,7 @@ private:
std::vector<int> autoclearBufferSizes;
std::vector<ReorderListener*> reorderListeners;
OpenCLIntegrationUtilities* integration;
OpenCLExpressionUtilities* expression;
OpenCLBondedUtilities* bonded;
OpenCLNonbondedUtilities* nonbonded;
WorkThread* thread;
......
......@@ -33,19 +33,6 @@ using namespace OpenMM;
using namespace Lepton;
using namespace std;
string OpenCLExpressionUtilities::doubleToString(double value) {
stringstream s;
s.precision(8);
s << scientific << value << "f";
return s.str();
}
string OpenCLExpressionUtilities::intToString(int value) {
stringstream s;
s << value;
return s.str();
}
string OpenCLExpressionUtilities::createExpressions(const map<string, ParsedExpression>& expressions, const map<string, string>& variables,
const vector<pair<string, string> >& functions, const string& prefix, const string& functionParams, const string& tempType) {
vector<pair<ExpressionTreeNode, string> > variableNodes;
......@@ -75,13 +62,13 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre
return;
for (int i = 0; i < (int) node.getChildren().size(); i++)
processExpression(out, node.getChildren()[i], temps, functions, prefix, functionParams, allExpressions, tempType);
string name = prefix+intToString(temps.size());
string name = prefix+context.intToString(temps.size());
bool hasRecordedNode = false;
out << tempType << " " << name << " = ";
switch (node.getOperation().getId()) {
case Operation::CONSTANT:
out << doubleToString(dynamic_cast<const Operation::Constant*>(&node.getOperation())->getValue());
out << context.doubleToString(dynamic_cast<const Operation::Constant*>(&node.getOperation())->getValue());
break;
case Operation::VARIABLE:
throw OpenMMException("Unknown variable in expression: "+node.getOperation().getName());
......@@ -107,7 +94,7 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre
string valueName = name;
string derivName = name;
if (valueNode != NULL && derivNode != NULL) {
string name2 = prefix+intToString(temps.size());
string name2 = prefix+context.intToString(temps.size());
out << tempType << " " << name2 << " = 0.0f;\n";
if (isDeriv) {
valueName = name2;
......@@ -236,10 +223,10 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre
out << "RECIP(" << getTempName(node.getChildren()[0], temps) << ")";
break;
case Operation::ADD_CONSTANT:
out << doubleToString(dynamic_cast<const Operation::AddConstant*>(&node.getOperation())->getValue()) << "+" << getTempName(node.getChildren()[0], temps);
out << context.doubleToString(dynamic_cast<const Operation::AddConstant*>(&node.getOperation())->getValue()) << "+" << getTempName(node.getChildren()[0], temps);
break;
case Operation::MULTIPLY_CONSTANT:
out << doubleToString(dynamic_cast<const Operation::MultiplyConstant*>(&node.getOperation())->getValue()) << "*" << getTempName(node.getChildren()[0], temps);
out << context.doubleToString(dynamic_cast<const Operation::MultiplyConstant*>(&node.getOperation())->getValue()) << "*" << getTempName(node.getChildren()[0], temps);
break;
case Operation::POWER_CONSTANT:
{
......@@ -266,7 +253,7 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre
for (map<int, const ExpressionTreeNode*>::const_iterator iter = powers.begin(); iter != powers.end(); ++iter) {
if (iter->first != exponent) {
exponents.push_back(iter->first >= 0 ? iter->first : -iter->first);
string name2 = prefix+intToString(temps.size());
string name2 = prefix+context.intToString(temps.size());
names.push_back(name2);
temps.push_back(make_pair(*iter->second, name2));
out << tempType << " " << name2 << " = 0.0f;\n";
......@@ -295,7 +282,7 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre
out << "}";
}
else
out << "pow(" << getTempName(node.getChildren()[0], temps) << ", " << doubleToString(exponent) << ")";
out << "pow(" << getTempName(node.getChildren()[0], temps) << ", " << context.doubleToString(exponent) << ")";
break;
}
case Operation::MIN:
......
......@@ -45,6 +45,8 @@ namespace OpenMM {
class OPENMM_EXPORT OpenCLExpressionUtilities {
public:
OpenCLExpressionUtilities(OpenCLContext& context) : context(context) {
}
/**
* Generate the source code for calculating a set of expressions.
*
......@@ -54,10 +56,10 @@ public:
* @param functions defines the variable name for each tabulated function that may appear in the expressions
* @param prefix a prefix to put in front of temporary variables
* @param functionParams the variable name containing the parameters for each tabulated function
* @param tempType the type of value to use for temporary variables (defaults to "float")
* @param tempType the type of value to use for temporary variables (defaults to "real")
*/
static std::string createExpressions(const std::map<std::string, Lepton::ParsedExpression>& expressions, const std::map<std::string, std::string>& variables,
const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams, const std::string& tempType="float");
std::string createExpressions(const std::map<std::string, Lepton::ParsedExpression>& expressions, const std::map<std::string, std::string>& variables,
const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams, const std::string& tempType="real");
/**
* Generate the source code for calculating a set of expressions.
*
......@@ -69,7 +71,7 @@ public:
* @param functionParams the variable name containing the parameters for each tabulated function
* @param tempType the type of value to use for temporary variables (defaults to "float")
*/
static std::string createExpressions(const std::map<std::string, Lepton::ParsedExpression>& expressions, const std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& variables,
std::string createExpressions(const std::map<std::string, Lepton::ParsedExpression>& expressions, const std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& variables,
const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams, const std::string& tempType="float");
/**
* Calculate the spline coefficients for a tabulated function that appears in expressions.
......@@ -79,26 +81,19 @@ public:
* @param max the value of the independent variable corresponding to the last element of values
* @return the spline coefficients
*/
static std::vector<mm_float4> computeFunctionCoefficients(const std::vector<double>& values, double min, double max);
/**
* Convert a number to a string in a format suitable for including in a kernel.
*/
static std::string doubleToString(double value);
/**
* Convert a number to a string in a format suitable for including in a kernel.
*/
static std::string intToString(int value);
std::vector<mm_float4> computeFunctionCoefficients(const std::vector<double>& values, double min, double max);
class FunctionPlaceholder;
private:
static void processExpression(std::stringstream& out, const Lepton::ExpressionTreeNode& node,
void processExpression(std::stringstream& out, const Lepton::ExpressionTreeNode& node,
std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& temps,
const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams,
const std::vector<Lepton::ParsedExpression>& allExpressions, const std::string& tempType);
static std::string getTempName(const Lepton::ExpressionTreeNode& node, const std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& temps);
static void findRelatedTabulatedFunctions(const Lepton::ExpressionTreeNode& node, const Lepton::ExpressionTreeNode& searchNode,
std::string getTempName(const Lepton::ExpressionTreeNode& node, const std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& temps);
void findRelatedTabulatedFunctions(const Lepton::ExpressionTreeNode& node, const Lepton::ExpressionTreeNode& searchNode,
const Lepton::ExpressionTreeNode*& valueNode, const Lepton::ExpressionTreeNode*& derivNode);
static void findRelatedPowers(const Lepton::ExpressionTreeNode& node, const Lepton::ExpressionTreeNode& searchNode,
void findRelatedPowers(const Lepton::ExpressionTreeNode& node, const Lepton::ExpressionTreeNode& searchNode,
std::map<int, const Lepton::ExpressionTreeNode*>& powers);
OpenCLContext& context;
};
/**
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2011 Stanford University and the Authors. *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -47,15 +47,15 @@ void OpenCLFFT3D::execFFT(OpenCLArray& in, OpenCLArray& out, bool forward) {
maxSize = 1;
zkernel.setArg<cl::Buffer>(0, in.getDeviceBuffer());
zkernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
zkernel.setArg<cl_float>(2, forward ? 1.0f : -1.0f);
zkernel.setArg<cl_int>(2, forward ? 1 : -1);
context.executeKernel(zkernel, xsize*ysize*zsize, min(zsize, (int) maxSize));
xkernel.setArg<cl::Buffer>(0, out.getDeviceBuffer());
xkernel.setArg<cl::Buffer>(1, in.getDeviceBuffer());
xkernel.setArg<cl_float>(2, forward ? 1.0f : -1.0f);
xkernel.setArg<cl_int>(2, forward ? 1 : -1);
context.executeKernel(xkernel, xsize*ysize*zsize, min(xsize, (int) maxSize));
ykernel.setArg<cl::Buffer>(0, in.getDeviceBuffer());
ykernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
ykernel.setArg<cl_float>(2, forward ? 1.0f : -1.0f);
ykernel.setArg<cl_int>(2, forward ? 1 : -1);
context.executeKernel(ykernel, xsize*ysize*zsize, min(ysize, (int) maxSize));
}
......@@ -99,23 +99,23 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) {
source<<"int i = get_local_id(0);\n";
}
source<<"int j = i/"<<m<<";\n";
source<<"float2 c0 = data"<<input<<"[i];\n";
source<<"float2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
source<<"float2 c2 = data"<<input<<"[i+"<<(2*L*m)<<"];\n";
source<<"float2 c3 = data"<<input<<"[i+"<<(3*L*m)<<"];\n";
source<<"float2 c4 = data"<<input<<"[i+"<<(4*L*m)<<"];\n";
source<<"float2 d0 = c1+c4;\n";
source<<"float2 d1 = c2+c3;\n";
source<<"float2 d2 = "<<OpenCLExpressionUtilities::doubleToString(sin(0.4*M_PI))<<"*(c1-c4);\n";
source<<"float2 d3 = "<<OpenCLExpressionUtilities::doubleToString(sin(0.4*M_PI))<<"*(c2-c3);\n";
source<<"float2 d4 = d0+d1;\n";
source<<"float2 d5 = "<<OpenCLExpressionUtilities::doubleToString(0.25*sqrt(5.0))<<"*(d0-d1);\n";
source<<"float2 d6 = c0-0.25f*d4;\n";
source<<"float2 d7 = d6+d5;\n";
source<<"float2 d8 = d6-d5;\n";
string coeff = OpenCLExpressionUtilities::doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI));
source<<"float2 d9 = sign*(float2) (d2.y+"<<coeff<<"*d3.y, -d2.x-"<<coeff<<"*d3.x);\n";
source<<"float2 d10 = sign*(float2) ("<<coeff<<"*d2.y-d3.y, d3.x-"<<coeff<<"*d2.x);\n";
source<<"real2 c0 = data"<<input<<"[i];\n";
source<<"real2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
source<<"real2 c2 = data"<<input<<"[i+"<<(2*L*m)<<"];\n";
source<<"real2 c3 = data"<<input<<"[i+"<<(3*L*m)<<"];\n";
source<<"real2 c4 = data"<<input<<"[i+"<<(4*L*m)<<"];\n";
source<<"real2 d0 = c1+c4;\n";
source<<"real2 d1 = c2+c3;\n";
source<<"real2 d2 = "<<context.doubleToString(sin(0.4*M_PI))<<"*(c1-c4);\n";
source<<"real2 d3 = "<<context.doubleToString(sin(0.4*M_PI))<<"*(c2-c3);\n";
source<<"real2 d4 = d0+d1;\n";
source<<"real2 d5 = "<<context.doubleToString(0.25*sqrt(5.0))<<"*(d0-d1);\n";
source<<"real2 d6 = c0-0.25f*d4;\n";
source<<"real2 d7 = d6+d5;\n";
source<<"real2 d8 = d6-d5;\n";
string coeff = context.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI));
source<<"real2 d9 = sign*(real2) (d2.y+"<<coeff<<"*d3.y, -d2.x-"<<coeff<<"*d3.x);\n";
source<<"real2 d10 = sign*(real2) ("<<coeff<<"*d2.y-d3.y, d3.x-"<<coeff<<"*d2.x);\n";
source<<"data"<<output<<"[i+4*j*"<<m<<"] = c0+d4;\n";
source<<"data"<<output<<"[i+(4*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(5*L)<<"], d7+d9);\n";
source<<"data"<<output<<"[i+(4*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(5*L)<<"], d8+d10);\n";
......@@ -134,14 +134,14 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) {
source<<"int i = get_local_id(0);\n";
}
source<<"int j = i/"<<m<<";\n";
source<<"float2 c0 = data"<<input<<"[i];\n";
source<<"float2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
source<<"float2 c2 = data"<<input<<"[i+"<<(2*L*m)<<"];\n";
source<<"float2 c3 = data"<<input<<"[i+"<<(3*L*m)<<"];\n";
source<<"float2 d0 = c0+c2;\n";
source<<"float2 d1 = c0-c2;\n";
source<<"float2 d2 = c1+c3;\n";
source<<"float2 d3 = sign*(float2) (c1.y-c3.y, c3.x-c1.x);\n";
source<<"real2 c0 = data"<<input<<"[i];\n";
source<<"real2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
source<<"real2 c2 = data"<<input<<"[i+"<<(2*L*m)<<"];\n";
source<<"real2 c3 = data"<<input<<"[i+"<<(3*L*m)<<"];\n";
source<<"real2 d0 = c0+c2;\n";
source<<"real2 d1 = c0-c2;\n";
source<<"real2 d2 = c1+c3;\n";
source<<"real2 d3 = sign*(real2) (c1.y-c3.y, c3.x-c1.x);\n";
source<<"data"<<output<<"[i+3*j*"<<m<<"] = d0+d2;\n";
source<<"data"<<output<<"[i+(3*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(4*L)<<"], d1+d3);\n";
source<<"data"<<output<<"[i+(3*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(4*L)<<"], d0-d2);\n";
......@@ -159,12 +159,12 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) {
source<<"int i = get_local_id(0);\n";
}
source<<"int j = i/"<<m<<";\n";
source<<"float2 c0 = data"<<input<<"[i];\n";
source<<"float2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
source<<"float2 c2 = data"<<input<<"[i+"<<(2*L*m)<<"];\n";
source<<"float2 d0 = c1+c2;\n";
source<<"float2 d1 = c0-0.5f*d0;\n";
source<<"float2 d2 = sign*"<<OpenCLExpressionUtilities::doubleToString(sin(M_PI/3.0))<<"*(float2) (c1.y-c2.y, c2.x-c1.x);\n";
source<<"real2 c0 = data"<<input<<"[i];\n";
source<<"real2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
source<<"real2 c2 = data"<<input<<"[i+"<<(2*L*m)<<"];\n";
source<<"real2 d0 = c1+c2;\n";
source<<"real2 d1 = c0-0.5f*d0;\n";
source<<"real2 d2 = sign*"<<context.doubleToString(sin(M_PI/3.0))<<"*(real2) (c1.y-c2.y, c2.x-c1.x);\n";
source<<"data"<<output<<"[i+2*j*"<<m<<"] = c0+d0;\n";
source<<"data"<<output<<"[i+(2*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(3*L)<<"], d1+d2);\n";
source<<"data"<<output<<"[i+(2*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(3*L)<<"], d1-d2);\n";
......@@ -181,15 +181,15 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) {
source<<"int i = get_local_id(0);\n";
}
source<<"int j = i/"<<m<<";\n";
source<<"float2 c0 = data"<<input<<"[i];\n";
source<<"float2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
source<<"real2 c0 = data"<<input<<"[i];\n";
source<<"real2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
source<<"data"<<output<<"[i+j*"<<m<<"] = c0+c1;\n";
source<<"data"<<output<<"[i+(j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(2*L)<<"], c0-c1);\n";
source<<"}\n";
m = m*2;
}
else
throw OpenMMException("Illegal size for FFT: "+OpenCLExpressionUtilities::intToString(zsize));
throw OpenMMException("Illegal size for FFT: "+context.intToString(zsize));
source<<"barrier(CLK_LOCAL_MEM_FENCE);\n";
source<<"}\n";
++stage;
......@@ -205,16 +205,17 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) {
source<<"out[y*(ZSIZE*XSIZE)+get_local_id(0)*XSIZE+x] = data"<<(stage%2)<<"[get_local_id(0)];\n";
source<<"barrier(CLK_GLOBAL_MEM_FENCE);";
map<string, string> replacements;
replacements["XSIZE"] = OpenCLExpressionUtilities::intToString(xsize);
replacements["YSIZE"] = OpenCLExpressionUtilities::intToString(ysize);
replacements["ZSIZE"] = OpenCLExpressionUtilities::intToString(zsize);
replacements["M_PI"] = OpenCLExpressionUtilities::doubleToString(M_PI);
replacements["XSIZE"] = context.intToString(xsize);
replacements["YSIZE"] = context.intToString(ysize);
replacements["ZSIZE"] = context.intToString(zsize);
replacements["M_PI"] = context.doubleToString(M_PI);
replacements["COMPUTE_FFT"] = source.str();
replacements["LOOP_REQUIRED"] = (loopRequired ? "1" : "0");
cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::fft, replacements));
cl::Kernel kernel(program, "execFFT");
kernel.setArg(3, zsize*sizeof(mm_float2), NULL);
kernel.setArg(4, zsize*sizeof(mm_float2), NULL);
kernel.setArg(5, zsize*sizeof(mm_float2), NULL);
int bufferSize = zsize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2));
kernel.setArg(3, bufferSize, NULL);
kernel.setArg(4, bufferSize, NULL);
kernel.setArg(5, bufferSize, NULL);
return kernel;
}
......@@ -559,8 +559,8 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
// Create the CCMA kernels.
map<string, string> defines;
defines["NUM_CONSTRAINTS"] = OpenCLExpressionUtilities::intToString(numCCMA);
defines["NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(numAtoms);
defines["NUM_CONSTRAINTS"] = context.intToString(numCCMA);
defines["NUM_ATOMS"] = context.intToString(numAtoms);
cl::Program ccmaProgram = context.createProgram(OpenCLKernelSources::ccma, defines);
ccmaDirectionsKernel = cl::Kernel(ccmaProgram, "computeConstraintDirections");
ccmaPosForceKernel = cl::Kernel(ccmaProgram, "computeConstraintForce");
......@@ -630,9 +630,9 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
// Create the kernels for virtual sites.
map<string, string> defines;
defines["NUM_2_AVERAGE"] = OpenCLExpressionUtilities::intToString(num2Avg);
defines["NUM_3_AVERAGE"] = OpenCLExpressionUtilities::intToString(num3Avg);
defines["NUM_OUT_OF_PLANE"] = OpenCLExpressionUtilities::intToString(numOutOfPlane);
defines["NUM_2_AVERAGE"] = context.intToString(num2Avg);
defines["NUM_3_AVERAGE"] = context.intToString(num3Avg);
defines["NUM_OUT_OF_PLANE"] = context.intToString(numOutOfPlane);
cl::Program vsiteProgram = context.createProgram(OpenCLKernelSources::virtualSites, defines);
vsitePositionKernel = cl::Kernel(vsiteProgram, "computeVirtualSites");
vsitePositionKernel.setArg<cl::Buffer>(0, context.getPosq().getDeviceBuffer());
......
This diff is collapsed.
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2011 Stanford University and the Authors. *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -267,7 +267,7 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
forceKernel = createInteractionKernel(kernelSource, parameters, arguments, true, true);
if (useCutoff) {
map<string, string> defines;
defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(context.getNumAtomBlocks());
defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
if (forceBufferPerAtomBlock)
defines["USE_OUTPUT_BUFFER_PER_BLOCK"] = "1";
if (usePeriodic)
......@@ -281,6 +281,9 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
findBlockBoundsKernel.setArg<cl::Buffer>(5, blockBoundingBox->getDeviceBuffer());
findBlockBoundsKernel.setArg<cl::Buffer>(6, interactionCount->getDeviceBuffer());
findInteractingBlocksKernel = cl::Kernel(interactingBlocksProgram, "findBlocksWithInteractions");
if (context.getUseDoublePrecision())
findInteractingBlocksKernel.setArg<cl_double>(0, cutoff*cutoff);
else
findInteractingBlocksKernel.setArg<cl_float>(0, (cl_float) (cutoff*cutoff));
findInteractingBlocksKernel.setArg<cl::Buffer>(3, blockCenter->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(4, blockBoundingBox->getDeviceBuffer());
......@@ -293,6 +296,9 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
findInteractingBlocksKernel.setArg<cl_uint>(11, startTileIndex+numTiles);
if (context.getSIMDWidth() == 32 && !deviceIsCpu) {
findInteractionsWithinBlocksKernel = cl::Kernel(interactingBlocksProgram, "findInteractionsWithinBlocks");
if (context.getUseDoublePrecision())
findInteractionsWithinBlocksKernel.setArg<cl_double>(0, cutoff*cutoff);
else
findInteractionsWithinBlocksKernel.setArg<cl_float>(0, (cl_float) (cutoff*cutoff));
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(3, context.getPosq().getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(4, interactingTiles->getDeviceBuffer());
......@@ -315,6 +321,20 @@ int OpenCLNonbondedUtilities::findExclusionIndex(int x, int y, const vector<cl_u
throw OpenMMException("Internal error: exclusion in unexpected tile");
}
static void setPeriodicBoxSizeArg(OpenCLContext& cl, cl::Kernel& kernel, int index) {
if (cl.getUseDoublePrecision())
kernel.setArg<mm_double4>(index, cl.getPeriodicBoxSizeDouble());
else
kernel.setArg<mm_float4>(index, cl.getPeriodicBoxSize());
}
static void setInvPeriodicBoxSizeArg(OpenCLContext& cl, cl::Kernel& kernel, int index) {
if (cl.getUseDoublePrecision())
kernel.setArg<mm_double4>(index, cl.getInvPeriodicBoxSizeDouble());
else
kernel.setArg<mm_float4>(index, cl.getInvPeriodicBoxSize());
}
void OpenCLNonbondedUtilities::prepareInteractions() {
if (!useCutoff)
return;
......@@ -327,15 +347,15 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
// Compute the neighbor list.
findBlockBoundsKernel.setArg<mm_float4>(1, context.getPeriodicBoxSize());
findBlockBoundsKernel.setArg<mm_float4>(2, context.getInvPeriodicBoxSize());
setPeriodicBoxSizeArg(context, findBlockBoundsKernel, 1);
setInvPeriodicBoxSizeArg(context, findBlockBoundsKernel, 2);
context.executeKernel(findBlockBoundsKernel, context.getNumAtoms());
findInteractingBlocksKernel.setArg<mm_float4>(1, context.getPeriodicBoxSize());
findInteractingBlocksKernel.setArg<mm_float4>(2, context.getInvPeriodicBoxSize());
setPeriodicBoxSizeArg(context, findInteractingBlocksKernel, 1);
setInvPeriodicBoxSizeArg(context, findInteractingBlocksKernel, 2);
context.executeKernel(findInteractingBlocksKernel, context.getNumAtoms(), deviceIsCpu ? 1 : -1);
if (context.getSIMDWidth() == 32 && !deviceIsCpu) {
findInteractionsWithinBlocksKernel.setArg<mm_float4>(1, context.getPeriodicBoxSize());
findInteractionsWithinBlocksKernel.setArg<mm_float4>(2, context.getInvPeriodicBoxSize());
setPeriodicBoxSizeArg(context, findInteractionsWithinBlocksKernel, 1);
setInvPeriodicBoxSizeArg(context, findInteractionsWithinBlocksKernel, 2);
context.executeKernel(findInteractionsWithinBlocksKernel, context.getNumAtoms(), 128);
}
}
......@@ -343,8 +363,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
void OpenCLNonbondedUtilities::computeInteractions() {
if (cutoff != -1.0) {
if (useCutoff) {
forceKernel.setArg<mm_float4>(10, context.getPeriodicBoxSize());
forceKernel.setArg<mm_float4>(11, context.getInvPeriodicBoxSize());
setPeriodicBoxSizeArg(context, forceKernel, 10);
setInvPeriodicBoxSizeArg(context, forceKernel, 11);
}
context.executeKernel(forceKernel, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
}
......@@ -498,11 +518,11 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
defines["USE_EXCLUSIONS"] = "1";
if (isSymmetric)
defines["USE_SYMMETRIC"] = "1";
defines["FORCE_WORK_GROUP_SIZE"] = OpenCLExpressionUtilities::intToString(forceThreadBlockSize);
defines["CUTOFF_SQUARED"] = OpenCLExpressionUtilities::doubleToString(cutoff*cutoff);
defines["NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(context.getNumAtomBlocks());
defines["FORCE_WORK_GROUP_SIZE"] = context.intToString(forceThreadBlockSize);
defines["CUTOFF_SQUARED"] = context.doubleToString(cutoff*cutoff);
defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
if ((localDataSize/4)%2 == 0)
defines["PARAMETER_SIZE_IS_EVEN"] = "1";
string file;
......
......@@ -30,6 +30,7 @@
#include "OpenCLContext.h"
#include "openmm/System.h"
#include "OpenCLExpressionUtilities.h"
#include <sstream>
#include <string>
#include <vector>
......@@ -287,8 +288,11 @@ public:
name(name), componentType(componentType), numComponents(numComponents), size(size), memory(&memory) {
if (numComponents == 1)
type = componentType;
else
type = componentType+OpenCLExpressionUtilities::intToString(numComponents);
else {
std::stringstream s;
s << componentType << numComponents;
type = s.str();
}
}
const std::string& getName() const {
return name;
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011 Stanford University and the Authors. *
* Portions copyright (c) 2011-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -54,14 +54,14 @@ using namespace std;
class OpenCLParallelCalcForcesAndEnergyKernel::BeginComputationTask : public OpenCLContext::WorkTask {
public:
BeginComputationTask(ContextImpl& context, OpenCLContext& cl, OpenCLCalcForcesAndEnergyKernel& kernel,
bool includeForce, bool includeEnergy, int groups, mm_float4* pinnedMemory) : context(context), cl(cl), kernel(kernel),
bool includeForce, bool includeEnergy, int groups, void* pinnedMemory) : context(context), cl(cl), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), pinnedMemory(pinnedMemory) {
}
void execute() {
// Copy coordinates over to this device and execute the kernel.
if (cl.getContextIndex() > 0)
cl.getQueue().enqueueWriteBuffer(cl.getPosq().getDeviceBuffer(), CL_FALSE, 0, cl.getPaddedNumAtoms()*sizeof(mm_float4), pinnedMemory);
cl.getQueue().enqueueWriteBuffer(cl.getPosq().getDeviceBuffer(), CL_FALSE, 0, cl.getPaddedNumAtoms()*cl.getPosq().getElementSize(), pinnedMemory);
kernel.beginComputation(context, includeForce, includeEnergy, groups);
}
private:
......@@ -70,13 +70,13 @@ private:
OpenCLCalcForcesAndEnergyKernel& kernel;
bool includeForce, includeEnergy;
int groups;
mm_float4* pinnedMemory;
void* pinnedMemory;
};
class OpenCLParallelCalcForcesAndEnergyKernel::FinishComputationTask : public OpenCLContext::WorkTask {
public:
FinishComputationTask(ContextImpl& context, OpenCLContext& cl, OpenCLCalcForcesAndEnergyKernel& kernel,
bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, mm_float4* pinnedMemory) :
bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, void* pinnedMemory) :
context(context), cl(cl), kernel(kernel), includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), energy(energy),
completionTime(completionTime), pinnedMemory(pinnedMemory) {
}
......@@ -87,8 +87,9 @@ public:
if (includeForce) {
if (cl.getContextIndex() > 0) {
int numAtoms = cl.getPaddedNumAtoms();
void* dest = (cl.getUseDoublePrecision() ? (void*) &((mm_double4*) pinnedMemory)[(cl.getContextIndex()-1)*numAtoms] : (void*) &((mm_float4*) pinnedMemory)[(cl.getContextIndex()-1)*numAtoms]);
cl.getQueue().enqueueReadBuffer(cl.getForce().getDeviceBuffer(), CL_TRUE, 0,
numAtoms*sizeof(mm_float4), &pinnedMemory[(cl.getContextIndex()-1)*numAtoms]);
numAtoms*cl.getForce().getElementSize(), dest);
}
else
cl.getQueue().finish();
......@@ -103,7 +104,7 @@ private:
int groups;
double& energy;
long long& completionTime;
mm_float4* pinnedMemory;
void* pinnedMemory;
};
OpenCLParallelCalcForcesAndEnergyKernel::OpenCLParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, OpenCLPlatform::PlatformData& data) :
......@@ -129,19 +130,20 @@ void OpenCLParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
void OpenCLParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
OpenCLContext& cl0 = *data.contexts[0];
int elementSize = (cl0.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
if (contextForces == NULL) {
contextForces = OpenCLArray::create<mm_float4>(cl0, &cl0.getForceBuffers().getDeviceBuffer(),
data.contexts.size()*cl0.getPaddedNumAtoms(), "contextForces");
int bufferBytes = (data.contexts.size()-1)*cl0.getPaddedNumAtoms()*sizeof(mm_float4);
int bufferBytes = (data.contexts.size()-1)*cl0.getPaddedNumAtoms()*elementSize;
pinnedPositionBuffer = new cl::Buffer(cl0.getContext(), CL_MEM_ALLOC_HOST_PTR, bufferBytes);
pinnedPositionMemory = (mm_float4*) cl0.getQueue().enqueueMapBuffer(*pinnedPositionBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, bufferBytes);
pinnedPositionMemory = cl0.getQueue().enqueueMapBuffer(*pinnedPositionBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, bufferBytes);
pinnedForceBuffer = new cl::Buffer(cl0.getContext(), CL_MEM_ALLOC_HOST_PTR, bufferBytes);
pinnedForceMemory = (mm_float4*) cl0.getQueue().enqueueMapBuffer(*pinnedForceBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, bufferBytes);
pinnedForceMemory = cl0.getQueue().enqueueMapBuffer(*pinnedForceBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, bufferBytes);
}
// Copy coordinates over to each device and execute the kernel.
cl0.getQueue().enqueueReadBuffer(cl0.getPosq().getDeviceBuffer(), CL_TRUE, 0, cl0.getPaddedNumAtoms()*sizeof(mm_float4), pinnedPositionMemory);
cl0.getQueue().enqueueReadBuffer(cl0.getPosq().getDeviceBuffer(), CL_TRUE, 0, cl0.getPaddedNumAtoms()*elementSize, pinnedPositionMemory);
for (int i = 0; i < (int) data.contexts.size(); i++) {
data.contextEnergy[i] = 0.0;
OpenCLContext& cl = *data.contexts[i];
......@@ -165,8 +167,9 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c
OpenCLContext& cl = *data.contexts[0];
int numAtoms = cl.getPaddedNumAtoms();
cl.getQueue().enqueueWriteBuffer(contextForces->getDeviceBuffer(), CL_FALSE, numAtoms*sizeof(mm_float4),
numAtoms*(data.contexts.size()-1)*sizeof(mm_float4), pinnedForceMemory);
int elementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
cl.getQueue().enqueueWriteBuffer(contextForces->getDeviceBuffer(), CL_FALSE, numAtoms*elementSize,
numAtoms*(data.contexts.size()-1)*elementSize, pinnedForceMemory);
cl.reduceBuffer(*contextForces, data.contexts.size());
// Balance work between the contexts by transferring a few nonbonded tiles from the context that
......
......@@ -84,8 +84,8 @@ private:
OpenCLArray* contextForces;
cl::Buffer* pinnedPositionBuffer;
cl::Buffer* pinnedForceBuffer;
mm_float4* pinnedPositionMemory;
mm_float4* pinnedForceMemory;
void* pinnedPositionMemory;
void* pinnedForceMemory;
};
/**
......
......@@ -141,7 +141,7 @@ OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& p
device << contexts[i]->getDeviceIndex();
}
propertyValues[OpenCLPlatform::OpenCLDeviceIndex()] = device.str();
propertyValues[OpenCLPlatform::OpenCLPlatformIndex()] = OpenCLExpressionUtilities::intToString(platformIndex);
propertyValues[OpenCLPlatform::OpenCLPlatformIndex()] = contexts[0]->intToString(platformIndex);
propertyValues[OpenCLPlatform::OpenCLPrecision()] = precisionProperty;
contextEnergy.resize(contexts.size());
}
......
......@@ -162,7 +162,7 @@ public:
// Assign array elements to buckets.
unsigned int numBuckets = bucketOffset->getSize();
context.clearBuffer(bucketOffset->getDeviceBuffer(), numBuckets);
context.clearBuffer(*bucketOffset);
assignElementsKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
assignElementsKernel.setArg<cl_int>(1, data.getSize());
assignElementsKernel.setArg<cl_int>(2, numBuckets);
......
float4 v0 = pos2-pos1;
float4 v1 = pos2-pos3;
float4 cp = cross(v0, v1);
float rp = cp.x*cp.x + cp.y*cp.y + cp.z*cp.z;
rp = max(SQRT(rp), 1.0e-06f);
float r21 = v0.x*v0.x + v0.y*v0.y + v0.z*v0.z;
float r23 = v1.x*v1.x + v1.y*v1.y + v1.z*v1.z;
float dot = v0.x*v1.x + v0.y*v1.y + v0.z*v1.z;
float cosine = clamp(dot*RSQRT(r21*r23), -1.0f, 1.0f);
float theta = acos(cosine);
real4 v0 = pos2-pos1;
real4 v1 = pos2-pos3;
real4 cp = cross(v0, v1);
real rp = cp.x*cp.x + cp.y*cp.y + cp.z*cp.z;
rp = max(SQRT(rp), (real) 1.0e-06f);
real r21 = v0.x*v0.x + v0.y*v0.y + v0.z*v0.z;
real r23 = v1.x*v1.x + v1.y*v1.y + v1.z*v1.z;
real dot = v0.x*v1.x + v0.y*v1.y + v0.z*v1.z;
real cosine = clamp(dot*RSQRT(r21*r23), (real) -1, (real) 1);
real theta = acos(cosine);
COMPUTE_FORCE
float4 force1 = cross(v0, cp)*(dEdAngle/(r21*rp));
float4 force3 = cross(cp, v1)*(dEdAngle/(r23*rp));
float4 force2 = -force1-force3;
real4 force1 = cross(v0, cp)*(dEdAngle/(r21*rp));
real4 force3 = cross(cp, v1)*(dEdAngle/(r23*rp));
real4 force2 = -force1-force3;
float4 delta = pos2-pos1;
float r = SQRT(delta.x*delta.x + delta.y*delta.y + delta.z*delta.z);
real4 delta = pos2-pos1;
real r = SQRT(delta.x*delta.x + delta.y*delta.y + delta.z*delta.z);
COMPUTE_FORCE
dEdR = (r > 0.0f) ? (dEdR / r) : 0.0f;
delta.xyz *= dEdR;
float4 force1 = delta;
float4 force2 = -delta;
\ No newline at end of file
real4 force1 = delta;
real4 force2 = -delta;
\ No newline at end of file
const float PI = 3.14159265358979323846f;
const real PI = 3.14159265358979323846f;
// Compute the first angle.
float4 v0a = (float4) (pos1.xyz-pos2.xyz, 0.0f);
float4 v1a = (float4) (pos3.xyz-pos2.xyz, 0.0f);
float4 v2a = (float4) (pos3.xyz-pos4.xyz, 0.0f);
float4 cp0a = cross(v0a, v1a);
float4 cp1a = cross(v1a, v2a);
float cosangle = dot(normalize(cp0a), normalize(cp1a));
float angleA;
real4 v0a = (real4) (pos1.xyz-pos2.xyz, 0.0f);
real4 v1a = (real4) (pos3.xyz-pos2.xyz, 0.0f);
real4 v2a = (real4) (pos3.xyz-pos4.xyz, 0.0f);
real4 cp0a = cross(v0a, v1a);
real4 cp1a = cross(v1a, v2a);
real cosangle = dot(normalize(cp0a), normalize(cp1a));
real angleA;
if (cosangle > 0.99f || cosangle < -0.99f) {
// We're close to the singularity in acos(), so take the cross product and use asin() instead.
float4 cross_prod = cross(cp0a, cp1a);
float scale = dot(cp0a, cp0a)*dot(cp1a, cp1a);
real4 cross_prod = cross(cp0a, cp1a);
real scale = dot(cp0a, cp0a)*dot(cp1a, cp1a);
angleA = asin(SQRT(dot(cross_prod, cross_prod)/scale));
if (cosangle < 0.0f)
angleA = PI-angleA;
......@@ -25,18 +25,18 @@ angleA = fmod(angleA+2.0f*PI, 2.0f*PI);
// Compute the second angle.
float4 v0b = (float4) (pos5.xyz-pos6.xyz, 0.0f);
float4 v1b = (float4) (pos7.xyz-pos6.xyz, 0.0f);
float4 v2b = (float4) (pos7.xyz-pos8.xyz, 0.0f);
float4 cp0b = cross(v0b, v1b);
float4 cp1b = cross(v1b, v2b);
real4 v0b = (real4) (pos5.xyz-pos6.xyz, 0.0f);
real4 v1b = (real4) (pos7.xyz-pos6.xyz, 0.0f);
real4 v2b = (real4) (pos7.xyz-pos8.xyz, 0.0f);
real4 cp0b = cross(v0b, v1b);
real4 cp1b = cross(v1b, v2b);
cosangle = dot(normalize(cp0b), normalize(cp1b));
float angleB;
real angleB;
if (cosangle > 0.99f || cosangle < -0.99f) {
// We're close to the singularity in acos(), so take the cross product and use asin() instead.
float4 cross_prod = cross(cp0b, cp1b);
float scale = dot(cp0b, cp0b)*dot(cp1b, cp1b);
real4 cross_prod = cross(cp0b, cp1b);
real scale = dot(cp0b, cp0b)*dot(cp1b, cp1b);
angleB = asin(SQRT(dot(cross_prod, cross_prod)/scale));
if (cosangle < 0.0f)
angleB = PI-angleB;
......@@ -50,7 +50,7 @@ angleB = fmod(angleB+2.0f*PI, 2.0f*PI);
int2 pos = MAP_POS[MAPS[index]];
int size = pos.y;
float delta = 2*PI/size;
real delta = 2*PI/size;
int s = (int) (angleA/delta);
int t = (int) (angleB/delta);
float4 c[4];
......@@ -59,14 +59,14 @@ c[0] = COEFF[coeffIndex];
c[1] = COEFF[coeffIndex+1];
c[2] = COEFF[coeffIndex+2];
c[3] = COEFF[coeffIndex+3];
float da = angleA/delta-s;
float db = angleB/delta-t;
real da = angleA/delta-s;
real db = angleB/delta-t;
// Evaluate the spline to determine the energy and gradients.
float torsionEnergy = 0.0f;
float dEdA = 0.0f;
float dEdB = 0.0f;
real torsionEnergy = 0.0f;
real dEdA = 0.0f;
real dEdB = 0.0f;
torsionEnergy = da*torsionEnergy + ((c[3].w*db + c[3].z)*db + c[3].y)*db + c[3].x;
dEdA = db*dEdA + (3.0f*c[3].w*da + 2.0f*c[2].w)*da + c[1].w;
dEdB = da*dEdB + (3.0f*c[3].w*db + 2.0f*c[3].z)*db + c[3].y;
......@@ -85,17 +85,17 @@ energy += torsionEnergy;
// Apply the force to the first torsion.
float normCross1 = dot(cp0a, cp0a);
float normSqrBC = dot(v1a, v1a);
float normBC = SQRT(normSqrBC);
float normCross2 = dot(cp1a, cp1a);
float dp = 1.0f/normSqrBC;
float4 ff = (float4) ((-dEdA*normBC)/normCross1, dot(v0a, v1a)*dp, dot(v2a, v1a)*dp, (dEdA*normBC)/normCross2);
float4 force1 = ff.x*cp0a;
float4 force4 = ff.w*cp1a;
float4 d = ff.y*force1 - ff.z*force4;
float4 force2 = d-force1;
float4 force3 = -d-force4;
real normCross1 = dot(cp0a, cp0a);
real normSqrBC = dot(v1a, v1a);
real normBC = SQRT(normSqrBC);
real normCross2 = dot(cp1a, cp1a);
real dp = 1.0f/normSqrBC;
real4 ff = (real4) ((-dEdA*normBC)/normCross1, dot(v0a, v1a)*dp, dot(v2a, v1a)*dp, (dEdA*normBC)/normCross2);
real4 force1 = ff.x*cp0a;
real4 force4 = ff.w*cp1a;
real4 d = ff.y*force1 - ff.z*force4;
real4 force2 = d-force1;
real4 force3 = -d-force4;
// Apply the force to the second torsion.
......@@ -104,9 +104,9 @@ normSqrBC = dot(v1b, v1b);
normBC = SQRT(normSqrBC);
normCross2 = dot(cp1b, cp1b);
dp = 1.0f/normSqrBC;
ff = (float4) ((-dEdB*normBC)/normCross1, dot(v0b, v1b)*dp, dot(v2b, v1b)*dp, (dEdB*normBC)/normCross2);
float4 force5 = ff.x*cp0b;
float4 force8 = ff.w*cp1b;
ff = (real4) ((-dEdB*normBC)/normCross1, dot(v0b, v1b)*dp, dot(v2b, v1b)*dp, (dEdB*normBC)/normCross2);
real4 force5 = ff.x*cp0b;
real4 force8 = ff.w*cp1b;
d = ff.y*force5 - ff.z*force8;
float4 force6 = d-force5;
float4 force7 = -d-force8;
real4 force6 = d-force5;
real4 force7 = -d-force8;
#if USE_EWALD
bool needCorrection = isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS;
if (!isExcluded || needCorrection) {
float tempForce = 0.0f;
real tempForce = 0;
if (r2 < CUTOFF_SQUARED || needCorrection) {
const float alphaR = EWALD_ALPHA*r;
const float expAlphaRSqr = EXP(-alphaR*alphaR);
const float prefactor = 138.935456f*posq1.w*posq2.w*invR;
const real alphaR = EWALD_ALPHA*r;
const real expAlphaRSqr = EXP(-alphaR*alphaR);
const real prefactor = 138.935456f*posq1.w*posq2.w*invR;
// This approximation for erfc is from Abramowitz and Stegun (1964) p. 299. They cite the following as
// the original source: C. Hastings, Jr., Approximations for Digital Computers (1955). It has a maximum
// error of 3e-7.
float t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*alphaR)*alphaR)*alphaR)*alphaR)*alphaR)*alphaR;
real t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*alphaR)*alphaR)*alphaR)*alphaR)*alphaR)*alphaR;
t *= t;
t *= t;
t *= t;
const float erfcAlphaR = RECIP(t*t);
const real erfcAlphaR = RECIP(t*t);
if (needCorrection) {
// Subtract off the part of this interaction that was included in the reciprocal space contribution.
......@@ -24,11 +24,11 @@ if (!isExcluded || needCorrection) {
}
else {
#if HAS_LENNARD_JONES
float sig = sigmaEpsilon1.x + sigmaEpsilon2.x;
float sig2 = invR*sig;
real sig = sigmaEpsilon1.x + sigmaEpsilon2.x;
real sig2 = invR*sig;
sig2 *= sig2;
float sig6 = sig2*sig2*sig2;
float epssig6 = sig6*(sigmaEpsilon1.y*sigmaEpsilon2.y);
real sig6 = sig2*sig2*sig2;
real epssig6 = sig6*(sigmaEpsilon1.y*sigmaEpsilon2.y);
tempForce = epssig6*(12.0f*sig6 - 6.0f) + prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
tempEnergy += epssig6*(sig6 - 1.0f) + prefactor*erfcAlphaR;
#else
......@@ -41,32 +41,37 @@ if (!isExcluded || needCorrection) {
}
#else
{
#ifdef USE_DOUBLE_PRECISION
unsigned long includeInteraction;
#else
unsigned int includeInteraction;
#endif
#ifdef USE_CUTOFF
unsigned int includeInteraction = (!isExcluded && r2 < CUTOFF_SQUARED);
includeInteraction = (!isExcluded && r2 < CUTOFF_SQUARED);
#else
unsigned int includeInteraction = (!isExcluded);
includeInteraction = (!isExcluded);
#endif
float tempForce = 0.0f;
real tempForce = 0;
#if HAS_LENNARD_JONES
float sig = sigmaEpsilon1.x + sigmaEpsilon2.x;
float sig2 = invR*sig;
real sig = sigmaEpsilon1.x + sigmaEpsilon2.x;
real sig2 = invR*sig;
sig2 *= sig2;
float sig6 = sig2*sig2*sig2;
float epssig6 = sig6*(sigmaEpsilon1.y*sigmaEpsilon2.y);
real sig6 = sig2*sig2*sig2;
real epssig6 = sig6*(sigmaEpsilon1.y*sigmaEpsilon2.y);
tempForce = epssig6*(12.0f*sig6 - 6.0f);
tempEnergy += select(0.0f, epssig6*(sig6 - 1.0f), includeInteraction);
tempEnergy += select((real) 0, epssig6*(sig6-1), includeInteraction);
#endif
#if HAS_COULOMB
#ifdef USE_CUTOFF
const float prefactor = 138.935456f*posq1.w*posq2.w;
const real prefactor = 138.935456f*posq1.w*posq2.w;
tempForce += prefactor*(invR - 2.0f*REACTION_FIELD_K*r2);
tempEnergy += select(0.0f, prefactor*(invR + REACTION_FIELD_K*r2 - REACTION_FIELD_C), includeInteraction);
tempEnergy += select((real) 0, prefactor*(invR + REACTION_FIELD_K*r2 - REACTION_FIELD_C), includeInteraction);
#else
const float prefactor = 138.935456f*posq1.w*posq2.w*invR;
const real prefactor = 138.935456f*posq1.w*posq2.w*invR;
tempForce += prefactor;
tempEnergy += select(0.0f, prefactor, includeInteraction);
tempEnergy += select((real) 0, prefactor, includeInteraction);
#endif
#endif
dEdR += select(0.0f, tempForce*invR*invR, includeInteraction);
dEdR += select((real) 0, tempForce*invR*invR, includeInteraction);
}
#endif
\ No newline at end of file
/**
* Compute the difference between two vectors, setting the fourth component to the squared magnitude.
*/
float4 ccb_delta(float4 vec1, float4 vec2) {
float4 result = (float4) (vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
real4 ccb_delta(real4 vec1, real4 vec2) {
real4 result = (real4) (vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
result.w = result.x*result.x + result.y*result.y + result.z*result.z;
return result;
}
......@@ -10,17 +10,17 @@ float4 ccb_delta(float4 vec1, float4 vec2) {
/**
* Compute the angle between two vectors. The w component of each vector should contain the squared magnitude.
*/
float ccb_computeAngle(float4 vec1, float4 vec2) {
float dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
float cosine = dotProduct*RSQRT(vec1.w*vec2.w);
float angle;
real ccb_computeAngle(real4 vec1, real4 vec2) {
real dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
real cosine = dotProduct*RSQRT(vec1.w*vec2.w);
real angle;
if (cosine > 0.99f || cosine < -0.99f) {
// We're close to the singularity in acos(), so take the cross product and use asin() instead.
float4 crossProduct = cross(vec1, vec2);
float scale = vec1.w*vec2.w;
real4 crossProduct = cross(vec1, vec2);
real scale = vec1.w*vec2.w;
angle = asin(SQRT(dot(crossProduct, crossProduct)/scale));
if (cosine < 0.0f)
if (cosine < 0)
angle = M_PI-angle;
}
else
......@@ -31,8 +31,8 @@ float ccb_computeAngle(float4 vec1, float4 vec2) {
/**
* Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
*/
float4 ccb_computeCross(float4 vec1, float4 vec2) {
float4 result = cross(vec1, vec2);
real4 ccb_computeCross(real4 vec1, real4 vec2) {
real4 result = cross(vec1, vec2);
result.w = result.x*result.x + result.y*result.y + result.z*result.z;
return result;
}
COMPUTE_FORCE
float4 force1 = (float4) (-dEdX, -dEdY, -dEdZ, 0.0f);
real4 force1 = (real4) (-dEdX, -dEdY, -dEdZ, 0);
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment