Commit c8dac206 authored by Peter Eastman's avatar Peter Eastman
Browse files

Continuing to implement double precision in OpenCL

parent 34938e2c
...@@ -58,7 +58,7 @@ void OpenCLBondedUtilities::addInteraction(const vector<vector<int> >& atoms, co ...@@ -58,7 +58,7 @@ void OpenCLBondedUtilities::addInteraction(const vector<vector<int> >& atoms, co
std::string OpenCLBondedUtilities::addArgument(cl::Memory& data, const string& type) { std::string OpenCLBondedUtilities::addArgument(cl::Memory& data, const string& type) {
arguments.push_back(&data); arguments.push_back(&data);
argTypes.push_back(type); argTypes.push_back(type);
return "customArg"+OpenCLExpressionUtilities::intToString(arguments.size()); return "customArg"+context.intToString(arguments.size());
} }
void OpenCLBondedUtilities::addPrefixCode(const string& source) { void OpenCLBondedUtilities::addPrefixCode(const string& source) {
...@@ -164,17 +164,17 @@ void OpenCLBondedUtilities::initialize(const System& system) { ...@@ -164,17 +164,17 @@ void OpenCLBondedUtilities::initialize(const System& system) {
stringstream s; stringstream s;
for (int i = 0; i < (int) prefixCode.size(); i++) for (int i = 0; i < (int) prefixCode.size(); i++)
s<<prefixCode[i]; s<<prefixCode[i];
s<<"__kernel void computeBondedForces(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, int groups"; s<<"__kernel void computeBondedForces(__global real4* restrict forceBuffers, __global real* restrict energyBuffer, __global const real4* restrict posq, int groups";
for (int i = 0; i < setSize; i++) { for (int i = 0; i < setSize; i++) {
int force = set[i]; int force = set[i];
string indexType = "uint"+(indexWidth[force] == 1 ? "" : OpenCLExpressionUtilities::intToString(indexWidth[force])); string indexType = "uint"+(indexWidth[force] == 1 ? "" : context.intToString(indexWidth[force]));
s<<", __global const "<<indexType<<"* restrict atomIndices"<<i; s<<", __global const "<<indexType<<"* restrict atomIndices"<<i;
s<<", __global const "<<indexType<<"* restrict bufferIndices"<<i; s<<", __global const "<<indexType<<"* restrict bufferIndices"<<i;
} }
for (int i = 0; i < (int) arguments.size(); i++) for (int i = 0; i < (int) arguments.size(); i++)
s<<", __global "<<argTypes[i]<<"* customArg"<<(i+1); s<<", __global "<<argTypes[i]<<"* customArg"<<(i+1);
s<<") {\n"; s<<") {\n";
s<<"float energy = 0.0f;\n"; s<<"real energy = 0.0f;\n";
for (int i = 0; i < setSize; i++) { for (int i = 0; i < setSize; i++) {
int force = set[i]; int force = set[i];
s<<createForceSource(i, forceAtoms[force].size(), forceAtoms[force][0].size(), forceGroup[force], forceSource[force]); s<<createForceSource(i, forceAtoms[force].size(), forceAtoms[force][0].size(), forceGroup[force], forceSource[force]);
...@@ -182,7 +182,7 @@ void OpenCLBondedUtilities::initialize(const System& system) { ...@@ -182,7 +182,7 @@ void OpenCLBondedUtilities::initialize(const System& system) {
s<<"energyBuffer[get_global_id(0)] += energy;\n"; s<<"energyBuffer[get_global_id(0)] += energy;\n";
s<<"}\n"; s<<"}\n";
map<string, string> defines; map<string, string> defines;
defines["PADDED_NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getPaddedNumAtoms()); defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
cl::Program program = context.createProgram(s.str(), defines); cl::Program program = context.createProgram(s.str(), defines);
kernels.push_back(cl::Kernel(program, "computeBondedForces")); kernels.push_back(cl::Kernel(program, "computeBondedForces"));
} }
...@@ -206,7 +206,7 @@ string OpenCLBondedUtilities::createForceSource(int forceIndex, int numBonds, in ...@@ -206,7 +206,7 @@ string OpenCLBondedUtilities::createForceSource(int forceIndex, int numBonds, in
suffix = suffix4; suffix = suffix4;
else else
suffix = suffix16; suffix = suffix16;
string indexType = "uint"+(width == 1 ? "" : OpenCLExpressionUtilities::intToString(width)); string indexType = "uint"+(width == 1 ? "" : context.intToString(width));
stringstream s; stringstream s;
s<<"if ((groups&"<<(1<<group)<<") != 0)\n"; s<<"if ((groups&"<<(1<<group)<<") != 0)\n";
s<<"for (unsigned int index = get_global_id(0); index < "<<numBonds<<"; index += get_global_size(0)) {\n"; s<<"for (unsigned int index = get_global_id(0); index < "<<numBonds<<"; index += get_global_size(0)) {\n";
...@@ -214,13 +214,13 @@ string OpenCLBondedUtilities::createForceSource(int forceIndex, int numBonds, in ...@@ -214,13 +214,13 @@ string OpenCLBondedUtilities::createForceSource(int forceIndex, int numBonds, in
s<<" "<<indexType<<" buffers = bufferIndices"<<forceIndex<<"[index];\n"; s<<" "<<indexType<<" buffers = bufferIndices"<<forceIndex<<"[index];\n";
for (int i = 0; i < numAtoms; i++) { for (int i = 0; i < numAtoms; i++) {
s<<" unsigned int atom"<<(i+1)<<" = atoms"<<suffix[i]<<";\n"; s<<" unsigned int atom"<<(i+1)<<" = atoms"<<suffix[i]<<";\n";
s<<" float4 pos"<<(i+1)<<" = posq[atom"<<(i+1)<<"];\n"; s<<" real4 pos"<<(i+1)<<" = posq[atom"<<(i+1)<<"];\n";
} }
s<<computeForce<<"\n"; s<<computeForce<<"\n";
for (int i = 0; i < numAtoms; i++) { for (int i = 0; i < numAtoms; i++) {
s<<" {\n"; s<<" {\n";
s<<" unsigned int offset = atom"<<(i+1)<<"+buffers"<<suffix[i]<<"*PADDED_NUM_ATOMS;\n"; s<<" unsigned int offset = atom"<<(i+1)<<"+buffers"<<suffix[i]<<"*PADDED_NUM_ATOMS;\n";
s<<" float4 force = forceBuffers[offset];\n"; s<<" real4 force = forceBuffers[offset];\n";
s<<" force.xyz += force"<<(i+1)<<".xyz;\n"; s<<" force.xyz += force"<<(i+1)<<".xyz;\n";
s<<" forceBuffers[offset] = force;\n"; s<<" forceBuffers[offset] = force;\n";
s<<" }\n"; s<<" }\n";
......
...@@ -68,7 +68,7 @@ static void CL_CALLBACK errorCallback(const char* errinfo, const void* private_i ...@@ -68,7 +68,7 @@ static void CL_CALLBACK errorCallback(const char* errinfo, const void* private_i
OpenCLContext::OpenCLContext(const System& system, int platformIndex, int deviceIndex, const string& precision, OpenCLPlatform::PlatformData& platformData) : OpenCLContext::OpenCLContext(const System& system, int platformIndex, int deviceIndex, const string& precision, OpenCLPlatform::PlatformData& platformData) :
system(system), time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), atomsWereReordered(false), posq(NULL), system(system), time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), atomsWereReordered(false), posq(NULL),
posqCorrection(NULL), velm(NULL), forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndexDevice(NULL), integration(NULL), posqCorrection(NULL), velm(NULL), forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndexDevice(NULL), integration(NULL),
bonded(NULL), nonbonded(NULL), thread(NULL) { expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
if (precision == "single") { if (precision == "single") {
useDoublePrecision = false; useDoublePrecision = false;
useMixedPrecision = false; useMixedPrecision = false;
...@@ -145,7 +145,7 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device ...@@ -145,7 +145,7 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
this->deviceIndex = deviceIndex; this->deviceIndex = deviceIndex;
if (device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>() < minThreadBlockSize) if (device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>() < minThreadBlockSize)
throw OpenMMException("The specified OpenCL device is not compatible with OpenMM"); throw OpenMMException("The specified OpenCL device is not compatible with OpenMM");
compilationDefines["WORK_GROUP_SIZE"] = OpenCLExpressionUtilities::intToString(ThreadBlockSize); compilationDefines["WORK_GROUP_SIZE"] = intToString(ThreadBlockSize);
if (platformVendor.size() >= 5 && platformVendor.substr(0, 5) == "Intel") if (platformVendor.size() >= 5 && platformVendor.substr(0, 5) == "Intel")
defaultOptimizationOptions = ""; defaultOptimizationOptions = "";
else else
...@@ -269,7 +269,7 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device ...@@ -269,7 +269,7 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
clearFourBuffersKernel = cl::Kernel(utilities, "clearFourBuffers"); clearFourBuffersKernel = cl::Kernel(utilities, "clearFourBuffers");
clearFiveBuffersKernel = cl::Kernel(utilities, "clearFiveBuffers"); clearFiveBuffersKernel = cl::Kernel(utilities, "clearFiveBuffers");
clearSixBuffersKernel = cl::Kernel(utilities, "clearSixBuffers"); clearSixBuffersKernel = cl::Kernel(utilities, "clearSixBuffers");
reduceFloat4Kernel = cl::Kernel(utilities, "reduceFloat4Buffer"); reduceReal4Kernel = cl::Kernel(utilities, "reduceReal4Buffer");
reduceForcesKernel = cl::Kernel(utilities, "reduceForces"); reduceForcesKernel = cl::Kernel(utilities, "reduceForces");
// Decide whether native_sqrt(), native_rsqrt(), and native_recip() are sufficiently accurate to use. // Decide whether native_sqrt(), native_rsqrt(), and native_recip() are sufficiently accurate to use.
...@@ -316,9 +316,10 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device ...@@ -316,9 +316,10 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
thread = new WorkThread(); thread = new WorkThread();
// Create the integration utilities object. // Create utilities objects.
integration = new OpenCLIntegrationUtilities(*this, system); integration = new OpenCLIntegrationUtilities(*this, system);
expression = new OpenCLExpressionUtilities(*this);
} }
OpenCLContext::~OpenCLContext() { OpenCLContext::~OpenCLContext() {
...@@ -346,6 +347,8 @@ OpenCLContext::~OpenCLContext() { ...@@ -346,6 +347,8 @@ OpenCLContext::~OpenCLContext() {
delete atomIndexDevice; delete atomIndexDevice;
if (integration != NULL) if (integration != NULL)
delete integration; delete integration;
if (expression != NULL)
delete expression;
if (bonded != NULL) if (bonded != NULL)
delete bonded; delete bonded;
if (nonbonded != NULL) if (nonbonded != NULL)
...@@ -376,10 +379,10 @@ void OpenCLContext::initialize() { ...@@ -376,10 +379,10 @@ void OpenCLContext::initialize() {
reduceForcesKernel.setArg<cl::Buffer>(1, forceBuffers->getDeviceBuffer()); reduceForcesKernel.setArg<cl::Buffer>(1, forceBuffers->getDeviceBuffer());
reduceForcesKernel.setArg<cl_int>(2, paddedNumAtoms); reduceForcesKernel.setArg<cl_int>(2, paddedNumAtoms);
reduceForcesKernel.setArg<cl_int>(3, numForceBuffers); reduceForcesKernel.setArg<cl_int>(3, numForceBuffers);
addAutoclearBuffer(longForceBuffer->getDeviceBuffer(), longForceBuffer->getSize()*2); addAutoclearBuffer(*longForceBuffer);
} }
addAutoclearBuffer(forceBuffers->getDeviceBuffer(), forceBuffers->getSize()*4); addAutoclearBuffer(*forceBuffers);
addAutoclearBuffer(energyBuffer->getDeviceBuffer(), energyBuffer->getSize()); addAutoclearBuffer(*energyBuffer);
int bufferBytes = max(posq->getSize()*posq->getElementSize(), energyBuffer->getSize()*energyBuffer->getElementSize()); int bufferBytes = max(posq->getSize()*posq->getElementSize(), energyBuffer->getSize()*energyBuffer->getElementSize());
pinnedBuffer = new cl::Buffer(context, CL_MEM_ALLOC_HOST_PTR, bufferBytes); pinnedBuffer = new cl::Buffer(context, CL_MEM_ALLOC_HOST_PTR, bufferBytes);
pinnedMemory = queue.enqueueMapBuffer(*pinnedBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, bufferBytes); pinnedMemory = queue.enqueueMapBuffer(*pinnedBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, bufferBytes);
...@@ -479,6 +482,21 @@ cl::Program OpenCLContext::createProgram(const string source, const map<string, ...@@ -479,6 +482,21 @@ cl::Program OpenCLContext::createProgram(const string source, const map<string,
return program; return program;
} }
string OpenCLContext::doubleToString(double value) {
stringstream s;
s.precision(useDoublePrecision ? 16 : 8);
s << scientific << value;
if (!useDoublePrecision)
s << "f";
return s.str();
}
string OpenCLContext::intToString(int value) {
stringstream s;
s << value;
return s.str();
}
void OpenCLContext::executeKernel(cl::Kernel& kernel, int workUnits, int blockSize) { void OpenCLContext::executeKernel(cl::Kernel& kernel, int workUnits, int blockSize) {
if (blockSize == -1) if (blockSize == -1)
blockSize = ThreadBlockSize; blockSize = ThreadBlockSize;
...@@ -494,18 +512,23 @@ void OpenCLContext::executeKernel(cl::Kernel& kernel, int workUnits, int blockSi ...@@ -494,18 +512,23 @@ void OpenCLContext::executeKernel(cl::Kernel& kernel, int workUnits, int blockSi
} }
void OpenCLContext::clearBuffer(OpenCLArray& array) { void OpenCLContext::clearBuffer(OpenCLArray& array) {
clearBuffer(array.getDeviceBuffer(), array.getSize()*array.getElementSize()/sizeof(cl_float)); clearBuffer(array.getDeviceBuffer(), array.getSize()*array.getElementSize());
} }
void OpenCLContext::clearBuffer(cl::Memory& memory, int size) { void OpenCLContext::clearBuffer(cl::Memory& memory, int size) {
int words = size/4;
clearBufferKernel.setArg<cl::Memory>(0, memory); clearBufferKernel.setArg<cl::Memory>(0, memory);
clearBufferKernel.setArg<cl_int>(1, size); clearBufferKernel.setArg<cl_int>(1, words);
executeKernel(clearBufferKernel, size, 128); executeKernel(clearBufferKernel, words, 128);
}
void OpenCLContext::addAutoclearBuffer(OpenCLArray& array) {
addAutoclearBuffer(array.getDeviceBuffer(), array.getSize()*array.getElementSize());
} }
void OpenCLContext::addAutoclearBuffer(cl::Memory& memory, int size) { void OpenCLContext::addAutoclearBuffer(cl::Memory& memory, int size) {
autoclearBuffers.push_back(&memory); autoclearBuffers.push_back(&memory);
autoclearBufferSizes.push_back(size); autoclearBufferSizes.push_back(size/4);
} }
void OpenCLContext::clearAutoclearBuffers() { void OpenCLContext::clearAutoclearBuffers() {
...@@ -581,10 +604,10 @@ void OpenCLContext::reduceForces() { ...@@ -581,10 +604,10 @@ void OpenCLContext::reduceForces() {
void OpenCLContext::reduceBuffer(OpenCLArray& array, int numBuffers) { void OpenCLContext::reduceBuffer(OpenCLArray& array, int numBuffers) {
int bufferSize = array.getSize()/numBuffers; int bufferSize = array.getSize()/numBuffers;
reduceFloat4Kernel.setArg<cl::Buffer>(0, array.getDeviceBuffer()); reduceReal4Kernel.setArg<cl::Buffer>(0, array.getDeviceBuffer());
reduceFloat4Kernel.setArg<cl_int>(1, bufferSize); reduceReal4Kernel.setArg<cl_int>(1, bufferSize);
reduceFloat4Kernel.setArg<cl_int>(2, numBuffers); reduceReal4Kernel.setArg<cl_int>(2, numBuffers);
executeKernel(reduceFloat4Kernel, bufferSize, 128); executeKernel(reduceReal4Kernel, bufferSize, 128);
} }
void OpenCLContext::tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMolecule, vector<vector<int> >& atomBonds) { void OpenCLContext::tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMolecule, vector<vector<int> >& atomBonds) {
......
...@@ -45,6 +45,7 @@ namespace OpenMM { ...@@ -45,6 +45,7 @@ namespace OpenMM {
class OpenCLArray; class OpenCLArray;
class OpenCLForceInfo; class OpenCLForceInfo;
class OpenCLIntegrationUtilities; class OpenCLIntegrationUtilities;
class OpenCLExpressionUtilities;
class OpenCLBondedUtilities; class OpenCLBondedUtilities;
class OpenCLNonbondedUtilities; class OpenCLNonbondedUtilities;
class System; class System;
...@@ -314,14 +315,18 @@ public: ...@@ -314,14 +315,18 @@ public:
* Set all elements of an array to 0. * Set all elements of an array to 0.
* *
* @param memory the Memory to clear * @param memory the Memory to clear
* @param size the number of float elements in the buffer * @param size the size of the buffer in bytes
*/ */
void clearBuffer(cl::Memory& memory, int size); void clearBuffer(cl::Memory& memory, int size);
/**
* Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
*/
void addAutoclearBuffer(OpenCLArray& array);
/** /**
* Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation. * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
* *
* @param memory the Memory to clear * @param memory the Memory to clear
* @param size the number of float elements in the buffer * @param size the size of the buffer in bytes
*/ */
void addAutoclearBuffer(cl::Memory& memory, int size); void addAutoclearBuffer(cl::Memory& memory, int size);
/** /**
...@@ -329,7 +334,7 @@ public: ...@@ -329,7 +334,7 @@ public:
*/ */
void clearAutoclearBuffers(); void clearAutoclearBuffers();
/** /**
* Given a collection of buffers packed into an array, sum them and store * Given a collection of floating point buffers packed into an array, sum them and store
* the sum in the first buffer. * the sum in the first buffer.
* *
* @param array the array containing the buffers to reduce * @param array the array containing the buffers to reduce
...@@ -437,6 +442,15 @@ public: ...@@ -437,6 +442,15 @@ public:
bool getUseMixedPrecision() { bool getUseMixedPrecision() {
return useMixedPrecision; return useMixedPrecision;
} }
/**
* Convert a number to a string in a format suitable for including in a kernel.
* This takes into account whether the context uses single or double precision.
*/
std::string doubleToString(double value);
/**
* Convert a number to a string in a format suitable for including in a kernel.
*/
std::string intToString(int value);
/** /**
* Get the size of the periodic box. * Get the size of the periodic box.
*/ */
...@@ -476,6 +490,12 @@ public: ...@@ -476,6 +490,12 @@ public:
OpenCLIntegrationUtilities& getIntegrationUtilities() { OpenCLIntegrationUtilities& getIntegrationUtilities() {
return *integration; return *integration;
} }
/**
* Get the OpenCLExpressionUtilities for this context.
*/
OpenCLExpressionUtilities& getExpressionUtilities() {
return *expression;
}
/** /**
* Get the OpenCLBondedUtilities for this context. * Get the OpenCLBondedUtilities for this context.
*/ */
...@@ -580,7 +600,7 @@ private: ...@@ -580,7 +600,7 @@ private:
cl::Kernel clearFourBuffersKernel; cl::Kernel clearFourBuffersKernel;
cl::Kernel clearFiveBuffersKernel; cl::Kernel clearFiveBuffersKernel;
cl::Kernel clearSixBuffersKernel; cl::Kernel clearSixBuffersKernel;
cl::Kernel reduceFloat4Kernel; cl::Kernel reduceReal4Kernel;
cl::Kernel reduceForcesKernel; cl::Kernel reduceForcesKernel;
std::vector<OpenCLForceInfo*> forces; std::vector<OpenCLForceInfo*> forces;
std::vector<Molecule> molecules; std::vector<Molecule> molecules;
...@@ -601,6 +621,7 @@ private: ...@@ -601,6 +621,7 @@ private:
std::vector<int> autoclearBufferSizes; std::vector<int> autoclearBufferSizes;
std::vector<ReorderListener*> reorderListeners; std::vector<ReorderListener*> reorderListeners;
OpenCLIntegrationUtilities* integration; OpenCLIntegrationUtilities* integration;
OpenCLExpressionUtilities* expression;
OpenCLBondedUtilities* bonded; OpenCLBondedUtilities* bonded;
OpenCLNonbondedUtilities* nonbonded; OpenCLNonbondedUtilities* nonbonded;
WorkThread* thread; WorkThread* thread;
......
...@@ -33,19 +33,6 @@ using namespace OpenMM; ...@@ -33,19 +33,6 @@ using namespace OpenMM;
using namespace Lepton; using namespace Lepton;
using namespace std; using namespace std;
string OpenCLExpressionUtilities::doubleToString(double value) {
stringstream s;
s.precision(8);
s << scientific << value << "f";
return s.str();
}
string OpenCLExpressionUtilities::intToString(int value) {
stringstream s;
s << value;
return s.str();
}
string OpenCLExpressionUtilities::createExpressions(const map<string, ParsedExpression>& expressions, const map<string, string>& variables, string OpenCLExpressionUtilities::createExpressions(const map<string, ParsedExpression>& expressions, const map<string, string>& variables,
const vector<pair<string, string> >& functions, const string& prefix, const string& functionParams, const string& tempType) { const vector<pair<string, string> >& functions, const string& prefix, const string& functionParams, const string& tempType) {
vector<pair<ExpressionTreeNode, string> > variableNodes; vector<pair<ExpressionTreeNode, string> > variableNodes;
...@@ -75,13 +62,13 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre ...@@ -75,13 +62,13 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre
return; return;
for (int i = 0; i < (int) node.getChildren().size(); i++) for (int i = 0; i < (int) node.getChildren().size(); i++)
processExpression(out, node.getChildren()[i], temps, functions, prefix, functionParams, allExpressions, tempType); processExpression(out, node.getChildren()[i], temps, functions, prefix, functionParams, allExpressions, tempType);
string name = prefix+intToString(temps.size()); string name = prefix+context.intToString(temps.size());
bool hasRecordedNode = false; bool hasRecordedNode = false;
out << tempType << " " << name << " = "; out << tempType << " " << name << " = ";
switch (node.getOperation().getId()) { switch (node.getOperation().getId()) {
case Operation::CONSTANT: case Operation::CONSTANT:
out << doubleToString(dynamic_cast<const Operation::Constant*>(&node.getOperation())->getValue()); out << context.doubleToString(dynamic_cast<const Operation::Constant*>(&node.getOperation())->getValue());
break; break;
case Operation::VARIABLE: case Operation::VARIABLE:
throw OpenMMException("Unknown variable in expression: "+node.getOperation().getName()); throw OpenMMException("Unknown variable in expression: "+node.getOperation().getName());
...@@ -107,7 +94,7 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre ...@@ -107,7 +94,7 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre
string valueName = name; string valueName = name;
string derivName = name; string derivName = name;
if (valueNode != NULL && derivNode != NULL) { if (valueNode != NULL && derivNode != NULL) {
string name2 = prefix+intToString(temps.size()); string name2 = prefix+context.intToString(temps.size());
out << tempType << " " << name2 << " = 0.0f;\n"; out << tempType << " " << name2 << " = 0.0f;\n";
if (isDeriv) { if (isDeriv) {
valueName = name2; valueName = name2;
...@@ -236,10 +223,10 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre ...@@ -236,10 +223,10 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre
out << "RECIP(" << getTempName(node.getChildren()[0], temps) << ")"; out << "RECIP(" << getTempName(node.getChildren()[0], temps) << ")";
break; break;
case Operation::ADD_CONSTANT: case Operation::ADD_CONSTANT:
out << doubleToString(dynamic_cast<const Operation::AddConstant*>(&node.getOperation())->getValue()) << "+" << getTempName(node.getChildren()[0], temps); out << context.doubleToString(dynamic_cast<const Operation::AddConstant*>(&node.getOperation())->getValue()) << "+" << getTempName(node.getChildren()[0], temps);
break; break;
case Operation::MULTIPLY_CONSTANT: case Operation::MULTIPLY_CONSTANT:
out << doubleToString(dynamic_cast<const Operation::MultiplyConstant*>(&node.getOperation())->getValue()) << "*" << getTempName(node.getChildren()[0], temps); out << context.doubleToString(dynamic_cast<const Operation::MultiplyConstant*>(&node.getOperation())->getValue()) << "*" << getTempName(node.getChildren()[0], temps);
break; break;
case Operation::POWER_CONSTANT: case Operation::POWER_CONSTANT:
{ {
...@@ -266,7 +253,7 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre ...@@ -266,7 +253,7 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre
for (map<int, const ExpressionTreeNode*>::const_iterator iter = powers.begin(); iter != powers.end(); ++iter) { for (map<int, const ExpressionTreeNode*>::const_iterator iter = powers.begin(); iter != powers.end(); ++iter) {
if (iter->first != exponent) { if (iter->first != exponent) {
exponents.push_back(iter->first >= 0 ? iter->first : -iter->first); exponents.push_back(iter->first >= 0 ? iter->first : -iter->first);
string name2 = prefix+intToString(temps.size()); string name2 = prefix+context.intToString(temps.size());
names.push_back(name2); names.push_back(name2);
temps.push_back(make_pair(*iter->second, name2)); temps.push_back(make_pair(*iter->second, name2));
out << tempType << " " << name2 << " = 0.0f;\n"; out << tempType << " " << name2 << " = 0.0f;\n";
...@@ -295,7 +282,7 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre ...@@ -295,7 +282,7 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre
out << "}"; out << "}";
} }
else else
out << "pow(" << getTempName(node.getChildren()[0], temps) << ", " << doubleToString(exponent) << ")"; out << "pow(" << getTempName(node.getChildren()[0], temps) << ", " << context.doubleToString(exponent) << ")";
break; break;
} }
case Operation::MIN: case Operation::MIN:
......
...@@ -45,6 +45,8 @@ namespace OpenMM { ...@@ -45,6 +45,8 @@ namespace OpenMM {
class OPENMM_EXPORT OpenCLExpressionUtilities { class OPENMM_EXPORT OpenCLExpressionUtilities {
public: public:
OpenCLExpressionUtilities(OpenCLContext& context) : context(context) {
}
/** /**
* Generate the source code for calculating a set of expressions. * Generate the source code for calculating a set of expressions.
* *
...@@ -54,10 +56,10 @@ public: ...@@ -54,10 +56,10 @@ public:
* @param functions defines the variable name for each tabulated function that may appear in the expressions * @param functions defines the variable name for each tabulated function that may appear in the expressions
* @param prefix a prefix to put in front of temporary variables * @param prefix a prefix to put in front of temporary variables
* @param functionParams the variable name containing the parameters for each tabulated function * @param functionParams the variable name containing the parameters for each tabulated function
* @param tempType the type of value to use for temporary variables (defaults to "float") * @param tempType the type of value to use for temporary variables (defaults to "real")
*/ */
static std::string createExpressions(const std::map<std::string, Lepton::ParsedExpression>& expressions, const std::map<std::string, std::string>& variables, std::string createExpressions(const std::map<std::string, Lepton::ParsedExpression>& expressions, const std::map<std::string, std::string>& variables,
const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams, const std::string& tempType="float"); const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams, const std::string& tempType="real");
/** /**
* Generate the source code for calculating a set of expressions. * Generate the source code for calculating a set of expressions.
* *
...@@ -69,7 +71,7 @@ public: ...@@ -69,7 +71,7 @@ public:
* @param functionParams the variable name containing the parameters for each tabulated function * @param functionParams the variable name containing the parameters for each tabulated function
* @param tempType the type of value to use for temporary variables (defaults to "float") * @param tempType the type of value to use for temporary variables (defaults to "float")
*/ */
static std::string createExpressions(const std::map<std::string, Lepton::ParsedExpression>& expressions, const std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& variables, std::string createExpressions(const std::map<std::string, Lepton::ParsedExpression>& expressions, const std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& variables,
const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams, const std::string& tempType="float"); const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams, const std::string& tempType="float");
/** /**
* Calculate the spline coefficients for a tabulated function that appears in expressions. * Calculate the spline coefficients for a tabulated function that appears in expressions.
...@@ -79,26 +81,19 @@ public: ...@@ -79,26 +81,19 @@ public:
* @param max the value of the independent variable corresponding to the last element of values * @param max the value of the independent variable corresponding to the last element of values
* @return the spline coefficients * @return the spline coefficients
*/ */
static std::vector<mm_float4> computeFunctionCoefficients(const std::vector<double>& values, double min, double max); std::vector<mm_float4> computeFunctionCoefficients(const std::vector<double>& values, double min, double max);
/**
* Convert a number to a string in a format suitable for including in a kernel.
*/
static std::string doubleToString(double value);
/**
* Convert a number to a string in a format suitable for including in a kernel.
*/
static std::string intToString(int value);
class FunctionPlaceholder; class FunctionPlaceholder;
private: private:
static void processExpression(std::stringstream& out, const Lepton::ExpressionTreeNode& node, void processExpression(std::stringstream& out, const Lepton::ExpressionTreeNode& node,
std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& temps, std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& temps,
const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams, const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams,
const std::vector<Lepton::ParsedExpression>& allExpressions, const std::string& tempType); const std::vector<Lepton::ParsedExpression>& allExpressions, const std::string& tempType);
static std::string getTempName(const Lepton::ExpressionTreeNode& node, const std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& temps); std::string getTempName(const Lepton::ExpressionTreeNode& node, const std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& temps);
static void findRelatedTabulatedFunctions(const Lepton::ExpressionTreeNode& node, const Lepton::ExpressionTreeNode& searchNode, void findRelatedTabulatedFunctions(const Lepton::ExpressionTreeNode& node, const Lepton::ExpressionTreeNode& searchNode,
const Lepton::ExpressionTreeNode*& valueNode, const Lepton::ExpressionTreeNode*& derivNode); const Lepton::ExpressionTreeNode*& valueNode, const Lepton::ExpressionTreeNode*& derivNode);
static void findRelatedPowers(const Lepton::ExpressionTreeNode& node, const Lepton::ExpressionTreeNode& searchNode, void findRelatedPowers(const Lepton::ExpressionTreeNode& node, const Lepton::ExpressionTreeNode& searchNode,
std::map<int, const Lepton::ExpressionTreeNode*>& powers); std::map<int, const Lepton::ExpressionTreeNode*>& powers);
OpenCLContext& context;
}; };
/** /**
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2011 Stanford University and the Authors. * * Portions copyright (c) 2009-2012 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -47,15 +47,15 @@ void OpenCLFFT3D::execFFT(OpenCLArray& in, OpenCLArray& out, bool forward) { ...@@ -47,15 +47,15 @@ void OpenCLFFT3D::execFFT(OpenCLArray& in, OpenCLArray& out, bool forward) {
maxSize = 1; maxSize = 1;
zkernel.setArg<cl::Buffer>(0, in.getDeviceBuffer()); zkernel.setArg<cl::Buffer>(0, in.getDeviceBuffer());
zkernel.setArg<cl::Buffer>(1, out.getDeviceBuffer()); zkernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
zkernel.setArg<cl_float>(2, forward ? 1.0f : -1.0f); zkernel.setArg<cl_int>(2, forward ? 1 : -1);
context.executeKernel(zkernel, xsize*ysize*zsize, min(zsize, (int) maxSize)); context.executeKernel(zkernel, xsize*ysize*zsize, min(zsize, (int) maxSize));
xkernel.setArg<cl::Buffer>(0, out.getDeviceBuffer()); xkernel.setArg<cl::Buffer>(0, out.getDeviceBuffer());
xkernel.setArg<cl::Buffer>(1, in.getDeviceBuffer()); xkernel.setArg<cl::Buffer>(1, in.getDeviceBuffer());
xkernel.setArg<cl_float>(2, forward ? 1.0f : -1.0f); xkernel.setArg<cl_int>(2, forward ? 1 : -1);
context.executeKernel(xkernel, xsize*ysize*zsize, min(xsize, (int) maxSize)); context.executeKernel(xkernel, xsize*ysize*zsize, min(xsize, (int) maxSize));
ykernel.setArg<cl::Buffer>(0, in.getDeviceBuffer()); ykernel.setArg<cl::Buffer>(0, in.getDeviceBuffer());
ykernel.setArg<cl::Buffer>(1, out.getDeviceBuffer()); ykernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
ykernel.setArg<cl_float>(2, forward ? 1.0f : -1.0f); ykernel.setArg<cl_int>(2, forward ? 1 : -1);
context.executeKernel(ykernel, xsize*ysize*zsize, min(ysize, (int) maxSize)); context.executeKernel(ykernel, xsize*ysize*zsize, min(ysize, (int) maxSize));
} }
...@@ -99,23 +99,23 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) { ...@@ -99,23 +99,23 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) {
source<<"int i = get_local_id(0);\n"; source<<"int i = get_local_id(0);\n";
} }
source<<"int j = i/"<<m<<";\n"; source<<"int j = i/"<<m<<";\n";
source<<"float2 c0 = data"<<input<<"[i];\n"; source<<"real2 c0 = data"<<input<<"[i];\n";
source<<"float2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n"; source<<"real2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
source<<"float2 c2 = data"<<input<<"[i+"<<(2*L*m)<<"];\n"; source<<"real2 c2 = data"<<input<<"[i+"<<(2*L*m)<<"];\n";
source<<"float2 c3 = data"<<input<<"[i+"<<(3*L*m)<<"];\n"; source<<"real2 c3 = data"<<input<<"[i+"<<(3*L*m)<<"];\n";
source<<"float2 c4 = data"<<input<<"[i+"<<(4*L*m)<<"];\n"; source<<"real2 c4 = data"<<input<<"[i+"<<(4*L*m)<<"];\n";
source<<"float2 d0 = c1+c4;\n"; source<<"real2 d0 = c1+c4;\n";
source<<"float2 d1 = c2+c3;\n"; source<<"real2 d1 = c2+c3;\n";
source<<"float2 d2 = "<<OpenCLExpressionUtilities::doubleToString(sin(0.4*M_PI))<<"*(c1-c4);\n"; source<<"real2 d2 = "<<context.doubleToString(sin(0.4*M_PI))<<"*(c1-c4);\n";
source<<"float2 d3 = "<<OpenCLExpressionUtilities::doubleToString(sin(0.4*M_PI))<<"*(c2-c3);\n"; source<<"real2 d3 = "<<context.doubleToString(sin(0.4*M_PI))<<"*(c2-c3);\n";
source<<"float2 d4 = d0+d1;\n"; source<<"real2 d4 = d0+d1;\n";
source<<"float2 d5 = "<<OpenCLExpressionUtilities::doubleToString(0.25*sqrt(5.0))<<"*(d0-d1);\n"; source<<"real2 d5 = "<<context.doubleToString(0.25*sqrt(5.0))<<"*(d0-d1);\n";
source<<"float2 d6 = c0-0.25f*d4;\n"; source<<"real2 d6 = c0-0.25f*d4;\n";
source<<"float2 d7 = d6+d5;\n"; source<<"real2 d7 = d6+d5;\n";
source<<"float2 d8 = d6-d5;\n"; source<<"real2 d8 = d6-d5;\n";
string coeff = OpenCLExpressionUtilities::doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI)); string coeff = context.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI));
source<<"float2 d9 = sign*(float2) (d2.y+"<<coeff<<"*d3.y, -d2.x-"<<coeff<<"*d3.x);\n"; source<<"real2 d9 = sign*(real2) (d2.y+"<<coeff<<"*d3.y, -d2.x-"<<coeff<<"*d3.x);\n";
source<<"float2 d10 = sign*(float2) ("<<coeff<<"*d2.y-d3.y, d3.x-"<<coeff<<"*d2.x);\n"; source<<"real2 d10 = sign*(real2) ("<<coeff<<"*d2.y-d3.y, d3.x-"<<coeff<<"*d2.x);\n";
source<<"data"<<output<<"[i+4*j*"<<m<<"] = c0+d4;\n"; source<<"data"<<output<<"[i+4*j*"<<m<<"] = c0+d4;\n";
source<<"data"<<output<<"[i+(4*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(5*L)<<"], d7+d9);\n"; source<<"data"<<output<<"[i+(4*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(5*L)<<"], d7+d9);\n";
source<<"data"<<output<<"[i+(4*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(5*L)<<"], d8+d10);\n"; source<<"data"<<output<<"[i+(4*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(5*L)<<"], d8+d10);\n";
...@@ -134,14 +134,14 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) { ...@@ -134,14 +134,14 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) {
source<<"int i = get_local_id(0);\n"; source<<"int i = get_local_id(0);\n";
} }
source<<"int j = i/"<<m<<";\n"; source<<"int j = i/"<<m<<";\n";
source<<"float2 c0 = data"<<input<<"[i];\n"; source<<"real2 c0 = data"<<input<<"[i];\n";
source<<"float2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n"; source<<"real2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
source<<"float2 c2 = data"<<input<<"[i+"<<(2*L*m)<<"];\n"; source<<"real2 c2 = data"<<input<<"[i+"<<(2*L*m)<<"];\n";
source<<"float2 c3 = data"<<input<<"[i+"<<(3*L*m)<<"];\n"; source<<"real2 c3 = data"<<input<<"[i+"<<(3*L*m)<<"];\n";
source<<"float2 d0 = c0+c2;\n"; source<<"real2 d0 = c0+c2;\n";
source<<"float2 d1 = c0-c2;\n"; source<<"real2 d1 = c0-c2;\n";
source<<"float2 d2 = c1+c3;\n"; source<<"real2 d2 = c1+c3;\n";
source<<"float2 d3 = sign*(float2) (c1.y-c3.y, c3.x-c1.x);\n"; source<<"real2 d3 = sign*(real2) (c1.y-c3.y, c3.x-c1.x);\n";
source<<"data"<<output<<"[i+3*j*"<<m<<"] = d0+d2;\n"; source<<"data"<<output<<"[i+3*j*"<<m<<"] = d0+d2;\n";
source<<"data"<<output<<"[i+(3*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(4*L)<<"], d1+d3);\n"; source<<"data"<<output<<"[i+(3*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(4*L)<<"], d1+d3);\n";
source<<"data"<<output<<"[i+(3*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(4*L)<<"], d0-d2);\n"; source<<"data"<<output<<"[i+(3*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(4*L)<<"], d0-d2);\n";
...@@ -159,12 +159,12 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) { ...@@ -159,12 +159,12 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) {
source<<"int i = get_local_id(0);\n"; source<<"int i = get_local_id(0);\n";
} }
source<<"int j = i/"<<m<<";\n"; source<<"int j = i/"<<m<<";\n";
source<<"float2 c0 = data"<<input<<"[i];\n"; source<<"real2 c0 = data"<<input<<"[i];\n";
source<<"float2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n"; source<<"real2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
source<<"float2 c2 = data"<<input<<"[i+"<<(2*L*m)<<"];\n"; source<<"real2 c2 = data"<<input<<"[i+"<<(2*L*m)<<"];\n";
source<<"float2 d0 = c1+c2;\n"; source<<"real2 d0 = c1+c2;\n";
source<<"float2 d1 = c0-0.5f*d0;\n"; source<<"real2 d1 = c0-0.5f*d0;\n";
source<<"float2 d2 = sign*"<<OpenCLExpressionUtilities::doubleToString(sin(M_PI/3.0))<<"*(float2) (c1.y-c2.y, c2.x-c1.x);\n"; source<<"real2 d2 = sign*"<<context.doubleToString(sin(M_PI/3.0))<<"*(real2) (c1.y-c2.y, c2.x-c1.x);\n";
source<<"data"<<output<<"[i+2*j*"<<m<<"] = c0+d0;\n"; source<<"data"<<output<<"[i+2*j*"<<m<<"] = c0+d0;\n";
source<<"data"<<output<<"[i+(2*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(3*L)<<"], d1+d2);\n"; source<<"data"<<output<<"[i+(2*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(3*L)<<"], d1+d2);\n";
source<<"data"<<output<<"[i+(2*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(3*L)<<"], d1-d2);\n"; source<<"data"<<output<<"[i+(2*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(3*L)<<"], d1-d2);\n";
...@@ -181,15 +181,15 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) { ...@@ -181,15 +181,15 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) {
source<<"int i = get_local_id(0);\n"; source<<"int i = get_local_id(0);\n";
} }
source<<"int j = i/"<<m<<";\n"; source<<"int j = i/"<<m<<";\n";
source<<"float2 c0 = data"<<input<<"[i];\n"; source<<"real2 c0 = data"<<input<<"[i];\n";
source<<"float2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n"; source<<"real2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
source<<"data"<<output<<"[i+j*"<<m<<"] = c0+c1;\n"; source<<"data"<<output<<"[i+j*"<<m<<"] = c0+c1;\n";
source<<"data"<<output<<"[i+(j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(2*L)<<"], c0-c1);\n"; source<<"data"<<output<<"[i+(j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(2*L)<<"], c0-c1);\n";
source<<"}\n"; source<<"}\n";
m = m*2; m = m*2;
} }
else else
throw OpenMMException("Illegal size for FFT: "+OpenCLExpressionUtilities::intToString(zsize)); throw OpenMMException("Illegal size for FFT: "+context.intToString(zsize));
source<<"barrier(CLK_LOCAL_MEM_FENCE);\n"; source<<"barrier(CLK_LOCAL_MEM_FENCE);\n";
source<<"}\n"; source<<"}\n";
++stage; ++stage;
...@@ -205,16 +205,17 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) { ...@@ -205,16 +205,17 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) {
source<<"out[y*(ZSIZE*XSIZE)+get_local_id(0)*XSIZE+x] = data"<<(stage%2)<<"[get_local_id(0)];\n"; source<<"out[y*(ZSIZE*XSIZE)+get_local_id(0)*XSIZE+x] = data"<<(stage%2)<<"[get_local_id(0)];\n";
source<<"barrier(CLK_GLOBAL_MEM_FENCE);"; source<<"barrier(CLK_GLOBAL_MEM_FENCE);";
map<string, string> replacements; map<string, string> replacements;
replacements["XSIZE"] = OpenCLExpressionUtilities::intToString(xsize); replacements["XSIZE"] = context.intToString(xsize);
replacements["YSIZE"] = OpenCLExpressionUtilities::intToString(ysize); replacements["YSIZE"] = context.intToString(ysize);
replacements["ZSIZE"] = OpenCLExpressionUtilities::intToString(zsize); replacements["ZSIZE"] = context.intToString(zsize);
replacements["M_PI"] = OpenCLExpressionUtilities::doubleToString(M_PI); replacements["M_PI"] = context.doubleToString(M_PI);
replacements["COMPUTE_FFT"] = source.str(); replacements["COMPUTE_FFT"] = source.str();
replacements["LOOP_REQUIRED"] = (loopRequired ? "1" : "0"); replacements["LOOP_REQUIRED"] = (loopRequired ? "1" : "0");
cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::fft, replacements)); cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::fft, replacements));
cl::Kernel kernel(program, "execFFT"); cl::Kernel kernel(program, "execFFT");
kernel.setArg(3, zsize*sizeof(mm_float2), NULL); int bufferSize = zsize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2));
kernel.setArg(4, zsize*sizeof(mm_float2), NULL); kernel.setArg(3, bufferSize, NULL);
kernel.setArg(5, zsize*sizeof(mm_float2), NULL); kernel.setArg(4, bufferSize, NULL);
kernel.setArg(5, bufferSize, NULL);
return kernel; return kernel;
} }
...@@ -559,8 +559,8 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c ...@@ -559,8 +559,8 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
// Create the CCMA kernels. // Create the CCMA kernels.
map<string, string> defines; map<string, string> defines;
defines["NUM_CONSTRAINTS"] = OpenCLExpressionUtilities::intToString(numCCMA); defines["NUM_CONSTRAINTS"] = context.intToString(numCCMA);
defines["NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(numAtoms); defines["NUM_ATOMS"] = context.intToString(numAtoms);
cl::Program ccmaProgram = context.createProgram(OpenCLKernelSources::ccma, defines); cl::Program ccmaProgram = context.createProgram(OpenCLKernelSources::ccma, defines);
ccmaDirectionsKernel = cl::Kernel(ccmaProgram, "computeConstraintDirections"); ccmaDirectionsKernel = cl::Kernel(ccmaProgram, "computeConstraintDirections");
ccmaPosForceKernel = cl::Kernel(ccmaProgram, "computeConstraintForce"); ccmaPosForceKernel = cl::Kernel(ccmaProgram, "computeConstraintForce");
...@@ -630,9 +630,9 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c ...@@ -630,9 +630,9 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
// Create the kernels for virtual sites. // Create the kernels for virtual sites.
map<string, string> defines; map<string, string> defines;
defines["NUM_2_AVERAGE"] = OpenCLExpressionUtilities::intToString(num2Avg); defines["NUM_2_AVERAGE"] = context.intToString(num2Avg);
defines["NUM_3_AVERAGE"] = OpenCLExpressionUtilities::intToString(num3Avg); defines["NUM_3_AVERAGE"] = context.intToString(num3Avg);
defines["NUM_OUT_OF_PLANE"] = OpenCLExpressionUtilities::intToString(numOutOfPlane); defines["NUM_OUT_OF_PLANE"] = context.intToString(numOutOfPlane);
cl::Program vsiteProgram = context.createProgram(OpenCLKernelSources::virtualSites, defines); cl::Program vsiteProgram = context.createProgram(OpenCLKernelSources::virtualSites, defines);
vsitePositionKernel = cl::Kernel(vsiteProgram, "computeVirtualSites"); vsitePositionKernel = cl::Kernel(vsiteProgram, "computeVirtualSites");
vsitePositionKernel.setArg<cl::Buffer>(0, context.getPosq().getDeviceBuffer()); vsitePositionKernel.setArg<cl::Buffer>(0, context.getPosq().getDeviceBuffer());
......
This diff is collapsed.
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2011 Stanford University and the Authors. * * Portions copyright (c) 2009-2012 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -267,7 +267,7 @@ void OpenCLNonbondedUtilities::initialize(const System& system) { ...@@ -267,7 +267,7 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
forceKernel = createInteractionKernel(kernelSource, parameters, arguments, true, true); forceKernel = createInteractionKernel(kernelSource, parameters, arguments, true, true);
if (useCutoff) { if (useCutoff) {
map<string, string> defines; map<string, string> defines;
defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(context.getNumAtomBlocks()); defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
if (forceBufferPerAtomBlock) if (forceBufferPerAtomBlock)
defines["USE_OUTPUT_BUFFER_PER_BLOCK"] = "1"; defines["USE_OUTPUT_BUFFER_PER_BLOCK"] = "1";
if (usePeriodic) if (usePeriodic)
...@@ -281,6 +281,9 @@ void OpenCLNonbondedUtilities::initialize(const System& system) { ...@@ -281,6 +281,9 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
findBlockBoundsKernel.setArg<cl::Buffer>(5, blockBoundingBox->getDeviceBuffer()); findBlockBoundsKernel.setArg<cl::Buffer>(5, blockBoundingBox->getDeviceBuffer());
findBlockBoundsKernel.setArg<cl::Buffer>(6, interactionCount->getDeviceBuffer()); findBlockBoundsKernel.setArg<cl::Buffer>(6, interactionCount->getDeviceBuffer());
findInteractingBlocksKernel = cl::Kernel(interactingBlocksProgram, "findBlocksWithInteractions"); findInteractingBlocksKernel = cl::Kernel(interactingBlocksProgram, "findBlocksWithInteractions");
if (context.getUseDoublePrecision())
findInteractingBlocksKernel.setArg<cl_double>(0, cutoff*cutoff);
else
findInteractingBlocksKernel.setArg<cl_float>(0, (cl_float) (cutoff*cutoff)); findInteractingBlocksKernel.setArg<cl_float>(0, (cl_float) (cutoff*cutoff));
findInteractingBlocksKernel.setArg<cl::Buffer>(3, blockCenter->getDeviceBuffer()); findInteractingBlocksKernel.setArg<cl::Buffer>(3, blockCenter->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(4, blockBoundingBox->getDeviceBuffer()); findInteractingBlocksKernel.setArg<cl::Buffer>(4, blockBoundingBox->getDeviceBuffer());
...@@ -293,6 +296,9 @@ void OpenCLNonbondedUtilities::initialize(const System& system) { ...@@ -293,6 +296,9 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
findInteractingBlocksKernel.setArg<cl_uint>(11, startTileIndex+numTiles); findInteractingBlocksKernel.setArg<cl_uint>(11, startTileIndex+numTiles);
if (context.getSIMDWidth() == 32 && !deviceIsCpu) { if (context.getSIMDWidth() == 32 && !deviceIsCpu) {
findInteractionsWithinBlocksKernel = cl::Kernel(interactingBlocksProgram, "findInteractionsWithinBlocks"); findInteractionsWithinBlocksKernel = cl::Kernel(interactingBlocksProgram, "findInteractionsWithinBlocks");
if (context.getUseDoublePrecision())
findInteractionsWithinBlocksKernel.setArg<cl_double>(0, cutoff*cutoff);
else
findInteractionsWithinBlocksKernel.setArg<cl_float>(0, (cl_float) (cutoff*cutoff)); findInteractionsWithinBlocksKernel.setArg<cl_float>(0, (cl_float) (cutoff*cutoff));
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(3, context.getPosq().getDeviceBuffer()); findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(3, context.getPosq().getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(4, interactingTiles->getDeviceBuffer()); findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(4, interactingTiles->getDeviceBuffer());
...@@ -315,6 +321,20 @@ int OpenCLNonbondedUtilities::findExclusionIndex(int x, int y, const vector<cl_u ...@@ -315,6 +321,20 @@ int OpenCLNonbondedUtilities::findExclusionIndex(int x, int y, const vector<cl_u
throw OpenMMException("Internal error: exclusion in unexpected tile"); throw OpenMMException("Internal error: exclusion in unexpected tile");
} }
static void setPeriodicBoxSizeArg(OpenCLContext& cl, cl::Kernel& kernel, int index) {
if (cl.getUseDoublePrecision())
kernel.setArg<mm_double4>(index, cl.getPeriodicBoxSizeDouble());
else
kernel.setArg<mm_float4>(index, cl.getPeriodicBoxSize());
}
static void setInvPeriodicBoxSizeArg(OpenCLContext& cl, cl::Kernel& kernel, int index) {
if (cl.getUseDoublePrecision())
kernel.setArg<mm_double4>(index, cl.getInvPeriodicBoxSizeDouble());
else
kernel.setArg<mm_float4>(index, cl.getInvPeriodicBoxSize());
}
void OpenCLNonbondedUtilities::prepareInteractions() { void OpenCLNonbondedUtilities::prepareInteractions() {
if (!useCutoff) if (!useCutoff)
return; return;
...@@ -327,15 +347,15 @@ void OpenCLNonbondedUtilities::prepareInteractions() { ...@@ -327,15 +347,15 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
// Compute the neighbor list. // Compute the neighbor list.
findBlockBoundsKernel.setArg<mm_float4>(1, context.getPeriodicBoxSize()); setPeriodicBoxSizeArg(context, findBlockBoundsKernel, 1);
findBlockBoundsKernel.setArg<mm_float4>(2, context.getInvPeriodicBoxSize()); setInvPeriodicBoxSizeArg(context, findBlockBoundsKernel, 2);
context.executeKernel(findBlockBoundsKernel, context.getNumAtoms()); context.executeKernel(findBlockBoundsKernel, context.getNumAtoms());
findInteractingBlocksKernel.setArg<mm_float4>(1, context.getPeriodicBoxSize()); setPeriodicBoxSizeArg(context, findInteractingBlocksKernel, 1);
findInteractingBlocksKernel.setArg<mm_float4>(2, context.getInvPeriodicBoxSize()); setInvPeriodicBoxSizeArg(context, findInteractingBlocksKernel, 2);
context.executeKernel(findInteractingBlocksKernel, context.getNumAtoms(), deviceIsCpu ? 1 : -1); context.executeKernel(findInteractingBlocksKernel, context.getNumAtoms(), deviceIsCpu ? 1 : -1);
if (context.getSIMDWidth() == 32 && !deviceIsCpu) { if (context.getSIMDWidth() == 32 && !deviceIsCpu) {
findInteractionsWithinBlocksKernel.setArg<mm_float4>(1, context.getPeriodicBoxSize()); setPeriodicBoxSizeArg(context, findInteractionsWithinBlocksKernel, 1);
findInteractionsWithinBlocksKernel.setArg<mm_float4>(2, context.getInvPeriodicBoxSize()); setInvPeriodicBoxSizeArg(context, findInteractionsWithinBlocksKernel, 2);
context.executeKernel(findInteractionsWithinBlocksKernel, context.getNumAtoms(), 128); context.executeKernel(findInteractionsWithinBlocksKernel, context.getNumAtoms(), 128);
} }
} }
...@@ -343,8 +363,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() { ...@@ -343,8 +363,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
void OpenCLNonbondedUtilities::computeInteractions() { void OpenCLNonbondedUtilities::computeInteractions() {
if (cutoff != -1.0) { if (cutoff != -1.0) {
if (useCutoff) { if (useCutoff) {
forceKernel.setArg<mm_float4>(10, context.getPeriodicBoxSize()); setPeriodicBoxSizeArg(context, forceKernel, 10);
forceKernel.setArg<mm_float4>(11, context.getInvPeriodicBoxSize()); setInvPeriodicBoxSizeArg(context, forceKernel, 11);
} }
context.executeKernel(forceKernel, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize); context.executeKernel(forceKernel, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
} }
...@@ -498,11 +518,11 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -498,11 +518,11 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
defines["USE_EXCLUSIONS"] = "1"; defines["USE_EXCLUSIONS"] = "1";
if (isSymmetric) if (isSymmetric)
defines["USE_SYMMETRIC"] = "1"; defines["USE_SYMMETRIC"] = "1";
defines["FORCE_WORK_GROUP_SIZE"] = OpenCLExpressionUtilities::intToString(forceThreadBlockSize); defines["FORCE_WORK_GROUP_SIZE"] = context.intToString(forceThreadBlockSize);
defines["CUTOFF_SQUARED"] = OpenCLExpressionUtilities::doubleToString(cutoff*cutoff); defines["CUTOFF_SQUARED"] = context.doubleToString(cutoff*cutoff);
defines["NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getNumAtoms()); defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getPaddedNumAtoms()); defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(context.getNumAtomBlocks()); defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
if ((localDataSize/4)%2 == 0) if ((localDataSize/4)%2 == 0)
defines["PARAMETER_SIZE_IS_EVEN"] = "1"; defines["PARAMETER_SIZE_IS_EVEN"] = "1";
string file; string file;
......
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
#include "OpenCLContext.h" #include "OpenCLContext.h"
#include "openmm/System.h" #include "openmm/System.h"
#include "OpenCLExpressionUtilities.h" #include "OpenCLExpressionUtilities.h"
#include <sstream>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -287,8 +288,11 @@ public: ...@@ -287,8 +288,11 @@ public:
name(name), componentType(componentType), numComponents(numComponents), size(size), memory(&memory) { name(name), componentType(componentType), numComponents(numComponents), size(size), memory(&memory) {
if (numComponents == 1) if (numComponents == 1)
type = componentType; type = componentType;
else else {
type = componentType+OpenCLExpressionUtilities::intToString(numComponents); std::stringstream s;
s << componentType << numComponents;
type = s.str();
}
} }
const std::string& getName() const { const std::string& getName() const {
return name; return name;
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2011 Stanford University and the Authors. * * Portions copyright (c) 2011-2012 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -54,14 +54,14 @@ using namespace std; ...@@ -54,14 +54,14 @@ using namespace std;
class OpenCLParallelCalcForcesAndEnergyKernel::BeginComputationTask : public OpenCLContext::WorkTask { class OpenCLParallelCalcForcesAndEnergyKernel::BeginComputationTask : public OpenCLContext::WorkTask {
public: public:
BeginComputationTask(ContextImpl& context, OpenCLContext& cl, OpenCLCalcForcesAndEnergyKernel& kernel, BeginComputationTask(ContextImpl& context, OpenCLContext& cl, OpenCLCalcForcesAndEnergyKernel& kernel,
bool includeForce, bool includeEnergy, int groups, mm_float4* pinnedMemory) : context(context), cl(cl), kernel(kernel), bool includeForce, bool includeEnergy, int groups, void* pinnedMemory) : context(context), cl(cl), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), pinnedMemory(pinnedMemory) { includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), pinnedMemory(pinnedMemory) {
} }
void execute() { void execute() {
// Copy coordinates over to this device and execute the kernel. // Copy coordinates over to this device and execute the kernel.
if (cl.getContextIndex() > 0) if (cl.getContextIndex() > 0)
cl.getQueue().enqueueWriteBuffer(cl.getPosq().getDeviceBuffer(), CL_FALSE, 0, cl.getPaddedNumAtoms()*sizeof(mm_float4), pinnedMemory); cl.getQueue().enqueueWriteBuffer(cl.getPosq().getDeviceBuffer(), CL_FALSE, 0, cl.getPaddedNumAtoms()*cl.getPosq().getElementSize(), pinnedMemory);
kernel.beginComputation(context, includeForce, includeEnergy, groups); kernel.beginComputation(context, includeForce, includeEnergy, groups);
} }
private: private:
...@@ -70,13 +70,13 @@ private: ...@@ -70,13 +70,13 @@ private:
OpenCLCalcForcesAndEnergyKernel& kernel; OpenCLCalcForcesAndEnergyKernel& kernel;
bool includeForce, includeEnergy; bool includeForce, includeEnergy;
int groups; int groups;
mm_float4* pinnedMemory; void* pinnedMemory;
}; };
class OpenCLParallelCalcForcesAndEnergyKernel::FinishComputationTask : public OpenCLContext::WorkTask { class OpenCLParallelCalcForcesAndEnergyKernel::FinishComputationTask : public OpenCLContext::WorkTask {
public: public:
FinishComputationTask(ContextImpl& context, OpenCLContext& cl, OpenCLCalcForcesAndEnergyKernel& kernel, FinishComputationTask(ContextImpl& context, OpenCLContext& cl, OpenCLCalcForcesAndEnergyKernel& kernel,
bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, mm_float4* pinnedMemory) : bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, void* pinnedMemory) :
context(context), cl(cl), kernel(kernel), includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), energy(energy), context(context), cl(cl), kernel(kernel), includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), energy(energy),
completionTime(completionTime), pinnedMemory(pinnedMemory) { completionTime(completionTime), pinnedMemory(pinnedMemory) {
} }
...@@ -87,8 +87,9 @@ public: ...@@ -87,8 +87,9 @@ public:
if (includeForce) { if (includeForce) {
if (cl.getContextIndex() > 0) { if (cl.getContextIndex() > 0) {
int numAtoms = cl.getPaddedNumAtoms(); int numAtoms = cl.getPaddedNumAtoms();
void* dest = (cl.getUseDoublePrecision() ? (void*) &((mm_double4*) pinnedMemory)[(cl.getContextIndex()-1)*numAtoms] : (void*) &((mm_float4*) pinnedMemory)[(cl.getContextIndex()-1)*numAtoms]);
cl.getQueue().enqueueReadBuffer(cl.getForce().getDeviceBuffer(), CL_TRUE, 0, cl.getQueue().enqueueReadBuffer(cl.getForce().getDeviceBuffer(), CL_TRUE, 0,
numAtoms*sizeof(mm_float4), &pinnedMemory[(cl.getContextIndex()-1)*numAtoms]); numAtoms*cl.getForce().getElementSize(), dest);
} }
else else
cl.getQueue().finish(); cl.getQueue().finish();
...@@ -103,7 +104,7 @@ private: ...@@ -103,7 +104,7 @@ private:
int groups; int groups;
double& energy; double& energy;
long long& completionTime; long long& completionTime;
mm_float4* pinnedMemory; void* pinnedMemory;
}; };
OpenCLParallelCalcForcesAndEnergyKernel::OpenCLParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, OpenCLPlatform::PlatformData& data) : OpenCLParallelCalcForcesAndEnergyKernel::OpenCLParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, OpenCLPlatform::PlatformData& data) :
...@@ -129,19 +130,20 @@ void OpenCLParallelCalcForcesAndEnergyKernel::initialize(const System& system) { ...@@ -129,19 +130,20 @@ void OpenCLParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
void OpenCLParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) { void OpenCLParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
OpenCLContext& cl0 = *data.contexts[0]; OpenCLContext& cl0 = *data.contexts[0];
int elementSize = (cl0.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
if (contextForces == NULL) { if (contextForces == NULL) {
contextForces = OpenCLArray::create<mm_float4>(cl0, &cl0.getForceBuffers().getDeviceBuffer(), contextForces = OpenCLArray::create<mm_float4>(cl0, &cl0.getForceBuffers().getDeviceBuffer(),
data.contexts.size()*cl0.getPaddedNumAtoms(), "contextForces"); data.contexts.size()*cl0.getPaddedNumAtoms(), "contextForces");
int bufferBytes = (data.contexts.size()-1)*cl0.getPaddedNumAtoms()*sizeof(mm_float4); int bufferBytes = (data.contexts.size()-1)*cl0.getPaddedNumAtoms()*elementSize;
pinnedPositionBuffer = new cl::Buffer(cl0.getContext(), CL_MEM_ALLOC_HOST_PTR, bufferBytes); pinnedPositionBuffer = new cl::Buffer(cl0.getContext(), CL_MEM_ALLOC_HOST_PTR, bufferBytes);
pinnedPositionMemory = (mm_float4*) cl0.getQueue().enqueueMapBuffer(*pinnedPositionBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, bufferBytes); pinnedPositionMemory = cl0.getQueue().enqueueMapBuffer(*pinnedPositionBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, bufferBytes);
pinnedForceBuffer = new cl::Buffer(cl0.getContext(), CL_MEM_ALLOC_HOST_PTR, bufferBytes); pinnedForceBuffer = new cl::Buffer(cl0.getContext(), CL_MEM_ALLOC_HOST_PTR, bufferBytes);
pinnedForceMemory = (mm_float4*) cl0.getQueue().enqueueMapBuffer(*pinnedForceBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, bufferBytes); pinnedForceMemory = cl0.getQueue().enqueueMapBuffer(*pinnedForceBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, bufferBytes);
} }
// Copy coordinates over to each device and execute the kernel. // Copy coordinates over to each device and execute the kernel.
cl0.getQueue().enqueueReadBuffer(cl0.getPosq().getDeviceBuffer(), CL_TRUE, 0, cl0.getPaddedNumAtoms()*sizeof(mm_float4), pinnedPositionMemory); cl0.getQueue().enqueueReadBuffer(cl0.getPosq().getDeviceBuffer(), CL_TRUE, 0, cl0.getPaddedNumAtoms()*elementSize, pinnedPositionMemory);
for (int i = 0; i < (int) data.contexts.size(); i++) { for (int i = 0; i < (int) data.contexts.size(); i++) {
data.contextEnergy[i] = 0.0; data.contextEnergy[i] = 0.0;
OpenCLContext& cl = *data.contexts[i]; OpenCLContext& cl = *data.contexts[i];
...@@ -165,8 +167,9 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c ...@@ -165,8 +167,9 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c
OpenCLContext& cl = *data.contexts[0]; OpenCLContext& cl = *data.contexts[0];
int numAtoms = cl.getPaddedNumAtoms(); int numAtoms = cl.getPaddedNumAtoms();
cl.getQueue().enqueueWriteBuffer(contextForces->getDeviceBuffer(), CL_FALSE, numAtoms*sizeof(mm_float4), int elementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
numAtoms*(data.contexts.size()-1)*sizeof(mm_float4), pinnedForceMemory); cl.getQueue().enqueueWriteBuffer(contextForces->getDeviceBuffer(), CL_FALSE, numAtoms*elementSize,
numAtoms*(data.contexts.size()-1)*elementSize, pinnedForceMemory);
cl.reduceBuffer(*contextForces, data.contexts.size()); cl.reduceBuffer(*contextForces, data.contexts.size());
// Balance work between the contexts by transferring a few nonbonded tiles from the context that // Balance work between the contexts by transferring a few nonbonded tiles from the context that
......
...@@ -84,8 +84,8 @@ private: ...@@ -84,8 +84,8 @@ private:
OpenCLArray* contextForces; OpenCLArray* contextForces;
cl::Buffer* pinnedPositionBuffer; cl::Buffer* pinnedPositionBuffer;
cl::Buffer* pinnedForceBuffer; cl::Buffer* pinnedForceBuffer;
mm_float4* pinnedPositionMemory; void* pinnedPositionMemory;
mm_float4* pinnedForceMemory; void* pinnedForceMemory;
}; };
/** /**
......
...@@ -141,7 +141,7 @@ OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& p ...@@ -141,7 +141,7 @@ OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& p
device << contexts[i]->getDeviceIndex(); device << contexts[i]->getDeviceIndex();
} }
propertyValues[OpenCLPlatform::OpenCLDeviceIndex()] = device.str(); propertyValues[OpenCLPlatform::OpenCLDeviceIndex()] = device.str();
propertyValues[OpenCLPlatform::OpenCLPlatformIndex()] = OpenCLExpressionUtilities::intToString(platformIndex); propertyValues[OpenCLPlatform::OpenCLPlatformIndex()] = contexts[0]->intToString(platformIndex);
propertyValues[OpenCLPlatform::OpenCLPrecision()] = precisionProperty; propertyValues[OpenCLPlatform::OpenCLPrecision()] = precisionProperty;
contextEnergy.resize(contexts.size()); contextEnergy.resize(contexts.size());
} }
......
...@@ -162,7 +162,7 @@ public: ...@@ -162,7 +162,7 @@ public:
// Assign array elements to buckets. // Assign array elements to buckets.
unsigned int numBuckets = bucketOffset->getSize(); unsigned int numBuckets = bucketOffset->getSize();
context.clearBuffer(bucketOffset->getDeviceBuffer(), numBuckets); context.clearBuffer(*bucketOffset);
assignElementsKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer()); assignElementsKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
assignElementsKernel.setArg<cl_int>(1, data.getSize()); assignElementsKernel.setArg<cl_int>(1, data.getSize());
assignElementsKernel.setArg<cl_int>(2, numBuckets); assignElementsKernel.setArg<cl_int>(2, numBuckets);
......
float4 v0 = pos2-pos1; real4 v0 = pos2-pos1;
float4 v1 = pos2-pos3; real4 v1 = pos2-pos3;
float4 cp = cross(v0, v1); real4 cp = cross(v0, v1);
float rp = cp.x*cp.x + cp.y*cp.y + cp.z*cp.z; real rp = cp.x*cp.x + cp.y*cp.y + cp.z*cp.z;
rp = max(SQRT(rp), 1.0e-06f); rp = max(SQRT(rp), (real) 1.0e-06f);
float r21 = v0.x*v0.x + v0.y*v0.y + v0.z*v0.z; real r21 = v0.x*v0.x + v0.y*v0.y + v0.z*v0.z;
float r23 = v1.x*v1.x + v1.y*v1.y + v1.z*v1.z; real r23 = v1.x*v1.x + v1.y*v1.y + v1.z*v1.z;
float dot = v0.x*v1.x + v0.y*v1.y + v0.z*v1.z; real dot = v0.x*v1.x + v0.y*v1.y + v0.z*v1.z;
float cosine = clamp(dot*RSQRT(r21*r23), -1.0f, 1.0f); real cosine = clamp(dot*RSQRT(r21*r23), (real) -1, (real) 1);
float theta = acos(cosine); real theta = acos(cosine);
COMPUTE_FORCE COMPUTE_FORCE
float4 force1 = cross(v0, cp)*(dEdAngle/(r21*rp)); real4 force1 = cross(v0, cp)*(dEdAngle/(r21*rp));
float4 force3 = cross(cp, v1)*(dEdAngle/(r23*rp)); real4 force3 = cross(cp, v1)*(dEdAngle/(r23*rp));
float4 force2 = -force1-force3; real4 force2 = -force1-force3;
float4 delta = pos2-pos1; real4 delta = pos2-pos1;
float r = SQRT(delta.x*delta.x + delta.y*delta.y + delta.z*delta.z); real r = SQRT(delta.x*delta.x + delta.y*delta.y + delta.z*delta.z);
COMPUTE_FORCE COMPUTE_FORCE
dEdR = (r > 0.0f) ? (dEdR / r) : 0.0f; dEdR = (r > 0.0f) ? (dEdR / r) : 0.0f;
delta.xyz *= dEdR; delta.xyz *= dEdR;
float4 force1 = delta; real4 force1 = delta;
float4 force2 = -delta; real4 force2 = -delta;
\ No newline at end of file \ No newline at end of file
const float PI = 3.14159265358979323846f; const real PI = 3.14159265358979323846f;
// Compute the first angle. // Compute the first angle.
float4 v0a = (float4) (pos1.xyz-pos2.xyz, 0.0f); real4 v0a = (real4) (pos1.xyz-pos2.xyz, 0.0f);
float4 v1a = (float4) (pos3.xyz-pos2.xyz, 0.0f); real4 v1a = (real4) (pos3.xyz-pos2.xyz, 0.0f);
float4 v2a = (float4) (pos3.xyz-pos4.xyz, 0.0f); real4 v2a = (real4) (pos3.xyz-pos4.xyz, 0.0f);
float4 cp0a = cross(v0a, v1a); real4 cp0a = cross(v0a, v1a);
float4 cp1a = cross(v1a, v2a); real4 cp1a = cross(v1a, v2a);
float cosangle = dot(normalize(cp0a), normalize(cp1a)); real cosangle = dot(normalize(cp0a), normalize(cp1a));
float angleA; real angleA;
if (cosangle > 0.99f || cosangle < -0.99f) { if (cosangle > 0.99f || cosangle < -0.99f) {
// We're close to the singularity in acos(), so take the cross product and use asin() instead. // We're close to the singularity in acos(), so take the cross product and use asin() instead.
float4 cross_prod = cross(cp0a, cp1a); real4 cross_prod = cross(cp0a, cp1a);
float scale = dot(cp0a, cp0a)*dot(cp1a, cp1a); real scale = dot(cp0a, cp0a)*dot(cp1a, cp1a);
angleA = asin(SQRT(dot(cross_prod, cross_prod)/scale)); angleA = asin(SQRT(dot(cross_prod, cross_prod)/scale));
if (cosangle < 0.0f) if (cosangle < 0.0f)
angleA = PI-angleA; angleA = PI-angleA;
...@@ -25,18 +25,18 @@ angleA = fmod(angleA+2.0f*PI, 2.0f*PI); ...@@ -25,18 +25,18 @@ angleA = fmod(angleA+2.0f*PI, 2.0f*PI);
// Compute the second angle. // Compute the second angle.
float4 v0b = (float4) (pos5.xyz-pos6.xyz, 0.0f); real4 v0b = (real4) (pos5.xyz-pos6.xyz, 0.0f);
float4 v1b = (float4) (pos7.xyz-pos6.xyz, 0.0f); real4 v1b = (real4) (pos7.xyz-pos6.xyz, 0.0f);
float4 v2b = (float4) (pos7.xyz-pos8.xyz, 0.0f); real4 v2b = (real4) (pos7.xyz-pos8.xyz, 0.0f);
float4 cp0b = cross(v0b, v1b); real4 cp0b = cross(v0b, v1b);
float4 cp1b = cross(v1b, v2b); real4 cp1b = cross(v1b, v2b);
cosangle = dot(normalize(cp0b), normalize(cp1b)); cosangle = dot(normalize(cp0b), normalize(cp1b));
float angleB; real angleB;
if (cosangle > 0.99f || cosangle < -0.99f) { if (cosangle > 0.99f || cosangle < -0.99f) {
// We're close to the singularity in acos(), so take the cross product and use asin() instead. // We're close to the singularity in acos(), so take the cross product and use asin() instead.
float4 cross_prod = cross(cp0b, cp1b); real4 cross_prod = cross(cp0b, cp1b);
float scale = dot(cp0b, cp0b)*dot(cp1b, cp1b); real scale = dot(cp0b, cp0b)*dot(cp1b, cp1b);
angleB = asin(SQRT(dot(cross_prod, cross_prod)/scale)); angleB = asin(SQRT(dot(cross_prod, cross_prod)/scale));
if (cosangle < 0.0f) if (cosangle < 0.0f)
angleB = PI-angleB; angleB = PI-angleB;
...@@ -50,7 +50,7 @@ angleB = fmod(angleB+2.0f*PI, 2.0f*PI); ...@@ -50,7 +50,7 @@ angleB = fmod(angleB+2.0f*PI, 2.0f*PI);
int2 pos = MAP_POS[MAPS[index]]; int2 pos = MAP_POS[MAPS[index]];
int size = pos.y; int size = pos.y;
float delta = 2*PI/size; real delta = 2*PI/size;
int s = (int) (angleA/delta); int s = (int) (angleA/delta);
int t = (int) (angleB/delta); int t = (int) (angleB/delta);
float4 c[4]; float4 c[4];
...@@ -59,14 +59,14 @@ c[0] = COEFF[coeffIndex]; ...@@ -59,14 +59,14 @@ c[0] = COEFF[coeffIndex];
c[1] = COEFF[coeffIndex+1]; c[1] = COEFF[coeffIndex+1];
c[2] = COEFF[coeffIndex+2]; c[2] = COEFF[coeffIndex+2];
c[3] = COEFF[coeffIndex+3]; c[3] = COEFF[coeffIndex+3];
float da = angleA/delta-s; real da = angleA/delta-s;
float db = angleB/delta-t; real db = angleB/delta-t;
// Evaluate the spline to determine the energy and gradients. // Evaluate the spline to determine the energy and gradients.
float torsionEnergy = 0.0f; real torsionEnergy = 0.0f;
float dEdA = 0.0f; real dEdA = 0.0f;
float dEdB = 0.0f; real dEdB = 0.0f;
torsionEnergy = da*torsionEnergy + ((c[3].w*db + c[3].z)*db + c[3].y)*db + c[3].x; torsionEnergy = da*torsionEnergy + ((c[3].w*db + c[3].z)*db + c[3].y)*db + c[3].x;
dEdA = db*dEdA + (3.0f*c[3].w*da + 2.0f*c[2].w)*da + c[1].w; dEdA = db*dEdA + (3.0f*c[3].w*da + 2.0f*c[2].w)*da + c[1].w;
dEdB = da*dEdB + (3.0f*c[3].w*db + 2.0f*c[3].z)*db + c[3].y; dEdB = da*dEdB + (3.0f*c[3].w*db + 2.0f*c[3].z)*db + c[3].y;
...@@ -85,17 +85,17 @@ energy += torsionEnergy; ...@@ -85,17 +85,17 @@ energy += torsionEnergy;
// Apply the force to the first torsion. // Apply the force to the first torsion.
float normCross1 = dot(cp0a, cp0a); real normCross1 = dot(cp0a, cp0a);
float normSqrBC = dot(v1a, v1a); real normSqrBC = dot(v1a, v1a);
float normBC = SQRT(normSqrBC); real normBC = SQRT(normSqrBC);
float normCross2 = dot(cp1a, cp1a); real normCross2 = dot(cp1a, cp1a);
float dp = 1.0f/normSqrBC; real dp = 1.0f/normSqrBC;
float4 ff = (float4) ((-dEdA*normBC)/normCross1, dot(v0a, v1a)*dp, dot(v2a, v1a)*dp, (dEdA*normBC)/normCross2); real4 ff = (real4) ((-dEdA*normBC)/normCross1, dot(v0a, v1a)*dp, dot(v2a, v1a)*dp, (dEdA*normBC)/normCross2);
float4 force1 = ff.x*cp0a; real4 force1 = ff.x*cp0a;
float4 force4 = ff.w*cp1a; real4 force4 = ff.w*cp1a;
float4 d = ff.y*force1 - ff.z*force4; real4 d = ff.y*force1 - ff.z*force4;
float4 force2 = d-force1; real4 force2 = d-force1;
float4 force3 = -d-force4; real4 force3 = -d-force4;
// Apply the force to the second torsion. // Apply the force to the second torsion.
...@@ -104,9 +104,9 @@ normSqrBC = dot(v1b, v1b); ...@@ -104,9 +104,9 @@ normSqrBC = dot(v1b, v1b);
normBC = SQRT(normSqrBC); normBC = SQRT(normSqrBC);
normCross2 = dot(cp1b, cp1b); normCross2 = dot(cp1b, cp1b);
dp = 1.0f/normSqrBC; dp = 1.0f/normSqrBC;
ff = (float4) ((-dEdB*normBC)/normCross1, dot(v0b, v1b)*dp, dot(v2b, v1b)*dp, (dEdB*normBC)/normCross2); ff = (real4) ((-dEdB*normBC)/normCross1, dot(v0b, v1b)*dp, dot(v2b, v1b)*dp, (dEdB*normBC)/normCross2);
float4 force5 = ff.x*cp0b; real4 force5 = ff.x*cp0b;
float4 force8 = ff.w*cp1b; real4 force8 = ff.w*cp1b;
d = ff.y*force5 - ff.z*force8; d = ff.y*force5 - ff.z*force8;
float4 force6 = d-force5; real4 force6 = d-force5;
float4 force7 = -d-force8; real4 force7 = -d-force8;
#if USE_EWALD #if USE_EWALD
bool needCorrection = isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS; bool needCorrection = isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS;
if (!isExcluded || needCorrection) { if (!isExcluded || needCorrection) {
float tempForce = 0.0f; real tempForce = 0;
if (r2 < CUTOFF_SQUARED || needCorrection) { if (r2 < CUTOFF_SQUARED || needCorrection) {
const float alphaR = EWALD_ALPHA*r; const real alphaR = EWALD_ALPHA*r;
const float expAlphaRSqr = EXP(-alphaR*alphaR); const real expAlphaRSqr = EXP(-alphaR*alphaR);
const float prefactor = 138.935456f*posq1.w*posq2.w*invR; const real prefactor = 138.935456f*posq1.w*posq2.w*invR;
// This approximation for erfc is from Abramowitz and Stegun (1964) p. 299. They cite the following as // This approximation for erfc is from Abramowitz and Stegun (1964) p. 299. They cite the following as
// the original source: C. Hastings, Jr., Approximations for Digital Computers (1955). It has a maximum // the original source: C. Hastings, Jr., Approximations for Digital Computers (1955). It has a maximum
// error of 3e-7. // error of 3e-7.
float t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*alphaR)*alphaR)*alphaR)*alphaR)*alphaR)*alphaR; real t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*alphaR)*alphaR)*alphaR)*alphaR)*alphaR)*alphaR;
t *= t; t *= t;
t *= t; t *= t;
t *= t; t *= t;
const float erfcAlphaR = RECIP(t*t); const real erfcAlphaR = RECIP(t*t);
if (needCorrection) { if (needCorrection) {
// Subtract off the part of this interaction that was included in the reciprocal space contribution. // Subtract off the part of this interaction that was included in the reciprocal space contribution.
...@@ -24,11 +24,11 @@ if (!isExcluded || needCorrection) { ...@@ -24,11 +24,11 @@ if (!isExcluded || needCorrection) {
} }
else { else {
#if HAS_LENNARD_JONES #if HAS_LENNARD_JONES
float sig = sigmaEpsilon1.x + sigmaEpsilon2.x; real sig = sigmaEpsilon1.x + sigmaEpsilon2.x;
float sig2 = invR*sig; real sig2 = invR*sig;
sig2 *= sig2; sig2 *= sig2;
float sig6 = sig2*sig2*sig2; real sig6 = sig2*sig2*sig2;
float epssig6 = sig6*(sigmaEpsilon1.y*sigmaEpsilon2.y); real epssig6 = sig6*(sigmaEpsilon1.y*sigmaEpsilon2.y);
tempForce = epssig6*(12.0f*sig6 - 6.0f) + prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI); tempForce = epssig6*(12.0f*sig6 - 6.0f) + prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
tempEnergy += epssig6*(sig6 - 1.0f) + prefactor*erfcAlphaR; tempEnergy += epssig6*(sig6 - 1.0f) + prefactor*erfcAlphaR;
#else #else
...@@ -41,32 +41,37 @@ if (!isExcluded || needCorrection) { ...@@ -41,32 +41,37 @@ if (!isExcluded || needCorrection) {
} }
#else #else
{ {
#ifdef USE_DOUBLE_PRECISION
unsigned long includeInteraction;
#else
unsigned int includeInteraction;
#endif
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int includeInteraction = (!isExcluded && r2 < CUTOFF_SQUARED); includeInteraction = (!isExcluded && r2 < CUTOFF_SQUARED);
#else #else
unsigned int includeInteraction = (!isExcluded); includeInteraction = (!isExcluded);
#endif #endif
float tempForce = 0.0f; real tempForce = 0;
#if HAS_LENNARD_JONES #if HAS_LENNARD_JONES
float sig = sigmaEpsilon1.x + sigmaEpsilon2.x; real sig = sigmaEpsilon1.x + sigmaEpsilon2.x;
float sig2 = invR*sig; real sig2 = invR*sig;
sig2 *= sig2; sig2 *= sig2;
float sig6 = sig2*sig2*sig2; real sig6 = sig2*sig2*sig2;
float epssig6 = sig6*(sigmaEpsilon1.y*sigmaEpsilon2.y); real epssig6 = sig6*(sigmaEpsilon1.y*sigmaEpsilon2.y);
tempForce = epssig6*(12.0f*sig6 - 6.0f); tempForce = epssig6*(12.0f*sig6 - 6.0f);
tempEnergy += select(0.0f, epssig6*(sig6 - 1.0f), includeInteraction); tempEnergy += select((real) 0, epssig6*(sig6-1), includeInteraction);
#endif #endif
#if HAS_COULOMB #if HAS_COULOMB
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
const float prefactor = 138.935456f*posq1.w*posq2.w; const real prefactor = 138.935456f*posq1.w*posq2.w;
tempForce += prefactor*(invR - 2.0f*REACTION_FIELD_K*r2); tempForce += prefactor*(invR - 2.0f*REACTION_FIELD_K*r2);
tempEnergy += select(0.0f, prefactor*(invR + REACTION_FIELD_K*r2 - REACTION_FIELD_C), includeInteraction); tempEnergy += select((real) 0, prefactor*(invR + REACTION_FIELD_K*r2 - REACTION_FIELD_C), includeInteraction);
#else #else
const float prefactor = 138.935456f*posq1.w*posq2.w*invR; const real prefactor = 138.935456f*posq1.w*posq2.w*invR;
tempForce += prefactor; tempForce += prefactor;
tempEnergy += select(0.0f, prefactor, includeInteraction); tempEnergy += select((real) 0, prefactor, includeInteraction);
#endif #endif
#endif #endif
dEdR += select(0.0f, tempForce*invR*invR, includeInteraction); dEdR += select((real) 0, tempForce*invR*invR, includeInteraction);
} }
#endif #endif
\ No newline at end of file
/** /**
* Compute the difference between two vectors, setting the fourth component to the squared magnitude. * Compute the difference between two vectors, setting the fourth component to the squared magnitude.
*/ */
float4 ccb_delta(float4 vec1, float4 vec2) { real4 ccb_delta(real4 vec1, real4 vec2) {
float4 result = (float4) (vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f); real4 result = (real4) (vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
result.w = result.x*result.x + result.y*result.y + result.z*result.z; result.w = result.x*result.x + result.y*result.y + result.z*result.z;
return result; return result;
} }
...@@ -10,17 +10,17 @@ float4 ccb_delta(float4 vec1, float4 vec2) { ...@@ -10,17 +10,17 @@ float4 ccb_delta(float4 vec1, float4 vec2) {
/** /**
* Compute the angle between two vectors. The w component of each vector should contain the squared magnitude. * Compute the angle between two vectors. The w component of each vector should contain the squared magnitude.
*/ */
float ccb_computeAngle(float4 vec1, float4 vec2) { real ccb_computeAngle(real4 vec1, real4 vec2) {
float dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z; real dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
float cosine = dotProduct*RSQRT(vec1.w*vec2.w); real cosine = dotProduct*RSQRT(vec1.w*vec2.w);
float angle; real angle;
if (cosine > 0.99f || cosine < -0.99f) { if (cosine > 0.99f || cosine < -0.99f) {
// We're close to the singularity in acos(), so take the cross product and use asin() instead. // We're close to the singularity in acos(), so take the cross product and use asin() instead.
float4 crossProduct = cross(vec1, vec2); real4 crossProduct = cross(vec1, vec2);
float scale = vec1.w*vec2.w; real scale = vec1.w*vec2.w;
angle = asin(SQRT(dot(crossProduct, crossProduct)/scale)); angle = asin(SQRT(dot(crossProduct, crossProduct)/scale));
if (cosine < 0.0f) if (cosine < 0)
angle = M_PI-angle; angle = M_PI-angle;
} }
else else
...@@ -31,8 +31,8 @@ float ccb_computeAngle(float4 vec1, float4 vec2) { ...@@ -31,8 +31,8 @@ float ccb_computeAngle(float4 vec1, float4 vec2) {
/** /**
* Compute the cross product of two vectors, setting the fourth component to the squared magnitude. * Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
*/ */
float4 ccb_computeCross(float4 vec1, float4 vec2) { real4 ccb_computeCross(real4 vec1, real4 vec2) {
float4 result = cross(vec1, vec2); real4 result = cross(vec1, vec2);
result.w = result.x*result.x + result.y*result.y + result.z*result.z; result.w = result.x*result.x + result.y*result.y + result.z*result.z;
return result; return result;
} }
COMPUTE_FORCE COMPUTE_FORCE
float4 force1 = (float4) (-dEdX, -dEdY, -dEdZ, 0.0f); real4 force1 = (real4) (-dEdX, -dEdY, -dEdZ, 0);
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment