Commit 8d6a2a01 authored by Peter Eastman's avatar Peter Eastman
Browse files

Beginnings of mixed/double precision support in OpenCL

parent a3d5f834
...@@ -68,11 +68,18 @@ public: ...@@ -68,11 +68,18 @@ public:
static const std::string key = "OpenCLPlatformIndex"; static const std::string key = "OpenCLPlatformIndex";
return key; return key;
} }
/**
* This is the name of the parameter for selecting what numerical precision to use.
*/
static const std::string& OpenCLPrecision() {
static const std::string key = "OpenCLPrecision";
return key;
}
}; };
class OPENMM_EXPORT OpenCLPlatform::PlatformData { class OPENMM_EXPORT OpenCLPlatform::PlatformData {
public: public:
PlatformData(const System& system, const std::string& platformPropValue, const std::string& deviceIndexProperty); PlatformData(const System& system, const std::string& platformPropValue, const std::string& deviceIndexProperty, const std::string& precisionProperty);
~PlatformData(); ~PlatformData();
void initializeContexts(const System& system); void initializeContexts(const System& system);
void syncContexts(); void syncContexts();
......
This diff is collapsed.
...@@ -62,7 +62,7 @@ struct mm_float2 { ...@@ -62,7 +62,7 @@ struct mm_float2 {
mm_float2(cl_float x, cl_float y) : x(x), y(y) { mm_float2(cl_float x, cl_float y) : x(x), y(y) {
} }
}; };
struct mm_float4 { struct mm_float4 {
cl_float x, y, z, w; cl_float x, y, z, w;
mm_float4() { mm_float4() {
} }
...@@ -87,6 +87,20 @@ struct mm_float16 { ...@@ -87,6 +87,20 @@ struct mm_float16 {
s8(s8), s9(s9), s10(s10), s11(s11), s12(s12), s13(s13), s14(s14), s15(15) { s8(s8), s9(s9), s10(s10), s11(s11), s12(s12), s13(s13), s14(s14), s15(15) {
} }
}; };
struct mm_double2 {
cl_double x, y;
mm_double2() {
}
mm_double2(cl_double x, cl_double y) : x(x), y(y) {
}
};
struct mm_double4 {
cl_double x, y, z, w;
mm_double4() {
}
mm_double4(cl_double x, cl_double y, cl_double z, cl_double w) : x(x), y(y), z(z), w(w) {
}
};
struct mm_ushort2 { struct mm_ushort2 {
cl_ushort x, y; cl_ushort x, y;
mm_ushort2() { mm_ushort2() {
...@@ -145,7 +159,7 @@ public: ...@@ -145,7 +159,7 @@ public:
class ReorderListener; class ReorderListener;
static const int ThreadBlockSize; static const int ThreadBlockSize;
static const int TileSize; static const int TileSize;
OpenCLContext(const System& system, int platformIndex, int deviceIndex, OpenCLPlatform::PlatformData& platformData); OpenCLContext(const System& system, int platformIndex, int deviceIndex, const std::string& precision, OpenCLPlatform::PlatformData& platformData);
~OpenCLContext(); ~OpenCLContext();
/** /**
* This is called to initialize internal data structures after all Forces in the system * This is called to initialize internal data structures after all Forces in the system
...@@ -198,6 +212,12 @@ public: ...@@ -198,6 +212,12 @@ public:
OpenCLArray& getPosq() { OpenCLArray& getPosq() {
return *posq; return *posq;
} }
/**
* Get the array which contains a correction to the position of each atom. This only exists if getUseMixedPrecision() returns true.
*/
OpenCLArray& getPosqCorrection() {
return *posqCorrection;
}
/** /**
* Get the array which contains the velocity (the xyz components) and inverse mass (the w component) of each atom. * Get the array which contains the velocity (the xyz components) and inverse mass (the w component) of each atom.
*/ */
...@@ -405,18 +425,38 @@ public: ...@@ -405,18 +425,38 @@ public:
bool getSupportsDoublePrecision() { bool getSupportsDoublePrecision() {
return supportsDoublePrecision; return supportsDoublePrecision;
} }
/**
* Get whether double precision is being used.
*/
bool getUseDoublePrecision() {
return useDoublePrecision;
}
/**
* Get whether mixed precision is being used.
*/
bool getUseMixedPrecision() {
return useMixedPrecision;
}
/** /**
* Get the size of the periodic box. * Get the size of the periodic box.
*/ */
mm_float4 getPeriodicBoxSize() const { mm_float4 getPeriodicBoxSize() const {
return periodicBoxSize; return periodicBoxSize;
} }
/**
* Get the size of the periodic box.
*/
mm_double4 getPeriodicBoxSizeDouble() const {
return periodicBoxSizeDouble;
}
/** /**
* Set the size of the periodic box. * Set the size of the periodic box.
*/ */
void setPeriodicBoxSize(double xsize, double ysize, double zsize) { void setPeriodicBoxSize(double xsize, double ysize, double zsize) {
periodicBoxSize = mm_float4((float) xsize, (float) ysize, (float) zsize, 0); periodicBoxSize = mm_float4((float) xsize, (float) ysize, (float) zsize, 0);
invPeriodicBoxSize = mm_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0); invPeriodicBoxSize = mm_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0);
periodicBoxSizeDouble = mm_double4(xsize, ysize, zsize, 0);
invPeriodicBoxSizeDouble = mm_double4(1.0/xsize, 1.0/ysize, 1.0/zsize, 0);
} }
/** /**
* Get the inverse of the size of the periodic box. * Get the inverse of the size of the periodic box.
...@@ -424,6 +464,12 @@ public: ...@@ -424,6 +464,12 @@ public:
mm_float4 getInvPeriodicBoxSize() const { mm_float4 getInvPeriodicBoxSize() const {
return invPeriodicBoxSize; return invPeriodicBoxSize;
} }
/**
* Get the inverse of the size of the periodic box.
*/
mm_double4 getInvPeriodicBoxSizeDouble() const {
return invPeriodicBoxSizeDouble;
}
/** /**
* Get the OpenCLIntegrationUtilities for this context. * Get the OpenCLIntegrationUtilities for this context.
*/ */
...@@ -502,6 +548,11 @@ private: ...@@ -502,6 +548,11 @@ private:
* of molecules and resort the atoms. * of molecules and resort the atoms.
*/ */
void validateMolecules(); void validateMolecules();
/**
* This is the internal implementation of reorderAtoms(), templatized by the numerical precision in use.
*/
template <class Real, class Real4, class Mixed, class Mixed4>
void reorderAtomsImpl(bool enforcePeriodic);
const System& system; const System& system;
double time; double time;
OpenCLPlatform::PlatformData& platformData; OpenCLPlatform::PlatformData& platformData;
...@@ -515,9 +566,9 @@ private: ...@@ -515,9 +566,9 @@ private:
int numThreadBlocks; int numThreadBlocks;
int numForceBuffers; int numForceBuffers;
int simdWidth; int simdWidth;
bool supports64BitGlobalAtomics, supportsDoublePrecision, atomsWereReordered, moleculesInvalid; bool supports64BitGlobalAtomics, supportsDoublePrecision, useDoublePrecision, useMixedPrecision, atomsWereReordered, moleculesInvalid;
mm_float4 periodicBoxSize; mm_float4 periodicBoxSize, invPeriodicBoxSize;
mm_float4 invPeriodicBoxSize; mm_double4 periodicBoxSizeDouble, invPeriodicBoxSizeDouble;
std::string defaultOptimizationOptions; std::string defaultOptimizationOptions;
std::map<std::string, std::string> compilationDefines; std::map<std::string, std::string> compilationDefines;
cl::Context context; cl::Context context;
...@@ -538,6 +589,7 @@ private: ...@@ -538,6 +589,7 @@ private:
cl::Buffer* pinnedBuffer; cl::Buffer* pinnedBuffer;
void* pinnedMemory; void* pinnedMemory;
OpenCLArray* posq; OpenCLArray* posq;
OpenCLArray* posqCorrection;
OpenCLArray* velm; OpenCLArray* velm;
OpenCLArray* force; OpenCLArray* force;
OpenCLArray* forceBuffers; OpenCLArray* forceBuffers;
......
...@@ -87,6 +87,13 @@ struct OpenCLIntegrationUtilities::ConstraintOrderer : public binary_function<in ...@@ -87,6 +87,13 @@ struct OpenCLIntegrationUtilities::ConstraintOrderer : public binary_function<in
} }
}; };
static void setPosqCorrectionArg(OpenCLContext& cl, cl::Kernel& kernel, int index) {
if (cl.getUseMixedPrecision())
kernel.setArg<cl::Buffer>(index, cl.getPosqCorrection().getDeviceBuffer());
else
kernel.setArg<void*>(index, NULL);
}
OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, const System& system) : context(context), OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, const System& system) : context(context),
posDelta(NULL), settleAtoms(NULL), settleParams(NULL), shakeAtoms(NULL), shakeParams(NULL), posDelta(NULL), settleAtoms(NULL), settleParams(NULL), shakeAtoms(NULL), shakeParams(NULL),
random(NULL), randomSeed(NULL), randomPos(0), stepSize(NULL), ccmaAtoms(NULL), ccmaDistance(NULL), random(NULL), randomSeed(NULL), randomPos(0), stepSize(NULL), ccmaAtoms(NULL), ccmaDistance(NULL),
...@@ -96,12 +103,22 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c ...@@ -96,12 +103,22 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL), hasInitializedPosConstraintKernels(false), hasInitializedVelConstraintKernels(false) { vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL), hasInitializedPosConstraintKernels(false), hasInitializedVelConstraintKernels(false) {
// Create workspace arrays. // Create workspace arrays.
posDelta = OpenCLArray::create<mm_float4>(context, context.getPaddedNumAtoms(), "posDelta"); if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) {
vector<mm_float4> deltas(posDelta->getSize(), mm_float4(0.0, 0.0, 0.0, 0.0)); posDelta = OpenCLArray::create<mm_double4>(context, context.getPaddedNumAtoms(), "posDelta");
posDelta->upload(deltas); vector<mm_double4> deltas(posDelta->getSize(), mm_double4(0.0, 0.0, 0.0, 0.0));
stepSize = OpenCLArray::create<mm_float2>(context, 1, "stepSize"); posDelta->upload(deltas);
vector<mm_float2> step(1, mm_float2(0.0f, 0.0f)); stepSize = OpenCLArray::create<mm_double2>(context, 1, "stepSize");
stepSize->upload(step); vector<mm_double2> step(1, mm_double2(0.0, 0.0));
stepSize->upload(step);
}
else {
posDelta = OpenCLArray::create<mm_float4>(context, context.getPaddedNumAtoms(), "posDelta");
vector<mm_float4> deltas(posDelta->getSize(), mm_float4(0.0f, 0.0f, 0.0f, 0.0f));
posDelta->upload(deltas);
stepSize = OpenCLArray::create<mm_float2>(context, 1, "stepSize");
vector<mm_float2> step(1, mm_float2(0.0f, 0.0f));
stepSize->upload(step);
}
// Create kernels for enforcing constraints. // Create kernels for enforcing constraints.
...@@ -458,51 +475,86 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c ...@@ -458,51 +475,86 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
// Record the CCMA data structures. // Record the CCMA data structures.
ccmaAtoms = OpenCLArray::create<mm_int2>(context, numCCMA, "CcmaAtoms"); ccmaAtoms = OpenCLArray::create<mm_int2>(context, numCCMA, "CcmaAtoms");
ccmaDistance = OpenCLArray::create<mm_float4>(context, numCCMA, "CcmaDistance");
ccmaAtomConstraints = OpenCLArray::create<cl_int>(context, numAtoms*maxAtomConstraints, "CcmaAtomConstraints"); ccmaAtomConstraints = OpenCLArray::create<cl_int>(context, numAtoms*maxAtomConstraints, "CcmaAtomConstraints");
ccmaNumAtomConstraints = OpenCLArray::create<cl_int>(context, numAtoms, "CcmaAtomConstraintsIndex"); ccmaNumAtomConstraints = OpenCLArray::create<cl_int>(context, numAtoms, "CcmaAtomConstraintsIndex");
ccmaDelta1 = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaDelta1"); ccmaConstraintMatrixColumn = OpenCLArray::create<cl_int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
ccmaDelta2 = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaDelta2");
ccmaConverged = OpenCLArray::create<cl_int>(context, 2, "CcmaConverged"); ccmaConverged = OpenCLArray::create<cl_int>(context, 2, "CcmaConverged");
ccmaConvergedBuffer = new cl::Buffer(context.getContext(), CL_MEM_ALLOC_HOST_PTR, 2*sizeof(cl_int)); ccmaConvergedBuffer = new cl::Buffer(context.getContext(), CL_MEM_ALLOC_HOST_PTR, 2*sizeof(cl_int));
ccmaConvergedMemory = (cl_int*) context.getQueue().enqueueMapBuffer(*ccmaConvergedBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, 2*sizeof(cl_int)); ccmaConvergedMemory = (cl_int*) context.getQueue().enqueueMapBuffer(*ccmaConvergedBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, 2*sizeof(cl_int));
ccmaReducedMass = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaReducedMass");
ccmaConstraintMatrixColumn = OpenCLArray::create<cl_int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
ccmaConstraintMatrixValue = OpenCLArray::create<cl_float>(context, numCCMA*maxRowElements, "ConstraintMatrixValue");
vector<mm_int2> atomsVec(ccmaAtoms->getSize()); vector<mm_int2> atomsVec(ccmaAtoms->getSize());
vector<mm_float4> distanceVec(ccmaDistance->getSize());
vector<cl_int> atomConstraintsVec(ccmaAtomConstraints->getSize()); vector<cl_int> atomConstraintsVec(ccmaAtomConstraints->getSize());
vector<cl_int> numAtomConstraintsVec(ccmaNumAtomConstraints->getSize()); vector<cl_int> numAtomConstraintsVec(ccmaNumAtomConstraints->getSize());
vector<cl_float> reducedMassVec(ccmaReducedMass->getSize());
vector<cl_int> constraintMatrixColumnVec(ccmaConstraintMatrixColumn->getSize()); vector<cl_int> constraintMatrixColumnVec(ccmaConstraintMatrixColumn->getSize());
vector<cl_float> constraintMatrixValueVec(ccmaConstraintMatrixValue->getSize()); if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) {
for (int i = 0; i < numCCMA; i++) { ccmaDistance = OpenCLArray::create<mm_double4>(context, numCCMA, "CcmaDistance");
int index = constraintOrder[i]; ccmaDelta1 = OpenCLArray::create<cl_double>(context, numCCMA, "CcmaDelta1");
int c = ccmaConstraints[index]; ccmaDelta2 = OpenCLArray::create<cl_double>(context, numCCMA, "CcmaDelta2");
atomsVec[i].x = atom1[c]; ccmaReducedMass = OpenCLArray::create<cl_double>(context, numCCMA, "CcmaReducedMass");
atomsVec[i].y = atom2[c]; ccmaConstraintMatrixValue = OpenCLArray::create<cl_double>(context, numCCMA*maxRowElements, "ConstraintMatrixValue");
distanceVec[i].w = (float) distance[c]; vector<mm_double4> distanceVec(ccmaDistance->getSize());
reducedMassVec[i] = (float) (0.5/(1.0/system.getParticleMass(atom1[c])+1.0/system.getParticleMass(atom2[c]))); vector<cl_double> reducedMassVec(ccmaReducedMass->getSize());
for (unsigned int j = 0; j < matrix[index].size(); j++) { vector<cl_double> constraintMatrixValueVec(ccmaConstraintMatrixValue->getSize());
constraintMatrixColumnVec[i+j*numCCMA] = matrix[index][j].first; for (int i = 0; i < numCCMA; i++) {
constraintMatrixValueVec[i+j*numCCMA] = (float) matrix[index][j].second; int index = constraintOrder[i];
int c = ccmaConstraints[index];
atomsVec[i].x = atom1[c];
atomsVec[i].y = atom2[c];
distanceVec[i].w = distance[c];
reducedMassVec[i] = (0.5/(1.0/system.getParticleMass(atom1[c])+1.0/system.getParticleMass(atom2[c])));
for (unsigned int j = 0; j < matrix[index].size(); j++) {
constraintMatrixColumnVec[i+j*numCCMA] = matrix[index][j].first;
constraintMatrixValueVec[i+j*numCCMA] = matrix[index][j].second;
}
constraintMatrixColumnVec[i+matrix[index].size()*numCCMA] = numCCMA;
}
for (unsigned int i = 0; i < atomConstraints.size(); i++) {
numAtomConstraintsVec[i] = atomConstraints[i].size();
for (unsigned int j = 0; j < atomConstraints[i].size(); j++) {
bool forward = (atom1[ccmaConstraints[atomConstraints[i][j]]] == i);
atomConstraintsVec[i+j*numAtoms] = (forward ? inverseOrder[atomConstraints[i][j]]+1 : -inverseOrder[atomConstraints[i][j]]-1);
}
} }
constraintMatrixColumnVec[i+matrix[index].size()*numCCMA] = numCCMA; ccmaDistance->upload(distanceVec);
ccmaReducedMass->upload(reducedMassVec);
ccmaConstraintMatrixValue->upload(constraintMatrixValueVec);
} }
for (unsigned int i = 0; i < atomConstraints.size(); i++) { else {
numAtomConstraintsVec[i] = atomConstraints[i].size(); ccmaDistance = OpenCLArray::create<mm_float4>(context, numCCMA, "CcmaDistance");
for (unsigned int j = 0; j < atomConstraints[i].size(); j++) { ccmaDelta1 = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaDelta1");
bool forward = (atom1[ccmaConstraints[atomConstraints[i][j]]] == i); ccmaDelta2 = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaDelta2");
atomConstraintsVec[i+j*numAtoms] = (forward ? inverseOrder[atomConstraints[i][j]]+1 : -inverseOrder[atomConstraints[i][j]]-1); ccmaReducedMass = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaReducedMass");
ccmaConstraintMatrixValue = OpenCLArray::create<cl_float>(context, numCCMA*maxRowElements, "ConstraintMatrixValue");
vector<mm_float4> distanceVec(ccmaDistance->getSize());
vector<cl_float> reducedMassVec(ccmaReducedMass->getSize());
vector<cl_float> constraintMatrixValueVec(ccmaConstraintMatrixValue->getSize());
for (int i = 0; i < numCCMA; i++) {
int index = constraintOrder[i];
int c = ccmaConstraints[index];
atomsVec[i].x = atom1[c];
atomsVec[i].y = atom2[c];
distanceVec[i].w = (float) distance[c];
reducedMassVec[i] = (float) (0.5/(1.0/system.getParticleMass(atom1[c])+1.0/system.getParticleMass(atom2[c])));
for (unsigned int j = 0; j < matrix[index].size(); j++) {
constraintMatrixColumnVec[i+j*numCCMA] = matrix[index][j].first;
constraintMatrixValueVec[i+j*numCCMA] = (float) matrix[index][j].second;
}
constraintMatrixColumnVec[i+matrix[index].size()*numCCMA] = numCCMA;
} }
for (unsigned int i = 0; i < atomConstraints.size(); i++) {
numAtomConstraintsVec[i] = atomConstraints[i].size();
for (unsigned int j = 0; j < atomConstraints[i].size(); j++) {
bool forward = (atom1[ccmaConstraints[atomConstraints[i][j]]] == i);
atomConstraintsVec[i+j*numAtoms] = (forward ? inverseOrder[atomConstraints[i][j]]+1 : -inverseOrder[atomConstraints[i][j]]-1);
}
}
ccmaDistance->upload(distanceVec);
ccmaReducedMass->upload(reducedMassVec);
ccmaConstraintMatrixValue->upload(constraintMatrixValueVec);
} }
ccmaAtoms->upload(atomsVec); ccmaAtoms->upload(atomsVec);
ccmaDistance->upload(distanceVec);
ccmaAtomConstraints->upload(atomConstraintsVec); ccmaAtomConstraints->upload(atomConstraintsVec);
ccmaNumAtomConstraints->upload(numAtomConstraintsVec); ccmaNumAtomConstraints->upload(numAtomConstraintsVec);
ccmaReducedMass->upload(reducedMassVec);
ccmaConstraintMatrixColumn->upload(constraintMatrixColumnVec); ccmaConstraintMatrixColumn->upload(constraintMatrixColumnVec);
ccmaConstraintMatrixValue->upload(constraintMatrixValueVec);
// Create the CCMA kernels. // Create the CCMA kernels.
...@@ -584,21 +636,23 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c ...@@ -584,21 +636,23 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
cl::Program vsiteProgram = context.createProgram(OpenCLKernelSources::virtualSites, defines); cl::Program vsiteProgram = context.createProgram(OpenCLKernelSources::virtualSites, defines);
vsitePositionKernel = cl::Kernel(vsiteProgram, "computeVirtualSites"); vsitePositionKernel = cl::Kernel(vsiteProgram, "computeVirtualSites");
vsitePositionKernel.setArg<cl::Buffer>(0, context.getPosq().getDeviceBuffer()); vsitePositionKernel.setArg<cl::Buffer>(0, context.getPosq().getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(1, vsite2AvgAtoms->getDeviceBuffer()); setPosqCorrectionArg(context, vsitePositionKernel, 1);
vsitePositionKernel.setArg<cl::Buffer>(2, vsite2AvgWeights->getDeviceBuffer()); vsitePositionKernel.setArg<cl::Buffer>(2, vsite2AvgAtoms->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(3, vsite3AvgAtoms->getDeviceBuffer()); vsitePositionKernel.setArg<cl::Buffer>(3, vsite2AvgWeights->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(4, vsite3AvgWeights->getDeviceBuffer()); vsitePositionKernel.setArg<cl::Buffer>(4, vsite3AvgAtoms->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(5, vsiteOutOfPlaneAtoms->getDeviceBuffer()); vsitePositionKernel.setArg<cl::Buffer>(5, vsite3AvgWeights->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(6, vsiteOutOfPlaneWeights->getDeviceBuffer()); vsitePositionKernel.setArg<cl::Buffer>(6, vsiteOutOfPlaneAtoms->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(7, vsiteOutOfPlaneWeights->getDeviceBuffer());
vsiteForceKernel = cl::Kernel(vsiteProgram, "distributeForces"); vsiteForceKernel = cl::Kernel(vsiteProgram, "distributeForces");
vsiteForceKernel.setArg<cl::Buffer>(0, context.getPosq().getDeviceBuffer()); vsiteForceKernel.setArg<cl::Buffer>(0, context.getPosq().getDeviceBuffer());
// Skip argument 1: the force array hasn't been created yet. setPosqCorrectionArg(context, vsiteForceKernel, 1);
vsiteForceKernel.setArg<cl::Buffer>(2, vsite2AvgAtoms->getDeviceBuffer()); // Skip argument 2: the force array hasn't been created yet.
vsiteForceKernel.setArg<cl::Buffer>(3, vsite2AvgWeights->getDeviceBuffer()); vsiteForceKernel.setArg<cl::Buffer>(3, vsite2AvgAtoms->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(4, vsite3AvgAtoms->getDeviceBuffer()); vsiteForceKernel.setArg<cl::Buffer>(4, vsite2AvgWeights->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(5, vsite3AvgWeights->getDeviceBuffer()); vsiteForceKernel.setArg<cl::Buffer>(5, vsite3AvgAtoms->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(6, vsiteOutOfPlaneAtoms->getDeviceBuffer()); vsiteForceKernel.setArg<cl::Buffer>(6, vsite3AvgWeights->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(7, vsiteOutOfPlaneWeights->getDeviceBuffer()); vsiteForceKernel.setArg<cl::Buffer>(7, vsiteOutOfPlaneAtoms->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(8, vsiteOutOfPlaneWeights->getDeviceBuffer());
numVsites = num2Avg+num3Avg+numOutOfPlane; numVsites = num2Avg+num3Avg+numOutOfPlane;
} }
...@@ -686,23 +740,37 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub ...@@ -686,23 +740,37 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub
if (!hasInitialized) { if (!hasInitialized) {
settleKernel.setArg<cl_int>(0, settleAtoms->getSize()); settleKernel.setArg<cl_int>(0, settleAtoms->getSize());
settleKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer()); settleKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
settleKernel.setArg<cl::Buffer>(3, posDelta->getDeviceBuffer()); if (context.getUseMixedPrecision())
settleKernel.setArg<cl::Buffer>(4, context.getVelm().getDeviceBuffer()); settleKernel.setArg<cl::Buffer>(3, context.getPosqCorrection().getDeviceBuffer());
settleKernel.setArg<cl::Buffer>(5, settleAtoms->getDeviceBuffer()); else
settleKernel.setArg<cl::Buffer>(6, settleParams->getDeviceBuffer()); settleKernel.setArg<void*>(3, NULL);
settleKernel.setArg<cl::Buffer>(4, posDelta->getDeviceBuffer());
settleKernel.setArg<cl::Buffer>(5, context.getVelm().getDeviceBuffer());
settleKernel.setArg<cl::Buffer>(6, settleAtoms->getDeviceBuffer());
settleKernel.setArg<cl::Buffer>(7, settleParams->getDeviceBuffer());
} }
settleKernel.setArg<cl_float>(1, (cl_float) tol); if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
settleKernel.setArg<cl_double>(1, (cl_double) tol);
else
settleKernel.setArg<cl_float>(1, (cl_float) tol);
context.executeKernel(settleKernel, settleAtoms->getSize()); context.executeKernel(settleKernel, settleAtoms->getSize());
} }
if (shakeAtoms != NULL) { if (shakeAtoms != NULL) {
if (!hasInitialized) { if (!hasInitialized) {
shakeKernel.setArg<cl_int>(0, shakeAtoms->getSize()); shakeKernel.setArg<cl_int>(0, shakeAtoms->getSize());
shakeKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer()); shakeKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
shakeKernel.setArg<cl::Buffer>(3, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta->getDeviceBuffer()); if (context.getUseMixedPrecision())
shakeKernel.setArg<cl::Buffer>(4, shakeAtoms->getDeviceBuffer()); shakeKernel.setArg<cl::Buffer>(3, context.getPosqCorrection().getDeviceBuffer());
shakeKernel.setArg<cl::Buffer>(5, shakeParams->getDeviceBuffer()); else
shakeKernel.setArg<void*>(3, NULL);
shakeKernel.setArg<cl::Buffer>(4, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta->getDeviceBuffer());
shakeKernel.setArg<cl::Buffer>(5, shakeAtoms->getDeviceBuffer());
shakeKernel.setArg<cl::Buffer>(6, shakeParams->getDeviceBuffer());
} }
shakeKernel.setArg<cl_float>(1, (cl_float) tol); if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
shakeKernel.setArg<cl_double>(1, (cl_double) tol);
else
shakeKernel.setArg<cl_float>(1, (cl_float) tol);
context.executeKernel(shakeKernel, shakeAtoms->getSize()); context.executeKernel(shakeKernel, shakeAtoms->getSize());
} }
if (ccmaAtoms != NULL) { if (ccmaAtoms != NULL) {
...@@ -710,6 +778,10 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub ...@@ -710,6 +778,10 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub
ccmaDirectionsKernel.setArg<cl::Buffer>(0, ccmaAtoms->getDeviceBuffer()); ccmaDirectionsKernel.setArg<cl::Buffer>(0, ccmaAtoms->getDeviceBuffer());
ccmaDirectionsKernel.setArg<cl::Buffer>(1, ccmaDistance->getDeviceBuffer()); ccmaDirectionsKernel.setArg<cl::Buffer>(1, ccmaDistance->getDeviceBuffer());
ccmaDirectionsKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer()); ccmaDirectionsKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
if (context.getUseMixedPrecision())
ccmaDirectionsKernel.setArg<cl::Buffer>(3, context.getPosqCorrection().getDeviceBuffer());
else
ccmaDirectionsKernel.setArg<void*>(3, NULL);
ccmaForceKernel.setArg<cl::Buffer>(0, ccmaAtoms->getDeviceBuffer()); ccmaForceKernel.setArg<cl::Buffer>(0, ccmaAtoms->getDeviceBuffer());
ccmaForceKernel.setArg<cl::Buffer>(1, ccmaDistance->getDeviceBuffer()); ccmaForceKernel.setArg<cl::Buffer>(1, ccmaDistance->getDeviceBuffer());
ccmaForceKernel.setArg<cl::Buffer>(2, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta->getDeviceBuffer()); ccmaForceKernel.setArg<cl::Buffer>(2, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta->getDeviceBuffer());
...@@ -730,7 +802,10 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub ...@@ -730,7 +802,10 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub
ccmaUpdateKernel.setArg<cl::Buffer>(6, ccmaDelta2->getDeviceBuffer()); ccmaUpdateKernel.setArg<cl::Buffer>(6, ccmaDelta2->getDeviceBuffer());
ccmaUpdateKernel.setArg<cl::Buffer>(7, ccmaConverged->getDeviceBuffer()); ccmaUpdateKernel.setArg<cl::Buffer>(7, ccmaConverged->getDeviceBuffer());
} }
ccmaForceKernel.setArg<cl_float>(6, (cl_float) tol); if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
ccmaForceKernel.setArg<cl_double>(6, (cl_double) tol);
else
ccmaForceKernel.setArg<cl_float>(6, (cl_float) tol);
context.executeKernel(ccmaDirectionsKernel, ccmaAtoms->getSize()); context.executeKernel(ccmaDirectionsKernel, ccmaAtoms->getSize());
const int checkInterval = 4; const int checkInterval = 4;
cl::Event event; cl::Event event;
...@@ -764,7 +839,7 @@ void OpenCLIntegrationUtilities::computeVirtualSites() { ...@@ -764,7 +839,7 @@ void OpenCLIntegrationUtilities::computeVirtualSites() {
void OpenCLIntegrationUtilities::distributeForcesFromVirtualSites() { void OpenCLIntegrationUtilities::distributeForcesFromVirtualSites() {
if (numVsites > 0) { if (numVsites > 0) {
vsiteForceKernel.setArg<cl::Buffer>(1, context.getForce().getDeviceBuffer()); vsiteForceKernel.setArg<cl::Buffer>(2, context.getForce().getDeviceBuffer());
context.executeKernel(vsiteForceKernel, numVsites); context.executeKernel(vsiteForceKernel, numVsites);
} }
} }
......
This diff is collapsed.
...@@ -1145,7 +1145,10 @@ private: ...@@ -1145,7 +1145,10 @@ private:
OpenCLArray* uniformRandoms; OpenCLArray* uniformRandoms;
OpenCLArray* randomSeed; OpenCLArray* randomSeed;
OpenCLParameterSet* perDofValues; OpenCLParameterSet* perDofValues;
mutable std::vector<std::vector<cl_float> > localPerDofValues; mutable std::vector<std::vector<cl_float> > localPerDofValuesFloat;
mutable std::vector<std::vector<cl_double> > localPerDofValuesDouble;
std::vector<float> contextValuesFloat;
std::vector<double> contextValuesDouble;
std::vector<float> contextValues; std::vector<float> contextValues;
std::vector<std::vector<cl::Kernel> > kernels; std::vector<std::vector<cl::Kernel> > kernels;
cl::Kernel sumEnergyKernel, randomKernel; cl::Kernel sumEnergyKernel, randomKernel;
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009 Stanford University and the Authors. * * Portions copyright (c) 2009-2012 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -32,32 +32,34 @@ ...@@ -32,32 +32,34 @@
using namespace OpenMM; using namespace OpenMM;
using namespace std; using namespace std;
OpenCLParameterSet::OpenCLParameterSet(OpenCLContext& context, int numParameters, int numObjects, const string& name, bool bufferPerParameter) : OpenCLParameterSet::OpenCLParameterSet(OpenCLContext& context, int numParameters, int numObjects, const string& name, bool bufferPerParameter, bool useDoublePrecision) :
context(context), numParameters(numParameters), numObjects(numObjects), name(name) { context(context), numParameters(numParameters), numObjects(numObjects), name(name) {
int params = numParameters; int params = numParameters;
int bufferCount = 0; int bufferCount = 0;
elementSize = (useDoublePrecision ? sizeof(double) : sizeof(float));
string elementType = (useDoublePrecision ? "double" : "float");
try { try {
if (!bufferPerParameter) { if (!bufferPerParameter) {
while (params > 2) { while (params > 2) {
cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*sizeof(mm_float4)); cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*elementSize*4);
std::stringstream name; std::stringstream name;
name << "param" << (++bufferCount); name << "param" << (++bufferCount);
buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), "float", 4, sizeof(mm_float4), *buf)); buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), elementType, 4, elementSize*4, *buf));
params -= 4; params -= 4;
} }
if (params > 1) { if (params > 1) {
cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*sizeof(mm_float2)); cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*elementSize*2);
std::stringstream name; std::stringstream name;
name << "param" << (++bufferCount); name << "param" << (++bufferCount);
buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), "float", 2, sizeof(mm_float2), *buf)); buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), elementType, 2, elementSize*2, *buf));
params -= 2; params -= 2;
} }
} }
while (params > 0) { while (params > 0) {
cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*sizeof(cl_float)); cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*elementSize);
std::stringstream name; std::stringstream name;
name << "param" << (++bufferCount); name << "param" << (++bufferCount);
buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), "float", 1, sizeof(cl_float), *buf)); buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), elementType, 1, elementSize, *buf));
params--; params--;
} }
} }
...@@ -73,39 +75,42 @@ OpenCLParameterSet::~OpenCLParameterSet() { ...@@ -73,39 +75,42 @@ OpenCLParameterSet::~OpenCLParameterSet() {
delete &buffers[i].getMemory(); delete &buffers[i].getMemory();
} }
void OpenCLParameterSet::getParameterValues(vector<vector<cl_float> >& values) const { template <class T>
void OpenCLParameterSet::getParameterValues(vector<vector<T> >& values) const {
if (sizeof(T) != elementSize)
throw OpenMMException("Called getParameterValues() with vector of wrong type");
values.resize(numObjects); values.resize(numObjects);
for (int i = 0; i < numObjects; i++) for (int i = 0; i < numObjects; i++)
values[i].resize(numParameters); values[i].resize(numParameters);
try { try {
int base = 0; int base = 0;
for (int i = 0; i < (int) buffers.size(); i++) { for (int i = 0; i < (int) buffers.size(); i++) {
if (buffers[i].getType() == "float4") { if (buffers[i].getSize() == 4*elementSize) {
vector<mm_float4> data(numObjects); vector<T> data(4*numObjects);
context.getQueue().enqueueReadBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]); context.getQueue().enqueueReadBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]);
for (int j = 0; j < numObjects; j++) { for (int j = 0; j < numObjects; j++) {
values[j][base] = data[j].x; values[j][base] = data[4*j];
if (base+1 < numParameters) if (base+1 < numParameters)
values[j][base+1] = data[j].y; values[j][base+1] = data[4*j+1];
if (base+2 < numParameters) if (base+2 < numParameters)
values[j][base+2] = data[j].z; values[j][base+2] = data[4*j+2];
if (base+3 < numParameters) if (base+3 < numParameters)
values[j][base+3] = data[j].w; values[j][base+3] = data[4*j+3];
} }
base += 4; base += 4;
} }
else if (buffers[i].getType() == "float2") { else if (buffers[i].getSize() == 2*elementSize) {
vector<mm_float2> data(numObjects); vector<T> data(2*numObjects);
context.getQueue().enqueueReadBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]); context.getQueue().enqueueReadBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]);
for (int j = 0; j < numObjects; j++) { for (int j = 0; j < numObjects; j++) {
values[j][base] = data[j].x; values[j][base] = data[2*j];
if (base+1 < numParameters) if (base+1 < numParameters)
values[j][base+1] = data[j].y; values[j][base+1] = data[2*j+1];
} }
base += 2; base += 2;
} }
else if (buffers[i].getType() == "float") { else if (buffers[i].getSize() == elementSize) {
vector<cl_float> data(numObjects); vector<T> data(numObjects);
context.getQueue().enqueueReadBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]); context.getQueue().enqueueReadBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]);
for (int j = 0; j < numObjects; j++) for (int j = 0; j < numObjects; j++)
values[j][base] = data[j]; values[j][base] = data[j];
...@@ -122,36 +127,39 @@ void OpenCLParameterSet::getParameterValues(vector<vector<cl_float> >& values) c ...@@ -122,36 +127,39 @@ void OpenCLParameterSet::getParameterValues(vector<vector<cl_float> >& values) c
} }
} }
void OpenCLParameterSet::setParameterValues(const vector<vector<cl_float> >& values) { template <class T>
void OpenCLParameterSet::setParameterValues(const vector<vector<T> >& values) {
if (sizeof(T) != elementSize)
throw OpenMMException("Called setParameterValues() with vector of wrong type");
try { try {
int base = 0; int base = 0;
for (int i = 0; i < (int) buffers.size(); i++) { for (int i = 0; i < (int) buffers.size(); i++) {
if (buffers[i].getType() == "float4") { if (buffers[i].getSize() == 4*elementSize) {
vector<mm_float4> data(numObjects); vector<T> data(4*numObjects);
for (int j = 0; j < numObjects; j++) { for (int j = 0; j < numObjects; j++) {
data[j].x = values[j][base]; data[4*j] = values[j][base];
if (base+1 < numParameters) if (base+1 < numParameters)
data[j].y = values[j][base+1]; data[4*j+1] = values[j][base+1];
if (base+2 < numParameters) if (base+2 < numParameters)
data[j].z = values[j][base+2]; data[4*j+2] = values[j][base+2];
if (base+3 < numParameters) if (base+3 < numParameters)
data[j].w = values[j][base+3]; data[4*j+3] = values[j][base+3];
} }
context.getQueue().enqueueWriteBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]); context.getQueue().enqueueWriteBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]);
base += 4; base += 4;
} }
else if (buffers[i].getType() == "float2") { else if (buffers[i].getSize() == 2*elementSize) {
vector<mm_float2> data(numObjects); vector<T> data(2*numObjects);
for (int j = 0; j < numObjects; j++) { for (int j = 0; j < numObjects; j++) {
data[j].x = values[j][base]; data[2*j] = values[j][base];
if (base+1 < numParameters) if (base+1 < numParameters)
data[j].y = values[j][base+1]; data[2*j+1] = values[j][base+1];
} }
context.getQueue().enqueueWriteBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]); context.getQueue().enqueueWriteBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]);
base += 2; base += 2;
} }
else if (buffers[i].getType() == "float") { else if (buffers[i].getSize() == elementSize) {
vector<cl_float> data(numObjects); vector<T> data(numObjects);
for (int j = 0; j < numObjects; j++) for (int j = 0; j < numObjects; j++)
data[j] = values[j][base]; data[j] = values[j][base];
context.getQueue().enqueueWriteBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]); context.getQueue().enqueueWriteBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]);
...@@ -172,16 +180,26 @@ string OpenCLParameterSet::getParameterSuffix(int index, const std::string& extr ...@@ -172,16 +180,26 @@ string OpenCLParameterSet::getParameterSuffix(int index, const std::string& extr
const string suffixes[] = {".x", ".y", ".z", ".w"}; const string suffixes[] = {".x", ".y", ".z", ".w"};
int buffer = -1; int buffer = -1;
for (int i = 0; buffer == -1 && i < (int) buffers.size(); i++) { for (int i = 0; buffer == -1 && i < (int) buffers.size(); i++) {
if (index*sizeof(cl_float) < buffers[i].getSize()) if (index*elementSize < buffers[i].getSize())
buffer = i; buffer = i;
else else
index -= buffers[i].getSize()/sizeof(cl_float); index -= buffers[i].getSize()/elementSize;
} }
if (buffer == -1) if (buffer == -1)
throw OpenMMException("Internal error: Illegal argument to OpenCLParameterSet::getParameterSuffix() ("+name+")"); throw OpenMMException("Internal error: Illegal argument to OpenCLParameterSet::getParameterSuffix() ("+name+")");
stringstream suffix; stringstream suffix;
suffix << (buffer+1) << extraSuffix; suffix << (buffer+1) << extraSuffix;
if (buffers[buffer].getType() != "float") if (buffers[buffer].getSize() != elementSize)
suffix << suffixes[index]; suffix << suffixes[index];
return suffix.str(); return suffix.str();
} }
/**
* Define template instantiations for float and double versions of getParameterValues() and setParameterValues().
*/
namespace OpenMM {
template void OpenCLParameterSet::getParameterValues<float>(vector<vector<float> >& values) const;
template void OpenCLParameterSet::setParameterValues<float>(const vector<vector<float> >& values);
template void OpenCLParameterSet::getParameterValues<double>(vector<vector<double> >& values) const;
template void OpenCLParameterSet::setParameterValues<double>(const vector<vector<double> >& values);
}
\ No newline at end of file
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009 Stanford University and the Authors. * * Portions copyright (c) 2009-2012 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -51,8 +51,9 @@ public: ...@@ -51,8 +51,9 @@ public:
* @param name the name of the parameter set * @param name the name of the parameter set
* @param bufferPerParameter if true, a separate cl::Buffer is created for each parameter. If false, * @param bufferPerParameter if true, a separate cl::Buffer is created for each parameter. If false,
* multiple parameters may be combined into a single buffer. * multiple parameters may be combined into a single buffer.
* @param useDoublePrecision whether values should be stored as single or double precision
*/ */
OpenCLParameterSet(OpenCLContext& context, int numParameters, int numObjects, const std::string& name, bool bufferPerParameter=false); OpenCLParameterSet(OpenCLContext& context, int numParameters, int numObjects, const std::string& name, bool bufferPerParameter=false, bool useDoublePrecision=false);
~OpenCLParameterSet(); ~OpenCLParameterSet();
/** /**
* Get the number of parameters. * Get the number of parameters.
...@@ -71,13 +72,15 @@ public: ...@@ -71,13 +72,15 @@ public:
* *
* @param values on exit, values[i][j] contains the value of parameter j for object i * @param values on exit, values[i][j] contains the value of parameter j for object i
*/ */
void getParameterValues(std::vector<std::vector<cl_float> >& values) const; template <class T>
void getParameterValues(std::vector<std::vector<T> >& values) const;
/** /**
* Set the values of all parameters. * Set the values of all parameters.
* *
* @param values values[i][j] contains the value of parameter j for object i * @param values values[i][j] contains the value of parameter j for object i
*/ */
void setParameterValues(const std::vector<std::vector<cl_float> >& values); template <class T>
void setParameterValues(const std::vector<std::vector<T> >& values);
/** /**
* Get a set of OpenCLNonbondedUtilities::ParameterInfo objects which describe the Buffers * Get a set of OpenCLNonbondedUtilities::ParameterInfo objects which describe the Buffers
* containing the data. * containing the data.
...@@ -95,8 +98,7 @@ public: ...@@ -95,8 +98,7 @@ public:
std::string getParameterSuffix(int index, const std::string& extraSuffix = "") const; std::string getParameterSuffix(int index, const std::string& extraSuffix = "") const;
private: private:
OpenCLContext& context; OpenCLContext& context;
int numParameters; int numParameters, numObjects, elementSize;
int numObjects;
std::string name; std::string name;
std::vector<OpenCLNonbondedUtilities::ParameterInfo> buffers; std::vector<OpenCLNonbondedUtilities::ParameterInfo> buffers;
}; };
......
...@@ -76,8 +76,10 @@ OpenCLPlatform::OpenCLPlatform() { ...@@ -76,8 +76,10 @@ OpenCLPlatform::OpenCLPlatform() {
registerKernelFactory(RemoveCMMotionKernel::Name(), factory); registerKernelFactory(RemoveCMMotionKernel::Name(), factory);
platformProperties.push_back(OpenCLDeviceIndex()); platformProperties.push_back(OpenCLDeviceIndex());
platformProperties.push_back(OpenCLPlatformIndex()); platformProperties.push_back(OpenCLPlatformIndex());
platformProperties.push_back(OpenCLPrecision());
setPropertyDefaultValue(OpenCLDeviceIndex(), ""); setPropertyDefaultValue(OpenCLDeviceIndex(), "");
setPropertyDefaultValue(OpenCLPlatformIndex(), ""); setPropertyDefaultValue(OpenCLPlatformIndex(), "");
setPropertyDefaultValue(OpenCLPrecision(), "single");
} }
bool OpenCLPlatform::supportsDoublePrecision() const { bool OpenCLPlatform::supportsDoublePrecision() const {
...@@ -101,7 +103,9 @@ void OpenCLPlatform::contextCreated(ContextImpl& context, const map<string, stri ...@@ -101,7 +103,9 @@ void OpenCLPlatform::contextCreated(ContextImpl& context, const map<string, stri
getPropertyDefaultValue(OpenCLPlatformIndex()) : properties.find(OpenCLPlatformIndex())->second); getPropertyDefaultValue(OpenCLPlatformIndex()) : properties.find(OpenCLPlatformIndex())->second);
const string& devicePropValue = (properties.find(OpenCLDeviceIndex()) == properties.end() ? const string& devicePropValue = (properties.find(OpenCLDeviceIndex()) == properties.end() ?
getPropertyDefaultValue(OpenCLDeviceIndex()) : properties.find(OpenCLDeviceIndex())->second); getPropertyDefaultValue(OpenCLDeviceIndex()) : properties.find(OpenCLDeviceIndex())->second);
context.setPlatformData(new PlatformData(context.getSystem(), platformPropValue, devicePropValue)); string precisionPropValue = (properties.find(OpenCLPrecision()) == properties.end() ?
getPropertyDefaultValue(OpenCLPrecision()) : properties.find(OpenCLPrecision())->second);
context.setPlatformData(new PlatformData(context.getSystem(), platformPropValue, devicePropValue, precisionPropValue));
} }
void OpenCLPlatform::contextDestroyed(ContextImpl& context) const { void OpenCLPlatform::contextDestroyed(ContextImpl& context) const {
...@@ -109,7 +113,8 @@ void OpenCLPlatform::contextDestroyed(ContextImpl& context) const { ...@@ -109,7 +113,8 @@ void OpenCLPlatform::contextDestroyed(ContextImpl& context) const {
delete data; delete data;
} }
OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& platformPropValue, const string& deviceIndexProperty) : removeCM(false), stepCount(0), computeForceCount(0), time(0.0) { OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& platformPropValue, const string& deviceIndexProperty,
const string& precisionProperty) : removeCM(false), stepCount(0), computeForceCount(0), time(0.0) {
int platformIndex = 0; int platformIndex = 0;
if (platformPropValue.length() > 0) if (platformPropValue.length() > 0)
stringstream(platformPropValue) >> platformIndex; stringstream(platformPropValue) >> platformIndex;
...@@ -124,11 +129,11 @@ OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& p ...@@ -124,11 +129,11 @@ OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& p
if (devices[i].length() > 0) { if (devices[i].length() > 0) {
unsigned int deviceIndex; unsigned int deviceIndex;
stringstream(devices[i]) >> deviceIndex; stringstream(devices[i]) >> deviceIndex;
contexts.push_back(new OpenCLContext(system, platformIndex, deviceIndex, *this)); contexts.push_back(new OpenCLContext(system, platformIndex, deviceIndex, precisionProperty, *this));
} }
} }
if (contexts.size() == 0) if (contexts.size() == 0)
contexts.push_back(new OpenCLContext(system, platformIndex, -1, *this)); contexts.push_back(new OpenCLContext(system, platformIndex, -1, precisionProperty, *this));
stringstream device; stringstream device;
for (int i = 0; i < (int) contexts.size(); i++) { for (int i = 0; i < (int) contexts.size(); i++) {
if (i > 0) if (i > 0)
...@@ -137,6 +142,7 @@ OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& p ...@@ -137,6 +142,7 @@ OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& p
} }
propertyValues[OpenCLPlatform::OpenCLDeviceIndex()] = device.str(); propertyValues[OpenCLPlatform::OpenCLDeviceIndex()] = device.str();
propertyValues[OpenCLPlatform::OpenCLPlatformIndex()] = OpenCLExpressionUtilities::intToString(platformIndex); propertyValues[OpenCLPlatform::OpenCLPlatformIndex()] = OpenCLExpressionUtilities::intToString(platformIndex);
propertyValues[OpenCLPlatform::OpenCLPrecision()] = precisionProperty;
contextEnergy.resize(contexts.size()); contextEnergy.resize(contexts.size());
} }
......
...@@ -2,17 +2,19 @@ ...@@ -2,17 +2,19 @@
* Apply the Andersen thermostat to adjust particle velocities. * Apply the Andersen thermostat to adjust particle velocities.
*/ */
__kernel void applyAndersenThermostat(float collisionFrequency, float kT, __global float4* velm, __global const float2* restrict stepSize, __global const float4* restrict random, __kernel void applyAndersenThermostat(float collisionFrequency, float kT, __global mixed4* velm, __global const mixed2* restrict stepSize, __global const float4* restrict random,
unsigned int randomIndex, __global const int* restrict atomGroups) { unsigned int randomIndex, __global const int* restrict atomGroups) {
float collisionProbability = 1.0f-exp(-collisionFrequency*stepSize[0].y); float collisionProbability = 1.0f-exp(-collisionFrequency*stepSize[0].y);
float randomRange = erf(collisionProbability/sqrt(2.0f)); float randomRange = erf(collisionProbability/sqrt(2.0f));
for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) { for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
float4 velocity = velm[index]; mixed4 velocity = velm[index];
float4 selectRand = random[randomIndex+atomGroups[index]]; float4 selectRand = random[randomIndex+atomGroups[index]];
float4 velRand = random[randomIndex+index]; float4 velRand = random[randomIndex+index];
float scale = (selectRand.w > -randomRange && selectRand.w < randomRange ? 0.0f : 1.0f); real scale = (selectRand.w > -randomRange && selectRand.w < randomRange ? 0 : 1);
float add = (1.0f-scale)*sqrt(kT*velocity.w); real add = (1-scale)*sqrt(kT*velocity.w);
velocity.xyz = scale*velocity.xyz + add*velRand.xyz; velocity.x = scale*velocity.x + add*velRand.x;
velocity.y = scale*velocity.y + add*velRand.y;
velocity.z = scale*velocity.z + add*velRand.z;
velm[index] = velocity; velm[index] = velocity;
} }
} }
...@@ -2,13 +2,16 @@ ...@@ -2,13 +2,16 @@
* Perform the first step of Brownian integration. * Perform the first step of Brownian integration.
*/ */
__kernel void integrateBrownianPart1(float tauDeltaT, float noiseAmplitude, __global const float4* restrict force, __kernel void integrateBrownianPart1(mixed tauDeltaT, mixed noiseAmplitude, __global const real4* restrict force,
__global float4* restrict posDelta, __global const float4* restrict velm, __global const float4* restrict random, unsigned int randomIndex) { __global mixed4* restrict posDelta, __global const mixed4* restrict velm, __global const float4* restrict random, unsigned int randomIndex) {
randomIndex += get_global_id(0); randomIndex += get_global_id(0);
for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) { for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
float invMass = velm[index].w; mixed invMass = velm[index].w;
if (invMass != 0.0) if (invMass != 0) {
posDelta[index] = (float4) (tauDeltaT*invMass*force[index].xyz + noiseAmplitude*sqrt(invMass)*random[randomIndex].xyz, 0.0f); posDelta[index] = (mixed4) (tauDeltaT*invMass*force[index].x + noiseAmplitude*sqrt(invMass)*random[randomIndex].x,
tauDeltaT*invMass*force[index].y + noiseAmplitude*sqrt(invMass)*random[randomIndex].y,
tauDeltaT*invMass*force[index].z + noiseAmplitude*sqrt(invMass)*random[randomIndex].z, 0);
}
randomIndex += get_global_size(0); randomIndex += get_global_size(0);
} }
} }
...@@ -17,12 +20,29 @@ __kernel void integrateBrownianPart1(float tauDeltaT, float noiseAmplitude, __gl ...@@ -17,12 +20,29 @@ __kernel void integrateBrownianPart1(float tauDeltaT, float noiseAmplitude, __gl
* Perform the second step of Brownian integration. * Perform the second step of Brownian integration.
*/ */
__kernel void integrateBrownianPart2(float oneOverDeltaT, __global float4* posq, __global float4* velm, __global const float4* restrict posDelta) { __kernel void integrateBrownianPart2(mixed oneOverDeltaT, __global real4* posq, __global real4* posqCorrection, __global mixed4* velm, __global const mixed4* restrict posDelta) {
for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) { for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
if (velm[index].w != 0.0) { if (velm[index].w != 0) {
float4 delta = posDelta[index]; mixed4 delta = posDelta[index];
velm[index].xyz = oneOverDeltaT*delta.xyz; velm[index].x = oneOverDeltaT*delta.x;
posq[index].xyz = posq[index].xyz + delta.xyz; velm[index].y = oneOverDeltaT*delta.y;
velm[index].z = oneOverDeltaT*delta.z;
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
real4 pos = posq[index];
#endif
pos.x += delta.x;
pos.y += delta.y;
pos.z += delta.z;
#ifdef USE_MIXED_PRECISION
posq[index] = (real4) ((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos;
#endif
} }
} }
} }
mixed4 loadPos(__global const real4* restrict posq, __global const real4* restrict posqCorrection, int index) {
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
return (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
return posq[index];
#endif
}
/** /**
* Compute the direction each constraint is pointing in. This is called once at the beginning of constraint evaluation. * Compute the direction each constraint is pointing in. This is called once at the beginning of constraint evaluation.
*/ */
__kernel void computeConstraintDirections(__global const int2* restrict constraintAtoms, __global float4* restrict constraintDistance, __global const float4* restrict atomPositions) { __kernel void computeConstraintDirections(__global const int2* restrict constraintAtoms, __global mixed4* restrict constraintDistance, __global const real4* restrict atomPositions, __global const real4* restrict posCorrection) {
for (int index = get_global_id(0); index < NUM_CONSTRAINTS; index += get_global_size(0)) { for (int index = get_global_id(0); index < NUM_CONSTRAINTS; index += get_global_size(0)) {
// Compute the direction for this constraint. // Compute the direction for this constraint.
int2 atoms = constraintAtoms[index]; int2 atoms = constraintAtoms[index];
float4 dir = constraintDistance[index]; mixed4 dir = constraintDistance[index];
float4 oldPos1 = atomPositions[atoms.x]; mixed4 oldPos1 = loadPos(atomPositions, posCorrection, atoms.x);
float4 oldPos2 = atomPositions[atoms.y]; mixed4 oldPos2 = loadPos(atomPositions, posCorrection, atoms.y);
dir.x = oldPos1.x-oldPos2.x; dir.x = oldPos1.x-oldPos2.x;
dir.y = oldPos1.y-oldPos2.y; dir.y = oldPos1.y-oldPos2.y;
dir.z = oldPos1.z-oldPos2.z; dir.z = oldPos1.z-oldPos2.z;
...@@ -19,8 +28,8 @@ __kernel void computeConstraintDirections(__global const int2* restrict constrai ...@@ -19,8 +28,8 @@ __kernel void computeConstraintDirections(__global const int2* restrict constrai
/** /**
* Compute the force applied by each constraint. * Compute the force applied by each constraint.
*/ */
__kernel void computeConstraintForce(__global const int2* restrict constraintAtoms, __global const float4* restrict constraintDistance, __global const float4* restrict atomPositions, __kernel void computeConstraintForce(__global const int2* restrict constraintAtoms, __global const mixed4* restrict constraintDistance, __global const mixed4* restrict atomPositions,
__global const float* restrict reducedMass, __global float* restrict delta1, __global int* restrict converged, float tol, int iteration) { __global const mixed* restrict reducedMass, __global mixed* restrict delta1, __global int* restrict converged, mixed tol, int iteration) {
__local int groupConverged; __local int groupConverged;
if (converged[1-iteration%2]) { if (converged[1-iteration%2]) {
if (get_global_id(0) == 0) if (get_global_id(0) == 0)
...@@ -30,21 +39,21 @@ __kernel void computeConstraintForce(__global const int2* restrict constraintAto ...@@ -30,21 +39,21 @@ __kernel void computeConstraintForce(__global const int2* restrict constraintAto
if (get_local_id(0) == 0) if (get_local_id(0) == 0)
groupConverged = 1; groupConverged = 1;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
float lowerTol = 1.0f-2.0f*tol+tol*tol; mixed lowerTol = 1-2*tol+tol*tol;
float upperTol = 1.0f+2.0f*tol+tol*tol; mixed upperTol = 1+2*tol+tol*tol;
for (int index = get_global_id(0); index < NUM_CONSTRAINTS; index += get_global_size(0)) { for (int index = get_global_id(0); index < NUM_CONSTRAINTS; index += get_global_size(0)) {
// Compute the force due to this constraint. // Compute the force due to this constraint.
int2 atoms = constraintAtoms[index]; int2 atoms = constraintAtoms[index];
float4 dir = constraintDistance[index]; mixed4 dir = constraintDistance[index];
float4 rp_ij = atomPositions[atoms.x]-atomPositions[atoms.y]; mixed4 rp_ij = atomPositions[atoms.x]-atomPositions[atoms.y];
#ifndef CONSTRAIN_VELOCITIES #ifndef CONSTRAIN_VELOCITIES
rp_ij.xyz += dir.xyz; rp_ij.xyz += dir.xyz;
#endif #endif
float rrpr = rp_ij.x*dir.x + rp_ij.y*dir.y + rp_ij.z*dir.z; mixed rrpr = rp_ij.x*dir.x + rp_ij.y*dir.y + rp_ij.z*dir.z;
float d_ij2 = dir.x*dir.x + dir.y*dir.y + dir.z*dir.z; mixed d_ij2 = dir.x*dir.x + dir.y*dir.y + dir.z*dir.z;
#ifdef CONSTRAIN_VELOCITIES #ifdef CONSTRAIN_VELOCITIES
delta1[index] = -2.0f*reducedMass[index]*rrpr/d_ij2; delta1[index] = -2*reducedMass[index]*rrpr/d_ij2;
// See whether it has converged. // See whether it has converged.
...@@ -53,9 +62,9 @@ __kernel void computeConstraintForce(__global const int2* restrict constraintAto ...@@ -53,9 +62,9 @@ __kernel void computeConstraintForce(__global const int2* restrict constraintAto
converged[iteration%2] = 0; converged[iteration%2] = 0;
} }
#else #else
float rp2 = rp_ij.x*rp_ij.x + rp_ij.y*rp_ij.y + rp_ij.z*rp_ij.z; mixed rp2 = rp_ij.x*rp_ij.x + rp_ij.y*rp_ij.y + rp_ij.z*rp_ij.z;
float dist2 = dir.w*dir.w; mixed dist2 = dir.w*dir.w;
float diff = dist2 - rp2; mixed diff = dist2 - rp2;
delta1[index] = (rrpr > d_ij2*1e-6f ? reducedMass[index]*diff/rrpr : 0.0f); delta1[index] = (rrpr > d_ij2*1e-6f ? reducedMass[index]*diff/rrpr : 0.0f);
// See whether it has converged. // See whether it has converged.
...@@ -71,15 +80,15 @@ __kernel void computeConstraintForce(__global const int2* restrict constraintAto ...@@ -71,15 +80,15 @@ __kernel void computeConstraintForce(__global const int2* restrict constraintAto
/** /**
* Multiply the vector of constraint forces by the constraint matrix. * Multiply the vector of constraint forces by the constraint matrix.
*/ */
__kernel void multiplyByConstraintMatrix(__global const float* restrict delta1, __global float* restrict delta2, __global const int* restrict constraintMatrixColumn, __kernel void multiplyByConstraintMatrix(__global const mixed* restrict delta1, __global mixed* restrict delta2, __global const int* restrict constraintMatrixColumn,
__global const float* restrict constraintMatrixValue, __global const int* restrict converged, int iteration) { __global const mixed* restrict constraintMatrixValue, __global const int* restrict converged, int iteration) {
if (converged[iteration%2]) if (converged[iteration%2])
return; // The constraint iteration has already converged. return; // The constraint iteration has already converged.
// Multiply by the inverse constraint matrix. // Multiply by the inverse constraint matrix.
for (int index = get_global_id(0); index < NUM_CONSTRAINTS; index += get_global_size(0)) { for (int index = get_global_id(0); index < NUM_CONSTRAINTS; index += get_global_size(0)) {
float sum = 0.0f; mixed sum = 0;
for (int i = 0; ; i++) { for (int i = 0; ; i++) {
int element = index+i*NUM_CONSTRAINTS; int element = index+i*NUM_CONSTRAINTS;
int column = constraintMatrixColumn[element]; int column = constraintMatrixColumn[element];
...@@ -94,26 +103,26 @@ __kernel void multiplyByConstraintMatrix(__global const float* restrict delta1, ...@@ -94,26 +103,26 @@ __kernel void multiplyByConstraintMatrix(__global const float* restrict delta1,
/** /**
* Update the atom positions based on constraint forces. * Update the atom positions based on constraint forces.
*/ */
__kernel void updateAtomPositions(__global const int* restrict numAtomConstraints, __global const int* restrict atomConstraints, __global const float4* restrict constraintDistance, __kernel void updateAtomPositions(__global const int* restrict numAtomConstraints, __global const int* restrict atomConstraints, __global const mixed4* restrict constraintDistance,
__global float4* restrict atomPositions, __global const float4* restrict velm, __global const float* restrict delta1, __global const float* restrict delta2, __global int* restrict converged, int iteration) { __global mixed4* restrict atomPositions, __global const mixed4* restrict velm, __global const mixed* restrict delta1, __global const mixed* restrict delta2, __global int* restrict converged, int iteration) {
if (get_global_id(0) == 0) if (get_global_id(0) == 0)
converged[1-iteration%2] = 1; converged[1-iteration%2] = 1;
if (converged[iteration%2]) if (converged[iteration%2])
return; // The constraint iteration has already converged. return; // The constraint iteration has already converged.
float damping = (iteration < 2 ? 0.5f : 1.0f); mixed damping = (iteration < 2 ? 0.5f : 1.0f);
for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) { for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
// Compute the new position of this atom. // Compute the new position of this atom.
float4 atomPos = atomPositions[index]; mixed4 atomPos = atomPositions[index];
float invMass = velm[index].w; mixed invMass = velm[index].w;
int num = numAtomConstraints[index]; int num = numAtomConstraints[index];
for (int i = 0; i < num; i++) { for (int i = 0; i < num; i++) {
int constraint = atomConstraints[index+i*NUM_ATOMS]; int constraint = atomConstraints[index+i*NUM_ATOMS];
bool forward = (constraint > 0); bool forward = (constraint > 0);
constraint = (forward ? constraint-1 : -constraint-1); constraint = (forward ? constraint-1 : -constraint-1);
float constraintForce = damping*invMass*delta2[constraint]; mixed constraintForce = damping*invMass*delta2[constraint];
constraintForce = (forward ? constraintForce : -constraintForce); constraintForce = (forward ? constraintForce : -constraintForce);
float4 dir = constraintDistance[constraint]; mixed4 dir = constraintDistance[constraint];
atomPos.x += constraintForce*dir.x; atomPos.x += constraintForce*dir.x;
atomPos.y += constraintForce*dir.y; atomPos.y += constraintForce*dir.y;
atomPos.z += constraintForce*dir.z; atomPos.z += constraintForce*dir.z;
......
__kernel void applyPositionDeltas(__global float4* restrict posq, __global float4* restrict posDelta) { __kernel void applyPositionDeltas(__global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict posDelta) {
for (unsigned int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) { for (unsigned int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
float4 position = posq[index]; #ifdef USE_MIXED_PRECISION
position.xyz += posDelta[index].xyz; real4 pos1 = posq[index];
posq[index] = position; real4 pos2 = posqCorrection[index];
mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
mixed4 pos = posq[index];
#endif
pos.xyz += posDelta[index].xyz;
#ifdef USE_MIXED_PRECISION
posq[index] = (real4) ((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos;
#endif
} }
} }
__kernel void computeSum(__global const float* restrict sumBuffer, __global float* result, unsigned int outputIndex, int bufferSize) { __kernel void computeFloatSum(__global const float* restrict sumBuffer, __global float* result, unsigned int outputIndex, int bufferSize) {
__local float tempBuffer[WORK_GROUP_SIZE]; __local float tempBuffer[WORK_GROUP_SIZE];
const unsigned int thread = get_local_id(0); const unsigned int thread = get_local_id(0);
float sum = 0.0f; float sum = 0;
for (unsigned int index = thread; index < bufferSize; index += get_local_size(0)) for (unsigned int index = thread; index < bufferSize; index += get_local_size(0))
sum += sumBuffer[index]; sum += sumBuffer[index];
tempBuffer[thread] = sum; tempBuffer[thread] = sum;
...@@ -14,12 +14,41 @@ __kernel void computeSum(__global const float* restrict sumBuffer, __global floa ...@@ -14,12 +14,41 @@ __kernel void computeSum(__global const float* restrict sumBuffer, __global floa
result[outputIndex] = tempBuffer[0]; result[outputIndex] = tempBuffer[0];
} }
__kernel void applyPositionDeltas(__global float4* restrict posq, __global float4* restrict posDelta) { #ifdef SUPPORTS_DOUBLE_PRECISION
__kernel void computeDoubleSum(__global const double* restrict sumBuffer, __global double* result, unsigned int outputIndex, int bufferSize) {
__local double tempBuffer[WORK_GROUP_SIZE];
const unsigned int thread = get_local_id(0);
double sum = 0;
for (unsigned int index = thread; index < bufferSize; index += get_local_size(0))
sum += sumBuffer[index];
tempBuffer[thread] = sum;
for (int i = 1; i < WORK_GROUP_SIZE; i *= 2) {
barrier(CLK_LOCAL_MEM_FENCE);
if (thread%(i*2) == 0 && thread+i < WORK_GROUP_SIZE)
tempBuffer[thread] += tempBuffer[thread+i];
}
if (thread == 0)
result[outputIndex] = tempBuffer[0];
}
#endif
__kernel void applyPositionDeltas(__global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict posDelta) {
for (unsigned int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) { for (unsigned int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
float4 position = posq[index]; #ifdef USE_MIXED_PRECISION
position.xyz += posDelta[index].xyz; real4 pos1 = posq[index];
posq[index] = position; real4 pos2 = posqCorrection[index];
posDelta[index] = (float4) 0.0f; mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
real4 pos = posq[index];
#endif
pos.xyz += posDelta[index].xyz;
#ifdef USE_MIXED_PRECISION
posq[index] = (real4) ((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos;
#endif
posDelta[index] = (mixed4) 0;
} }
} }
......
__kernel void computeGlobal(__global float2* restrict dt, __global float* restrict globals, __global float* restrict params, __kernel void computeGlobal(__global mixed2* restrict dt, __global mixed* restrict globals, __global mixed* restrict params,
float uniform, float gaussian, __global const float* restrict energy) { float uniform, float gaussian, __global const real* restrict energy) {
COMPUTE_STEP COMPUTE_STEP
} }
#ifdef SUPPORTS_DOUBLE_PRECISION /**
#pragma OPENCL EXTENSION cl_khr_fp64 : enable * Load the position of a particle.
*/
mixed4 loadPos(__global const real4* restrict posq, __global const real4* restrict posqCorrection, int index) {
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
return (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
return posq[index];
#endif
}
/**
* Store the position of a particle.
*/
void storePos(__global real4* restrict posq, __global real4* restrict posqCorrection, int index, mixed4 pos) {
#ifdef USE_MIXED_PRECISION
posq[index] = (real4) ((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos;
#endif #endif
}
__kernel void computePerDof(__global float4* restrict posq, __global float4* restrict posDelta, __global float4* restrict velm, __kernel void computePerDof(__global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict posDelta,
__global const float4* restrict force, __global const float2* restrict dt, __global const float* restrict globals, __global mixed4* restrict velm, __global const real4* restrict force, __global const mixed2* restrict dt, __global const mixed* restrict globals,
__global const float* restrict params, __global float* restrict sum, __global const float4* restrict gaussianValues, __global const mixed* restrict params, __global mixed* restrict sum, __global const float4* restrict gaussianValues,
unsigned int randomIndex, __global const float4* restrict uniformValues, __global const float* restrict energy unsigned int randomIndex, __global const float4* restrict uniformValues, __global const real* restrict energy
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
float stepSize = dt[0].y; mixed stepSize = dt[0].y;
int index = get_global_id(0); int index = get_global_id(0);
randomIndex += index; randomIndex += index;
while (index < NUM_ATOMS) { while (index < NUM_ATOMS) {
#ifdef SUPPORTS_DOUBLE_PRECISION
#ifdef LOAD_POS_AS_DELTA #ifdef LOAD_POS_AS_DELTA
double4 position = convert_double4(posq[index]+posDelta[index]); mixed4 position = loadPos(posq, posqCorrection, index)+posDelta[index];
#else #else
double4 position = convert_double4(posq[index]); mixed4 position = loadPos(posq, posqCorrection, index);
#endif
double4 velocity = convert_double4(velm[index]);
double4 f = convert_double4(force[index]);
double mass = 1.0/velocity.w;
#else
#ifdef LOAD_POS_AS_DELTA
float4 position = posq[index]+posDelta[index];
#else
float4 position = posq[index];
#endif
float4 velocity = velm[index];
float4 f = force[index];
float mass = 1.0f/velocity.w;
#endif #endif
mixed4 velocity = velm[index];
real4 f = force[index];
mixed mass = 1/velocity.w;
if (velocity.w != 0.0) { if (velocity.w != 0.0) {
float4 gaussian = gaussianValues[randomIndex]; float4 gaussian = gaussianValues[randomIndex];
float4 uniform = uniformValues[index]; float4 uniform = uniformValues[index];
......
float2 multofFloat2(float2 a, float2 b) { float2 multofFloat2(float2 a, float2 b) {
return (float2) (a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x); return (float2) (a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x);
} }
......
#ifdef SUPPORTS_DOUBLE_PRECISION
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#endif
enum {VelScale, ForceScale, NoiseScale, MaxParams}; enum {VelScale, ForceScale, NoiseScale, MaxParams};
/** /**
* Perform the first step of Langevin integration. * Perform the first step of Langevin integration.
*/ */
__kernel void integrateLangevinPart1(__global float4* restrict velm, __global const float4* restrict force, __global float4* restrict posDelta, __kernel void integrateLangevinPart1(__global mixed4* restrict velm, __global const real4* restrict force, __global mixed4* restrict posDelta,
__global const float* restrict paramBuffer, __global const float2* restrict dt, __global const float4* restrict random, unsigned int randomIndex) { __global const mixed* restrict paramBuffer, __global const mixed2* restrict dt, __global const float4* restrict random, unsigned int randomIndex) {
float vscale = paramBuffer[VelScale]; mixed vscale = paramBuffer[VelScale];
float fscale = paramBuffer[ForceScale]; mixed fscale = paramBuffer[ForceScale];
float noisescale = paramBuffer[NoiseScale]; mixed noisescale = paramBuffer[NoiseScale];
float stepSize = dt[0].y; mixed stepSize = dt[0].y;
int index = get_global_id(0); int index = get_global_id(0);
randomIndex += index; randomIndex += index;
while (index < NUM_ATOMS) { while (index < NUM_ATOMS) {
float4 velocity = velm[index]; mixed4 velocity = velm[index];
if (velocity.w != 0.0) { if (velocity.w != 0.0) {
float sqrtInvMass = sqrt(velocity.w); mixed sqrtInvMass = sqrt(velocity.w);
velocity.xyz = vscale*velocity.xyz + fscale*velocity.w*force[index].xyz + noisescale*sqrtInvMass*random[randomIndex].xyz; velocity.x = vscale*velocity.x + fscale*velocity.w*force[index].x + noisescale*sqrtInvMass*random[randomIndex].x;
velocity.y = vscale*velocity.y + fscale*velocity.w*force[index].y + noisescale*sqrtInvMass*random[randomIndex].y;
velocity.z = vscale*velocity.z + fscale*velocity.w*force[index].z + noisescale*sqrtInvMass*random[randomIndex].z;
velm[index] = velocity; velm[index] = velocity;
posDelta[index] = stepSize*velocity; posDelta[index] = stepSize*velocity;
} }
...@@ -33,7 +31,7 @@ __kernel void integrateLangevinPart1(__global float4* restrict velm, __global co ...@@ -33,7 +31,7 @@ __kernel void integrateLangevinPart1(__global float4* restrict velm, __global co
* Perform the second step of Langevin integration. * Perform the second step of Langevin integration.
*/ */
__kernel void integrateLangevinPart2(__global float4* restrict posq, __global const float4* restrict posDelta, __global float4* restrict velm, __global const float2* restrict dt) { __kernel void integrateLangevinPart2(__global real4* restrict posq, __global real4* restrict posqCorrection, __global const mixed4* restrict posDelta, __global mixed4* restrict velm, __global const mixed2* restrict dt) {
#ifdef SUPPORTS_DOUBLE_PRECISION #ifdef SUPPORTS_DOUBLE_PRECISION
double invStepSize = 1.0/dt[0].y; double invStepSize = 1.0/dt[0].y;
#else #else
...@@ -41,17 +39,28 @@ __kernel void integrateLangevinPart2(__global float4* restrict posq, __global co ...@@ -41,17 +39,28 @@ __kernel void integrateLangevinPart2(__global float4* restrict posq, __global co
#endif #endif
int index = get_global_id(0); int index = get_global_id(0);
while (index < NUM_ATOMS) { while (index < NUM_ATOMS) {
float4 vel = velm[index]; mixed4 vel = velm[index];
if (vel.w != 0.0) { if (vel.w != 0.0) {
float4 pos = posq[index]; #ifdef USE_MIXED_PRECISION
float4 delta = posDelta[index]; real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
real4 pos = posq[index];
#endif
mixed4 delta = posDelta[index];
pos.xyz += delta.xyz; pos.xyz += delta.xyz;
#ifdef SUPPORTS_DOUBLE_PRECISION #ifdef SUPPORTS_DOUBLE_PRECISION
vel.xyz = convert_float4(invStepSize*convert_double4(delta)).xyz; vel.xyz = convert_mixed4(invStepSize*convert_double4(delta)).xyz;
#else #else
vel.xyz = invStepSize*delta.xyz; vel.xyz = invStepSize*delta.xyz;
#endif #endif
#ifdef USE_MIXED_PRECISION
posq[index] = convert_real4(pos);
posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos; posq[index] = pos;
#endif
velm[index] = vel; velm[index] = vel;
} }
index += get_global_size(0); index += get_global_size(0);
...@@ -62,15 +71,15 @@ __kernel void integrateLangevinPart2(__global float4* restrict posq, __global co ...@@ -62,15 +71,15 @@ __kernel void integrateLangevinPart2(__global float4* restrict posq, __global co
* Select the step size to use for the next step. * Select the step size to use for the next step.
*/ */
__kernel void selectLangevinStepSize(float maxStepSize, float errorTol, float tau, float kT, __global float2* restrict dt, __kernel void selectLangevinStepSize(mixed maxStepSize, mixed errorTol, mixed tau, mixed kT, __global mixed2* restrict dt,
__global const float4* restrict velm, __global const float4* restrict force, __global float* restrict paramBuffer, __local float* restrict params, __local float* restrict error) { __global const mixed4* restrict velm, __global const real4* restrict force, __global mixed* restrict paramBuffer, __local mixed* restrict params, __local mixed* restrict error) {
// Calculate the error. // Calculate the error.
float err = 0.0f; mixed err = 0.0f;
unsigned int index = get_local_id(0); unsigned int index = get_local_id(0);
while (index < NUM_ATOMS) { while (index < NUM_ATOMS) {
float4 f = force[index]; real4 f = force[index];
float invMass = velm[index].w; mixed invMass = velm[index].w;
err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass; err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass;
index += get_global_size(0); index += get_global_size(0);
} }
...@@ -87,9 +96,9 @@ __kernel void selectLangevinStepSize(float maxStepSize, float errorTol, float ta ...@@ -87,9 +96,9 @@ __kernel void selectLangevinStepSize(float maxStepSize, float errorTol, float ta
if (get_global_id(0) == 0) { if (get_global_id(0) == 0) {
// Select the new step size. // Select the new step size.
float totalError = sqrt(error[0]/(NUM_ATOMS*3)); mixed totalError = sqrt(error[0]/(NUM_ATOMS*3));
float newStepSize = sqrt(errorTol/totalError); mixed newStepSize = sqrt(errorTol/totalError);
float oldStepSize = dt[0].y; mixed oldStepSize = dt[0].y;
if (oldStepSize > 0.0f) if (oldStepSize > 0.0f)
newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase. newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.
if (newStepSize > oldStepSize && newStepSize < 1.1f*oldStepSize) if (newStepSize > oldStepSize && newStepSize < 1.1f*oldStepSize)
...@@ -100,9 +109,9 @@ __kernel void selectLangevinStepSize(float maxStepSize, float errorTol, float ta ...@@ -100,9 +109,9 @@ __kernel void selectLangevinStepSize(float maxStepSize, float errorTol, float ta
// Recalculate the integration parameters. // Recalculate the integration parameters.
float vscale = exp(-newStepSize/tau); mixed vscale = exp(-newStepSize/tau);
float fscale = (1-vscale)*tau; mixed fscale = (1-vscale)*tau;
float noisescale = sqrt(2*kT/tau)*sqrt(0.5f*(1-vscale*vscale)*tau); mixed noisescale = sqrt(2*kT/tau)*sqrt(0.5f*(1-vscale*vscale)*tau);
params[VelScale] = vscale; params[VelScale] = vscale;
params[ForceScale] = fscale; params[ForceScale] = fscale;
params[NoiseScale] = noisescale; params[NoiseScale] = noisescale;
......
...@@ -2,13 +2,16 @@ ...@@ -2,13 +2,16 @@
* Calculate the center of mass momentum. * Calculate the center of mass momentum.
*/ */
__kernel void calcCenterOfMassMomentum(int numAtoms, __global const float4* restrict velm, __global float4* restrict cmMomentum, __local volatile float4* restrict temp) { __kernel void calcCenterOfMassMomentum(int numAtoms, __global const mixed4* restrict velm, __global float4* restrict cmMomentum, __local volatile float4* restrict temp) {
int index = get_global_id(0); int index = get_global_id(0);
float4 cm = 0.0f; float4 cm = 0.0f;
while (index < numAtoms) { while (index < numAtoms) {
float4 velocity = velm[index]; mixed4 velocity = velm[index];
if (velocity.w != 0.0) if (velocity.w != 0) {
cm.xyz += velocity.xyz/velocity.w; cm.x += velocity.x/velocity.w;
cm.y += velocity.y/velocity.w;
cm.z += velocity.z/velocity.w;
}
index += get_global_size(0); index += get_global_size(0);
} }
...@@ -54,7 +57,7 @@ __kernel void calcCenterOfMassMomentum(int numAtoms, __global const float4* rest ...@@ -54,7 +57,7 @@ __kernel void calcCenterOfMassMomentum(int numAtoms, __global const float4* rest
* Remove center of mass motion. * Remove center of mass motion.
*/ */
__kernel void removeCenterOfMassMomentum(unsigned int numAtoms, __global float4* restrict velm, __global const float4* restrict cmMomentum, __local volatile float4* restrict temp) { __kernel void removeCenterOfMassMomentum(unsigned int numAtoms, __global mixed4* restrict velm, __global const float4* restrict cmMomentum, __local volatile float4* restrict temp) {
// First sum all of the momenta that were calculated by individual groups. // First sum all of the momenta that were calculated by individual groups.
unsigned int index = get_local_id(0); unsigned int index = get_local_id(0);
...@@ -101,7 +104,9 @@ __kernel void removeCenterOfMassMomentum(unsigned int numAtoms, __global float4* ...@@ -101,7 +104,9 @@ __kernel void removeCenterOfMassMomentum(unsigned int numAtoms, __global float4*
index = get_global_id(0); index = get_global_id(0);
while (index < numAtoms) { while (index < numAtoms) {
velm[index].xyz -= cm.xyz; velm[index].x -= cm.x;
velm[index].y -= cm.y;
velm[index].z -= cm.z;
index += get_global_size(0); index += get_global_size(0);
} }
} }
mixed4 loadPos(__global const real4* restrict posq, __global const real4* restrict posqCorrection, int index) {
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
return (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
return posq[index];
#endif
}
/** /**
* Enforce constraints on SETTLE clusters * Enforce constraints on SETTLE clusters
*/ */
__kernel void applySettle(int numClusters, float tol, __global const float4* restrict oldPos, __global float4* restrict posDelta, __global const float4* restrict velm, __global const int4* restrict clusterAtoms, __global const float2* restrict clusterParams) { __kernel void applySettle(int numClusters, mixed tol, __global const real4* restrict oldPos, __global const real4* restrict posCorrection, __global mixed4* restrict posDelta, __global const mixed4* restrict velm, __global const int4* restrict clusterAtoms, __global const float2* restrict clusterParams) {
int index = get_global_id(0); int index = get_global_id(0);
while (index < numClusters) { while (index < numClusters) {
// Load the data for this cluster. // Load the data for this cluster.
int4 atoms = clusterAtoms[index]; int4 atoms = clusterAtoms[index];
float2 params = clusterParams[index]; float2 params = clusterParams[index];
float4 apos0 = oldPos[atoms.x]; mixed4 apos0 = loadPos(oldPos, posCorrection, atoms.x);
float4 xp0 = posDelta[atoms.x]; mixed4 xp0 = posDelta[atoms.x];
float4 apos1 = oldPos[atoms.y]; mixed4 apos1 = loadPos(oldPos, posCorrection, atoms.y);
float4 xp1 = posDelta[atoms.y]; mixed4 xp1 = posDelta[atoms.y];
float4 apos2 = oldPos[atoms.z]; mixed4 apos2 = loadPos(oldPos, posCorrection, atoms.z);
float4 xp2 = posDelta[atoms.z]; mixed4 xp2 = posDelta[atoms.z];
float m0 = RECIP(velm[atoms.x].w); mixed m0 = 1/velm[atoms.x].w;
float m1 = RECIP(velm[atoms.y].w); mixed m1 = 1/velm[atoms.y].w;
float m2 = RECIP(velm[atoms.z].w); mixed m2 = 1/velm[atoms.z].w;
// Apply the SETTLE algorithm. // Apply the SETTLE algorithm.
float xb0 = apos1.x-apos0.x; mixed xb0 = apos1.x-apos0.x;
float yb0 = apos1.y-apos0.y; mixed yb0 = apos1.y-apos0.y;
float zb0 = apos1.z-apos0.z; mixed zb0 = apos1.z-apos0.z;
float xc0 = apos2.x-apos0.x; mixed xc0 = apos2.x-apos0.x;
float yc0 = apos2.y-apos0.y; mixed yc0 = apos2.y-apos0.y;
float zc0 = apos2.z-apos0.z; mixed zc0 = apos2.z-apos0.z;
float invTotalMass = 1.0f/(m0+m1+m2); mixed invTotalMass = 1.0f/(m0+m1+m2);
float xcom = (xp0.x*m0 + (xb0+xp1.x)*m1 + (xc0+xp2.x)*m2) * invTotalMass; mixed xcom = (xp0.x*m0 + (xb0+xp1.x)*m1 + (xc0+xp2.x)*m2) * invTotalMass;
float ycom = (xp0.y*m0 + (yb0+xp1.y)*m1 + (yc0+xp2.y)*m2) * invTotalMass; mixed ycom = (xp0.y*m0 + (yb0+xp1.y)*m1 + (yc0+xp2.y)*m2) * invTotalMass;
float zcom = (xp0.z*m0 + (zb0+xp1.z)*m1 + (zc0+xp2.z)*m2) * invTotalMass; mixed zcom = (xp0.z*m0 + (zb0+xp1.z)*m1 + (zc0+xp2.z)*m2) * invTotalMass;
float xa1 = xp0.x - xcom; mixed xa1 = xp0.x - xcom;
float ya1 = xp0.y - ycom; mixed ya1 = xp0.y - ycom;
float za1 = xp0.z - zcom; mixed za1 = xp0.z - zcom;
float xb1 = xb0 + xp1.x - xcom; mixed xb1 = xb0 + xp1.x - xcom;
float yb1 = yb0 + xp1.y - ycom; mixed yb1 = yb0 + xp1.y - ycom;
float zb1 = zb0 + xp1.z - zcom; mixed zb1 = zb0 + xp1.z - zcom;
float xc1 = xc0 + xp2.x - xcom; mixed xc1 = xc0 + xp2.x - xcom;
float yc1 = yc0 + xp2.y - ycom; mixed yc1 = yc0 + xp2.y - ycom;
float zc1 = zc0 + xp2.z - zcom; mixed zc1 = zc0 + xp2.z - zcom;
float xaksZd = yb0*zc0 - zb0*yc0; mixed xaksZd = yb0*zc0 - zb0*yc0;
float yaksZd = zb0*xc0 - xb0*zc0; mixed yaksZd = zb0*xc0 - xb0*zc0;
float zaksZd = xb0*yc0 - yb0*xc0; mixed zaksZd = xb0*yc0 - yb0*xc0;
float xaksXd = ya1*zaksZd - za1*yaksZd; mixed xaksXd = ya1*zaksZd - za1*yaksZd;
float yaksXd = za1*xaksZd - xa1*zaksZd; mixed yaksXd = za1*xaksZd - xa1*zaksZd;
float zaksXd = xa1*yaksZd - ya1*xaksZd; mixed zaksXd = xa1*yaksZd - ya1*xaksZd;
float xaksYd = yaksZd*zaksXd - zaksZd*yaksXd; mixed xaksYd = yaksZd*zaksXd - zaksZd*yaksXd;
float yaksYd = zaksZd*xaksXd - xaksZd*zaksXd; mixed yaksYd = zaksZd*xaksXd - xaksZd*zaksXd;
float zaksYd = xaksZd*yaksXd - yaksZd*xaksXd; mixed zaksYd = xaksZd*yaksXd - yaksZd*xaksXd;
float axlng = sqrt(xaksXd*xaksXd + yaksXd*yaksXd + zaksXd*zaksXd); mixed axlng = sqrt(xaksXd*xaksXd + yaksXd*yaksXd + zaksXd*zaksXd);
float aylng = sqrt(xaksYd*xaksYd + yaksYd*yaksYd + zaksYd*zaksYd); mixed aylng = sqrt(xaksYd*xaksYd + yaksYd*yaksYd + zaksYd*zaksYd);
float azlng = sqrt(xaksZd*xaksZd + yaksZd*yaksZd + zaksZd*zaksZd); mixed azlng = sqrt(xaksZd*xaksZd + yaksZd*yaksZd + zaksZd*zaksZd);
float trns11 = xaksXd / axlng; mixed trns11 = xaksXd / axlng;
float trns21 = yaksXd / axlng; mixed trns21 = yaksXd / axlng;
float trns31 = zaksXd / axlng; mixed trns31 = zaksXd / axlng;
float trns12 = xaksYd / aylng; mixed trns12 = xaksYd / aylng;
float trns22 = yaksYd / aylng; mixed trns22 = yaksYd / aylng;
float trns32 = zaksYd / aylng; mixed trns32 = zaksYd / aylng;
float trns13 = xaksZd / azlng; mixed trns13 = xaksZd / azlng;
float trns23 = yaksZd / azlng; mixed trns23 = yaksZd / azlng;
float trns33 = zaksZd / azlng; mixed trns33 = zaksZd / azlng;
float xb0d = trns11*xb0 + trns21*yb0 + trns31*zb0; mixed xb0d = trns11*xb0 + trns21*yb0 + trns31*zb0;
float yb0d = trns12*xb0 + trns22*yb0 + trns32*zb0; mixed yb0d = trns12*xb0 + trns22*yb0 + trns32*zb0;
float xc0d = trns11*xc0 + trns21*yc0 + trns31*zc0; mixed xc0d = trns11*xc0 + trns21*yc0 + trns31*zc0;
float yc0d = trns12*xc0 + trns22*yc0 + trns32*zc0; mixed yc0d = trns12*xc0 + trns22*yc0 + trns32*zc0;
float za1d = trns13*xa1 + trns23*ya1 + trns33*za1; mixed za1d = trns13*xa1 + trns23*ya1 + trns33*za1;
float xb1d = trns11*xb1 + trns21*yb1 + trns31*zb1; mixed xb1d = trns11*xb1 + trns21*yb1 + trns31*zb1;
float yb1d = trns12*xb1 + trns22*yb1 + trns32*zb1; mixed yb1d = trns12*xb1 + trns22*yb1 + trns32*zb1;
float zb1d = trns13*xb1 + trns23*yb1 + trns33*zb1; mixed zb1d = trns13*xb1 + trns23*yb1 + trns33*zb1;
float xc1d = trns11*xc1 + trns21*yc1 + trns31*zc1; mixed xc1d = trns11*xc1 + trns21*yc1 + trns31*zc1;
float yc1d = trns12*xc1 + trns22*yc1 + trns32*zc1; mixed yc1d = trns12*xc1 + trns22*yc1 + trns32*zc1;
float zc1d = trns13*xc1 + trns23*yc1 + trns33*zc1; mixed zc1d = trns13*xc1 + trns23*yc1 + trns33*zc1;
// --- Step2 A2' --- // --- Step2 A2' ---
float rc = 0.5*params.y; float rc = 0.5*params.y;
float rb = sqrt(params.x*params.x-rc*rc); mixed rb = sqrt(params.x*params.x-rc*rc);
float ra = rb*(m1+m2)*invTotalMass; mixed ra = rb*(m1+m2)*invTotalMass;
rb -= ra; rb -= ra;
float sinphi = za1d / ra; mixed sinphi = za1d / ra;
float cosphi = sqrt(1.0f - sinphi*sinphi); mixed cosphi = sqrt(1.0f - sinphi*sinphi);
float sinpsi = (zb1d - zc1d) / (2*rc*cosphi); mixed sinpsi = (zb1d - zc1d) / (2*rc*cosphi);
float cospsi = sqrt(1.0f - sinpsi*sinpsi); mixed cospsi = sqrt(1.0f - sinpsi*sinpsi);
float ya2d = ra*cosphi; mixed ya2d = ra*cosphi;
float xb2d = - rc*cospsi; mixed xb2d = - rc*cospsi;
float yb2d = - rb*cosphi - rc*sinpsi*sinphi; mixed yb2d = - rb*cosphi - rc*sinpsi*sinphi;
float yc2d = - rb*cosphi + rc*sinpsi*sinphi; mixed yc2d = - rb*cosphi + rc*sinpsi*sinphi;
float xb2d2 = xb2d*xb2d; mixed xb2d2 = xb2d*xb2d;
float hh2 = 4.0f*xb2d2 + (yb2d-yc2d)*(yb2d-yc2d) + (zb1d-zc1d)*(zb1d-zc1d); mixed hh2 = 4.0f*xb2d2 + (yb2d-yc2d)*(yb2d-yc2d) + (zb1d-zc1d)*(zb1d-zc1d);
float deltx = 2.0f*xb2d + sqrt(4.0f*xb2d2 - hh2 + params.y*params.y); mixed deltx = 2.0f*xb2d + sqrt(4.0f*xb2d2 - hh2 + params.y*params.y);
xb2d -= deltx*0.5; xb2d -= deltx*0.5;
// --- Step3 al,be,ga --- // --- Step3 al,be,ga ---
float alpha = (xb2d*(xb0d-xc0d) + yb0d*yb2d + yc0d*yc2d); mixed alpha = (xb2d*(xb0d-xc0d) + yb0d*yb2d + yc0d*yc2d);
float beta = (xb2d*(yc0d-yb0d) + xb0d*yb2d + xc0d*yc2d); mixed beta = (xb2d*(yc0d-yb0d) + xb0d*yb2d + xc0d*yc2d);
float gamma = xb0d*yb1d - xb1d*yb0d + xc0d*yc1d - xc1d*yc0d; mixed gamma = xb0d*yb1d - xb1d*yb0d + xc0d*yc1d - xc1d*yc0d;
float al2be2 = alpha*alpha + beta*beta; mixed al2be2 = alpha*alpha + beta*beta;
float sintheta = (alpha*gamma - beta*sqrt(al2be2 - gamma*gamma)) / al2be2; mixed sintheta = (alpha*gamma - beta*sqrt(al2be2 - gamma*gamma)) / al2be2;
// --- Step4 A3' --- // --- Step4 A3' ---
float costheta = sqrt(1.0f - sintheta*sintheta); mixed costheta = sqrt(1.0f - sintheta*sintheta);
float xa3d = - ya2d*sintheta; mixed xa3d = - ya2d*sintheta;
float ya3d = ya2d*costheta; mixed ya3d = ya2d*costheta;
float za3d = za1d; mixed za3d = za1d;
float xb3d = xb2d*costheta - yb2d*sintheta; mixed xb3d = xb2d*costheta - yb2d*sintheta;
float yb3d = xb2d*sintheta + yb2d*costheta; mixed yb3d = xb2d*sintheta + yb2d*costheta;
float zb3d = zb1d; mixed zb3d = zb1d;
float xc3d = - xb2d*costheta - yc2d*sintheta; mixed xc3d = - xb2d*costheta - yc2d*sintheta;
float yc3d = - xb2d*sintheta + yc2d*costheta; mixed yc3d = - xb2d*sintheta + yc2d*costheta;
float zc3d = zc1d; mixed zc3d = zc1d;
// --- Step5 A3 --- // --- Step5 A3 ---
float xa3 = trns11*xa3d + trns12*ya3d + trns13*za3d; mixed xa3 = trns11*xa3d + trns12*ya3d + trns13*za3d;
float ya3 = trns21*xa3d + trns22*ya3d + trns23*za3d; mixed ya3 = trns21*xa3d + trns22*ya3d + trns23*za3d;
float za3 = trns31*xa3d + trns32*ya3d + trns33*za3d; mixed za3 = trns31*xa3d + trns32*ya3d + trns33*za3d;
float xb3 = trns11*xb3d + trns12*yb3d + trns13*zb3d; mixed xb3 = trns11*xb3d + trns12*yb3d + trns13*zb3d;
float yb3 = trns21*xb3d + trns22*yb3d + trns23*zb3d; mixed yb3 = trns21*xb3d + trns22*yb3d + trns23*zb3d;
float zb3 = trns31*xb3d + trns32*yb3d + trns33*zb3d; mixed zb3 = trns31*xb3d + trns32*yb3d + trns33*zb3d;
float xc3 = trns11*xc3d + trns12*yc3d + trns13*zc3d; mixed xc3 = trns11*xc3d + trns12*yc3d + trns13*zc3d;
float yc3 = trns21*xc3d + trns22*yc3d + trns23*zc3d; mixed yc3 = trns21*xc3d + trns22*yc3d + trns23*zc3d;
float zc3 = trns31*xc3d + trns32*yc3d + trns33*zc3d; mixed zc3 = trns31*xc3d + trns32*yc3d + trns33*zc3d;
xp0.x = xcom + xa3; xp0.x = xcom + xa3;
xp0.y = ycom + ya3; xp0.y = ycom + ya3;
...@@ -155,49 +165,49 @@ __kernel void applySettle(int numClusters, float tol, __global const float4* res ...@@ -155,49 +165,49 @@ __kernel void applySettle(int numClusters, float tol, __global const float4* res
* Enforce velocity constraints on SETTLE clusters * Enforce velocity constraints on SETTLE clusters
*/ */
__kernel void constrainVelocities(int numClusters, float tol, __global const float4* restrict oldPos, __global float4* restrict posDelta, __global float4* restrict velm, __global const int4* restrict clusterAtoms, __global const float2* restrict clusterParams) { __kernel void constrainVelocities(int numClusters, mixed tol, __global const real4* restrict oldPos, __global const real4* restrict posCorrection, __global mixed4* restrict posDelta, __global mixed4* restrict velm, __global const int4* restrict clusterAtoms, __global const float2* restrict clusterParams) {
for (int index = get_global_id(0); index < numClusters; index += get_global_size(0)) { for (int index = get_global_id(0); index < numClusters; index += get_global_size(0)) {
// Load the data for this cluster. // Load the data for this cluster.
int4 atoms = clusterAtoms[index]; int4 atoms = clusterAtoms[index];
float4 apos0 = oldPos[atoms.x]; mixed4 apos0 = loadPos(oldPos, posCorrection, atoms.x);
float4 apos1 = oldPos[atoms.y]; mixed4 apos1 = loadPos(oldPos, posCorrection, atoms.y);
float4 apos2 = oldPos[atoms.z]; mixed4 apos2 = loadPos(oldPos, posCorrection, atoms.z);
float4 v0 = velm[atoms.x]; mixed4 v0 = velm[atoms.x];
float4 v1 = velm[atoms.y]; mixed4 v1 = velm[atoms.y];
float4 v2 = velm[atoms.z]; mixed4 v2 = velm[atoms.z];
// Compute intermediate quantities: the atom masses, the bond directions, the relative velocities, // Compute intermediate quantities: the atom masses, the bond directions, the relative velocities,
// and the angle cosines and sines. // and the angle cosines and sines.
float mA = RECIP(v0.w); mixed mA = 1/v0.w;
float mB = RECIP(v1.w); mixed mB = 1/v1.w;
float mC = RECIP(v2.w); mixed mC = 1/v2.w;
float4 eAB = apos1-apos0; mixed4 eAB = apos1-apos0;
float4 eBC = apos2-apos1; mixed4 eBC = apos2-apos1;
float4 eCA = apos0-apos2; mixed4 eCA = apos0-apos2;
eAB.xyz /= SQRT(eAB.x*eAB.x + eAB.y*eAB.y + eAB.z*eAB.z); eAB.xyz /= sqrt(eAB.x*eAB.x + eAB.y*eAB.y + eAB.z*eAB.z);
eBC.xyz /= SQRT(eBC.x*eBC.x + eBC.y*eBC.y + eBC.z*eBC.z); eBC.xyz /= sqrt(eBC.x*eBC.x + eBC.y*eBC.y + eBC.z*eBC.z);
eCA.xyz /= SQRT(eCA.x*eCA.x + eCA.y*eCA.y + eCA.z*eCA.z); eCA.xyz /= sqrt(eCA.x*eCA.x + eCA.y*eCA.y + eCA.z*eCA.z);
float vAB = (v1.x-v0.x)*eAB.x + (v1.y-v0.y)*eAB.y + (v1.z-v0.z)*eAB.z; mixed vAB = (v1.x-v0.x)*eAB.x + (v1.y-v0.y)*eAB.y + (v1.z-v0.z)*eAB.z;
float vBC = (v2.x-v1.x)*eBC.x + (v2.y-v1.y)*eBC.y + (v2.z-v1.z)*eBC.z; mixed vBC = (v2.x-v1.x)*eBC.x + (v2.y-v1.y)*eBC.y + (v2.z-v1.z)*eBC.z;
float vCA = (v0.x-v2.x)*eCA.x + (v0.y-v2.y)*eCA.y + (v0.z-v2.z)*eCA.z; mixed vCA = (v0.x-v2.x)*eCA.x + (v0.y-v2.y)*eCA.y + (v0.z-v2.z)*eCA.z;
float cA = -(eAB.x*eCA.x + eAB.y*eCA.y + eAB.z*eCA.z); mixed cA = -(eAB.x*eCA.x + eAB.y*eCA.y + eAB.z*eCA.z);
float cB = -(eAB.x*eBC.x + eAB.y*eBC.y + eAB.z*eBC.z); mixed cB = -(eAB.x*eBC.x + eAB.y*eBC.y + eAB.z*eBC.z);
float cC = -(eBC.x*eCA.x + eBC.y*eCA.y + eBC.z*eCA.z); mixed cC = -(eBC.x*eCA.x + eBC.y*eCA.y + eBC.z*eCA.z);
float s2A = 1-cA*cA; mixed s2A = 1-cA*cA;
float s2B = 1-cB*cB; mixed s2B = 1-cB*cB;
float s2C = 1-cC*cC; mixed s2C = 1-cC*cC;
// Solve the equations. These are different from those in the SETTLE paper (JCC 13(8), pp. 952-962, 1992), because // Solve the equations. These are different from those in the SETTLE paper (JCC 13(8), pp. 952-962, 1992), because
// in going from equations B1 to B2, they make the assumption that mB=mC (but don't bother to mention they're // in going from equations B1 to B2, they make the assumption that mB=mC (but don't bother to mention they're
// making that assumption). We allow all three atoms to have different masses. // making that assumption). We allow all three atoms to have different masses.
float mABCinv = RECIP(mA*mB*mC); mixed mABCinv = 1/(mA*mB*mC);
float denom = (((s2A*mB+s2B*mA)*mC+(s2A*mB*mB+2*(cA*cB*cC+1)*mA*mB+s2B*mA*mA))*mC+s2C*mA*mB*(mA+mB))*mABCinv; mixed denom = (((s2A*mB+s2B*mA)*mC+(s2A*mB*mB+2*(cA*cB*cC+1)*mA*mB+s2B*mA*mA))*mC+s2C*mA*mB*(mA+mB))*mABCinv;
float tab = ((cB*cC*mA-cA*mB-cA*mC)*vCA + (cA*cC*mB-cB*mC-cB*mA)*vBC + (s2C*mA*mA*mB*mB*mABCinv+(mA+mB+mC))*vAB)/denom; mixed tab = ((cB*cC*mA-cA*mB-cA*mC)*vCA + (cA*cC*mB-cB*mC-cB*mA)*vBC + (s2C*mA*mA*mB*mB*mABCinv+(mA+mB+mC))*vAB)/denom;
float tbc = ((cA*cB*mC-cC*mB-cC*mA)*vCA + (s2A*mB*mB*mC*mC*mABCinv+(mA+mB+mC))*vBC + (cA*cC*mB-cB*mA-cB*mC)*vAB)/denom; mixed tbc = ((cA*cB*mC-cC*mB-cC*mA)*vCA + (s2A*mB*mB*mC*mC*mABCinv+(mA+mB+mC))*vBC + (cA*cC*mB-cB*mA-cB*mC)*vAB)/denom;
float tca = ((s2B*mA*mA*mC*mC*mABCinv+(mA+mB+mC))*vCA + (cA*cB*mC-cC*mB-cC*mA)*vBC + (cB*cC*mA-cA*mB-cA*mC)*vAB)/denom; mixed tca = ((s2B*mA*mA*mC*mC*mABCinv+(mA+mB+mC))*vCA + (cA*cB*mC-cC*mB-cC*mA)*vBC + (cB*cC*mA-cA*mB-cA*mC)*vAB)/denom;
v0.xyz += (tab*eAB.xyz - tca*eCA.xyz)*v0.w; v0.xyz += (tab*eAB.xyz - tca*eCA.xyz)*v0.w;
v1.xyz += (tbc*eBC.xyz - tab*eAB.xyz)*v1.w; v1.xyz += (tbc*eBC.xyz - tab*eAB.xyz)*v1.w;
v2.xyz += (tca*eCA.xyz - tbc*eBC.xyz)*v2.w; v2.xyz += (tca*eCA.xyz - tbc*eBC.xyz)*v2.w;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment